Source code for evcouplings.couplings.mapping

Mapping indices for complexes / multi-domain sequences to
internal model numbering.

  Thomas A. Hopf
  Anna G. Green (MultiSegmentCouplingsModel)

from collections import Iterable
from copy import deepcopy
from evcouplings.couplings.model import CouplingsModel
import pandas as pd
import numpy as np

[docs]class Segment: """ Represents a continuous stretch of sequence in a sequence alignment to infer evolutionary couplings (e.g. multiple domains, or monomers in a concatenated complex alignment) """ def __init__(self, segment_type, sequence_id, region_start, region_end, positions=None, segment_id="A"): """ Create a new sequence segment Parameters ---------- segment_type : {"aa", "dna", "rna"} Type of sequence sequence_id : str Identifier of sequence region_start : int Start index of sequence segment region_end : int End index of sequence segment (position is inclusive) positions : list(int), optional (default: None) Positions in the sequence alignment that will be used for EC calculation (all positions corresponding to uppercase residues). Compulsory parameter when using non-focus mode. segment_id : str Identifier of segment (must be unique) """ self.segment_type = segment_type self.sequence_id = sequence_id self.region_start = region_start self.region_end = region_end if positions is not None: self.positions = list(map(int, positions)) else: self.positions = None self.segment_id = segment_id
[docs] @classmethod def from_list(cls, segment): """ Create a segment object from list representation (e.g. from config). Parameters ---------- segment : list List representation of segment, with the following items: segment_id (str), segment_type (str), sequence_id (str), region_start (int), region_end (int), positions (list(int)) Returns ------- Segment New Segment instance from list """ segment_id, segment_type, sequence_id, region_start, region_end, positions = segment return cls(segment_type, sequence_id, region_start, region_end, positions, segment_id)
[docs] def to_list(self): """ Represent segment as list (for storing in configs) Returns ------- list List representation of segment, with the following items: segment_id (str), segment_type (str), sequence_id (str), region_start (int), region_end (int), positions (list(int)) """ return [ self.segment_id, self.segment_type, self.sequence_id, self.region_start, self.region_end, self.positions ]
[docs] def default_chain_name(self): """ Retrieve default PDB chain identifier the segment will be mapped to in 3D structures (by convention, segments in the pipeline are named A_1, A_2, ..., B_1, B_2, ...; the default chain identifier is anything before the underscore). Returns ------- chain : str Default PDB chain identifier the segment maps to """ return self.segment_id.split("_")[0]
[docs]class SegmentIndexMapper: """ Map indices of one or more sequence segments into CouplingsModel internal numbering space. Can also be used to (trivially) remap indices for a single sequence. """ def __init__(self, focus_mode, first_index, *segments): """ Create index mapping from individual segments Parameters ---------- focus_mode : bool Set to true if model was inferred in focus mode, False otherwise. first_index : int Index of first position in model/sequence. For nonfocus mode, should always be one. For focus mode, corresponds to index given in sequence header (1 if not in alignment) *segments: evcouplings.couplings.mapping.Segment: Segments containing numberings for each individual segment """ # store segments so we retain full information self.segments = deepcopy(segments) # build up target indices by going through all segments self.target_pos = [] for s in segments: if focus_mode: # in focus mode, we simply assemble the # ranges of continuous indices, because # numbering in model is also continuous cur_target = range( s.region_start, s.region_end + 1 ) else: # in non-focus mode, we need to assemble # the indices of actual model positions, # since the numbering in model may be # discontinuous cur_target = s.positions # create tuples of (segment_id, target_pos) self.target_pos += list(zip( [s.segment_id] * len(cur_target), cur_target )) # create correspond list of model positions; # note that in focus mode, not all of these # positions might actually be in the model # (if they correspond to lowercase columns) self.model_pos = list(range( first_index, first_index + len(self.target_pos) )) # mapping from target sequences (segments) into # model numbering space (continuous numbering) self.target_to_model = dict( zip(self.target_pos, self.model_pos) ) # inverse mapping from model numbering into target # numbering self.model_to_target = dict( zip(self.model_pos, self.target_pos) )
[docs] def patch_model(self, model, inplace=True): """ Change numbering of CouplingModel object so that it uses segment-based numbering Parameters ---------- model : CouplingsModel Model that will be updated to segment- based numbering inplace : bool, optional (default: True) If True, change passed model; otherwise returnnew object Returns ------- CouplingsModel Model with updated numbering (if inplace is False, this will point to original model) Raises ------ ValueError If segment mapping does not match internal model numbering """ if not inplace: model = deepcopy(model) try: mapped = [ self.model_to_target[pos] for pos in model.index_list ] except KeyError: raise ValueError( "Mapping from target to model positions does " "not contain all positions of internal model numbering" ) # update model mapping model.index_list = mapped # return updated model return model
def __map(self, indices, mapping_dict): """ Applies index mapping either to a single index, or to a list of indices Parameters ---------- indices: int, or (str, int), or lists thereof Indices in input numbering space mapping_dict : dict(int->(str, int)) or dict((str, int)-> int) Mapping from one indexing space into the other Returns ------- list of int, or list of (str, int) Mapped indices """ if isinstance(indices, Iterable) and not isinstance(indices, tuple): return [mapping_dict[x] for x in indices] else: return mapping_dict[indices] def __call__(self, segment_id, pos): """ Function-style syntax for single position to be mapped (calls to_model method) Parameters ---------- segment_id : str Identifier of segment pos : int Position in segment numbering Returns ------- int Index in coupling object numbering space """ return self.to_model((segment_id, pos))
[docs] def to_target(self, x): """ Map model index to target index Parameters ---------- x : int, or list of ints Indices in model numbering Returns ------- (str, int), or list of (str, int) Indices mapped into target numbering. Tuples are (segment_id, index_in_segment) """ return self.__map(x, self.model_to_target)
[docs] def to_model(self, x): """ Map target index to model index Parameters ---------- x : (str, int), or list of (str, int) Indices in target indexing (segment_id, index_in_segment) Returns ------- int, or list of int Monomer indices mapped into couplings object numbering """ return self.__map(x, self.target_to_model)
[docs]def segment_map_ecs(ecs, mapper): """ Map EC dataframe in model numbering into segment numbering Parameters ---------- ecs : pandas.DataFrame EC table (with columns i and j) Returns ------- pandas.DataFrame Mapped EC table (with columns i and j mapped, and additional columns segment_i and segment_j) """ ecs = deepcopy(ecs) def _map_column(col): seg_col = "segment_" + col # create new dataframe with two columns # 1) mapped segment, 2) mapped position col_m = pd.DataFrame( mapper.to_target(ecs.loc[:, col]), columns=[seg_col, col] ) # assign values instead of Series, because otherwise # wrong values end up in column ecs.loc[:, col] = col_m.loc[:, col].values ecs.loc[:, seg_col] = col_m.loc[:, seg_col].values # map both position columns (and add segment id) _map_column("i") _map_column("j") return ecs
[docs]class MultiSegmentCouplingsModel(CouplingsModel): """ Complex specific Couplings Model that handles segments and provides the option to convert model into inter-segment only. """ def __init__(self, filename, *segments, precision="float32", file_format="plmc_v2", **kwargs): """ filename : str Binary Jij file containing model parameters from plmc software precision : {"float32", "float64"}, default: "float32" Sets if input file has single (float32) or double precision (float64) } file_format : {"plmc_v2", "plmc_v1"}, default: "plmc_v2" File format of parameter file. segments: list of evcouplings.couplings.Segment TODO: have a additional constructor/class method that can take an existing loaded instance of a CouplingsModel and turn it into a MultiSegmentCouplingsModel """ super().__init__(filename, precision, file_format, **kwargs) # initialize the segment index mapper to update model numbering if len(segments) == 0: raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel") first_segment = segments[0] index_start = first_segment.region_start r = SegmentIndexMapper( True, # use focus mode index_start, # first index of first segment *segments ) # update model numbering r.patch_model(model=self)
[docs] def to_inter_segment_model(self): """ Convert model to inter-segment only parameters, ie the J_ijs that correspond to inter-protein or inter-domain residue pairs. All other parameters are set to 0. Returns ------- CouplingsModel Copy of object turned into inter-only Epistatic model """ h_i = np.zeros((self.L, self.num_symbols)) J_ij = np.zeros(self.J_ij.shape) for idx_i, i in enumerate(self.index_list): for idx_j, j in enumerate(self.index_list): if i[0] != j[0]: # if the segment identifier is different J_ij[idx_i, idx_j] = self.J_ij[idx_i, idx_j] ci = deepcopy(self) ci.h_i = h_i ci.J_ij = J_ij ci._reset_precomputed() return ci