Source code for evcouplings.compare.mapping

"""
Index mapping for PDB structures

Authors:
  Thomas A. Hopf
  Charlotta P. Schärfe
"""

import numpy as np
import pandas as pd

from evcouplings.align.alignment import Alignment, parse_header


[docs]def map_indices(seq_i, start_i, end_i, seq_j, start_j, end_j, gaps=("-", ".")): """ Compute index mapping between positions in two aligned sequences Parameters ---------- seq_i : str First aligned sequence start_i : int Index of first position in first sequence end_i : int Index of last position in first sequence (used for verification purposes only) seq_j : str Second aligned sequence start_j : int Index of first position in second sequence end_j : int Index of last position in second sequence (used for verification purposes only) Returns ------- pandas.DataFrame Mapping table containing assignment of 1. index in first sequence (i) 2. symbol in first sequence (A_i) 3. index in second sequence (j) 4. symbol in second sequence (A_j) """ NA = np.nan pos_i = start_i pos_j = start_j mapping = [] for i, (res_i, res_j) in enumerate(zip(seq_i, seq_j)): # Do we match two residues, or residue and a gap? # if matching two residues, store 1 to 1 mapping. # Store positions as strings, since pandas cannot # handle nan values in integer columns if res_i not in gaps and res_j not in gaps: mapping.append([str(pos_i), res_i, str(pos_j), res_j]) elif res_i not in gaps: mapping.append([str(pos_i), res_i, NA, NA]) elif res_j not in gaps: mapping.append([NA, NA, str(pos_j), res_j]) # adjust position in sequences if we saw a residue if res_i not in gaps: pos_i += 1 if res_j not in gaps: pos_j += 1 assert pos_i - 1 == end_i and pos_j - 1 == end_j return pd.DataFrame( mapping, columns=["i", "A_i", "j", "A_j"] )
[docs]def alignment_index_mapping(alignment_file, format="stockholm", target_seq=None): """ Create index mapping table between sequence positions based on a sequence alignment. Parameters ---------- alignment_file : str Path of alignment file containing sequences for which indices shoul dbe mapped format : {"stockholm", "fasta"} Format of alignment file target_seq : str, optional (default: None) Identifier of sequence around which the index mapping will be centered. If None, first sequence in alignment will be used. Returns ------- pandas.DataFrame Mapping table containing assignment of 1. index in target sequence (i) 2. symbol in target sequence (A_i) For all other sequences in alignment, the following two columns: 3. index in second sequence (j_<sequence id>) 4. symbol in second sequence (A_j_<sequence_id>) """ # read alignment that is basis of mapping with open(alignment_file) as a: ali = Alignment.from_file(a, format) # determine index of target sequence if necessary # (default: first sequence in alignment) if target_seq is None: target_seq_index = 0 else: for i, full_id in enumerate(ali.ids): if full_id.startswith(target_seq): target_seq_index = i # get range and sequence of target id_, target_start, target_end = parse_header( ali.ids[target_seq_index] ) target_seq = ali.matrix[target_seq_index] # now map from target numbering to hit numbering full_map = None for i, full_id in enumerate(ali.ids): if i == target_seq_index: continue # extract information about sequence we are comparing to id_, region_start, region_end = parse_header(full_id) other_seq = ali.matrix[i] # compute mapping table map_df = map_indices( target_seq, target_start, target_end, other_seq, region_start, region_end, [ali._match_gap, ali._insert_gap] ) # adjust column names for non-target sequence map_df = map_df.rename( columns={ "j": "i_" + full_id, "A_j": "A_i_" + full_id, } ) # add to full mapping table, left outer join # so all positions in target sequence are kept if full_map is None: full_map = map_df else: full_map = full_map.merge( map_df, on=("i", "A_i"), how="left" ) return full_map