Source code for evcouplings.compare.mapping

"""
Index mapping for PDB structures

Authors:
  Thomas A. Hopf
  Charlotta P. Schärfe
"""

import numpy as np
import pandas as pd

from evcouplings.align.alignment import Alignment, parse_header


[docs]def map_indices(seq_i, start_i, end_i,
                seq_j, start_j, end_j, gaps=("-", ".")):
    """
    Compute index mapping between positions in two
    aligned sequences

    Parameters
    ----------
    seq_i : str
        First aligned sequence
    start_i : int
        Index of first position in first sequence
    end_i : int
        Index of last position in first sequence
        (used for verification purposes only)
    seq_j : str
        Second aligned sequence
    start_j : int
        Index of first position in second sequence
    end_j : int
        Index of last position in second sequence
        (used for verification purposes only)

    Returns
    -------
    pandas.DataFrame
        Mapping table containing assignment of

        1. index in first sequence (i)
        2. symbol in first sequence (A_i)
        3. index in second sequence (j)
        4. symbol in second sequence (A_j)
    """
    NA = np.nan
    pos_i = start_i
    pos_j = start_j
    mapping = []

    for i, (res_i, res_j) in enumerate(zip(seq_i, seq_j)):
        # Do we match two residues, or residue and a gap?
        # if matching two residues, store 1 to 1 mapping.
        # Store positions as strings, since pandas cannot
        # handle nan values in integer columns
        if res_i not in gaps and res_j not in gaps:
            mapping.append([str(pos_i), res_i, str(pos_j), res_j])
        elif res_i not in gaps:
            mapping.append([str(pos_i), res_i, NA, NA])
        elif res_j not in gaps:
            mapping.append([NA, NA, str(pos_j), res_j])

        # adjust position in sequences if we saw a residue
        if res_i not in gaps:
            pos_i += 1

        if res_j not in gaps:
            pos_j += 1

    assert pos_i - 1 == end_i and pos_j - 1 == end_j

    return pd.DataFrame(
        mapping, columns=["i", "A_i", "j", "A_j"]
    )


[docs]def alignment_index_mapping(alignment_file, format="stockholm",
                            target_seq=None):
    """

    Create index mapping table between sequence positions
    based on a sequence alignment.

    Parameters
    ----------
    alignment_file : str
        Path of alignment file containing sequences for
        which indices shoul dbe mapped
    format : {"stockholm", "fasta"}
        Format of alignment file
    target_seq : str, optional (default: None)
        Identifier of sequence around which the index
        mapping will be centered. If None, first sequence
        in alignment will be used.

    Returns
    -------
    pandas.DataFrame
        Mapping table containing assignment of

        1. index in target sequence (i)
        2. symbol in target sequence (A_i)

        For all other sequences in alignment, the following
        two columns:

        3. index in second sequence (j_<sequence id>)
        4. symbol in second sequence (A_j_<sequence_id>)
    """
    # read alignment that is basis of mapping
    with open(alignment_file) as a:
        ali = Alignment.from_file(a, format)

    # determine index of target sequence if necessary
    # (default: first sequence in alignment)
    if target_seq is None:
        target_seq_index = 0
    else:
        for i, full_id in enumerate(ali.ids):
            if full_id.startswith(target_seq):
                target_seq_index = i

    # get range and sequence of target
    id_, target_start, target_end = parse_header(
        ali.ids[target_seq_index]
    )
    target_seq = ali.matrix[target_seq_index]

    # now map from target numbering to hit numbering
    full_map = None

    for i, full_id in enumerate(ali.ids):
        if i == target_seq_index:
            continue

        # extract information about sequence we are comparing to
        id_, region_start, region_end = parse_header(full_id)
        other_seq = ali.matrix[i]

        # compute mapping table
        map_df = map_indices(
            target_seq, target_start, target_end,
            other_seq, region_start, region_end,
            [ali._match_gap, ali._insert_gap]
        )

        # adjust column names for non-target sequence
        map_df = map_df.rename(
            columns={
                "j": "i_" + full_id,
                "A_j": "A_i_" + full_id,
            }
        )

        # add to full mapping table, left outer join
        # so all positions in target sequence are kept
        if full_map is None:
            full_map = map_df
        else:
            full_map = full_map.merge(
                map_df, on=("i", "A_i"), how="left"
            )

    return full_map