Source code for evcouplings.mutate.calculations

"""
High-level mutation calculation functions for EVmutation

.. todo::

    implement segment handling

Authors:
  Thomas A. Hopf
  Anna G. Green (generalization for multiple segments)
"""

import numpy as np
import pandas as pd
from evcouplings.utils.calculations import entropy_map


COMPONENT_TO_INDEX = {
    "full": 0,
    "couplings": 1,
    "fields": 2,
}


[docs]def extract_mutations(mutation_string, offset=0, sep=","):
    """
    Turns a string containing mutations of the format I100V into a list of tuples with
    format (100, 'I', 'V') (index, from, to)

    Parameters
    ----------
    mutation_string : str
        Comma-separated list of one or more mutations (e.g. "K50R,I100V")
    offset : int, default: 0
        Offset to be added to the index/position of each mutation
    sep : str, default ","
        String used to separate multiple mutations

    Returns
    -------
    list of tuples
        List of tuples of the form (index+offset, from, to)
    """
    if mutation_string.lower() not in ["wild", "wt", ""]:
        mutations = mutation_string.split(sep)
        return list(map(
            lambda x: (int(x[1:-1]) + offset, x[0], x[-1]),
            mutations
        ))
    else:
        return []


[docs]def predict_mutation_table(model, table, output_column="prediction_epistatic",
                           mutant_column="mutant", hamiltonian="full",
                           segment=None):
    """
    Predicts all mutants in a dataframe and adds predictions
    as a new column.

    If mutant_column is None, the dataframe index is used,
    otherwise the given column.

    Mutations which cannot be calculated (e.g. not covered
    by alignment, or invalid substitution) using object are
    set to NaN.

    Parameters
    ----------
    model : CouplingsModel
        CouplingsModel instance used to compute mutation
        effects
    table : pandas.DataFrame
        DataFrame with mutants to which delta of
        statistical energy will be added
    mutant_column: str
        Name of column in table that contains mutants
    output_column : str
        Name of column in returned dataframe that will
        contain computed effects
    hamiltonian: {"full", "couplings", "fields"},
            default: "full"
        Use full Hamiltonian of exponential model (default),
        or only couplings / fields for statistical energy
        calculation.
    segment: str, default: None
        Specificy a segment identifier to use for the positions in the mutation
        table. This will only be used if the mutation table doesn't already have
        a segments column.

    Returns
    -------
    pandas.DataFrame
        Dataframe with added column (mutant_column) that contains computed
        mutation effects
    """
    def _predict_mutant(m):
        try:
            delta_E = model.delta_hamiltonian(m)
            return delta_E[_component]

        except ValueError:
            return np.nan

    # select Hamiltonian component for prediction
    if hamiltonian in COMPONENT_TO_INDEX:
        _component = COMPONENT_TO_INDEX[hamiltonian]
    else:
        raise ValueError(
            "Invalid selection for hamiltonian. "
            "Valid values are: " + ", ".join(COMPONENT_TO_INDEX)
        )

    # make sure there is a target sequence for which we
    # can compute statistical energy difference
    if not model.has_target_seq:
        raise ValueError(
            "CouplingsModel object does not have a target "
            "sequence (non-focus mode). "
            "Set target sequence, or rerun inference in focus mode."
        )

    pred = table.copy()

    # get column which contains mutations
    if mutant_column is None:
        mutations = pred.index
    else:
        mutations = pred.loc[:, mutant_column]

    # if there is a segment column, use that to apply
    # segment information to every mutation
    if "segment" in pred.columns and pred.loc[:, "segment"].notnull().all():
        segments = pred.loc[:, "segment"]

        # split each comma-delimited string of mutations into a list
        mutations_separated = map(extract_mutations, mutations)

        # split each comma-delimited string of segments into a list
        segments_separated = [x.split(",") for x in segments]
        mutation_list = []

        # create a list of mutation in the format
        # [[((segment, pos), aa_from, aa_to), ((segment, pos) aa_from, aa_to)], [((segment, pos) aa_from, aa_to)]]
        if len([segments_separated]) != len([mutations_separated]):
            raise(
                ValueError,
                "Number of mutations provided does not match number of segments of origin provided."
            )

        for segment_subset, mutation_subset in zip(segments_separated, mutations_separated):
            _mutation_list = [
                ((seg, pos), aa_from, aa_to) for
                (seg, (pos, aa_from, aa_to)) in zip(
                    segment_subset, mutation_subset
                )
            ]
            mutation_list.append(_mutation_list)

    # else if the segment argument was provided
    # designate that as the segment for every mutation
    elif segment is not None:
        mutations_separated = map(extract_mutations, mutations)
        mutation_list = []
        for mutation_subset in mutations_separated:
            _mutation_list = [
                ((segment, pos), aa_from, aa_to) for
                (pos, aa_from, aa_to) in mutation_subset
            ]
            mutation_list.append(_mutation_list)

    else:
        mutation_list = map(extract_mutations, mutations)

    # predict mutations and add to table
    pred.loc[:, output_column] = [
        _predict_mutant(m) for m in mutation_list
    ]

    return pred


[docs]def single_mutant_matrix(model, output_column="prediction_epistatic",
                         exclude_self_subs=True):
    """
    Create table with all possible single substitutions of
    target sequence in CouplingsModel object.

    Parameters
    ----------
    model : CouplingsModel
        Model that will be used to predict single mutants
    output_column : str, default: "prediction_epistatic"
        Name of column in Dataframe that will contain predictions
    exclude_self_subs : bool, default: True
        Exclude self-substitutions (e.g. A100A) from results

    Returns
    -------
    pandas.DataFrame
        DataFrame with predictions for all single mutants
    """
    res = []
    cons = entropy_map(model)

    # iterate all positions and substitutions per position
    for pos in model.index_list:
        for subs in model.alphabet:
            # do not predict gaps
            if subs in ["-", "."]:
                continue

            # exclude self-substitutions?
            if exclude_self_subs and subs == model.seq(pos):
                continue

            # if position is a tuple, it is in format
            # (segment_id, position). Else, there is
            # no segment information
            if isinstance(pos, tuple):
                position_str = pos[1]
                segment = pos[0]

            else:
                position_str = pos
                segment = np.nan

            wt = model.seq(pos)
            mutant = "{}{}{}".format(wt, position_str, subs)

            res.append(
                {
                    "segment": segment,
                    "mutant": mutant,
                    "pos": position_str,
                    "wt": wt,
                    "subs": subs,
                    "frequency": model.fi(pos, subs),
                    "column_conservation": cons[pos],
                    output_column: model.smm(pos, subs),
                }
            )

    pred = pd.DataFrame(res)
    return pred.loc[
        :, ["segment", "mutant", "pos", "wt", "subs", "frequency",
            "column_conservation", output_column]
    ]


[docs]def split_mutants(x, mutant_column="mutant"):
    """
    Splits mutation strings into individual columns in DataFrame
    (wild-type symbol(s), position(s), substitution(s), number of mutations).
    This function is e.g. helpful when computing average
    effects per position using pandas groupby() operations

    Parameters
    ----------
    x : pandas.DataFrame
        Table with mutants
    mutant_column : str, default: "mutant"
        Column which contains mutants, set to None
        to use index of DataFrame

    Returns
    -------
    pandas.DataFrame
        DataFrame with added columns "num_subs", "pos", "wt"
        and "subs" that contain the number of mutations,
        and split mutation strings (if higher-order mutations,
        symbols/numbers are comma-separated)
    """
    def _split(mut_str):
        try:
            return sorted(extract_mutations(mut_str))
        except ValueError:
            return np.nan

    def _join(index):
        return [
            ",".join([str(subs[index]) for subs in mutant])
            for mutant in spl
        ]

    # get column which contains mutations
    if mutant_column is None:
        mutations = x.index
    else:
        mutations = x.loc[:, mutant_column]

    # extract wt/pos/subs where possible
    spl = mutations.map(_split)

    # then store in individual columns
    x.loc[:, "num_mutations"] = [len(mutant) for mutant in spl]
    for i, column in enumerate(["pos", "wt", "subs"]):
        x.loc[:, column] = _join(i)

    return x