"""
High-level mutation calculation functions for EVmutation
.. todo::
implement segment handling
Authors:
Thomas A. Hopf
Anna G. Green (generalization for multiple segments)
"""
import numpy as np
import pandas as pd
from evcouplings.utils.calculations import entropy_map
COMPONENT_TO_INDEX = {
"full": 0,
"couplings": 1,
"fields": 2,
}
[docs]def predict_mutation_table(model, table, output_column="prediction_epistatic",
mutant_column="mutant", hamiltonian="full",
segment=None):
"""
Predicts all mutants in a dataframe and adds predictions
as a new column.
If mutant_column is None, the dataframe index is used,
otherwise the given column.
Mutations which cannot be calculated (e.g. not covered
by alignment, or invalid substitution) using object are
set to NaN.
Parameters
----------
model : CouplingsModel
CouplingsModel instance used to compute mutation
effects
table : pandas.DataFrame
DataFrame with mutants to which delta of
statistical energy will be added
mutant_column: str
Name of column in table that contains mutants
output_column : str
Name of column in returned dataframe that will
contain computed effects
hamiltonian: {"full", "couplings", "fields"},
default: "full"
Use full Hamiltonian of exponential model (default),
or only couplings / fields for statistical energy
calculation.
segment: str, default: None
Specificy a segment identifier to use for the positions in the mutation
table. This will only be used if the mutation table doesn't already have
a segments column.
Returns
-------
pandas.DataFrame
Dataframe with added column (mutant_column) that contains computed
mutation effects
"""
def _predict_mutant(m):
try:
delta_E = model.delta_hamiltonian(m)
return delta_E[_component]
except ValueError:
return np.nan
# select Hamiltonian component for prediction
if hamiltonian in COMPONENT_TO_INDEX:
_component = COMPONENT_TO_INDEX[hamiltonian]
else:
raise ValueError(
"Invalid selection for hamiltonian. "
"Valid values are: " + ", ".join(COMPONENT_TO_INDEX)
)
# make sure there is a target sequence for which we
# can compute statistical energy difference
if not model.has_target_seq:
raise ValueError(
"CouplingsModel object does not have a target "
"sequence (non-focus mode). "
"Set target sequence, or rerun inference in focus mode."
)
pred = table.copy()
# get column which contains mutations
if mutant_column is None:
mutations = pred.index
else:
mutations = pred.loc[:, mutant_column]
# if there is a segment column, use that to apply
# segment information to every mutation
if "segment" in pred.columns and pred.loc[:, "segment"].notnull().all():
segments = pred.loc[:, "segment"]
# split each comma-delimited string of mutations into a list
mutations_separated = map(extract_mutations, mutations)
# split each comma-delimited string of segments into a list
segments_separated = [x.split(",") for x in segments]
mutation_list = []
# create a list of mutation in the format
# [[((segment, pos), aa_from, aa_to), ((segment, pos) aa_from, aa_to)], [((segment, pos) aa_from, aa_to)]]
if len([segments_separated]) != len([mutations_separated]):
raise(
ValueError,
"Number of mutations provided does not match number of segments of origin provided."
)
for segment_subset, mutation_subset in zip(segments_separated, mutations_separated):
_mutation_list = [
((seg, pos), aa_from, aa_to) for
(seg, (pos, aa_from, aa_to)) in zip(
segment_subset, mutation_subset
)
]
mutation_list.append(_mutation_list)
# else if the segment argument was provided
# designate that as the segment for every mutation
elif segment is not None:
mutations_separated = map(extract_mutations, mutations)
mutation_list = []
for mutation_subset in mutations_separated:
_mutation_list = [
((segment, pos), aa_from, aa_to) for
(pos, aa_from, aa_to) in mutation_subset
]
mutation_list.append(_mutation_list)
else:
mutation_list = map(extract_mutations, mutations)
# predict mutations and add to table
pred.loc[:, output_column] = [
_predict_mutant(m) for m in mutation_list
]
return pred
[docs]def single_mutant_matrix(model, output_column="prediction_epistatic",
exclude_self_subs=True):
"""
Create table with all possible single substitutions of
target sequence in CouplingsModel object.
Parameters
----------
model : CouplingsModel
Model that will be used to predict single mutants
output_column : str, default: "prediction_epistatic"
Name of column in Dataframe that will contain predictions
exclude_self_subs : bool, default: True
Exclude self-substitutions (e.g. A100A) from results
Returns
-------
pandas.DataFrame
DataFrame with predictions for all single mutants
"""
res = []
cons = entropy_map(model)
# iterate all positions and substitutions per position
for pos in model.index_list:
for subs in model.alphabet:
# do not predict gaps
if subs in ["-", "."]:
continue
# exclude self-substitutions?
if exclude_self_subs and subs == model.seq(pos):
continue
# if position is a tuple, it is in format
# (segment_id, position). Else, there is
# no segment information
if isinstance(pos, tuple):
position_str = pos[1]
segment = pos[0]
else:
position_str = pos
segment = np.nan
wt = model.seq(pos)
mutant = "{}{}{}".format(wt, position_str, subs)
res.append(
{
"segment": segment,
"mutant": mutant,
"pos": position_str,
"wt": wt,
"subs": subs,
"frequency": model.fi(pos, subs),
"column_conservation": cons[pos],
output_column: model.smm(pos, subs),
}
)
pred = pd.DataFrame(res)
return pred.loc[
:, ["segment", "mutant", "pos", "wt", "subs", "frequency",
"column_conservation", output_column]
]
[docs]def split_mutants(x, mutant_column="mutant"):
"""
Splits mutation strings into individual columns in DataFrame
(wild-type symbol(s), position(s), substitution(s), number of mutations).
This function is e.g. helpful when computing average
effects per position using pandas groupby() operations
Parameters
----------
x : pandas.DataFrame
Table with mutants
mutant_column : str, default: "mutant"
Column which contains mutants, set to None
to use index of DataFrame
Returns
-------
pandas.DataFrame
DataFrame with added columns "num_subs", "pos", "wt"
and "subs" that contain the number of mutations,
and split mutation strings (if higher-order mutations,
symbols/numbers are comma-separated)
"""
def _split(mut_str):
try:
return sorted(extract_mutations(mut_str))
except ValueError:
return np.nan
def _join(index):
return [
",".join([str(subs[index]) for subs in mutant])
for mutant in spl
]
# get column which contains mutations
if mutant_column is None:
mutations = x.index
else:
mutations = x.loc[:, mutant_column]
# extract wt/pos/subs where possible
spl = mutations.map(_split)
# then store in individual columns
x.loc[:, "num_mutations"] = [len(mutant) for mutant in spl]
for i, column in enumerate(["pos", "wt", "subs"]):
x.loc[:, column] = _join(i)
return x