Source code for evcouplings.mutate.protocol

"""
Sequence statistical energy and mutation effect computation
protocols

Authors:
  Thomas A. Hopf
  Anna G. Green (complex)
"""

import pandas as pd
import matplotlib.pyplot as plt
from bokeh.io import save, output_file

from evcouplings.couplings.model import (
    CouplingsModel
)
from evcouplings.couplings.mapping import (
    MultiSegmentCouplingsModel
)
from evcouplings.mutate.calculations import (
    single_mutant_matrix, predict_mutation_table
)
import evcouplings
from evcouplings.utils.config import (
    check_required, InvalidParameterError
)
from evcouplings.utils.system import (
    create_prefix_folders, verify_resources
)
from evcouplings.couplings.mapping import (
    Segment
)


[docs]def standard(**kwargs):
    """
    Protocol:
    Mutation effect calculation and visualization for protein monomers

    TODO: eventually merge with complexes to make a protocol agnostic to the
    number of segments

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * mutation_matrix_file
        * [mutation_dataset_predicted_file]
    """
    check_required(
        kwargs,
        [
            "prefix", "model_file",
            "mutation_dataset_file",
        ]
    )

    prefix = kwargs["prefix"]

    outcfg = {
        "mutation_matrix_file": prefix + "_single_mutant_matrix.csv",
        "mutation_matrix_plot_files": [],
    }

    # make sure model file exists
    verify_resources(
        "Model parameter file does not exist",
        kwargs["model_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load couplings object, and create independent model
    c = CouplingsModel(kwargs["model_file"])
    c0 = c.to_independent_model()

    for model, type_ in [(c, "Epistatic"), (c0, "Independent")]:
        # interactive plot using bokeh
        filename = prefix + "_{}_model".format(type_.lower(),)
        output_file(
            filename + ".html", "{} model".format(type_)
        )
        fig = evcouplings.visualize.mutations.plot_mutation_matrix(model, engine="bokeh")
        save(fig)
        outcfg["mutation_matrix_plot_files"].append(filename + ".html")

        # static matplotlib plot
        evcouplings.visualize.mutations.plot_mutation_matrix(model)
        plt.savefig(filename + ".pdf", bbox_inches="tight")
        outcfg["mutation_matrix_plot_files"].append(filename + ".pdf")

    # create single mutation matrix table,
    # add prediction by independent model and
    # save to file
    singles = single_mutant_matrix(
        c, output_column="prediction_epistatic"
    )

    singles = predict_mutation_table(
        c0, singles, "prediction_independent"
    )

    singles.to_csv(outcfg["mutation_matrix_file"], index=False)

    # Pymol scripts
    outcfg["mutations_epistatic_pml_files"] = []
    for model in ["epistatic", "independent"]:
        pml_filename = prefix + "_{}_model.pml".format(model)
        evcouplings.visualize.mutations.mutation_pymol_script(
            singles, pml_filename, effect_column="prediction_" + model
        )
        outcfg["mutations_epistatic_pml_files"].append(pml_filename)

    # predict experimental dataset if given
    dataset_file = kwargs["mutation_dataset_file"]
    if dataset_file is not None:
        verify_resources("Dataset file does not exist", dataset_file)
        data = pd.read_csv(dataset_file, comment="#")

        # add epistatic model prediction
        data_pred = predict_mutation_table(
            c, data, "prediction_epistatic"
        )

        # add independent model prediction
        data_pred = predict_mutation_table(
            c0, data_pred, "prediction_independent"
        )

        outcfg["mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv"
        data_pred.to_csv(
            outcfg["mutation_dataset_predicted_file"], index=False
        )

    return outcfg


[docs]def complex(**kwargs):
    """
    Protocol:
    Mutation effect prediction and visualization for protein complexes

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * mutation_matrix_file
        * [mutation_dataset_predicted_file]
    """
    check_required(
        kwargs,
        [
            "prefix", "model_file",
            "mutation_dataset_file",
            "segments"
        ]
    )

    prefix = kwargs["prefix"]

    outcfg = {
        "mutation_matrix_file": prefix + "_single_mutant_matrix.csv",
        "mutation_matrix_plot_files": [],
    }

    # make sure model file exists
    verify_resources(
        "Model parameter file does not exist",
        kwargs["model_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load segments to create couplings object
    segment_objects = []
    for segment_list in kwargs["segments"]:
        segment_objects.append(Segment.from_list(segment_list))

    first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id
    second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id

    first_chain_name = Segment.from_list(kwargs["segments"][0]).default_chain_name()
    second_chain_name = Segment.from_list(kwargs["segments"][1]).default_chain_name()

    # load couplings object
    c = MultiSegmentCouplingsModel(kwargs["model_file"], *segment_objects)

    # create the independent model
    c0 = c.to_independent_model()

    # create the inter-protein only Jij model
    ci = c.to_inter_segment_model()

    for model, type_ in [(c, "Epistatic"), (c0, "Independent"), (ci, "Inter_segment")]:
        # interactive plot using bokeh
        filename = prefix + "_{}_model".format(type_.lower(), )
        output_file(
            filename + ".html", "{} model".format(type_)
        )
        fig = evcouplings.visualize.mutations.plot_mutation_matrix(model, engine="bokeh")
        save(fig)
        outcfg["mutation_matrix_plot_files"].append(filename + ".html")

        # static matplotlib plot
        evcouplings.visualize.mutations.plot_mutation_matrix(model)
        plt.savefig(filename + ".pdf", bbox_inches="tight")
        outcfg["mutation_matrix_plot_files"].append(filename + ".pdf")

    # create single mutation matrix table,
    # add prediction by independent model and
    # save to file
    singles = single_mutant_matrix(
        c, output_column="prediction_epistatic"
    )

    singles = predict_mutation_table(
        c0, singles, "prediction_independent"
    )

    singles = predict_mutation_table(
        ci, singles, "prediction_inter_segment"
    )

    singles.to_csv(outcfg["mutation_matrix_file"], index=False)

    # Pymol scripts
    outcfg["mutations_epistatic_pml_files"] = []
    for model in ["epistatic", "independent", "inter_segment"]:
        pml_filename = prefix + "_{}_model.pml".format(model)
        evcouplings.visualize.mutations.mutation_pymol_script(
            singles, pml_filename, effect_column="prediction_" + model,
            segment_to_chain_mapping={
                first_segment_name: first_chain_name,
                second_segment_name: second_chain_name
            }
        )
        outcfg["mutations_epistatic_pml_files"].append(pml_filename)

    # predict experimental dataset if given
    dataset_file = kwargs["mutation_dataset_file"]
    if dataset_file is not None:
        verify_resources("Dataset file does not exist", dataset_file)
        data = pd.read_csv(dataset_file, comment="#", sep=",")

        if "segment" not in data.columns:
            raise ValueError(
                "Input mutation dataset file does not contain "
                "a column called 'segment' to specify the "
                "protein of origin for each mutation"
            )

        # add epistatic model prediction
        data_pred = predict_mutation_table(
            c, data, "prediction_epistatic"
        )

        # add independent model prediction
        data_pred = predict_mutation_table(
            c0, data_pred, "prediction_independent"
        )

        data_pred = predict_mutation_table(
            ci, data_pred, "inter_segment"
        )

        outcfg["mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv"
        data_pred.to_csv(
            outcfg["mutation_dataset_predicted_file"], index=False
        )

    return outcfg


# list of available mutation protocols
PROTOCOLS = {
    # standard EVmutation protocol
    "standard": standard,

    # EVmutation protocol for complexes
    "complex": complex
}


[docs]def run(**kwargs):
    """
    Run mutation protocol

    Parameters
    ----------
    Mandatory kwargs arguments:
        protocol: EC protocol to run
        prefix: Output prefix for all generated files

    Returns
    -------
    outcfg : dict
        Output configuration of stage
        (see individual protocol for fields)
    """
    check_required(kwargs, ["protocol"])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())
            )
        )

    return PROTOCOLS[kwargs["protocol"]](**kwargs)