Source code for evcouplings.compare.ecs

"""
Compare evolutionary couplings to distances in 3D structures

Authors:
  Thomas A. Hopf
"""

import numpy as np


[docs]def add_distances(ec_table, dist_map, target_column="dist"):
    """
    Add pair distances to EC score table

    Parameters
    ----------
    ec_table : pandas.DataFrame
        List of evolutionary couplings, with pair
        positions in columns i and j
    dist_map : DistanceMap
        Distance map that will be used to annotate
        distances in ec_table
    target_column : str
        Name of column in which distances will be stored

    Returns
    -------
    pandas.DataFrame
        Couplings table with added distances
        in target_column. Pairs where no distance
        information is available will be np.nan
    """
    ec_table = ec_table.copy()

    ec_table.loc[:, target_column] = [
        dist_map.dist(i, j, raise_na=False)
        for i, j in zip(ec_table.i, ec_table.j)
    ]

    return ec_table


[docs]def add_precision(ec_table, dist_cutoff=5, score="cn",
                  min_sequence_dist=6, target_column="precision",
                  dist_column="dist"):
    """
    Compute precision of evolutionary couplings as predictor
    of 3D structure contacts

    Parameters
    ----------
    ec_table : pandas.DataFrame
        List of evolutionary couplings
    dist_cutoff : float, optional (default: 5)
        Upper distance cutoff (in Angstrom) for a
        pair to be considered a true positive contact
    score : str, optional (default: "cn")
        Column which contains coupling score. Table will
        be sorted in descending order by this score.
    min_sequence_dist : int, optional (default: 6)
        Minimal distance in primary sequence for an EC to
        be included in precision calculation
    target_column : str, optional (default: "precision")
        Name of column in which precision will be stored
    dist_column : str, optional (default: "dist")
        Name of column which contains pair distances

    Returns
    -------
    pandas.DataFrame
        EC table with added precision values as a
        function of EC rank (returned table will be
        sorted by score column)
    """
    # make sure list is sorted by score
    ec_table = ec_table.sort_values(by=score, ascending=False)

    if min_sequence_dist is not None:
        ec_table = ec_table.query("abs(i - j) >= @min_sequence_dist")

    ec_table = ec_table.copy()

    # number of true positive contacts
    true_pos_count = (ec_table.loc[:, dist_column] <= dist_cutoff).cumsum()

    # total number of contacts with specified distance
    pos_count = ec_table.loc[:, dist_column].notnull().cumsum()

    ec_table.loc[:, target_column] = true_pos_count / pos_count
    return ec_table


[docs]def coupling_scores_compared(ec_table, dist_map, dist_map_multimer=None,
                             dist_cutoff=5, output_file=None, score="cn",
                             min_sequence_dist=6):
    """
    Utility function to create "CouplingScores.csv"-style
    table

    Parameters
    ----------
    ec_table : pandas.DataFrame
        List of evolutionary couplings
    dist_map : DistanceMap
        Distance map that will be used to annotate
        distances in ec_table
    dist_map_multimer : DistanceMap, optional (default: None)
        Additional multimer distance map. If given,
        the distance for any EC pair will be the minimum
        out of the monomer and multimer distances.
    dist_cutoff : float, optional (default: 5)
        Upper distance cutoff (in Angstrom) for a
        pair to be considered a true positive contact
    output_file : str, optional (default: None)
        Store final table to this file
    score : str, optional (default: "cn")
        Column which contains coupling score. Table will
        be sorted in descending order by this score.
    min_sequence_dist : int, optional (default: 6)
        Minimal distance in primary sequence for an EC to
        be included in precision calculation

    Returns
    -------
    pandas.DataFrame
        EC table with added distances, and precision
        if dist_cutoff is given.
    """
    if dist_map_multimer is None:
        x = add_distances(ec_table, dist_map)
    else:
        x = add_distances(ec_table, dist_map, "dist_intra")
        x = add_distances(x, dist_map_multimer, "dist_multimer")
        x.loc[:, "dist"] = np.fmin(
            x.dist_intra, x.dist_multimer
        )

    if min_sequence_dist is not None:
        x = x.query("abs(i - j) >= @min_sequence_dist")

    # if distance cutoff is given, add precision
    if dist_cutoff is not None:
        x = add_precision(
            x, dist_cutoff, score=score,
            min_sequence_dist=min_sequence_dist
        )

    # also save to file if path is given
    if output_file is not None:
        x.to_csv(output_file, index=False)

    return x