Source code for evcouplings.complex.protocol

"""
Protocols for matching putatively interacting sequences
in protein complexes to create a concatenated sequence
alignment

Authors:
  Anna G. Green
  Thomas A. Hopf
"""

from collections import Counter
import numpy as np
import pandas as pd

from evcouplings.couplings.mapping import Segment

from evcouplings.utils.config import (
    check_required, InvalidParameterError
)
from evcouplings.utils.system import (
    create_prefix_folders, verify_resources
)
from evcouplings.align.protocol import modify_alignment

from evcouplings.complex.alignment import (
    write_concatenated_alignment
)
from evcouplings.complex.distance import (
    find_possible_partners, best_reciprocal_matching,
    plot_distance_distribution
)
from evcouplings.complex.similarity import (
    read_species_annotation_table,
    most_similar_by_organism,
    filter_best_reciprocal,
    find_paralogs
)

[docs]def modify_complex_segments(outcfg, **kwargs):
    """
    Modifies the output configuration so
    that the segments are correct for a
    concatenated alignment

    Parameters
    ----------
    outcfg : dict
        The output configuration

    Returns
    -------
    outcfg: dict
        The output configuration, with
        a new field called "segments"

    """
    def _modify_segments(seg_list, seg_prefix):
        # extract segments from list representation into objects
        segs = [
            Segment.from_list(s) for s in seg_list
        ]
        # update segment IDs
        for i, s in enumerate(segs, start=1):
            s.segment_id = "{}_{}".format(seg_prefix, i)

        return segs

    # merge segments - this allows to have more than one segment per
    # "monomer" alignment
    segments_1 = _modify_segments(kwargs["first_segments"], "A")
    segments_2 = _modify_segments(kwargs["second_segments"], "B")
    segments_complex = segments_1 + segments_2
    outcfg["segments"] = [s.to_list() for s in segments_complex]

    return outcfg

def _run_describe_concatenation(outcfg, **kwargs):
    """
    calculate some basic statistics on the concatenated alignment
    """
    prefix = kwargs["prefix"]
    outcfg["concatentation_statistics_file"] = prefix + "_concatenation_statistics.csv"
    describe_concatenation(
        kwargs["first_annotation_file"],
        kwargs["second_annotation_file"],
        kwargs["first_genome_location_file"],
        kwargs["second_genome_location_file"],
        outcfg["concatentation_statistics_file"]
    )
    return outcfg


[docs]def describe_concatenation(annotation_file_1, annotation_file_2,
                           genome_location_filename_1, genome_location_filename_2,
                           outfile):
    """
    Describes properties of concatenated alignment. 

    Writes a csv with the following columns

    num_seqs_1 : number of sequences in the first monomer alignment
    num_seqs_2 : number of sequences in the second monomer alignment
    num_nonred_species_1 : number of unique species annotations in the 
        first monomer alignment
    num_nonred_species_2 : number of unique species annotations in the 
        second monomer alignment
    num_species_overlap: number of unique species found in both alignments
    median_num_per_species_1 : median number of paralogs per species in the 
        first monomer alignmment
    median_num_per_species_2 : median number of paralogs per species in 
        the second monomer alignment
    num_with_embl_cds_1 : number of IDs for which we found an EMBL CDS in the 
        first monomer alignment (relevant to distance concatention only)
    num_with_embl_cds_2 : number of IDs for which we found an EMBL CDS in the 
        first monomer alignment (relevant to distance concatention only)
    
    Parameters
    ----------
    annotation_file_1 : str
        Path to annotation.csv file for first monomer alignment
    annotation_file_2 : str
        Path to annotation.csv file for second monomer alignment
    genome_location_filename_1 : str
        Path to genome location mapping file for first alignment
    genome_location_filename_2 : str
        Path to genome location mapping file for second alignment
    outfile: str
        Path to output file
    """

    # load the annotations for each alignment
    # as a pd.DataFrame
    annotations_1 = read_species_annotation_table(
        annotation_file_1
    )
    species_1 = annotations_1.species.values

    annotations_2 = read_species_annotation_table(
        annotation_file_2
    )
    species_2 = annotations_2.species.values
    
    # calculate the number of sequences found in each alignment
    num_seqs_1 = len(annotations_1)
    num_seqs_2 = len(annotations_2)
    
    # calculate the number of species found in each alignment
    # where a species is defined as a unique OS or Tax annotation field
    nonredundant_annotations_1 = len(set(species_1))
    nonredundant_annotations_2 = len(set(species_2))

    # calculate the number of overlapping species
    species_overlap = list(
        set(species_1).intersection(set(species_2))
    )
    n_species_overlap = len(species_overlap)
    
    # calculate the median number of paralogs per species
    n_paralogs_1 = float(
        # counts the number of times each species occurs in the list
        # then takes the median
        np.median(list(Counter(species_1).values()))
    )

    n_paralogs_2 = float(
        np.median(list(Counter(species_2).values()))
    )
    
    # If the user provided genome location files, calculate the number
    # of ids for which we found an embl CDS. Default value is np.nan
    embl_cds1 = np.nan
    embl_cds2 = np.nan

    if (genome_location_filename_1 is not None and
        genome_location_filename_2 is not None):

        genome_location_table_1 = pd.read_csv(genome_location_filename_1)
        genome_location_table_2 = pd.read_csv(genome_location_filename_2)

        # Number uniprot IDs with EMBL CDS that is not NA
        if "uniprot_ac" in genome_location_table_1.columns:
            embl_cds1 = len(list(set(genome_location_table_1.uniprot_ac)))
        if "uniprot_ac" in genome_location_table_2.columns:
            embl_cds2 = len(list(set(genome_location_table_2.uniprot_ac)))

    concatenation_data = [
        num_seqs_1,
        num_seqs_2,
        nonredundant_annotations_1,
        nonredundant_annotations_2,
        n_species_overlap,
        n_paralogs_1,
        n_paralogs_2,
        embl_cds1,
        embl_cds2,
    ]
    
    cols = [
        "num_seqs_1",
        "num_seqs_2",
        "num_nonred_species_1",
        "num_nonred_species_2",
        "num_species_overlap",
        "median_num_per_species_1",
        "median_num_per_species_2",
        "num_with_embl_cds_1",
        "num_with_embl_cds_2",
    ]

    # create dataframe and store
    data_df = pd.DataFrame(
        [concatenation_data], columns=cols
    )

    data_df.to_csv(outfile)


[docs]def genome_distance(**kwargs):
    """
    Protocol:

    Concatenate alignments based on genomic distance

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * alignment_file
        * raw_alignment_file
        * focus_mode
        * focus_sequence
        * segments
        * frequencies_file
        * identities_file
        * num_sequences
        * num_sites
        * raw_focus_alignment_file
        * statistics_file

    """

    check_required(
        kwargs,
        [
            "prefix",
            "first_alignment_file", "second_alignment_file",
            "first_focus_sequence", "second_focus_sequence",
            "first_focus_mode", "second_focus_mode",
            "first_region_start", "second_region_start",
            "first_segments", "second_segments",
            "genome_distance_threshold",
            "first_genome_location_file",
            "second_genome_location_file",
            "first_annotation_file",
            "second_annotation_file"
        ]
    )

    prefix = kwargs["prefix"]

    # make sure input alignments exist
    verify_resources(
        "Input alignment does not exist",
        kwargs["first_alignment_file"], kwargs["second_alignment_file"]
    )

    verify_resources(
        "Genome location file does not exist",
        kwargs["first_genome_location_file"],
        kwargs["second_genome_location_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load the information for each monomer alignment
    alignment_1 = kwargs["first_alignment_file"]
    alignment_2 = kwargs["second_alignment_file"]

    genome_location_filename_1 = kwargs["first_genome_location_file"]
    genome_location_filename_2 = kwargs["second_genome_location_file"]

    gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0)
    gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0)

    # find all possible matches
    possible_partners = find_possible_partners(
        gene_location_table_1, gene_location_table_2
    )

    # find the best reciprocal matches
    id_pairing_unfiltered = best_reciprocal_matching(possible_partners)

    # filter best reciprocal matches by genome distance threshold
    if kwargs["genome_distance_threshold"]:
        distance_threshold = kwargs["genome_distance_threshold"]
        id_pairing = id_pairing_unfiltered.query("distance < @distance_threshold")
    else:
        id_pairing = id_pairing_unfiltered

    id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"]
    id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"]

    # write concatenated alignment with distance filtering
    # TODO: save monomer alignments?
    target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \
        write_concatenated_alignment(
            id_pairing,
            alignment_1,
            alignment_2,
            kwargs["first_focus_sequence"],
            kwargs["second_focus_sequence"]
        )

    # save the alignment files
    raw_alignment_file = prefix + "_raw.fasta"
    with open(raw_alignment_file, "w") as of:
        raw_ali.write(of)

    mon_alignment_file_1 = prefix + "_monomer_1.fasta"
    with open(mon_alignment_file_1, "w") as of:
        mon_ali_1.write(of)   

    mon_alignment_file_2 = prefix + "_monomer_2.fasta"
    with open(mon_alignment_file_2, "w") as of:
        mon_ali_2.write(of)   

    # filter the alignment
    aln_outcfg, _ = modify_alignment(
        raw_ali,
        target_seq_index,
        target_seq_id,
        kwargs["first_region_start"],
        **kwargs
    )

    # make sure we return all the necessary information:
    # * alignment_file: final concatenated alignment that will go into plmc
    # * focus_sequence: this is the identifier of the concatenated target
    #   sequence which will be passed into plmc with -f
    outcfg = aln_outcfg
    outcfg["raw_alignment_file"] = raw_alignment_file
    outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1
    outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2
    outcfg["focus_sequence"] = target_seq_id

    # Update the segments
    outcfg = modify_complex_segments(outcfg, **kwargs)

    # Describe the statistics of the concatenation
    outcfg = _run_describe_concatenation(outcfg, **kwargs)

    # plot the genome distance distribution
    outcfg["distance_plot_file"] = prefix + "_distplot.pdf"
    plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"])

    return outcfg


[docs]def best_hit(**kwargs):
    """
    Protocol:

    Concatenate alignments based on the best hit 
    to the focus sequence in each species

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        alignment_file
        raw_alignment_file
        focus_mode
        focus_sequence
        segments
        frequencies_file
        identities_file
        num_sequences
        num_sites
        raw_focus_alignment_file
        statistics_file
    """
    check_required(
        kwargs,
        [
            "prefix",
            "first_alignment_file", "second_alignment_file",
            "first_focus_sequence", "second_focus_sequence",
            "first_focus_mode", "second_focus_mode",
            "first_segments", "second_segments",
            "first_identities_file", "second_identities_file",
            "first_annotation_file", "second_annotation_file",
            "use_best_reciprocal", "paralog_identity_threshold"
        ]
    )

    prefix = kwargs["prefix"]

    # make sure input alignments
    verify_resources(
        "Input alignment does not exist",
        kwargs["first_alignment_file"], kwargs["second_alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    def _load_monomer_info(annotations_file, identities_file,
                           target_sequence, alignment_file,
                           use_best_reciprocal, identity_threshold):

        # read in annotation to a file and rename the appropriate column
        annotation_table = read_species_annotation_table(annotations_file)

        # read identity file
        similarities = pd.read_csv(identities_file)

        # create a pd.DataFrame containing the best hit in each organism
        most_similar_in_species = most_similar_by_organism(similarities, annotation_table)

        if use_best_reciprocal:
            paralogs = find_paralogs(
                target_sequence, annotation_table, similarities,
                identity_threshold
            )

            most_similar_in_species = filter_best_reciprocal(
                alignment_file, paralogs, most_similar_in_species
            )

        return most_similar_in_species

    # load the information about each monomer alignment
    most_similar_in_species_1 = _load_monomer_info(
        kwargs["first_annotation_file"],
        kwargs["first_identities_file"],
        kwargs["first_focus_sequence"],
        kwargs["first_alignment_file"],
        kwargs["use_best_reciprocal"],
        kwargs["paralog_identity_threshold"]
    )

    most_similar_in_species_2 = _load_monomer_info(
        kwargs["second_annotation_file"],
        kwargs["second_identities_file"],
        kwargs["second_focus_sequence"],
        kwargs["second_alignment_file"],
        kwargs["use_best_reciprocal"],
        kwargs["paralog_identity_threshold"]
    )

    # merge the two dataframes to get all species found in 
    # both alignments
    species_intersection = most_similar_in_species_1.merge(
        most_similar_in_species_2,
        how="inner",  # takes the intersection
        on="species",  # merges on species identifiers
        suffixes=("_1", "_2")
    )

    # write concatenated alignment with distance filtering
    # TODO: save monomer alignments?
    target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \
        write_concatenated_alignment(
            species_intersection,
            kwargs["first_alignment_file"],
            kwargs["second_alignment_file"],
            kwargs["first_focus_sequence"],
            kwargs["second_focus_sequence"]
        )

    # save the alignment files
    raw_alignment_file = prefix + "_raw.fasta"
    with open(raw_alignment_file, "w") as of:
        raw_ali.write(of)

    mon_alignment_file_1 = prefix + "_monomer_1.fasta"
    with open(mon_alignment_file_1, "w") as of:
        mon_ali_1.write(of)

    mon_alignment_file_2 = prefix + "_monomer_2.fasta"
    with open(mon_alignment_file_2, "w") as of:
        mon_ali_2.write(of)

    aln_outcfg, _ = modify_alignment(
        raw_ali,
        target_seq_index,
        target_seq_id,
        kwargs["first_region_start"],
        **kwargs
    )

    # make sure we return all the necessary information:
    # * alignment_file: final concatenated alignment that will go into plmc
    # * focus_sequence: this is the identifier of the concatenated target
    #   sequence which will be passed into plmc with -f
    outcfg = aln_outcfg
    outcfg["raw_alignment_file"] = raw_alignment_file
    outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1
    outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2
    outcfg["focus_sequence"] = target_seq_id

    # Update the segments
    outcfg = modify_complex_segments(outcfg, **kwargs)

    # Describe the statistics of the concatenation
    outcfg = _run_describe_concatenation(outcfg, **kwargs)

    return outcfg


# list of available EC inference protocols
PROTOCOLS = {
    # concatenate based on genomic distance ("operon-based")
    "genome_distance": genome_distance,

    # concatenate based on best hit per genome ("species")
    "best_hit": best_hit
}


[docs]def run(**kwargs):
    """
    Run alignment concatenation protocol

    Parameters
    ----------
    Mandatory kwargs arguments:
        protocol: concatenation protocol to run
        prefix: Output prefix for all generated files

    Returns
    -------
    outcfg : dict
        Output configuration of concatenation stage
        Dictionary with results in following fields:
        (in brackets: not mandatory)

        alignment_file
        raw_alignment_file
        focus_mode
        focus_sequence
        segments
        frequencies_file
        identities_file
        num_sequences
        num_sites
        raw_focus_alignment_file
        statistics_file

    """
    check_required(kwargs, ["protocol"])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())
            )
        )

    return PROTOCOLS[kwargs["protocol"]](**kwargs)