Source code for evcouplings.complex.protocol

"""
Protocols for matching putatively interacting sequences
in protein complexes to create a concatenated sequence
alignment

Authors:
  Anna G. Green
  Thomas A. Hopf
"""

from collections import Counter
import numpy as np
import pandas as pd

from evcouplings.couplings.mapping import Segment

from evcouplings.utils.config import (
    check_required, InvalidParameterError
)
from evcouplings.utils.system import (
    create_prefix_folders, verify_resources
)
from evcouplings.align.protocol import modify_alignment

from evcouplings.complex.alignment import (
    write_concatenated_alignment
)
from evcouplings.complex.distance import (
    find_possible_partners, best_reciprocal_matching,
    plot_distance_distribution
)
from evcouplings.complex.similarity import (
    read_species_annotation_table,
    most_similar_by_organism,
    filter_best_reciprocal,
    find_paralogs
)

[docs]def modify_complex_segments(outcfg, **kwargs): """ Modifies the output configuration so that the segments are correct for a concatenated alignment Parameters ---------- outcfg : dict The output configuration Returns ------- outcfg: dict The output configuration, with a new field called "segments" """ def _modify_segments(seg_list, seg_prefix): # extract segments from list representation into objects segs = [ Segment.from_list(s) for s in seg_list ] # update segment IDs for i, s in enumerate(segs, start=1): s.segment_id = "{}_{}".format(seg_prefix, i) return segs # merge segments - this allows to have more than one segment per # "monomer" alignment segments_1 = _modify_segments(kwargs["first_segments"], "A") segments_2 = _modify_segments(kwargs["second_segments"], "B") segments_complex = segments_1 + segments_2 outcfg["segments"] = [s.to_list() for s in segments_complex] return outcfg
def _run_describe_concatenation(outcfg, **kwargs): """ calculate some basic statistics on the concatenated alignment """ prefix = kwargs["prefix"] outcfg["concatentation_statistics_file"] = prefix + "_concatenation_statistics.csv" describe_concatenation( kwargs["first_annotation_file"], kwargs["second_annotation_file"], kwargs["first_genome_location_file"], kwargs["second_genome_location_file"], outcfg["concatentation_statistics_file"] ) return outcfg
[docs]def describe_concatenation(annotation_file_1, annotation_file_2, genome_location_filename_1, genome_location_filename_2, outfile): """ Describes properties of concatenated alignment. Writes a csv with the following columns num_seqs_1 : number of sequences in the first monomer alignment num_seqs_2 : number of sequences in the second monomer alignment num_nonred_species_1 : number of unique species annotations in the first monomer alignment num_nonred_species_2 : number of unique species annotations in the second monomer alignment num_species_overlap: number of unique species found in both alignments median_num_per_species_1 : median number of paralogs per species in the first monomer alignmment median_num_per_species_2 : median number of paralogs per species in the second monomer alignment num_with_embl_cds_1 : number of IDs for which we found an EMBL CDS in the first monomer alignment (relevant to distance concatention only) num_with_embl_cds_2 : number of IDs for which we found an EMBL CDS in the first monomer alignment (relevant to distance concatention only) Parameters ---------- annotation_file_1 : str Path to annotation.csv file for first monomer alignment annotation_file_2 : str Path to annotation.csv file for second monomer alignment genome_location_filename_1 : str Path to genome location mapping file for first alignment genome_location_filename_2 : str Path to genome location mapping file for second alignment outfile: str Path to output file """ # load the annotations for each alignment # as a pd.DataFrame annotations_1 = read_species_annotation_table( annotation_file_1 ) species_1 = annotations_1.species.values annotations_2 = read_species_annotation_table( annotation_file_2 ) species_2 = annotations_2.species.values # calculate the number of sequences found in each alignment num_seqs_1 = len(annotations_1) num_seqs_2 = len(annotations_2) # calculate the number of species found in each alignment # where a species is defined as a unique OS or Tax annotation field nonredundant_annotations_1 = len(set(species_1)) nonredundant_annotations_2 = len(set(species_2)) # calculate the number of overlapping species species_overlap = list( set(species_1).intersection(set(species_2)) ) n_species_overlap = len(species_overlap) # calculate the median number of paralogs per species n_paralogs_1 = float( # counts the number of times each species occurs in the list # then takes the median np.median(list(Counter(species_1).values())) ) n_paralogs_2 = float( np.median(list(Counter(species_2).values())) ) # If the user provided genome location files, calculate the number # of ids for which we found an embl CDS. Default value is np.nan embl_cds1 = np.nan embl_cds2 = np.nan if (genome_location_filename_1 is not None and genome_location_filename_2 is not None): genome_location_table_1 = pd.read_csv(genome_location_filename_1) genome_location_table_2 = pd.read_csv(genome_location_filename_2) # Number uniprot IDs with EMBL CDS that is not NA if "uniprot_ac" in genome_location_table_1.columns: embl_cds1 = len(list(set(genome_location_table_1.uniprot_ac))) if "uniprot_ac" in genome_location_table_2.columns: embl_cds2 = len(list(set(genome_location_table_2.uniprot_ac))) concatenation_data = [ num_seqs_1, num_seqs_2, nonredundant_annotations_1, nonredundant_annotations_2, n_species_overlap, n_paralogs_1, n_paralogs_2, embl_cds1, embl_cds2, ] cols = [ "num_seqs_1", "num_seqs_2", "num_nonred_species_1", "num_nonred_species_2", "num_species_overlap", "median_num_per_species_1", "median_num_per_species_2", "num_with_embl_cds_1", "num_with_embl_cds_2", ] # create dataframe and store data_df = pd.DataFrame( [concatenation_data], columns=cols ) data_df.to_csv(outfile)
[docs]def genome_distance(**kwargs): """ Protocol: Concatenate alignments based on genomic distance Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * alignment_file * raw_alignment_file * focus_mode * focus_sequence * segments * frequencies_file * identities_file * num_sequences * num_sites * raw_focus_alignment_file * statistics_file """ check_required( kwargs, [ "prefix", "first_alignment_file", "second_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_region_start", "second_region_start", "first_segments", "second_segments", "genome_distance_threshold", "first_genome_location_file", "second_genome_location_file", "first_annotation_file", "second_annotation_file" ] ) prefix = kwargs["prefix"] # make sure input alignments exist verify_resources( "Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"] ) verify_resources( "Genome location file does not exist", kwargs["first_genome_location_file"], kwargs["second_genome_location_file"] ) # make sure output directory exists create_prefix_folders(prefix) # load the information for each monomer alignment alignment_1 = kwargs["first_alignment_file"] alignment_2 = kwargs["second_alignment_file"] genome_location_filename_1 = kwargs["first_genome_location_file"] genome_location_filename_2 = kwargs["second_genome_location_file"] gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0) gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0) # find all possible matches possible_partners = find_possible_partners( gene_location_table_1, gene_location_table_2 ) # find the best reciprocal matches id_pairing_unfiltered = best_reciprocal_matching(possible_partners) # filter best reciprocal matches by genome distance threshold if kwargs["genome_distance_threshold"]: distance_threshold = kwargs["genome_distance_threshold"] id_pairing = id_pairing_unfiltered.query("distance < @distance_threshold") else: id_pairing = id_pairing_unfiltered id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"] id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"] # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ write_concatenated_alignment( id_pairing, alignment_1, alignment_2, kwargs["first_focus_sequence"], kwargs["second_focus_sequence"] ) # save the alignment files raw_alignment_file = prefix + "_raw.fasta" with open(raw_alignment_file, "w") as of: raw_ali.write(of) mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: mon_ali_2.write(of) # filter the alignment aln_outcfg, _ = modify_alignment( raw_ali, target_seq_index, target_seq_id, kwargs["first_region_start"], **kwargs ) # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = aln_outcfg outcfg["raw_alignment_file"] = raw_alignment_file outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1 outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2 outcfg["focus_sequence"] = target_seq_id # Update the segments outcfg = modify_complex_segments(outcfg, **kwargs) # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) # plot the genome distance distribution outcfg["distance_plot_file"] = prefix + "_distplot.pdf" plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"]) return outcfg
[docs]def best_hit(**kwargs): """ Protocol: Concatenate alignments based on the best hit to the focus sequence in each species Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: alignment_file raw_alignment_file focus_mode focus_sequence segments frequencies_file identities_file num_sequences num_sites raw_focus_alignment_file statistics_file """ check_required( kwargs, [ "prefix", "first_alignment_file", "second_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_segments", "second_segments", "first_identities_file", "second_identities_file", "first_annotation_file", "second_annotation_file", "use_best_reciprocal", "paralog_identity_threshold" ] ) prefix = kwargs["prefix"] # make sure input alignments verify_resources( "Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) def _load_monomer_info(annotations_file, identities_file, target_sequence, alignment_file, use_best_reciprocal, identity_threshold): # read in annotation to a file and rename the appropriate column annotation_table = read_species_annotation_table(annotations_file) # read identity file similarities = pd.read_csv(identities_file) # create a pd.DataFrame containing the best hit in each organism most_similar_in_species = most_similar_by_organism(similarities, annotation_table) if use_best_reciprocal: paralogs = find_paralogs( target_sequence, annotation_table, similarities, identity_threshold ) most_similar_in_species = filter_best_reciprocal( alignment_file, paralogs, most_similar_in_species ) return most_similar_in_species # load the information about each monomer alignment most_similar_in_species_1 = _load_monomer_info( kwargs["first_annotation_file"], kwargs["first_identities_file"], kwargs["first_focus_sequence"], kwargs["first_alignment_file"], kwargs["use_best_reciprocal"], kwargs["paralog_identity_threshold"] ) most_similar_in_species_2 = _load_monomer_info( kwargs["second_annotation_file"], kwargs["second_identities_file"], kwargs["second_focus_sequence"], kwargs["second_alignment_file"], kwargs["use_best_reciprocal"], kwargs["paralog_identity_threshold"] ) # merge the two dataframes to get all species found in # both alignments species_intersection = most_similar_in_species_1.merge( most_similar_in_species_2, how="inner", # takes the intersection on="species", # merges on species identifiers suffixes=("_1", "_2") ) # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ write_concatenated_alignment( species_intersection, kwargs["first_alignment_file"], kwargs["second_alignment_file"], kwargs["first_focus_sequence"], kwargs["second_focus_sequence"] ) # save the alignment files raw_alignment_file = prefix + "_raw.fasta" with open(raw_alignment_file, "w") as of: raw_ali.write(of) mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: mon_ali_2.write(of) aln_outcfg, _ = modify_alignment( raw_ali, target_seq_index, target_seq_id, kwargs["first_region_start"], **kwargs ) # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = aln_outcfg outcfg["raw_alignment_file"] = raw_alignment_file outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1 outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2 outcfg["focus_sequence"] = target_seq_id # Update the segments outcfg = modify_complex_segments(outcfg, **kwargs) # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) return outcfg
# list of available EC inference protocols PROTOCOLS = { # concatenate based on genomic distance ("operon-based") "genome_distance": genome_distance, # concatenate based on best hit per genome ("species") "best_hit": best_hit }
[docs]def run(**kwargs): """ Run alignment concatenation protocol Parameters ---------- Mandatory kwargs arguments: protocol: concatenation protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of concatenation stage Dictionary with results in following fields: (in brackets: not mandatory) alignment_file raw_alignment_file focus_mode focus_sequence segments frequencies_file identities_file num_sequences num_sites raw_focus_alignment_file statistics_file """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) return PROTOCOLS[kwargs["protocol"]](**kwargs)