Source code for evcouplings.align.pfam

"""
Code for identifying Pfam domains and mapping Pfam
alignments and ECs into target sequence mode.

Authors:
  Thomas A. Hopf

.. todo::

    1. Write code to create list of family sizes
    2. Implement alignments against Pfam-HMM so precomputed results can be reused in focus mode

"""
import gzip
import pandas as pd
from evcouplings.align.tools import run_hmmscan, read_hmmer_domtbl
from evcouplings.utils.helpers import range_overlap


[docs]def create_family_size_table(full_pfam_file, outfile=None): """ Parse family size table from Pfam flat file (ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz) Parameters ---------- full_pfam_file : str Path to the pfam file (gzip). outfile : str, optional (default: None) Save the parsed table to this file as a csv file. Returns ------- pd.DataFrame Parsed Pfam table. """ data = [] with gzip.open(full_pfam_file, "rt", encoding='latin-1') as gz_ref: pfam_id = None num_seqs = None for line in gz_ref: # identifier at the beginning of the family entry if line.startswith("#=GF AC"): pfam_id = line[10:17] # the number of sequences in the family, follows after the identifier elif line.startswith("#=GF SQ"): num_seqs = int(line[10:]) # stores the result at the end of an entry elif (line.startswith("//") and pfam_id is not None and num_seqs is not None): data.append({"pfam_id": pfam_id, "num_seqs": num_seqs}) pfam_id = None num_seqs = None df = pd.DataFrame(data, columns=["pfam_id", "num_seqs"]) if outfile is not None: df.to_csv(outfile, index=False) return df
[docs]def remove_clan_overlaps(pfam_table): """ Remove overlapping Pfam hits from same Pfam clan (equivalent of PfamScan.pl). Currently only allows to remove overlaps by domain bitscore. .. todo:: is bitscore the most sensible choice if different length hits? Parameters ---------- pfam_table : pd.DataFrame Pfam hit table as generated by pfam_hits() function (must contain Pfam clan annotation). Returns ------- pd.DataFrame Pfam hit table with lower-scoring overlaps removed """ # could make this a parameter, if switching to E-values # we would have to changing sorting order of DataFrame # and sign of comparison further below. score = "domain_score" # group by sequence ID and clan to resolve overlaps grouped = pfam_table.sort_values( by=score, ascending=False ).groupby( by=["query_name", "clan_id"], as_index=False, sort=False ) # store index value of all entries to discard remove_hits = [] for (uniprot_ac, clan_name), grp in grouped: # safety check here that we are not grouping hits that are # not in the same clan (missing value) if pandas ever changed # the behaviour of groupby to not iterate through groups # with missing values. Otherwise, we would have to skip grouop. assert clan_name.startswith("CL") # go through all pairwise combinations of hits for idx1, hit1 in grp.iterrows(): for idx2, hit2 in grp.iterrows(): if idx1 < idx2: if range_overlap( (int(hit1["ali_from"]), int(hit1["ali_to"]) + 1), (int(hit2["ali_from"]), int(hit2["ali_to"]) + 1), ) > 0: if float(hit1[score]) >= float(hit2[score]): remove_hits.append(idx2) else: remove_hits.append(idx1) return pfam_table.loc[~pfam_table.index.isin(remove_hits)]
[docs]def pfam_hits(query_file, hmm_database, prefix, clan_table_file, size_table_file, resolve_overlaps=True, **kwargs): """ Identify hits of Pfam HMMs in a set of sequences. Parameters ---------- query_file : str File containing query sequence(s) hmm_database : str File containing HMM database (Pfam-A.hmm, with hmmpress applied) prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. clan_table_file : str File with table linking Pfam families to clans (Pfam-A.clans.tsv). Set to None if not available, but resolve_overlaps cannot be True in that case. size_table_file : str File with table of family sizes. Create using create_family_size_table(). Set to None if not available. resolve_overlaps : bool Resolve overlapping hits by families from the same clan. Only possible if clan_table_file is given. **kwargs : dict kwargs passed on to evcouplings.align.tools.run_hmmscan Returns ------- pd.DataFrame Pfam hit table """ # find HMM hits with hmmscan scan_res = run_hmmscan( query_file, hmm_database, prefix, **kwargs ) hits = read_hmmer_domtbl(scan_res.domtblout) # remove version information from family name hits.loc[:, "pfam_id"] = hits.target_accession.map( lambda x: x.split(".")[0] ) # add information about Pfam clan for each family, # this is necessary to resolve overlapping hits # clan file is Pfam-A.clans.tsv from Pfam FTP site if clan_table_file is not None: clans = pd.read_csv( clan_table_file, sep="\t", names=[ "pfam_id", "clan_id", "clan_name", "family_name", "family_text" ] ) hits = hits.merge(clans, on="pfam_id", how="left") # add number of sequences in each Pfam family, # this file has to be created using create_family_sie_table() # from Pfam-A.full.gz flatfile if size_table_file is not None: sizes = pd.read_csv(size_table_file) hits = hits.merge(sizes, on="pfam_id", how="left") hits.loc[:, "num_seqs_over_len"] = ( hits.loc[:, "num_seqs"] / pd.to_numeric(hits.loc[:, "target_len"], errors="raise") ) # multiple members of the same clan might hit overlapping regions # in these cases, we may only want to keep the top-scoring hit if resolve_overlaps: if clan_table_file is None: raise ValueError( "Need to specify clan_table_file to resolve " "overlapping hits from same clan." ) hits = remove_clan_overlaps(hits) return hits