"""
Code for identifying Pfam domains and mapping Pfam
alignments and ECs into target sequence mode.
Authors:
Thomas A. Hopf
.. todo::
1. Write code to create list of family sizes
2. Implement alignments against Pfam-HMM so precomputed results can be reused in focus mode
"""
import gzip
import pandas as pd
from evcouplings.align.tools import run_hmmscan, read_hmmer_domtbl
from evcouplings.utils.helpers import range_overlap
[docs]def create_family_size_table(full_pfam_file, outfile=None):
"""
Parse family size table from Pfam flat file
(ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz)
Parameters
----------
full_pfam_file : str
Path to the pfam file (gzip).
outfile : str, optional (default: None)
Save the parsed table to this file as a csv file.
Returns
-------
pd.DataFrame
Parsed Pfam table.
"""
data = []
with gzip.open(full_pfam_file, "rt", encoding='latin-1') as gz_ref:
pfam_id = None
num_seqs = None
for line in gz_ref:
# identifier at the beginning of the family entry
if line.startswith("#=GF AC"):
pfam_id = line[10:17]
# the number of sequences in the family, follows after the identifier
elif line.startswith("#=GF SQ"):
num_seqs = int(line[10:])
# stores the result at the end of an entry
elif (line.startswith("//") and
pfam_id is not None and num_seqs is not None):
data.append({"pfam_id": pfam_id, "num_seqs": num_seqs})
pfam_id = None
num_seqs = None
df = pd.DataFrame(data, columns=["pfam_id", "num_seqs"])
if outfile is not None:
df.to_csv(outfile, index=False)
return df
[docs]def remove_clan_overlaps(pfam_table):
"""
Remove overlapping Pfam hits from same Pfam clan
(equivalent of PfamScan.pl). Currently only
allows to remove overlaps by domain bitscore.
.. todo::
is bitscore the most sensible choice if different length hits?
Parameters
----------
pfam_table : pd.DataFrame
Pfam hit table as generated by pfam_hits() function
(must contain Pfam clan annotation).
Returns
-------
pd.DataFrame
Pfam hit table with lower-scoring overlaps removed
"""
# could make this a parameter, if switching to E-values
# we would have to changing sorting order of DataFrame
# and sign of comparison further below.
score = "domain_score"
# group by sequence ID and clan to resolve overlaps
grouped = pfam_table.sort_values(
by=score, ascending=False
).groupby(
by=["query_name", "clan_id"], as_index=False, sort=False
)
# store index value of all entries to discard
remove_hits = []
for (uniprot_ac, clan_name), grp in grouped:
# safety check here that we are not grouping hits that are
# not in the same clan (missing value) if pandas ever changed
# the behaviour of groupby to not iterate through groups
# with missing values. Otherwise, we would have to skip grouop.
assert clan_name.startswith("CL")
# go through all pairwise combinations of hits
for idx1, hit1 in grp.iterrows():
for idx2, hit2 in grp.iterrows():
if idx1 < idx2:
if range_overlap(
(int(hit1["ali_from"]), int(hit1["ali_to"]) + 1),
(int(hit2["ali_from"]), int(hit2["ali_to"]) + 1),
) > 0:
if float(hit1[score]) >= float(hit2[score]):
remove_hits.append(idx2)
else:
remove_hits.append(idx1)
return pfam_table.loc[~pfam_table.index.isin(remove_hits)]
[docs]def pfam_hits(query_file, hmm_database, prefix,
clan_table_file, size_table_file,
resolve_overlaps=True,
**kwargs):
"""
Identify hits of Pfam HMMs in a set of sequences.
Parameters
----------
query_file : str
File containing query sequence(s)
hmm_database : str
File containing HMM database
(Pfam-A.hmm, with hmmpress applied)
prefix : str
Prefix path for output files. Folder structure in
the prefix will be created if not existing.
clan_table_file : str
File with table linking Pfam families to clans
(Pfam-A.clans.tsv). Set to None if not available,
but resolve_overlaps cannot be True in that case.
size_table_file : str
File with table of family sizes. Create using
create_family_size_table(). Set to None if not
available.
resolve_overlaps : bool
Resolve overlapping hits by families from
the same clan.
Only possible if clan_table_file is given.
**kwargs : dict
kwargs passed on to evcouplings.align.tools.run_hmmscan
Returns
-------
pd.DataFrame
Pfam hit table
"""
# find HMM hits with hmmscan
scan_res = run_hmmscan(
query_file, hmm_database, prefix,
**kwargs
)
hits = read_hmmer_domtbl(scan_res.domtblout)
# remove version information from family name
hits.loc[:, "pfam_id"] = hits.target_accession.map(
lambda x: x.split(".")[0]
)
# add information about Pfam clan for each family,
# this is necessary to resolve overlapping hits
# clan file is Pfam-A.clans.tsv from Pfam FTP site
if clan_table_file is not None:
clans = pd.read_csv(
clan_table_file, sep="\t",
names=[
"pfam_id", "clan_id", "clan_name",
"family_name", "family_text"
]
)
hits = hits.merge(clans, on="pfam_id", how="left")
# add number of sequences in each Pfam family,
# this file has to be created using create_family_sie_table()
# from Pfam-A.full.gz flatfile
if size_table_file is not None:
sizes = pd.read_csv(size_table_file)
hits = hits.merge(sizes, on="pfam_id", how="left")
hits.loc[:, "num_seqs_over_len"] = (
hits.loc[:, "num_seqs"] /
pd.to_numeric(hits.loc[:, "target_len"], errors="raise")
)
# multiple members of the same clan might hit overlapping regions
# in these cases, we may only want to keep the top-scoring hit
if resolve_overlaps:
if clan_table_file is None:
raise ValueError(
"Need to specify clan_table_file to resolve "
"overlapping hits from same clan."
)
hits = remove_clan_overlaps(hits)
return hits