Source code for evcouplings.align.pfam

"""
Code for identifying Pfam domains and mapping Pfam
alignments and ECs into target sequence mode.

Authors:
  Thomas A. Hopf

.. todo::

    1. Write code to create list of family sizes
    2. Implement alignments against Pfam-HMM so precomputed results can be reused in focus mode

"""
import gzip
import pandas as pd
from evcouplings.align.tools import run_hmmscan, read_hmmer_domtbl
from evcouplings.utils.helpers import range_overlap


[docs]def create_family_size_table(full_pfam_file, outfile=None):
    """
    Parse family size table from Pfam flat file
    (ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz)

    Parameters
    ----------
    full_pfam_file : str
        Path to the pfam file (gzip).
    outfile : str, optional (default: None)
        Save the parsed table to this file as a csv file.
    
    Returns
    -------
    pd.DataFrame
        Parsed Pfam table.
    """
    data = []

    with gzip.open(full_pfam_file, "rt", encoding='latin-1') as gz_ref:
        pfam_id = None
        num_seqs = None

        for line in gz_ref:
            # identifier at the beginning of the family entry
            if line.startswith("#=GF AC"):
                pfam_id = line[10:17]

            # the number of sequences in the family, follows after the identifier
            elif line.startswith("#=GF SQ"):
                num_seqs = int(line[10:])

            # stores the result at the end of an entry
            elif (line.startswith("//") and
                    pfam_id is not None and num_seqs is not None):
                data.append({"pfam_id": pfam_id, "num_seqs": num_seqs})
                pfam_id = None
                num_seqs = None

    df = pd.DataFrame(data, columns=["pfam_id", "num_seqs"])

    if outfile is not None:
        df.to_csv(outfile, index=False)

    return df


[docs]def remove_clan_overlaps(pfam_table):
    """
    Remove overlapping Pfam hits from same Pfam clan
    (equivalent of PfamScan.pl). Currently only
    allows to remove overlaps by domain bitscore.

    .. todo::

        is bitscore the most sensible choice if different length hits?

    Parameters
    ----------
    pfam_table : pd.DataFrame
        Pfam hit table as generated by pfam_hits() function
        (must contain Pfam clan annotation).

    Returns
    -------
    pd.DataFrame
        Pfam hit table with lower-scoring overlaps removed
    """
    # could make this a parameter, if switching to E-values
    # we would have to changing sorting order of DataFrame
    # and sign of comparison further below.
    score = "domain_score"

    # group by sequence ID and clan to resolve overlaps
    grouped = pfam_table.sort_values(
        by=score, ascending=False
    ).groupby(
        by=["query_name", "clan_id"], as_index=False, sort=False
    )

    # store index value of all entries to discard
    remove_hits = []

    for (uniprot_ac, clan_name), grp in grouped:
        # safety check here that we are not grouping hits that are
        # not in the same clan (missing value) if pandas ever changed
        # the behaviour of groupby to not iterate through groups
        # with missing values. Otherwise, we would have to skip grouop.
        assert clan_name.startswith("CL")

        # go through all pairwise combinations of hits
        for idx1, hit1 in grp.iterrows():
            for idx2, hit2 in grp.iterrows():
                if idx1 < idx2:
                    if range_overlap(
                        (int(hit1["ali_from"]), int(hit1["ali_to"]) + 1),
                        (int(hit2["ali_from"]), int(hit2["ali_to"]) + 1),
                    ) > 0:
                        if float(hit1[score]) >= float(hit2[score]):
                            remove_hits.append(idx2)
                        else:
                            remove_hits.append(idx1)

    return pfam_table.loc[~pfam_table.index.isin(remove_hits)]


[docs]def pfam_hits(query_file, hmm_database, prefix,
              clan_table_file, size_table_file,
              resolve_overlaps=True,
              **kwargs):
    """
    Identify hits of Pfam HMMs in a set of sequences.

    Parameters
    ----------
    query_file : str
        File containing query sequence(s)
    hmm_database : str
        File containing HMM database
        (Pfam-A.hmm, with hmmpress applied)
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    clan_table_file : str
        File with table linking Pfam families to clans
        (Pfam-A.clans.tsv). Set to None if not available,
        but resolve_overlaps cannot be True in that case.
    size_table_file : str
        File with table of family sizes. Create using
        create_family_size_table(). Set to None if not
        available.
    resolve_overlaps : bool
        Resolve overlapping hits by families from
        the same clan.
        Only possible if clan_table_file is given.
    **kwargs : dict
        kwargs passed on to evcouplings.align.tools.run_hmmscan

    Returns
    -------
    pd.DataFrame
        Pfam hit table
    """
    # find HMM hits with hmmscan
    scan_res = run_hmmscan(
        query_file, hmm_database, prefix,
        **kwargs
    )

    hits = read_hmmer_domtbl(scan_res.domtblout)

    # remove version information from family name
    hits.loc[:, "pfam_id"] = hits.target_accession.map(
        lambda x: x.split(".")[0]
    )

    # add information about Pfam clan for each family,
    # this is necessary to resolve overlapping hits
    # clan file is Pfam-A.clans.tsv from Pfam FTP site
    if clan_table_file is not None:
        clans = pd.read_csv(
            clan_table_file, sep="\t",
            names=[
                "pfam_id", "clan_id", "clan_name",
                "family_name", "family_text"
            ]
        )

        hits = hits.merge(clans, on="pfam_id", how="left")

    # add number of sequences in each Pfam family,
    # this file has to be created using create_family_sie_table()
    # from Pfam-A.full.gz flatfile
    if size_table_file is not None:
        sizes = pd.read_csv(size_table_file)
        hits = hits.merge(sizes, on="pfam_id", how="left")

        hits.loc[:, "num_seqs_over_len"] = (
            hits.loc[:, "num_seqs"] /
            pd.to_numeric(hits.loc[:, "target_len"], errors="raise")
        )

    # multiple members of the same clan might hit overlapping regions
    # in these cases, we may only want to keep the top-scoring hit
    if resolve_overlaps:
        if clan_table_file is None:
            raise ValueError(
                "Need to specify clan_table_file to resolve "
                "overlapping hits from same clan."
            )

        hits = remove_clan_overlaps(hits)

    return hits