Source code for evcouplings.align.tools

"""
Wrappers for running external sequence alignment tools

Authors:
  Thomas A. Hopf
  Anna G. Green - run_hmmbuild, run_hmmsearch
  Chan Kang - run_hmmbuild, run_hmmsearch
"""

from collections import namedtuple
import pandas as pd
from evcouplings.utils.system import (
    run, create_prefix_folders, verify_resources, temp
)
from evcouplings.utils.config import check_required


# output fields for storing results of a hmmbuild run
# (returned by run_hmmbuild)
HmmbuildResult = namedtuple(
    "HmmbuildResult",
    ["prefix", "hmmfile", "output"]
)


[docs]def run_hmmbuild(alignment_file, prefix, cpu=None,
                 stdout_redirect=None, symfrac=None,
                 binary="hmmbuild"):
    """
    Profile HMM construction from multiple sequence alignments
    Refer to HMMER documentation for details.

    http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf

    Parameters
    ----------
    alignment_file : str
        File containing the multiple sequence alignment. Can be in 
        Stockholm, a2m, or clustal formats, or any other format 
        recognized by hmmer. Please note that ALL POSITIONS 
        above the symfrac cutoff will be used in HMM 
        construction (if the alignment contains columns that are 
        insertions relative to the query sequence, this may be 
        problematic for structure comparison)
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    symfrac : float, optional (default: None)
        range 0.0 - 1.0, HMMbuild will use columns with 
        > symfrac percent gaps to construct the HMM.
        If None provided, HMMbuild internal default is 0.5.
        (Note: this is calculated after their internal sequence
        weighting is calculated)
    binary : str (default: "hmmbuild")
        Path to jackhmmer binary (put in PATH for
        default to work)

    Returns
    -------
    HmmbuildResult
        namedtuple with fields corresponding to the different
        output files (prefix, alignment, output, tblout, domtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources(
        "Input file does not exist or is empty",
        alignment_file
    )

    create_prefix_folders(prefix)

    # store filenames of all individual results;
    # these will be returned as result of the
    # function.
    result = HmmbuildResult(
        prefix,
        prefix + ".hmm",
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
    )

    cmd = [
        binary,
        "-o", result.output,
    ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    if symfrac is not None:
        cmd += ["--symfrac", str(symfrac)]

    cmd += [result.hmmfile, alignment_file]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created some sort of alignment
    verify_resources(
        "hmmbuild returned empty HMM profile: "
        "stdout={} stderr={} file={}".format(
            stdout, stderr, result.hmmfile
        ),
        result.hmmfile
    )

    return result


# output fields for storing results of a hmmsearch run
# (returned by run_hmmsearch)
HmmsearchResult = namedtuple(
    "HmmsearchResult",
    ["prefix", "alignment", "output", "tblout", "domtblout"]
)


[docs]def run_hmmsearch(hmmfile, database, prefix,
                  use_bitscores, domain_threshold, seq_threshold,
                  nobias=False, cpu=None,
                  stdout_redirect=None, binary="hmmsearch"):
    """
    Search profile(s) against a sequence database.
    Refer to HMMER documentation for details.

    http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf

    Parameters
    ----------
    hmmfile : str
        File containing the profile(s)
    database : str
        File containing sequence database
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    binary : str (default: "hmmsearch")
        Path to jackhmmer binary (put in PATH for
        default to work)

    Returns
    -------
    HmmsearchResult
        namedtuple with fields corresponding to the different
        output files (prefix, alignment, output, tblout, domtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources(
        "Input file does not exist or is empty",
        hmmfile, database
    )

    create_prefix_folders(prefix)

    # store filenames of all individual results;
    # these will be returned as result of the
    # function.
    result = HmmsearchResult(
        prefix,
        prefix + ".sto",
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout",
        prefix + ".domtblout"
    )

    cmd = [
        binary,
        "-o", result.output,
        "-A", result.alignment,
        "--tblout", result.tblout,
        "--domtblout", result.domtblout,
        "--noali",
        "--notextw"
    ]

    # reporting thresholds are set accordingly to
    # inclusion threshold to reduce memory footprint
    if use_bitscores:
        cmd += [
            "-T", str(seq_threshold),
            "--domT", str(domain_threshold),
            "--incT", str(seq_threshold),
            "--incdomT", str(domain_threshold)
        ]
    else:
        cmd += [
            "-E", str(seq_threshold),
            "--domE", str(domain_threshold),
            "--incE", str(seq_threshold),
            "--incdomE", str(domain_threshold)
        ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    cmd += [hmmfile, database]

    return_code, stdout, stderr = run(cmd)

    return result


# output fields for storing results of a jackhmmer run
# (returned by run_jackhmmer)
JackhmmerResult = namedtuple(
    "JackhmmerResult",
    ["prefix", "alignment", "output", "tblout", "domtblout"]
)


[docs]def run_jackhmmer(query, database, prefix,
                  use_bitscores, domain_threshold, seq_threshold,
                  iterations=5, nobias=False, cpu=None,
                  stdout_redirect=None, checkpoints_hmm=False,
                  checkpoints_ali=False, binary="jackhmmer"):
    """
    Run jackhmmer sequence search against target database.
    Refer to HMMER Userguide for explanation of these parameters.

    Parameters
    ----------
    query : str
        File containing query sequence
    database : str
        File containing sequence database
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    iterations : int
        number of jackhmmer search iterations
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    checkpoints_hmm : bool, optional (default: False)
        Store checkpoint HMMs to prefix.<iter>.hmm
    checkpoints_ali : bool, optional (default: False)
        Store checkpoint alignments to prefix.<iter>.sto
    binary : str (default: "jackhmmer")
        Path to jackhmmer binary (put in PATH for
        default to work)

    Returns
    -------
    JackhmmerResult
        namedtuple with fields corresponding to the different
        output files (prefix, alignment, output, tblout, domtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources(
        "Input file does not exist or is empty",
        query, database
    )

    create_prefix_folders(prefix)

    # store filenames of all individual results;
    # these will be returned as result of the
    # function.
    result = JackhmmerResult(
        prefix,
        prefix + ".sto",
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout",
        prefix + ".domtblout"
    )

    cmd = [
        binary,
        "-N", str(iterations),
        "-o", result.output,
        "-A", result.alignment,
        "--tblout", result.tblout,
        "--domtblout", result.domtblout,
        "--noali",
        "--notextw"
    ]

    # reporting thresholds are set accordingly to
    # inclusion threshold to reduce memory footprit
    if use_bitscores:
        cmd += [
            "-T", str(seq_threshold),
            "--domT", str(domain_threshold),
            "--incT", str(seq_threshold),
            "--incdomT", str(domain_threshold)
        ]
    else:
        cmd += [
            "-E", str(seq_threshold),
            "--domE", str(domain_threshold),
            "--incE", str(seq_threshold),
            "--incdomE", str(domain_threshold)
        ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    # save checkpoints for alignments and HMMs?
    if checkpoints_ali:
        cmd += ["--chkali", prefix]
    if checkpoints_hmm:
        cmd += ["--chkhmm", prefix]

    cmd += [query, database]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created some sort of alignment
    verify_resources(
        "jackhmmer returned empty alignment: "
        "stdout={} stderr={} file={}".format(
            stdout, stderr, result.alignment
        ),
        result.alignment
    )

    return result


HmmscanResult = namedtuple(
    "HmmscanResult",
    ["prefix", "output", "tblout", "domtblout", "pfamtblout"]
)


[docs]def run_hmmscan(query, database, prefix,
                use_model_threshold=True, threshold_type="cut_ga",
                use_bitscores=True, domain_threshold=None, seq_threshold=None,
                nobias=False, cpu=None, stdout_redirect=None, binary="hmmscan"):
    """
    Run hmmscan of HMMs in database against sequences in query
    to identify matches of these HMMs.
    Refer to HMMER Userguide for explanation of these parameters.

    Parameters
    ----------
    query : str
        File containing query sequence(s)
    database : str
        File containing HMM database (prepared with hmmpress)
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_model_threshold: bool (default: True)
        Use model-specific inclusion thresholds from
        HMM database rather than global bitscore/E-value
        thresholds (use_bitscores, domain_threshold and
        seq_threshold are overriden by this flag).
    threshold-type: {"cut_ga", "cut_nc", "cut_tc"} (default: "cut_ga")
        Use gathering (default), noise or trusted cutoff
        to define scan hits. Please refer to HMMER manual for
        details.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
        Overriden by use_model_threshold flag.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    binary : str (default: "hmmscan")
        Path to hmmscan binary (put in PATH for
        default to work)

    Returns
    -------
    HmmscanResult
        namedtuple with fields corresponding to the different
        output files (prefix, output, tblout, domtblout, pfamtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources(
        "Input file does not exist or is empty",
        query, database
    )

    create_prefix_folders(prefix)

    result = HmmscanResult(
        prefix,
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout",
        prefix + ".domtblout",
        prefix + ".pfamtblout"
    )

    cmd = [
        binary,
        "-o", result.output,
        "--tblout", result.tblout,
        "--domtblout", result.domtblout,
        "--pfamtblout", result.pfamtblout,
        "--notextw",
        "--acc",
    ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    # either use model-specific threshold, or custom
    # bitscore/E-value thresholds
    if use_model_threshold:
        THRESHOLD_CHOICES = ["cut_ga", "cut_nc", "cut_tc"]
        if threshold_type not in THRESHOLD_CHOICES:
            raise ValueError(
                "Invalid model threshold, valid choices are: " +
                ", ".join(THRESHOLD_CHOICES)
            )

        cmd += ["--" + threshold_type]
    else:
        if seq_threshold is None or domain_threshold is None:
            raise ValueError(
                "Must define sequence- and domain-level reporting"
                "thresholds, or use gathering threshold instead."
            )

        if use_bitscores:
            cmd += [
                "-T", str(seq_threshold),
                "--domT", str(domain_threshold),
            ]
        else:
            cmd += [
                "-E", str(seq_threshold),
                "--domE", str(domain_threshold),
            ]

    cmd += [database, query]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created a table with hits
    verify_resources(
        "hmmscan did not return results: "
        "stdout={} stderr={} file={}".format(
            stdout, stderr, result.domtblout
        ),
        result.domtblout
    )

    return result


def _read_hmmer_table(filename, column_names):
    """
    Parse a HMMER file in (dom)tbl format into
    a pandas DataFrame.

    (Why this is necessary: cannot easily split on
    whitespace with pandas because of last column
    that contains whitespace both in header and rows)

    Parameters
    ----------
    filename : str
        Path of (dom)tbl file
    column_names : list of str
        Columns in the respective format
        (different for tbl and domtbl)

    Returns
    -------
    pd.DataFrame
        DataFrame with parsed (dom)tbl
    """
    res = []
    num_splits = len(column_names) - 1

    with open(filename) as f:
        for line in f:
            if line.startswith("#"):
                continue

            fields = line.rstrip().split(maxsplit=num_splits)
            res.append(fields)

    # at the moment, all fields in dataframe are strings, even
    # if numeric. To convert to numbers, cheap trick is to store
    # to csv file and let pandas guess the types, rather than
    # going through convert_objects (deprecated) or to_numeric
    # (more effort)
    tempfile = temp()
    pd.DataFrame(
        res, columns=column_names
    ).to_csv(tempfile, index=False)

    return pd.read_csv(tempfile)


[docs]def read_hmmer_tbl(filename):
    """
    Read a HMMER tbl file into DataFrame.

    Parameters
    ----------
    filename : str
        Path of tbl file

    Returns
    -------
    pd.DataFrame
        DataFrame with parsed tbl
    """
    column_names = [
        "target_name", "target_accession",
        "query_name", "query_accession",
        "full_Evalue", "full_score", "full_bias",
        "best_domain_Evalue", "best_domain_score",
        "best_domain_bias",
        "domain_exp", "domain_reg", "domain_clu",
        "domain_ov", "domain_env", "domain_dom",
        "domain_rep", "domain_inc",
        "description"
    ]

    return _read_hmmer_table(filename, column_names)


[docs]def read_hmmer_domtbl(filename):
    """
    Read a HMMER domtbl file into DataFrame.

    Parameters
    ----------
    filename : str
        Path of domtbl file

    Returns
    -------
    pd.DataFrame
        DataFrame with parsed domtbl
    """
    column_names = [
        "target_name", "target_accession", "target_len",
        "query_name", "query_accession", "query_len",
        "full_Evalue", "full_score", "full_bias",
        "hit_number", "total_hit_number",
        "domain_c_Evalue", "domain_i_Evalue",
        "domain_score", "domain_bias",
        "hmm_from", "hmm_to",
        "ali_from", "ali_to",
        "env_from", "env_to",
        "acc", "description"
    ]

    return _read_hmmer_table(filename, column_names)


[docs]def run_hhfilter(input_file, output_file, threshold=95,
                 columns="a2m", binary="hhfilter"):
    """
    Redundancy-reduce a sequence alignment using hhfilter
    from the HHsuite alignment suite.

    Parameters
    ----------
    input_file : str
        Path to input alignment in A2M/FASTA format
    output_file : str
        Path to output alignment (will be in A3M format)
    threshold : int, optional (default: 95)
        Sequence identity threshold for maximum pairwise
        identity (between 0 and 100)
    columns : {"first", "a2m"}, optional (default: "a2m")
        Definition of match columns (based on first sequence
        or upper-case columns (a2m))
    binary : str
        Path to hhfilter binary

    Returns
    -------
    str
        output_file

    Raises
    ------
    ResourceError
        If output alignment is non-existent/empty
    ValueError
        Upon invalid value of columns parameter
    """
    if columns not in ["first", "a2m"]:
        raise ValueError(
            "Invalid column selection: {}".format(columns)
        )

    verify_resources(
        "Alignment file does not exist or is empty",
        input_file
    )

    create_prefix_folders(output_file)

    cmd = [
        binary,
        "-i", input_file,
        "-o", output_file,
        "-id", str(threshold),
        "-M", columns,
        "-v", str(2)
    ]

    return_code, stdout, stderr = run(cmd)

    verify_resources(
        "hhfilter returned empty alignment: "
        "stdout={} stderr={} file={}".format(
            stdout, stderr, output_file
        ),
        output_file
    )

    return output_file