Source code for evcouplings.align.tools

"""
Wrappers for running external sequence alignment tools

Authors:
  Thomas A. Hopf
  Anna G. Green - run_hmmbuild, run_hmmsearch
  Chan Kang - run_hmmbuild, run_hmmsearch
"""

from collections import namedtuple
import pandas as pd
from evcouplings.utils.system import (
    run, create_prefix_folders, verify_resources, temp
)
from evcouplings.utils.config import check_required


# output fields for storing results of a hmmbuild run
# (returned by run_hmmbuild)
HmmbuildResult = namedtuple(
    "HmmbuildResult",
    ["prefix", "hmmfile", "output"]
)


[docs]def run_hmmbuild(alignment_file, prefix, cpu=None, stdout_redirect=None, symfrac=None, binary="hmmbuild"): """ Profile HMM construction from multiple sequence alignments Refer to HMMER documentation for details. http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf Parameters ---------- alignment_file : str File containing the multiple sequence alignment. Can be in Stockholm, a2m, or clustal formats, or any other format recognized by hmmer. Please note that ALL POSITIONS above the symfrac cutoff will be used in HMM construction (if the alignment contains columns that are insertions relative to the query sequence, this may be problematic for structure comparison) prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. cpu : int, optional (default: None) Number of CPUs to use for search. Uses all if None. stdout_redirect : str, optional (default: None) Redirect bulky stdout instead of storing with rest of results (use "/dev/null" to dispose) symfrac : float, optional (default: None) range 0.0 - 1.0, HMMbuild will use columns with > symfrac percent gaps to construct the HMM. If None provided, HMMbuild internal default is 0.5. (Note: this is calculated after their internal sequence weighting is calculated) binary : str (default: "hmmbuild") Path to jackhmmer binary (put in PATH for default to work) Returns ------- HmmbuildResult namedtuple with fields corresponding to the different output files (prefix, alignment, output, tblout, domtblout) Raises ------ ExternalToolError, ResourceError """ verify_resources( "Input file does not exist or is empty", alignment_file ) create_prefix_folders(prefix) # store filenames of all individual results; # these will be returned as result of the # function. result = HmmbuildResult( prefix, prefix + ".hmm", prefix + ".output" if stdout_redirect is None else stdout_redirect, ) cmd = [ binary, "-o", result.output, ] # number of CPUs if cpu is not None: cmd += ["--cpu", str(cpu)] if symfrac is not None: cmd += ["--symfrac", str(symfrac)] cmd += [result.hmmfile, alignment_file] return_code, stdout, stderr = run(cmd) # also check we actually created some sort of alignment verify_resources( "hmmbuild returned empty HMM profile: " "stdout={} stderr={} file={}".format( stdout, stderr, result.hmmfile ), result.hmmfile ) return result
# output fields for storing results of a hmmsearch run # (returned by run_hmmsearch) HmmsearchResult = namedtuple( "HmmsearchResult", ["prefix", "alignment", "output", "tblout", "domtblout"] )
[docs]def run_hmmsearch(hmmfile, database, prefix, use_bitscores, domain_threshold, seq_threshold, nobias=False, cpu=None, stdout_redirect=None, binary="hmmsearch"): """ Search profile(s) against a sequence database. Refer to HMMER documentation for details. http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf Parameters ---------- hmmfile : str File containing the profile(s) database : str File containing sequence database prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. use_bitscores : bool Use bitscore inclusion thresholds rather than E-values. domain_threshold : int or float or str Inclusion threshold applied on the domain level (e.g. "1E-03" or 0.001 or 50) seq_threshold : int or float or str Inclusion threshold applied on the sequence level (e.g. "1E-03" or 0.001 or 50) nobias : bool, optional (default: False) Turn of bias correction cpu : int, optional (default: None) Number of CPUs to use for search. Uses all if None. stdout_redirect : str, optional (default: None) Redirect bulky stdout instead of storing with rest of results (use "/dev/null" to dispose) binary : str (default: "hmmsearch") Path to jackhmmer binary (put in PATH for default to work) Returns ------- HmmsearchResult namedtuple with fields corresponding to the different output files (prefix, alignment, output, tblout, domtblout) Raises ------ ExternalToolError, ResourceError """ verify_resources( "Input file does not exist or is empty", hmmfile, database ) create_prefix_folders(prefix) # store filenames of all individual results; # these will be returned as result of the # function. result = HmmsearchResult( prefix, prefix + ".sto", prefix + ".output" if stdout_redirect is None else stdout_redirect, prefix + ".tblout", prefix + ".domtblout" ) cmd = [ binary, "-o", result.output, "-A", result.alignment, "--tblout", result.tblout, "--domtblout", result.domtblout, "--noali", "--notextw" ] # reporting thresholds are set accordingly to # inclusion threshold to reduce memory footprint if use_bitscores: cmd += [ "-T", str(seq_threshold), "--domT", str(domain_threshold), "--incT", str(seq_threshold), "--incdomT", str(domain_threshold) ] else: cmd += [ "-E", str(seq_threshold), "--domE", str(domain_threshold), "--incE", str(seq_threshold), "--incdomE", str(domain_threshold) ] # number of CPUs if cpu is not None: cmd += ["--cpu", str(cpu)] # bias correction filter if nobias: cmd += ["--nobias"] cmd += [hmmfile, database] return_code, stdout, stderr = run(cmd) return result
# output fields for storing results of a jackhmmer run # (returned by run_jackhmmer) JackhmmerResult = namedtuple( "JackhmmerResult", ["prefix", "alignment", "output", "tblout", "domtblout"] )
[docs]def run_jackhmmer(query, database, prefix, use_bitscores, domain_threshold, seq_threshold, iterations=5, nobias=False, cpu=None, stdout_redirect=None, checkpoints_hmm=False, checkpoints_ali=False, binary="jackhmmer"): """ Run jackhmmer sequence search against target database. Refer to HMMER Userguide for explanation of these parameters. Parameters ---------- query : str File containing query sequence database : str File containing sequence database prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. use_bitscores : bool Use bitscore inclusion thresholds rather than E-values. domain_threshold : int or float or str Inclusion threshold applied on the domain level (e.g. "1E-03" or 0.001 or 50) seq_threshold : int or float or str Inclusion threshold applied on the sequence level (e.g. "1E-03" or 0.001 or 50) iterations : int number of jackhmmer search iterations nobias : bool, optional (default: False) Turn of bias correction cpu : int, optional (default: None) Number of CPUs to use for search. Uses all if None. stdout_redirect : str, optional (default: None) Redirect bulky stdout instead of storing with rest of results (use "/dev/null" to dispose) checkpoints_hmm : bool, optional (default: False) Store checkpoint HMMs to prefix.<iter>.hmm checkpoints_ali : bool, optional (default: False) Store checkpoint alignments to prefix.<iter>.sto binary : str (default: "jackhmmer") Path to jackhmmer binary (put in PATH for default to work) Returns ------- JackhmmerResult namedtuple with fields corresponding to the different output files (prefix, alignment, output, tblout, domtblout) Raises ------ ExternalToolError, ResourceError """ verify_resources( "Input file does not exist or is empty", query, database ) create_prefix_folders(prefix) # store filenames of all individual results; # these will be returned as result of the # function. result = JackhmmerResult( prefix, prefix + ".sto", prefix + ".output" if stdout_redirect is None else stdout_redirect, prefix + ".tblout", prefix + ".domtblout" ) cmd = [ binary, "-N", str(iterations), "-o", result.output, "-A", result.alignment, "--tblout", result.tblout, "--domtblout", result.domtblout, "--noali", "--notextw" ] # reporting thresholds are set accordingly to # inclusion threshold to reduce memory footprit if use_bitscores: cmd += [ "-T", str(seq_threshold), "--domT", str(domain_threshold), "--incT", str(seq_threshold), "--incdomT", str(domain_threshold) ] else: cmd += [ "-E", str(seq_threshold), "--domE", str(domain_threshold), "--incE", str(seq_threshold), "--incdomE", str(domain_threshold) ] # number of CPUs if cpu is not None: cmd += ["--cpu", str(cpu)] # bias correction filter if nobias: cmd += ["--nobias"] # save checkpoints for alignments and HMMs? if checkpoints_ali: cmd += ["--chkali", prefix] if checkpoints_hmm: cmd += ["--chkhmm", prefix] cmd += [query, database] return_code, stdout, stderr = run(cmd) # also check we actually created some sort of alignment verify_resources( "jackhmmer returned empty alignment: " "stdout={} stderr={} file={}".format( stdout, stderr, result.alignment ), result.alignment ) return result
HmmscanResult = namedtuple( "HmmscanResult", ["prefix", "output", "tblout", "domtblout", "pfamtblout"] )
[docs]def run_hmmscan(query, database, prefix, use_model_threshold=True, threshold_type="cut_ga", use_bitscores=True, domain_threshold=None, seq_threshold=None, nobias=False, cpu=None, stdout_redirect=None, binary="hmmscan"): """ Run hmmscan of HMMs in database against sequences in query to identify matches of these HMMs. Refer to HMMER Userguide for explanation of these parameters. Parameters ---------- query : str File containing query sequence(s) database : str File containing HMM database (prepared with hmmpress) prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. use_model_threshold: bool (default: True) Use model-specific inclusion thresholds from HMM database rather than global bitscore/E-value thresholds (use_bitscores, domain_threshold and seq_threshold are overriden by this flag). threshold-type: {"cut_ga", "cut_nc", "cut_tc"} (default: "cut_ga") Use gathering (default), noise or trusted cutoff to define scan hits. Please refer to HMMER manual for details. use_bitscores : bool Use bitscore inclusion thresholds rather than E-values. Overriden by use_model_threshold flag. domain_threshold : int or float or str Inclusion threshold applied on the domain level (e.g. "1E-03" or 0.001 or 50) seq_threshold : int or float or str Inclusion threshold applied on the sequence level (e.g. "1E-03" or 0.001 or 50) nobias : bool, optional (default: False) Turn of bias correction cpu : int, optional (default: None) Number of CPUs to use for search. Uses all if None. stdout_redirect : str, optional (default: None) Redirect bulky stdout instead of storing with rest of results (use "/dev/null" to dispose) binary : str (default: "hmmscan") Path to hmmscan binary (put in PATH for default to work) Returns ------- HmmscanResult namedtuple with fields corresponding to the different output files (prefix, output, tblout, domtblout, pfamtblout) Raises ------ ExternalToolError, ResourceError """ verify_resources( "Input file does not exist or is empty", query, database ) create_prefix_folders(prefix) result = HmmscanResult( prefix, prefix + ".output" if stdout_redirect is None else stdout_redirect, prefix + ".tblout", prefix + ".domtblout", prefix + ".pfamtblout" ) cmd = [ binary, "-o", result.output, "--tblout", result.tblout, "--domtblout", result.domtblout, "--pfamtblout", result.pfamtblout, "--notextw", "--acc", ] # number of CPUs if cpu is not None: cmd += ["--cpu", str(cpu)] # bias correction filter if nobias: cmd += ["--nobias"] # either use model-specific threshold, or custom # bitscore/E-value thresholds if use_model_threshold: THRESHOLD_CHOICES = ["cut_ga", "cut_nc", "cut_tc"] if threshold_type not in THRESHOLD_CHOICES: raise ValueError( "Invalid model threshold, valid choices are: " + ", ".join(THRESHOLD_CHOICES) ) cmd += ["--" + threshold_type] else: if seq_threshold is None or domain_threshold is None: raise ValueError( "Must define sequence- and domain-level reporting" "thresholds, or use gathering threshold instead." ) if use_bitscores: cmd += [ "-T", str(seq_threshold), "--domT", str(domain_threshold), ] else: cmd += [ "-E", str(seq_threshold), "--domE", str(domain_threshold), ] cmd += [database, query] return_code, stdout, stderr = run(cmd) # also check we actually created a table with hits verify_resources( "hmmscan did not return results: " "stdout={} stderr={} file={}".format( stdout, stderr, result.domtblout ), result.domtblout ) return result
def _read_hmmer_table(filename, column_names): """ Parse a HMMER file in (dom)tbl format into a pandas DataFrame. (Why this is necessary: cannot easily split on whitespace with pandas because of last column that contains whitespace both in header and rows) Parameters ---------- filename : str Path of (dom)tbl file column_names : list of str Columns in the respective format (different for tbl and domtbl) Returns ------- pd.DataFrame DataFrame with parsed (dom)tbl """ res = [] num_splits = len(column_names) - 1 with open(filename) as f: for line in f: if line.startswith("#"): continue fields = line.rstrip().split(maxsplit=num_splits) res.append(fields) # at the moment, all fields in dataframe are strings, even # if numeric. To convert to numbers, cheap trick is to store # to csv file and let pandas guess the types, rather than # going through convert_objects (deprecated) or to_numeric # (more effort) tempfile = temp() pd.DataFrame( res, columns=column_names ).to_csv(tempfile, index=False) return pd.read_csv(tempfile)
[docs]def read_hmmer_tbl(filename): """ Read a HMMER tbl file into DataFrame. Parameters ---------- filename : str Path of tbl file Returns ------- pd.DataFrame DataFrame with parsed tbl """ column_names = [ "target_name", "target_accession", "query_name", "query_accession", "full_Evalue", "full_score", "full_bias", "best_domain_Evalue", "best_domain_score", "best_domain_bias", "domain_exp", "domain_reg", "domain_clu", "domain_ov", "domain_env", "domain_dom", "domain_rep", "domain_inc", "description" ] return _read_hmmer_table(filename, column_names)
[docs]def read_hmmer_domtbl(filename): """ Read a HMMER domtbl file into DataFrame. Parameters ---------- filename : str Path of domtbl file Returns ------- pd.DataFrame DataFrame with parsed domtbl """ column_names = [ "target_name", "target_accession", "target_len", "query_name", "query_accession", "query_len", "full_Evalue", "full_score", "full_bias", "hit_number", "total_hit_number", "domain_c_Evalue", "domain_i_Evalue", "domain_score", "domain_bias", "hmm_from", "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc", "description" ] return _read_hmmer_table(filename, column_names)
[docs]def run_hhfilter(input_file, output_file, threshold=95, columns="a2m", binary="hhfilter"): """ Redundancy-reduce a sequence alignment using hhfilter from the HHsuite alignment suite. Parameters ---------- input_file : str Path to input alignment in A2M/FASTA format output_file : str Path to output alignment (will be in A3M format) threshold : int, optional (default: 95) Sequence identity threshold for maximum pairwise identity (between 0 and 100) columns : {"first", "a2m"}, optional (default: "a2m") Definition of match columns (based on first sequence or upper-case columns (a2m)) binary : str Path to hhfilter binary Returns ------- str output_file Raises ------ ResourceError If output alignment is non-existent/empty ValueError Upon invalid value of columns parameter """ if columns not in ["first", "a2m"]: raise ValueError( "Invalid column selection: {}".format(columns) ) verify_resources( "Alignment file does not exist or is empty", input_file ) create_prefix_folders(output_file) cmd = [ binary, "-i", input_file, "-o", output_file, "-id", str(threshold), "-M", columns, "-v", str(2) ] return_code, stdout, stderr = run(cmd) verify_resources( "hhfilter returned empty alignment: " "stdout={} stderr={} file={}".format( stdout, stderr, output_file ), output_file ) return output_file