Source code for evcouplings.compare.sifts

"""
Uniprot to PDB structure identification and
index mapping using the SIFTS database
(https://www.ebi.ac.uk/pdbe/docs/sifts/)

This functionality is centered around the
pdb_chain_uniprot.csv table available from SIFTS.
(ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_chain_uniprot.csv.gz)

Authors:
  Thomas A. Hopf
  Anna G. Green (find_homologs)
  Chan Kang (find_homologs)
"""

from os import path
from collections import OrderedDict
from copy import deepcopy

import pandas as pd
import requests

from evcouplings.align.alignment import (
    Alignment, read_fasta, parse_header
)
from evcouplings.align.protocol import (
    jackhmmer_search, hmmbuild_and_search
)
from evcouplings.align.tools import read_hmmer_domtbl
from evcouplings.compare.mapping import map_indices
from evcouplings.utils.system import (
    get_urllib, ResourceError, valid_file, tempdir, temp
)
from evcouplings.utils.config import (
    parse_config, check_required, InvalidParameterError
)
from evcouplings.utils.helpers import range_overlap

UNIPROT_MAPPING_URL = "https://www.uniprot.org/mapping/"
SIFTS_URL = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_segments_observed.csv.gz"
SIFTS_REST_API = "http://www.ebi.ac.uk/pdbe/api/mappings/uniprot_segments/{}"

# TODO: make this default parametrization more explicit (e.g. a config file in repository)
# these parameters are fed as a default into SIFTS.by_alignment so that the method can be
# easily used without a configuration file/any further setup
HMMER_CONFIG = """
prefix:
sequence_id:
sequence_file:
region:
first_index: 1

use_bitscores: True
domain_threshold: 0.5
sequence_threshold: 0.5
iterations: 1
database: sequence_database

extract_annotation: False
cpu: 1
nobias: False
reuse_alignment: False
checkpoints_hmm: False
checkpoints_ali: False

# database
jackhmmer: jackhmmer
sequence_database:
sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta
"""


[docs]def fetch_uniprot_mapping(ids, from_="ACC", to="ACC", format="fasta"):
    """
    Fetch data from UniProt ID mapping service
    (e.g. download set of sequences)

    Parameters
    ----------
    ids : list(str)
        List of UniProt identifiers for which to
        retrieve mapping
    from_ : str, optional (default: "ACC")
        Source identifier (i.e. contained in "ids" list)
    to : str, optional (default: "ACC")
        Target identifier (to which source should be mapped)
    format : str, optional (default: "fasta")
        Output format to request from Uniprot server

    Returns
    -------
    str:
        Response from UniProt server
    """
    params = {
        "from": from_,
        "to": to,
        "format": format,
        "query": " ".join(ids)
    }
    url = UNIPROT_MAPPING_URL
    r = requests.post(url, data=params)

    if r.status_code != requests.codes.ok:
        raise ResourceError(
            "Invalid status code ({}) for URL: {}".format(
                r.status_code, url
            )
        )

    return r.text


[docs]def find_homologs(pdb_alignment_method="jackhmmer", **kwargs):
    """
    Identify homologs using jackhmmer or hmmbuild/hmmsearch

    Parameters
    ----------
    pdb_alignment_method : {"jackhmmer", "hmmsearch"}, 
             optional (default: "jackhmmer")
        Sequence alignment method used for searching the PDB
    **kwargs
        Passed into jackhmmer / hmmbuild_and_search protocol
        (see documentation for available options)

    Returns
    -------
    ali : evcouplings.align.Alignment
        Alignment of homologs of query sequence
        in sequence database
    hits : pandas.DataFrame
        Tabular representation of hits
    """

    # load default configuration
    config = parse_config(HMMER_CONFIG)

    # update with overrides from kwargs
    config = {
        **config,
        **kwargs,
    }

    # create temporary output if no prefix is given
    if config["prefix"] is None:
        config["prefix"] = path.join(tempdir(), "compare")

    check_required(
        config, ["prefix"]
    )

    # run hmmsearch (possibly preceded by hmmbuild)
    if pdb_alignment_method == "hmmsearch":
        # set up config to run hmmbuild_and_search on the unfiltered alignment file
        updated_config = deepcopy(config)
        updated_config["alignment_file"] = config.get("raw_focus_alignment_file")
        ar = hmmbuild_and_search(**updated_config)

        # For hmmbuild and search, we have to read the raw focus alignment file
        # to guarantee that the query sequence is present
        with open(ar["raw_focus_alignment_file"]) as a:
            ali = Alignment.from_file(a, "fasta")

    # run jackhmmer against sequence database
    # at this point we have already checked to ensure
    # that the input is either jackhmmer or hmmsearch
    elif pdb_alignment_method == "jackhmmer":
        ar = jackhmmer_search(**config)

        with open(ar["raw_alignment_file"]) as a:
            ali = Alignment.from_file(a, "stockholm")

        # write alignment as FASTA file for easier checking by hand,
        # if necessary
        with open(config["prefix"] + "_raw.fasta", "w") as f:
            ali.write(f)
    else:
        raise InvalidParameterError(
            "Invalid pdb_alignment_method selected. Valid options are: " +
            ", ".join(["jackhmmer", "hmmsearch"])
        )

    # read hmmer hittable and simplify
    hits = read_hmmer_domtbl(ar["hittable_file"])

    hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[1])
    hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[2])

    hits = hits.rename(
        columns={
            "domain_score": "bitscore",
            "domain_i_Evalue": "e_value",
            "ali_from": "alignment_start",
            "ali_to": "alignment_end",
            "hmm_from": "hmm_start",
            "hmm_to": "hmm_end",
        }
    )

    hits.loc[:, "alignment_start"] = pd.to_numeric(hits.alignment_start).astype(int)
    hits.loc[:, "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int)

    hits.loc[:, "alignment_id"] = (
        hits.target_name + "/" +
        hits.alignment_start.astype(str) + "-" +
        hits.alignment_end.astype(str)
    )

    hits = hits.loc[
        :, ["alignment_id", "uniprot_ac", "uniprot_id", "alignment_start",
            "alignment_end", "bitscore", "e_value"]
    ]

    return ali, hits


[docs]class SIFTSResult:
    """
    Store results of SIFTS structure/mapping identification.

    (Full class defined for easify modification of fields)
    """
    def __init__(self, hits, mapping):
        """
        Create new SIFTS structure / mapping record.

        Parameters
        ----------
        hits : pandas.DataFrame
            Table with identified PDB chains
        mapping : dict
            Mapping from seqres to Uniprot numbering
            for each PDB chain
            (index by mapping_index column in hits
            dataframe)
        """
        self.hits = hits
        self.mapping = mapping


[docs]class SIFTS:
    """
    Provide Uniprot to PDB mapping data and functions
    starting from SIFTS mapping table.
    """
    def __init__(self, sifts_table_file, sequence_file=None):
        """
        Create new SIFTS mapper from mapping table.

        Note that creation of the mapping files, if not existing,
        takes a while.

        Parameters
        ----------
        sifts_table_file : str
            Path to *corrected* SIFTS pdb_chain_uniprot.csv
            To generate this file, point to an empty file path.
        sequence_file : str, optional (default: None)
            Path to file containing all UniProt sequences
            in SIFTS (used for homology-based identification
            of structures).
            Note: This file can be created using the
            create_sequence_file() method.
        """
        # test if table exists, if not, download and modify
        if not valid_file(sifts_table_file):
            self._create_mapping_table(sifts_table_file)

        self.table = pd.read_csv(
            sifts_table_file, comment="#"
        )

        # final table has still some entries where lengths do not match,
        # remove these
        self.table = self.table.query(
            "(resseq_end - resseq_start) == (uniprot_end - uniprot_start)"
        )

        self.sequence_file = sequence_file

        # if path for sequence file given, but not there, create
        if sequence_file is not None and not valid_file(sequence_file):
            self.create_sequence_file(sequence_file)

        # add Uniprot ID column if we have sequence mapping
        # from FASTA file
        if self.sequence_file is not None:
            self._add_uniprot_ids()

    def _create_mapping_table(self, sifts_table_file):
        """
        Create modified SIFTS mapping table (based on
        file at SIFTS_URL). For some of the entries,
        the Uniprot sequence ranges do not map to a
        SEQRES sequence range of the same length. These
        PDB IDs will be entirely replaced by a segment-
        based mapping extracted from the SIFTS REST API.

        Parameters
        ----------
        sifts_table_file : str
            Path where computed table will be stored
        """
        def extract_rows(M, pdb_id):
            res = []

            M = M[pdb_id.lower()]["UniProt"]

            for uniprot_ac, Ms in M.items():
                for x in Ms["mappings"]:
                    res.append({
                        "pdb_id": pdb_id,
                        "pdb_chain": x["chain_id"],
                        "uniprot_ac": uniprot_ac,
                        "resseq_start": x["start"]["residue_number"],
                        "resseq_end": x["end"]["residue_number"],
                        "coord_start": (
                            str(x["start"]["author_residue_number"]) +
                            x["start"]["author_insertion_code"].replace(" ", "")
                        ),
                        "coord_end": (
                            str(x["end"]["author_residue_number"]) +
                            x["end"]["author_insertion_code"].replace(" ", "")
                        ),
                        "uniprot_start": x["unp_start"],
                        "uniprot_end": x["unp_end"],
                    })

            return res

        # download SIFTS table (gzip-compressed csv) to temp file
        temp_download_file = temp()
        get_urllib(SIFTS_URL, temp_download_file)

        # load table and rename columns for internal use, if SIFTS
        # ever decided to rename theirs
        table = pd.read_csv(
            temp_download_file, comment="#",
            compression="gzip"
        ).rename(
            columns={
                "PDB": "pdb_id",
                "CHAIN": "pdb_chain",
                "SP_PRIMARY": "uniprot_ac",
                "RES_BEG": "resseq_start",
                "RES_END": "resseq_end",
                "PDB_BEG": "coord_start",
                "PDB_END": "coord_end",
                "SP_BEG": "uniprot_start",
                "SP_END": "uniprot_end",
            }
        )

        # TODO: remove the following if new segment-based table proves as robust solution
        """
        # this block disabled for now due to use of new table
        # based on observed UniProt segments
        # - can probably be removed eventually

        # identify problematic PDB IDs
        problematic_ids = table.query(
            "(resseq_end - resseq_start) != (uniprot_end - uniprot_start)"
        ).pdb_id.unique()
        
        # collect new mappings from segment based REST API
        res = []
        for i, pdb_id in enumerate(problematic_ids):
            r = requests.get(
                SIFTS_REST_API.format(pdb_id.lower())
            )
            mapping = json.loads(r.text)

            res += extract_rows(mapping, pdb_id)

        # remove bad PDB IDs from table and add new mapping
        new_table = table.loc[~table.pdb_id.isin(problematic_ids)]

        # also disabled due to use of new table based on observed
        # UniProt segments - can probably be removed eventually 
        
        new_table = new_table.append(
            pd.DataFrame(res).loc[:, table.columns]
        )
        """

        # save for later reuse
        table.to_csv(sifts_table_file, index=False)

    def _add_uniprot_ids(self):
        """
        Add Uniprot ID column to SIFTS table based on
        AC to ID mapping extracted from sequence database
        """
        # iterate through headers in sequence file and store
        # AC to ID mapping
        ac_to_id = {}
        with open(self.sequence_file) as f:
            for seq_id, _ in read_fasta(f):
                _, ac, id_ = seq_id.split(" ")[0].split("|")
                ac_to_id[ac] = id_

        # add column to dataframe
        self.table.loc[:, "uniprot_id"] = self.table.loc[:, "uniprot_ac"].map(ac_to_id)

[docs]    def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100):
        """
        Create FASTA sequence file containing all UniProt
        sequences of proteins in SIFTS. This file is required
        for homology-based structure identification and
        index remapping.
        This function will also automatically associate
        the sequence file with the SIFTS object.

        Parameters
        ----------
        output_file : str
            Path at which to store sequence file
        chunk_size : int, optional (default: 1000)
            Retrieve sequences from UniProt in chunks of this size
            (too large chunks cause the mapping service to stall)
        max_retries : int, optional (default: 100)
            Allow this many retries when fetching sequences
            from UniProt ID mapping service, which unfortunately
            often suffers from connection failures.
        """
        ids = self.table.uniprot_ac.unique().tolist()

        # retrieve sequences in chunks since ID mapping service
        # tends to fail on large requests
        id_chunks = [
            ids[i:i + chunk_size] for i in range(0, len(ids), chunk_size)
        ]

        # store individual retrieved chunks as list of strings
        seq_chunks = []

        # keep track of how many retries were necessary and
        # abort if number exceeds max_retries
        num_retries = 0

        for ch in id_chunks:
            # fetch sequence chunk;
            # if there is a problem retry as long as we stay within
            # maximum number of retries
            while True:
                try:
                    seqs = fetch_uniprot_mapping(ch)
                    break
                except requests.ConnectionError as e:
                    # count as failed try
                    num_retries += 1

                    # if we retried too often, abort
                    if num_retries > max_retries:
                        raise ResourceError(
                            "Could not fetch sequences for SIFTS mapping tables from UniProt since "
                            "maximum number of retries after connection errors was exceeded. Retry "
                            "at a later time, or call SIFTS.create_sequence_file() with a higher value "
                            "for max_retries."
                        ) from e

            # rename identifiers in sequence file, so
            # we can circumvent Uniprot sequence identifiers
            # being prefixed by hmmer if a hit has exactly the
            # same identifier as the query sequence
            seqs = seqs.replace(
                ">sp|", ">evsp|",
            ).replace(
                ">tr|", ">evtr|",
            )

            assert seqs.endswith("\n")

            # store for writing
            seq_chunks.append(seqs)

        # store sequences to FASTA file in one go at the end
        with open(output_file, "w") as f:
            f.write("".join(seq_chunks))

        self.sequence_file = output_file

        # add Uniprot ID column to SIFTS table
        self._add_uniprot_ids()

    def _create_sequence_file(self, output_file):
        """
        Create FASTA sequence file containing all UniProt
        sequences of proteins in SIFTS. This file is required
        for homology-based structure identification and
        index remapping.
        This function will also automatically associate
        the sequence file with the SIFTS object.

        Note: this would be the nicer function, but unfortunately
        the UniProt server frequently closes the connection running it

        Parameters
        ----------
        output_file : str
            Path at which to store sequence file
        """
        # fetch all the sequences
        seqs = fetch_uniprot_mapping(
            self.table.uniprot_ac.unique().tolist()
        )

        # then store to FASTA file
        with open(output_file, "w") as f:
            f.write(seqs)

        self.sequence_file = output_file

    def _finalize_hits(self, hit_segments):
        """
        Create final hit/mapping record from
        table of segments in PDB chains in
        SIFTS file.

        Parameters
        ----------
        hit_segments : pd.DataFrame
            Subset of self.table that will be
            turned into final mapping record

        Returns
        -------
        SIFTSResult
            Identified hits plus index mappings
            to Uniprot
        """
        # compile final set of hits
        hits = []

        # compile mapping from Uniprot to seqres for
        # each final hit
        mappings = {}

        # go through all SIFTS segments per PDB chain
        for i, ((pdb_id, pdb_chain), chain_grp) in enumerate(
            hit_segments.groupby(["pdb_id", "pdb_chain"])
        ):
            # put segments together in one segment-based
            # mapping for chain; this will be used by pdb.Chain.remap()
            mapping = {
                (r["resseq_start"], r["resseq_end"]): (r["uniprot_start"], r["uniprot_end"])
                for j, r in chain_grp.iterrows()
            }

            # append current hit and mapping
            hits.append([pdb_id, pdb_chain, i])
            mappings[i] = mapping

        # create final hit representation as DataFrame
        hits_df = pd.DataFrame(
            hits, columns=["pdb_id", "pdb_chain", "mapping_index"]
        )

        return SIFTSResult(hits_df, mappings)

[docs]    def by_pdb_id(self, pdb_id, pdb_chain=None, uniprot_id=None):
        """
        Find structures and mapping by PDB id
        and chain name

        Parameters
        ----------
        pdb_id : str
            4-letter PDB identifier
        pdb_chain : str, optional (default: None)
            PDB chain name (if not given, all
            chains for PDB entry will be returned)
        uniprot_id : str, optional (default: None)
            Filter to keep only this Uniprot accession
            number or identifier (necessary for chimeras,
            or multi-chain complexes with different proteins)

        Returns
        -------
        SIFTSResult
            Identified hits plus index mappings
            to Uniprot

        Raises
        ------
        ValueError
            If selected segments in PDB file do
            not unambigously map to one Uniprot
            entry
        """
        pdb_id = pdb_id.lower()
        query = "pdb_id == @pdb_id"

        # filter by PDB chain if selected
        if pdb_chain is not None:
            query += " and pdb_chain == @pdb_chain"

        # filter by UniProt AC/ID if selected
        # (to remove chimeras)
        if uniprot_id is not None:
            if "uniprot_id" in self.table.columns:
                query += (" and (uniprot_ac == @uniprot_id or "
                          "uniprot_id == @uniprot_id)")
            else:
                query += " and uniprot_ac == @uniprot_id"

        x = self.table.query(query)

        # check we only have one protein (might not
        # be the case with multiple chains, or with
        # chimeras)
        if len(x.uniprot_ac.unique()) > 1:
            id_list = ", ".join(x.uniprot_ac.unique())

            if "uniprot_id" in self.table.columns:
                id_list += " or " + ", ".join(x.uniprot_id.unique())

            raise ValueError(
                "Multiple Uniprot sequences on chains, "
                "please disambiguate using uniprot_id "
                "parameter: {}".format(id_list)
            )

        # create hit and mapping result
        return self._finalize_hits(x)

[docs]    def by_uniprot_id(self, uniprot_id, reduce_chains=False):
        """
        Find structures and mapping by Uniprot
        access number.

        Parameters
        ----------
        uniprot_ac : str
            Find PDB structures for this Uniprot accession
            number. If sequence_file was given while creating
            the SIFTS object, Uniprot identifiers can also be
            used.
        reduce_chains : bool, optional (Default: True)
            If true, keep only first chain per PDB ID
            (i.e. remove redundant occurrences of same
            protein in PDB structures). Should be set to
            False to identify homomultimeric contacts.

        Returns
        -------
        SIFTSResult
            Record of hits and mappings found for this
            Uniprot protein. See by_pdb_id() for detailed
            explanation of fields.
        """
        query = "uniprot_ac == @uniprot_id"

        if "uniprot_id" in self.table.columns:
            query += " or uniprot_id == @uniprot_id"

        x = self.table.query(query)

        hit_table = self._finalize_hits(x)

        # only retain one chain if this option is active
        if reduce_chains:
            hit_table.hits = hit_table.hits.groupby(
                "pdb_id"
            ).first().reset_index()

        return hit_table

[docs]    def by_alignment(self, min_overlap=20, reduce_chains=False, **kwargs):
        """
        Find structures by sequence alignment between
        query sequence and sequences in PDB.

        Parameters
        ----------
        min_overlap : int, optional (default: 20)
            Require at least this many aligned positions
            with the target structure
        reduce_chains : bool, optional (Default: True)
            If true, keep only first chain per PDB ID
            (i.e. remove redundant occurrences of same
            protein in PDB structures). Should be set to
            False to identify homomultimeric contacts.
        **kwargs
            Defines the behaviour of find_homologs() function
            used to find homologs by sequence alignment:
            - which alignment method is used 
              (pdb_alignment_method: {"jackhmmer", "hmmsearch"}, 
              default: "jackhmmer"),
            - parameters passed into the protocol for the selected
              alignment method (evcouplings.align.jackhmmer_search or
              evcouplings.align.hmmbuild_and_search).
              
              Default parameters are set in the HMMER_CONFIG string in this
              module, other parameters will need to be overriden; these
              minimally are:
              - for pdb_alignment_method == "jackhmmer":
                - sequence_id : str, identifier of target sequence
                - jackhmmer : str, path to jackhmmer binary if not on path                
              - for pdb_alignment_method == "hmmsearch":
                - sequence_id : str, identifier of target sequence
                - raw_focus_alignment_file : str, path to input alignment file  
                - hmmbuild : str, path to hmmbuild binary if not on path
                - hmmsearch : str, path to search binary if not on path
            - additionally, if "prefix" is given,
              individual mappings will be saved to files suffixed
              by the respective key in mapping table.

        Returns
        -------
        SIFTSResult
            Record of hits and mappings found for this
            query sequence by alignment. See by_pdb_id()
            for detailed explanation of fields.
        """
        def _create_mapping(r):
            _, query_start, query_end = parse_header(ali.ids[0])

            # create mapping from query into PDB Uniprot sequence
            # A_i will be query sequence indices, A_j Uniprot sequence indices
            m = map_indices(
                ali[0], query_start, query_end,
                ali[r["alignment_id"]], r["alignment_start"], r["alignment_end"]
            )

            # create mapping from PDB Uniprot into seqres numbering
            # j will be Uniprot sequence index, k seqres index
            n = pd.DataFrame(
                {
                    "j": list(range(r["uniprot_start"], r["uniprot_end"] + 1)),
                    "k": list(range(r["resseq_start"], r["resseq_end"] + 1)),
                }
            )

            # need to convert to strings since other mapping has indices as strings
            n.loc[:, "j"] = n.j.astype(str)
            n.loc[:, "k"] = n.k.astype(str)

            # join over Uniprot indices (i.e. j);
            # get rid of any position that is not aligned
            mn = m.merge(n, on="j", how="inner").dropna()

            # extract final mapping from seqres (k) to query (i)
            map_ = dict(
                zip(mn.k, mn.i)
            )

            return map_, mn

        if self.sequence_file is None:
            raise ValueError(
                "Need to have SIFTS sequence file. "
                "Create using create_sequence_file() "
                "method or constructor."
            )

        ali, hits = find_homologs(
            sequence_database=self.sequence_file, 
            **kwargs
        )

        # merge with internal table to identify overlap of
        # aligned regions and regions with structural coverage
        hits = hits.merge(
            self.table, on="uniprot_ac", suffixes=("", "_")
        )

        # add 1 to end of range since overlap function treats
        # ends as exclusive, while ends here are inclusive
        hits.loc[:, "overlap"] = [
            range_overlap(
                (r["uniprot_start"], r["uniprot_end"] + 1),
                (r["alignment_start"], r["alignment_end"] + 1)
            ) for i, r in hits.iterrows()
        ]

        # collect complete index mappings in here...
        mappings = {}
        # ... as well as dataframe rows for assignment of hit to mapping
        mapping_rows = []

        # complication: if there are multiple segments per hit and chain, we should
        # reduce these into a single mapping (even though split mappings
        # are possible in principle) so we can count unique number of hits etc.
        hit_columns = ["alignment_id", "pdb_id", "pdb_chain"]
        for i, (hit, grp) in enumerate(
            hits.groupby(hit_columns)
        ):
            agg_mapping = {}
            agg_df = pd.DataFrame()
            # go through each segment
            for j, r in grp.iterrows():
                # compute mapping for that particular segment
                map_j, map_j_df = _create_mapping(r)

                # add to overall mapping dictionary for this hit
                agg_mapping.update(map_j)
                agg_df = agg_df.append(map_j_df)

            # store assignment of group to mapping index
            mapping_rows.append(
                list(hit) + [i, len(grp) > 1]
            )

            mappings[i] = agg_mapping

            # store index mappings if filename prefix is given
            prefix = kwargs.get("prefix", None)
            if prefix is not None:
                agg_df = agg_df.rename(
                    columns={
                        "j": "uniprot_of_pdb_index",
                        "A_j": "uniprot_of_pdb_residue",
                        "k": "pdb_seqres_index",
                    }
                )

                agg_df.to_csv(
                    "{}_mapping{}.csv".format(prefix, i), index=False
                )

        # create dataframe from mapping rows
        mapping_df = pd.DataFrame(
            mapping_rows, columns=hit_columns + [
                "mapping_index", "grouped_segments",
            ]
        )

        # split residue name into residue number and (potential) insertion code
        # for proper sorting
        def _split_insertion_code(res):
            # should not happen but just to be safe
            assert len(res) >= 1

            # check if we have an insertion code or not
            if res[-1].isalpha():
                return int(res[:-1]), res[-1]
            else:
                return int(res), ""

        # now group again, to aggregate full hit dataframe;
        # note that coord_start and coord_end are strings that may contain
        # insertion codes so need to treat accordingly
        def _agg_type(x):
            if x == "overlap":
                return "sum"
            elif x == "coord_start":
                return lambda l: sorted(l, key=_split_insertion_code)[0]
            elif x == "coord_end":
                return lambda l: sorted(l, key=_split_insertion_code)[-1]
            elif x.endswith("_start"):
                return "min"
            elif x.endswith("_end"):
                return "max"
            else:
                return "first"

        agg_types = OrderedDict(
            [(c, _agg_type(c)) for c in hits.columns
             if c not in hit_columns]
        )

        # only aggregate if we have anything to aggregate,
        # otherwise pandas drops the index columns
        # alignment_id, pdb_id, pdb_chain and things go
        # wrong horribly in the following join
        if len(hits) > 0:
            hits_grouped = hits.groupby(
                hit_columns
            ).agg(agg_types).reset_index()
        else:
            hits_grouped = hits

        # join with mapping information
        hits_grouped = hits_grouped.merge(
            mapping_df, on=hit_columns
        )

        # remove hits with too little residue coverage
        hits_grouped = hits_grouped.query("overlap >= @min_overlap")

        hits_grouped.loc[:, "bitscore"] = pd.to_numeric(
            hits_grouped.loc[:, "bitscore"], errors="coerce"
        )
        hits_grouped = hits_grouped.sort_values(by="bitscore", ascending=False)

        # if requested, only keep one chain per PDB;
        # sort by score before this to keep best hit
        if reduce_chains:
            hits_grouped = hits_grouped.groupby("pdb_id").first().reset_index()
            # sort again, just to be sure...
            hits_grouped = hits_grouped.sort_values(by="bitscore", ascending=False)

        # remove any zombie mappings we did not keep in table
        mappings = {
            idx: map_ for idx, map_ in mappings.items()
            if idx in hits_grouped.mapping_index.values
        }

        return SIFTSResult(hits_grouped, mappings)