reciprologs

#!/usr/bin/env python3  
# the above sources Python from $PATH
##!/usr/local/bin/python3
##!/usr/bin/python3
# the above uses specific Python version; allows script name in top

# authorship information
__author__ = 'Graham E. Larue'
__maintainer__ = "Graham E. Larue"
__email__ = 'egrahamlarue@gmail.com'
__license__ = 'GPL'

"""
usage: reciprologs [-h] [-p PARALLEL_PROCESSES] [-q PERCENTAGE] [--chain]
                   [--subset subset_1 [subset_2 ...]] [--ignore_same_id]
                   [--ignore_same_prefix <prefix_delimiter>] [-o [OUTPUT]]
                   [-d path [path ...]] [-b BLAST_FILE] [--overwrite]
                   [--one_to_one] [--logging] [--no_hash_tag]
                   file_1 file_2 ... [file_1 file_2 ... ...]
                   {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx}

Find reciprocal best hits between two or more files. Any unrecognized
arguments will be passed along to the chosen alignment program.

positional arguments:
  file_1 file_2 ...     files to use to build reciprolog sets (space
                        separated)
  {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx}
                        type of alignment program to run

optional arguments:
  -h, --help            show this help message and exit
  -p PARALLEL_PROCESSES, --parallel_processes PARALLEL_PROCESSES
                        run the alignment step using multiple parallel
                        processes (default: 1)
  -q PERCENTAGE, --query_percentage_threshold PERCENTAGE
                        require a specified fraction of the query length to
                        match in order for a hit to qualify (lowest
                        allowable percentage (default: None)
  --chain               cluster reciprologs without requiring all-by-all
                        pairwise relationships, e.g. A-B, A-C, A-D --> A-B-
                        C-D (default: False)
  --subset subset_1 [subset_2 ...]
                        Files containing subsets of headers to be used as
                        queries for each input file. Supplied in the same
                        order as the input files; one header per line. To
                        omit a subset file for a given input file, provide
                        "." as the argument, e.g. for three input files
                        with only 1 & 3 with subsets: --subsets subset_1 .
                        subset_2 (default: None)
  --ignore_same_id      ignore hits where both query and subject have
                        identical IDs (default: False)
  --ignore_same_prefix <prefix_delimiter>
                        ignore hits where both query and subject have
                        identical prefixes, where the prefix for each ID is
                        delimited by the specified <prefix_delimiter>
                        (default: None)
  -o [OUTPUT], --output [OUTPUT]
                        output filename (use flag without argument for
                        auto-naming) (default: stdout)
  -d path [path ...], --alignment_source_directory path [path ...]
                        check for existing alignment files to use in this
                        directory first (default: None)
  -b BLAST_FILE, --blast_file BLAST_FILE
                        aggregated BLAST output to use (both directions)
                        (default: None)
  --overwrite           overwrite existing output files (instead of using
                        them to bypass alignment step) (default: False)
  --one_to_one          remove any many-to-one reciprolog relationships in
                        each pairwise set, such that each member of each
                        pairwise comparison is only present exactly one
                        time in output (default: False)
  --logging             output a log of best-hit choice criteria (default:
                        False)
  --no_hash_tag         do not auto-tag output files with MD5 hashes of
                        source files (default: False)

NOTE: Depends on palign

"""
import sys
import subprocess
import os
import time
import argparse
import re
import shutil
from operator import itemgetter
from multiprocessing import cpu_count
from collections import defaultdict
from itertools import combinations, permutations
from biogl import fasta_parse, get_runtime
from hashlib import md5

# use networkx library for fast ortholog clustering if available
try:
    import networkx as nx
    USE_GRAPH = True
except ModuleNotFoundError:
    USE_GRAPH = False


def parse_blast_line(bl, *args):
    """
    Returns info from certain columns in a tab-separated BLAST
    output file. $args may be: query, subject, length, e, bitscore

    """
    columns = bl.strip().split("\t")
    (
        query, subject, length, 
        e_value, bitscore
    ) = itemgetter(0, 1, 3, 10, 11)(columns)
    arg_map = {
        "query": query,
        "subject": subject,
        "length": int(length) - 1,  # seem to be off by 1 in BLAST output
        "e": float(e_value),
        "bitscore": float(bitscore)
    }
    results = []
    for a in args:
        results.append(arg_map[a])
    if len(results) == 1:
        results = results[0]
    return results


def is_better(challenger, defender, seq_lengths=None):
    """
    Compares attributes of two dictionaries of BLAST
    hits for a given query to determine which is better.

    Returns the winning dictionary and reason if it's 
    better, otherwise False.
    
    """
    cbs = challenger['score']
    dbs = defender['score']
    # criteria: bitscore
    if cbs < dbs:
        return False
    elif cbs > dbs:
        return challenger, 'bitscore'
    elif cbs == dbs:
        # criteria --> e-value
        cev = challenger['evalue']
        dev = defender['evalue']
        if cev < dev: # lower is better
            return challenger, 'e-value'
        elif seq_lengths is not None:
            # criteria --> length
            # if scores are equal, check if sequence lengths
            # have been provided as an additional tiebreaking
            # criteria and look up the subject length to
            # see if there's a difference
            dn = defender['name']
            cn = challenger['name']
            try:
                if seq_lengths[cn] > seq_lengths[dn]:
                    return challenger, 'length'
            except KeyError:
                return False
        else:
            return False
    else:
        return False


def get_prefix(seq_id, delimiter):
    split_list = re.split(delimiter, seq_id, maxsplit=1)
    split_list = [s for s in split_list if s]

    return split_list[0]


def get_top_hits(
    blast, 
    paralogs=False, 
    query_match=None, 
    seq_lengths=None,
    ignore_same_id=False,
    ignore_same_prefix=False,
    query_list=None):
    results = {}
    # dictionary to store tie-broken matches
    win_ledger = defaultdict(lambda: defaultdict(set))
    with open(blast) as blst:
        for l in blst:
            new_best_hit = False
            if l.startswith("#"):
                continue
            (q, s, score, length, evalue) = parse_blast_line(
                l, "query", "subject", "bitscore", "length", "e")
            challenger = {
                'name': s,
                'score': score,
                'evalue': evalue,
                'length': length
            }
            if query_list and q not in query_list:
                continue

            # do not consider hits to self if BLASTing against self,
            # but allow query/subject names to be the same
            if q == s and (paralogs is True or ignore_same_id is True):
                continue
            if ignore_same_prefix is not None:
                prefix = ignore_same_prefix
                if get_prefix(q, prefix) == get_prefix(s, prefix):
                    continue
            if query_match:
                # use query_match dictionary to compare query lengths to
                # match lengths to exclude matches where query percentage 
                # is below query_match_threshold key
                fraction = (length / query_match[q]) * 100
                if fraction < query_match['query_match_threshold']:
                    continue
            if q in results:
                defender = results[q]
                challenger_wins = is_better(
                    challenger, defender, seq_lengths)
                if challenger_wins:  # new hit is better
                    new_best_hit = True
                    defender_name = results[q]['name']
                    reason = challenger_wins[1]
                    loser_info = (defender_name, reason)
                    win_ledger[q]['losers'].add(loser_info)
                    win_ledger[q]['best'] = s
                    
            else:
                new_best_hit = True

            if new_best_hit is True:
                results[q] = {
                    "name": s, 
                    "score": score, 
                    "evalue": evalue, 
                    "length": length
                }

    return results, win_ledger


def get_reciprocals(d1, d2, tag1, tag2):
    """
    Takes two dictionaries of top BLAST hits,
    returns a list of tuples of all pairs that were
    reciprocal best hits, along with their bitscore
    values.

    """
    reciprologs = set()
    blast_dicts = [(d1, tag1), (d2, tag2)]
    blast_permuts = permutations(blast_dicts, 2)
    for (first, first_tag), (second, second_tag) in blast_permuts:
        for query, hit_info in first.items():
            best_hit = hit_info["name"]
            score = hit_info["score"]
            if best_hit in second:
                reciprocal_hit = second[best_hit]["name"]
                if query == reciprocal_hit:  # best hit refers back to query
                    r_score = second[best_hit]["score"]
                    query = (first_tag, query)
                    best_hit = (second_tag, best_hit)
                    hit_pair = sorted([query, best_hit])
                    score_tuple = tuple(sorted([score, r_score]))
                    hit_pair.append(score_tuple)
                    reciprologs.add(tuple(hit_pair))

    return sorted(reciprologs)


def clean_reciprologs(reciprologs, subset_index=None):
    cleaned = []
    for group in reciprologs:
        if subset_index and not any(m[1] in subset_index[m[0]] for m in group):
            continue
        # remove file tags from tuples
        clean_group = [m[1] for m in group]
        cleaned.append(clean_group)
    
    return cleaned


def file_md5(fn, buffer_size=65536):
    hash = md5()
    with open(fn, 'rb') as f:
        while True:
            data = f.read(buffer_size)
            if not data:
                break
            hash.update(data)

    return hash.hexdigest()


def abbreviate(name, delimiter=".", use_hash=True, keep_path=False):
    local_name = os.path.basename(name)  # in case of non-local file path
    abbreviation = local_name.split(delimiter)[0]
    if use_hash is True:  # use shortened md5 hash to uniqueify name
        hash = file_md5(name)[:5]
        abbreviation = abbreviation + delimiter + hash
    
    if keep_path is True:
        file_path = os.path.dirname(os.path.abspath(name))
        abbreviation = os.path.join(file_path, abbreviation)

    return abbreviation


def unique_filenames(*file_list, skip=None, use_hash=True, keep_path=False):
    if skip is not None:
        abbreviated = [
            os.path.basename(f) if f in skip 
            else abbreviate(f, use_hash=use_hash) 
            for f in file_list
        ]
    else:
        abbreviated = [os.path.basename(f) for f in file_list]
    if len(set(abbreviated)) < len(abbreviated):  # not all are unique
        abbreviated = [os.path.basename(f) for f in file_list]
    
    if keep_path is True:  # add parent directory paths
        dirpaths = [os.path.dirname(f) for f in file_list]
        abbreviated = [
            os.path.join(p, f) for p, f in zip(dirpaths, abbreviated)
        ]

    return abbreviated


def concatenate(outname, file_list, clean=True):
    with open(outname, 'w') as outfile:
        for fn in file_list:
            with open(fn) as f:
                for l in f:
                    outfile.write(l)
    if clean:
        [os.remove(fn) for fn in file_list]


def parse_run_type(align_type_arg):
    type_map = {
        'diamondp': ('diamond', 'blastp'),
        'diamondx': ('diamond', 'blastx'),
        'blastn': ('blast', 'blastn'),
        'blastp': ('blast', 'blastp'),
        'blastx': ('blast', 'blastx'),
        'tblastn': ('blast', 'tblastn'),
        'tblastx': ('blast', 'tblastx'),
    }

    return type_map[align_type_arg]


def aggregate_dict_chained(ortho_dict):
    """
    IN:
    defaultdict(set,
            {'a': {'b', 'c', 'd'},
             'b': {'a', 'c', 'e', 'f'},
             'c': {'a', 'b', 'e', 'f', 'g'},
             'd': {'a'},
             'e': {'b', 'c'},
             'f': {'b', 'c'},
             'g': {'c'},
             'w': {'z'},
             'x': {'z'},
             'y': {'z'},
             'z': {'w', 'x', 'y'}})
    OUT:
    defaultdict(set, {'a': {'b', 'c', 'd', 'e', 'f', 'g'}, 'z': {'w', 'x', 'y'}})
    
    """
    changed = False
    processed = []
    master = defaultdict(set)
    for k, v in ortho_dict.items():
        if k in processed:
            continue
        processed.append(k)
        for v2 in v:
            if v2 == k:
                continue
            master[k].add(v2)
            processed.append(v2)
            if v2 not in ortho_dict:
                continue
            changed = True
            master[k].update(ortho_dict[v2])
    if changed is True:
        master = aggregate_dict_chained(master)

    return master


def aggregate_orthos_chained(orthos, use_graph=False):
    """
    IN:
    [
        [('a', 'b'), ('a', 'c'), ('a', 'd')],
        [('b', 'c'), ('b', 'e'), ('b', 'f')],
        [('c', 'e'), ('c', 'f'), ('c', 'g')],
        [('z', 'x'), ('z', 'y'), ('z', 'w')]
    ]
    OUT:
    [['a', 'b', 'c', 'd', 'e', 'f', 'g'], ['w', 'x', 'y', 'z']]
    
    """
    if use_graph:
        ortho_groups = graph_cluster(orthos, chain=True)
    else:
        o_dict = make_ortho_dict(*orthos)
        aggregated = aggregate_dict_chained(o_dict)
        ortho_groups = []
        for k, v in aggregated.items():
            combined = tuple(v) + (k,)
            ortho_groups.append(sorted(combined))
    
    return sorted(ortho_groups)


def aggregate_orthos_strict(orthos, use_graph=False):
    """
    IN:
    [
        [('a', 'b'), ('a', 'c'), ('a', 'd')],
        [('b', 'c'), ('b', 'e'), ('b', 'f')],
        [('c', 'e'), ('c', 'f'), ('c', 'g')],
        [('z', 'x'), ('z', 'y'), ('z', 'w')]
    ]
    OUT:
    [
        ['x', 'z'],
        ['y', 'z'],
        ['w', 'z'],
        ['a', 'd'],
        ['c', 'g'],
        ['b', 'c', 'f'],
        ['b', 'c', 'e'],
        ['a', 'b', 'c']
    ]
    
    """
    if use_graph is True:
        aggregated = graph_cluster(orthos)
    else:
        o_dict = make_ortho_dict(*orthos)
        aggregated = all_by_all_orthos(o_dict)

    return aggregated


def all_by_all_orthos(ortho_dict):
    full_groups = []
    for k, v in ortho_dict.items():
        groups = []
        max_n = len(v)
        # go backward in size and cull subsets as we go
        for i in range(max_n, 0, -1):
            for g in combinations(v, i):
                g = set(list(g) + [k])
                if g in full_groups or len(g) == 1:
                    continue
                if every_member_match(g, ortho_dict):
                    if any(og.issuperset(g) for og in full_groups):
                        continue
                    full_groups.append(g)

    return sorted([sorted(g) for g in full_groups])


def every_member_match(members, m_dict):
    all_match = True
    for m in members:
        others = [e for e in members if e != m]
        if not others:
            return True
        if any(m not in m_dict[o] for o in others):
            return False
            
    return all_match


def make_ortho_dict(*orthos):
    """
    IN:
    [
        [('a', 'b'), ('a', 'c'), ('a', 'd')],
        [('b', 'c'), ('b', 'e'), ('b', 'f')],
        [('c', 'e'), ('c', 'f'), ('c', 'g')],
        [('z', 'x'), ('z', 'y'), ('z', 'w')]
    ]
    OUT:
    defaultdict(set,
            {'a': {'b', 'c', 'd'},
             'b': {'a', 'c', 'e', 'f'},
             'c': {'a', 'b', 'e', 'f', 'g'},
             'd': {'a'},
             'e': {'b', 'c'},
             'f': {'b', 'c'},
             'g': {'c'},
             'w': {'z'},
             'x': {'z'},
             'y': {'z'},
             'z': {'w', 'x', 'y'}})
    
    """
    collector = defaultdict(set)
    for o_list in orthos:
        for pair in o_list:
            for a, b in permutations(pair, 2):
                collector[a].add(b)

    return collector


def names_from_blastfile(blast_fn):
    file_pattern = r'(.+)-vs-(.+)\.t?blast[npx]'
    query_fn, subject_fn = re.findall(file_pattern, blast_fn)[0]

    return query_fn, subject_fn


def make_subset(fasta, output_fn, keep_file=None, keep_list=None):
    if not (keep_file or keep_list):
        print(
            '[!] Cannot make subset for {} - aborting'.format(fasta),
            file=sys.stderr
    )
        sys.exit(1)
    keep_list_set = set()
    keep_file_set = set()
    if keep_list is not None:
        for e in keep_list:
            keep_list_set.add(e)
    if keep_file is not None:
        with open(keep_file) as f:
            for l in f:
                keep_file_set.add(l.strip())
    # get the union of the two sets of headers to include any new additions
    keep_set = keep_list_set | keep_file_set
    # write combined set to new <keep_file> if there are entries in 
    # <keep_list_set> that were not in the original <keep_file>
    new_keeps = len(keep_set) - len(keep_file_set)
    if new_keeps > 0 and keep_file is not None:
        print(
            '[#] Updating {} with {} new entries'.format(keep_file, new_keeps)
        , file=sys.stderr)
        with open(keep_file, 'w') as new_keep_file:
            for h in keep_set:
                new_keep_file.write(h + '\n')
    kept = 0
    with open(output_fn, 'w') as out:
        for h, s in fasta_parse(fasta, trim_header=False):
            trunc_header = h.split()[0]
            if trunc_header in keep_set:
                record = '>{}\n{}\n'.format(h, s)
                out.write(record)
                kept += 1
    
    return output_fn, kept


def list_hash(string_list, length=3, sort_first=True):
    hash = md5()
    if sort_first is True:
        string_list = sorted(string_list)
    for s in string_list:
        hash.update(s.encode())
    
    return hash.hexdigest()[:length]
    

def subset_name(fn, file_tag='subset', use_hash=True, keep_path=False):
    if type(file_tag) is not str or not file_tag:
        file_tag = ''
    else:
        file_tag = '_{}'.format(file_tag)
    out_fn = '{}{}.fa'.format(
        abbreviate(fn, use_hash=use_hash, keep_path=keep_path), file_tag
    )

    return out_fn


def align(aligner, query, subject, run_type, output_name, extra_args=None):
    if extra_args is None:
        extra_args = []
    aligner_args = [
        aligner, 
        query, 
        subject, 
        run_type,
        '--output_name', 
        output_name
    ]
    result = subprocess.run(aligner_args + extra_args)

    return result


def alignment_filenames(query, subject, run_type, use_hash=True):
    run_files = {}

    # when subsetting is used, this hack will produce more obvious filenames
    # that reflect which alignments were of subsets and which weren't
    special_case_tags = ['_residual', '_subset']

    no_abbrev = []
    for f in [query, subject]:
        if any(t in f for t in special_case_tags):
            no_abbrev.append(f)

    fw_names = unique_filenames(query, subject, skip=no_abbrev, use_hash=use_hash)
    rv_names = unique_filenames(subject, query, skip=no_abbrev, use_hash=use_hash)

    run_files['forward'] = '{}-vs-{}.{}'.format(*fw_names, run_type)
    run_files['reverse'] = '{}-vs-{}.{}'.format(*rv_names, run_type)

    return run_files


def seq_lengths(fasta):
    l = {}
    for h, s in fasta_parse(fasta):
        l[h] = len(s)
    
    return l


def remove_many_to_one(pairs):
    """
    Each element of >pairs< is a tuple: (hitA, hitB, (scoreX, scoreY))

    Takes a list of paired reciprocal hits (plus scores) and filters it
    such that each member of each pair only occur once, i.e. it removes
    any many-to-one hits, using the bitscores in the last element of
    each tuple.
    
    """
    uniques = {}
    to_remove = []
    for index, (a, b, scores) in enumerate(pairs):
        avg_score = sum(scores) / 2
        for e in [a, b]:
            if e not in uniques:
                uniques[e] = {'score': avg_score, 'index': index}
            elif uniques[e]['score'] >= avg_score:
                to_remove.append(index)
                continue
            else:
                to_remove.append(uniques[e]['index'])
                uniques[e] = {'score': avg_score, 'index': index}

    filtered = []
    for i, p in enumerate(pairs):
        if i in to_remove:
            names = p[0:2]
            print('Removed: {}'.format('\t'.join(names)), file=sys.stderr)
        else:
            filtered.append(p)

    return filtered


def graph_cluster(pairwise_sets, chain=False):
    graph = nx.Graph()
    for p_set in pairwise_sets:
        for pair in p_set:
            graph.add_edge(*pair)
    if chain:
        clusters = nx.connected_components(graph)
    else:
        clusters = nx.find_cliques(graph)
    clusters = [sorted(c) for c in clusters]

    return sorted(clusters, key=len)


def subset_size_check(input_file, subset, kept_n):
    if kept_n == 0:
        print('[!] Subset size of {} = 0; aborting.'.format(subset))
        sys.exit(1)
    else:
        print(
            '[#] Subset size for {}: {}'.format(input_file, kept_n), 
            file=sys.stderr
        )


def log_ledger(q, s, run_type, win_ledger, use_hash=True):
    ledger_file = '{}-{}.{}.log'.format(
        abbreviate(q, use_hash=use_hash), abbreviate(s, use_hash=use_hash), run_type)
    with open(ledger_file, 'w') as lf:
        for query, info in sorted(win_ledger.items()):
            winner = info['best']
            loser_tuples = info['losers']
            lf.write('>{}\t[{}]\n'.format(winner, query))
            for loser in sorted(loser_tuples):
                lf.write('\t'.join(loser) + '\n')


def get_alignments(
    pairs_index, 
    aligner, 
    run_type, 
    extra,
    overwrite=False, 
    file_dirs=None,
    blast_file=None,
    use_hash=True
):
    """
    <pairs_index> is a dictionary of the form {label: (q, s), ...}
    
    """
    # get reciprologs for each pairwise permutation of files [(1, 2), (2, 1), ...]
    alignment_index = {}
    for alignment_key, pair in pairs_index.items():
        q, s = pair
        # this block might want to move to above the preceding if block...?
        if blast_file:
            alignment_index[alignment_key] = blast_file
            continue
        align_fn = alignment_filenames(q, s, run_type, use_hash=use_hash)['forward']
        if file_dirs:
            for d in file_dirs:
                file_list = os.listdir(d)
                if align_fn in file_list:
                    align_fn = os.path.join(
                            os.path.abspath(d), align_fn)
                    break
        if not os.path.isfile(align_fn) or overwrite is True:
            alignment = align(
                aligner, q, s, run_type, align_fn, extra_args=extra
            )
            if alignment.returncode != 0:
                sys.exit(
                    '[!] ERROR: alignment failed: {} (return code: {})'
                    .format(align_fn, alignment.returncode)
                    )
        else:
            print(
                '[#] Using existing output \'{}\''.format(align_fn), 
                file=sys.stderr
            )
        alignment_index[alignment_key] = align_fn
    
    return alignment_index


def align_residuals(alignments, aligner, residual_index, args, overwrite=False, use_hash=True):
    alignment_index = {}
    run_type = args.run_type
    extra = args.extra
    for aln_key, aln in sorted(alignments.items(), key=lambda x: sorted(x[0])):
        q, s = aln_key
        if q not in residual_index:
            continue
        r_sub_set = residual_index[q]
        r_sub_fn = subset_name(q, file_tag='residual', use_hash=use_hash)
        r_sub, kept_n = make_subset(q, r_sub_fn, keep_list=r_sub_set)
        subset_size_check(q, r_sub, kept_n)
        align_fn = alignment_filenames(q, s, run_type, use_hash=use_hash)['forward']
        if not os.path.isfile(align_fn) or overwrite is True:
            residual_aln = align(
                aligner, r_sub, s, run_type, align_fn, extra_args=extra)
            if residual_aln.returncode != 0:
                sys.exit(
                    '[!] ERROR: alignment failed: {} (return code: {})'
                    .format(align_fn, residual_aln.returncode)
                    )
        else:
            print(
                '[#] Using existing output \'{}\''.format(align_fn), 
                file=sys.stderr
            )
        alignment_index[aln_key] = align_fn
    
    return alignment_index

        
def build_pair_index(
    input_files, 
    subset_index, 
    subset_tag=None, 
    subset_only=False,
    use_hash=True
):
    pairs = {}
    made = set()
    for q, s in permutations(sorted(input_files), 2):
        alignment_key = (q, s)
        paralogs = q == s
        if subset_index and subset_index.get(q) and not paralogs:
            # subset files are getting tagged twice—probably need to move
            # tagging to a different function or something
            sub_set = subset_index[q]
            subset_hash = list_hash(sub_set)
            hash_tag = f'{subset_tag}_h{subset_hash}'
            sub_fn = subset_name(
                q, file_tag=hash_tag, use_hash=use_hash
            )
            
            # check if subset file has already been made 
            # during this loop to avoid remaking recent files
            # unnecessarily
            if q not in made:
                made.add(q)
                sub_q, kept_n = make_subset(q, sub_fn, keep_list=sub_set)
                subset_size_check(q, sub_q, kept_n)
            else:
                sub_q = sub_fn
            q = sub_q
        elif subset_only is True:
            continue
        p = (q, s)
        pairs[alignment_key] = p
        
    return pairs


def get_residual_hits(hit_index, subset_index):
    residual_index = defaultdict(set)
    for pair, fwd_hits in hit_index.items():
        q, s = pair
        if subset_index and subset_index[s] is not None:
            reverse_pair = (s, q)
            rv_query_names = set(hit_index[reverse_pair].keys())
            s_hits = set(h['name'] for h in fwd_hits.values())
            residual = s_hits - rv_query_names
            residual_index[s] |= residual
            print(f'residual length for {s}: {len(residual_index[s])}', file=sys.stderr)  ###!!!
    
    return residual_index


def build_hit_indices(alignments, args):
    query_percentage = args.query_percentage_threshold
    blast_file = args.blast_file
    length_index = {}
    hit_index = {}
    residual_index = defaultdict(set)
    for (q, s), aln in sorted(alignments.items(), key=lambda x: sorted(x[0])):
        if blast_file:
            aln = blast_file
        # set flag if both files are the same
        paralogs = q == s
        if paralogs and (q, s) in hit_index:
            continue
        # add the lengths of all sequences to an index for later 
        # tie-breaking of best hits
        for e in (q, s):
            if e not in length_index:
                length_index[e] = seq_lengths(e)
                # this is needed for the weird get_top_hits() API - might
                # be better as a separate arg in the future...
                length_index[e]['query_match_threshold'] = query_percentage
        # get sets of query IDs to filter alignment lines to 
        # relevant hits (matters in case of aggregate alignment file)
        q_list = set(length_index[q].keys())
        if query_percentage is not None:
            q_lengths = length_index[q]
        else:
            q_lengths = {}
        s_lengths = length_index[s]
        top_hits, win_ledger = get_top_hits(
            aln,
            paralogs,
            query_match=q_lengths,
            seq_lengths=s_lengths,
            ignore_same_id=args.ignore_same_id,
            ignore_same_prefix=args.ignore_same_prefix,
            query_list=q_list
        )

        hit_index[(q, s)] = top_hits

    return hit_index


def get_subset_set(subset_list_file):
    subset = set()
    with open(subset_list_file) as f:
        for l in f:
            subset.add(l.strip())

    return subset


parser = argparse.ArgumentParser(
    description=(
        'Find reciprocal best hits between two or more files. '
        'Any unrecognized arguments will be passed along to the chosen '
        'alignment program.'),
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument(
    'input_files',
    metavar='file_1 file_2 ...',
    help='files to use to build reciprolog sets (space separated)',
    nargs='+'
)
parser.add_argument(
    'run_type',
    choices=[
        'diamondp',
        'diamondx',
        'blastn',
        'blastp',
        'blastx',
        'tblastn',
        'tblastx'
    ],
    help='type of alignment program to run'
)
parser.add_argument(
    '-p',
    '--parallel_processes',
    help=(
        'run the alignment step using multiple parallel processes'),
    type=int,
    default=1
)
parser.add_argument(
    '-q',
    '--query_percentage_threshold',
    metavar='PERCENTAGE',
    help=(
        'require a specified fraction of the query length to match in '
        'order for a hit to qualify (lowest allowable percentage'),
    type=float,
    default=None
)
parser.add_argument(
    '--chain',
    action='store_true',
    help=(
        'cluster reciprologs without requiring all-by-all pairwise '
        'relationships, e.g. A-B, A-C, A-D --> A-B-C-D')
)
parser.add_argument(
    '--subset',
    metavar=('subset_1', 'subset_2'),
    nargs='+',
    help=(
        'Files containing subsets of headers to be used as queries for each '
        'input file. Supplied in the same order as the input files; one header '
        'per line. To omit a subset file for a given input file, '
        'provide "." as the argument, e.g. for three input files with only 1 & '
        '3 with subsets: --subsets subset_1 . subset_2'
    )
)
parser.add_argument(
    '--ignore_same_id',
    action='store_true',
    help='ignore hits where both query and subject have identical IDs'
)
parser.add_argument(
    '--ignore_same_prefix',
    metavar='<prefix_delimiter>',
    help=(
        'ignore hits where both query and subject have identical prefixes, '
        'where the prefix for each ID is delimited by the specified '
        '<prefix_delimiter>')
)
parser.add_argument(
    '-o',
    '--output',
    help=(
        'output filename (use flag without argument for auto-naming)'
    ),
    nargs='?',
    default='stdout'
)
parser.add_argument(
    '-d',
    '--alignment_source_directory',
    help='check for existing alignment files to use in this directory first',
    nargs='+',
    metavar='path'
)
parser.add_argument(
    '-b',
    '--blast_file',
    help='aggregated BLAST output to use (both directions)'
)
parser.add_argument(
    '--overwrite',
    help=(
        'overwrite existing output files '
        '(instead of using them to bypass alignment step)'
    ),
    action='store_true'
)
parser.add_argument(
    '--one_to_one',
    help=(
        'remove any many-to-one reciprolog relationships in each pairwise '
        'set, such that each member of each pairwise comparison is only '
        'present exactly one time in output'),
    action='store_true'
)
parser.add_argument(
    '--logging',
    help='output a log of best-hit choice criteria',
    action='store_true'
)
parser.add_argument(
    '--no_hash_tag',
    help='do not auto-tag output files with MD5 hashes of source files',
    action='store_true'
)

t_start = time.time()

args, EXTRA_ARGS = parser.parse_known_args()

RUN_TYPE = args.run_type
PARALLEL = args.parallel_processes
INPUT_FILES = args.input_files
if len(INPUT_FILES) < 2:
    sys.exit('error: too few files specified (need >1)')
QUERY_PERCENTAGE = args.query_percentage_threshold
OVERWRITE = args.overwrite
ONE_TO_ONE = args.one_to_one
LOGGING = args.logging
CHAIN = args.chain
IGNORE_SAME_ID = args.ignore_same_id
IGNORE_SAME_PREFIX = args.ignore_same_prefix
BLAST_FILE = args.blast_file
OUTPUT_FILE = args.output
ALIGNMENT_SOURCE_DIRS = args.alignment_source_directory
SUBSET = args.subset
USE_HASHES = not args.no_hash_tag

ALIGN_PROG = 'palign'

if not USE_GRAPH:
    print(
        '[!] networkx library not found; will use brute-force method instead',
        file=sys.stderr
    )

# create a list with the flags/options to pass to the subsequence
# subprocess calls
call_options = {
    '-p': PARALLEL
}
optional = EXTRA_ARGS
for k, v in call_options.items():
    if v:
        optional.extend([str(k), str(v)])

# if --subset, generate list of subset files to match up with input files
if SUBSET and len(SUBSET) != len(INPUT_FILES):
    print(
        '[!] Must supply as many arguments to --subset as there are input files', 
        file=sys.stderr
    )
    sys.exit(1)

# the default None values here are used as a check later on to 
# indicate which files have associated subsets

if SUBSET:
    subset_index = {f: set() for f in INPUT_FILES}
    working_subsets = []
    for s in SUBSET:
        if s != '.':
            s = get_subset_set(s)
        working_subsets.append(s)
        # working_name = '{}.working'.format(s)
        # shutil.copy(s, working_name)
        # working_subsets.append(working_name)
    for f, s in zip(INPUT_FILES, working_subsets):
        if s != '.':
            subset_index[f] = s
else:
    subset_index = None


alignment_pairs = build_pair_index(
    INPUT_FILES, subset_index, subset_tag='subset', subset_only=False, use_hash=USE_HASHES)

alignments = get_alignments(
    alignment_pairs, 
    ALIGN_PROG, 
    RUN_TYPE, 
    EXTRA_ARGS, 
    overwrite=OVERWRITE,
    file_dirs=ALIGNMENT_SOURCE_DIRS,
    blast_file=BLAST_FILE,
    use_hash=USE_HASHES
)

# hit_index, residual_index = build_hit_indices(alignments, args, subset_index=subset_index)
hit_index = build_hit_indices(alignments, args)
residual_index = get_residual_hits(hit_index, subset_index)

if residual_index:
    residual_pairs = build_pair_index(
        INPUT_FILES, 
        residual_index, 
        subset_tag='residual', 
        subset_only=True, 
        use_hash=USE_HASHES
    )

    residual_alignments = get_alignments(
        residual_pairs, 
        ALIGN_PROG, 
        RUN_TYPE, 
        EXTRA_ARGS, 
        overwrite=OVERWRITE,
        file_dirs=ALIGNMENT_SOURCE_DIRS,
        blast_file=BLAST_FILE,
        use_hash=USE_HASHES
    )

    residual_hit_index = build_hit_indices(
        residual_alignments, args
    )
    
    for k, v in residual_hit_index.items():
        hit_index[k].update(v)

# get all unique pairs (e.g. (0, 1) but not (1, 0)),
# to then iterate over only the forward pairs and get 
# reciprocal hits for each pair
unique_pairs = set([tuple(sorted(e)) for e in hit_index])

pairwise_reciprolog_sets = []
for p in unique_pairs:
    rv_key = tuple(sorted(p, reverse=True))
    top_fwd = hit_index[p]
    top_rv = hit_index[rv_key]
    q_name, s_name = p
    reciprolog_set = get_reciprocals(top_fwd, top_rv, q_name, s_name)
    if ONE_TO_ONE is True:
        reciprolog_set = remove_many_to_one(reciprolog_set)

    # remove score info
    reciprolog_set = [tuple(x[0:2]) for x in reciprolog_set]
    pairwise_reciprolog_sets.append(reciprolog_set)

if CHAIN:
    reciprologs = aggregate_orthos_chained(pairwise_reciprolog_sets, USE_GRAPH)
else:
    reciprologs = aggregate_orthos_strict(pairwise_reciprolog_sets, USE_GRAPH)

reciprologs = clean_reciprologs(reciprologs, subset_index=subset_index)

basename = '-'.join(sorted([abbreviate(f, use_hash=USE_HASHES) for f in INPUT_FILES]))

if len(reciprologs) == 0:
    out_string = ''
else:
    if OUTPUT_FILE:
        if OUTPUT_FILE == 'stdout':
            out = sys.stdout
            out_string = ''
        else:
            out = open(OUTPUT_FILE, 'w')
            out_string = ': {}'.format(OUTPUT_FILE)
    else:
        OUTPUT_FILE = '{}.{}.reciprologs'.format(basename, RUN_TYPE)
        out = open(OUTPUT_FILE, 'w')
        out_string = ': {}'.format(OUTPUT_FILE)

    for group in reciprologs:
        out.write('\t'.join(group) + '\n')

    if OUTPUT_FILE != 'stdout':
        out.close()

runtime = get_runtime(t_start)

print(
    '[#] Job finished in {}; {} reciprolog sets found{}'
    .format(runtime, len(reciprologs), out_string),
    file=sys.stderr)

sys.exit(0)