Skip to content

Commit

Permalink
upgrade dbCAN to latest and update filtering parameters #890
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer committed Apr 1, 2023
1 parent 98c1472 commit 036c9d6
Show file tree
Hide file tree
Showing 2 changed files with 354 additions and 249 deletions.
157 changes: 92 additions & 65 deletions funannotate/aux_scripts/hmmer_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@
import subprocess
from natsort import natsorted
import funannotate.library as lib

with warnings.catch_warnings():
warnings.simplefilter('ignore')
warnings.simplefilter("ignore")
from Bio import SearchIO


def PfamHmmer(input):
HMM = os.path.join(FUNDB, 'Pfam-A.hmm')
base = os.path.basename(input).split('.fa')[0]
pfam_out = os.path.join(os.path.dirname(input), base+'.pfam.txt')
cmd = ['hmmsearch', '--domtblout', pfam_out,
'--cpu', '1', '--cut_ga', HMM, input]
HMM = os.path.join(FUNDB, "Pfam-A.hmm")
base = os.path.basename(input).split(".fa")[0]
pfam_out = os.path.join(os.path.dirname(input), base + ".pfam.txt")
cmd = ["hmmsearch", "--domtblout", pfam_out, "--cpu", "1", "--cut_ga", HMM, input]
subprocess.call(cmd, stdout=FNULL, stderr=FNULL)


Expand All @@ -32,17 +32,17 @@ def safe_run(*args, **kwargs):
def combineHmmerOutputs(inputList, output):
# function to combine multiple HMMER runs with proper header/footer so biopython can read
allHeadFoot = []
with open(inputList[0], 'r') as infile:
with open(inputList[0], "r") as infile:
for line in infile:
if line.startswith('#'):
if line.startswith("#"):
allHeadFoot.append(line)
with open(output, 'w') as out:
with open(output, "w") as out:
for x in allHeadFoot[:3]:
out.write(x)
for file in inputList:
with open(file, 'r') as resultin:
with open(file, "r") as resultin:
for line in resultin:
if line.startswith('#') or line.startswith('\n'):
if line.startswith("#") or line.startswith("\n"):
continue
out.write(line)
for y in allHeadFoot[3:]:
Expand All @@ -52,44 +52,48 @@ def combineHmmerOutputs(inputList, output):
def multiPFAMsearch(inputList, cpus, tmpdir, output):
# run hmmerscan multithreaded by running at same time
# input is a list of files, run multiprocessing on them
pfam_results = os.path.join(os.path.dirname(tmpdir), 'pfam.txt')
pfam_filtered = os.path.join(os.path.dirname(tmpdir), 'pfam.filtered.txt')
pfam_results = os.path.join(os.path.dirname(tmpdir), "pfam.txt")
pfam_filtered = os.path.join(os.path.dirname(tmpdir), "pfam.filtered.txt")
lib.runMultiProgress(safe_run, inputList, cpus, progress=False)

# now grab results and combine, kind of tricky as there are header and footers for each
resultList = [os.path.join(tmpdir, f) for f in os.listdir(
tmpdir) if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith('.pfam.txt')]
resultList = [
os.path.join(tmpdir, f)
for f in os.listdir(tmpdir)
if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith(".pfam.txt")
]
combineHmmerOutputs(resultList, pfam_results)

# now parse results
with open(output, 'w') as out:
with open(pfam_filtered, 'w') as filtered:
with open(pfam_results, 'r') as results:
with open(output, "w") as out:
with open(pfam_filtered, "w") as filtered:
with open(pfam_results, "r") as results:
for qresult in SearchIO.parse(results, "hmmsearch3-domtab"):
hits = qresult.hits
num_hits = len(hits)
if num_hits > 0:
for i in range(0, num_hits):
hit_evalue = hits[i].evalue
query = hits[i].id
pfam = qresult.accession.split('.')[0]
pfam = qresult.accession.split(".")[0]
hmmLen = qresult.seq_len
hmm_aln = int(hits[i].hsps[0].hit_end) - \
int(hits[i].hsps[0].hit_start)
hmm_aln = int(hits[i].hsps[0].hit_end) - int(
hits[i].hsps[0].hit_start
)
coverage = hmm_aln / float(hmmLen)
if coverage < 0.50: # coverage needs to be at least 50%
continue
filtered.write("%s\t%s\t%s\t%f\n" %
(query, pfam, hit_evalue, coverage))
filtered.write(
"%s\t%s\t%s\t%f\n" % (query, pfam, hit_evalue, coverage)
)
out.write("%s\tdb_xref\tPFAM:%s\n" % (query, pfam))


def dbCANHmmer(input):
HMM = os.path.join(FUNDB, 'dbCAN.hmm')
base = os.path.basename(input).split('.fa')[0]
outfiles = os.path.join(os.path.dirname(input), base+'.dbcan.txt')
cmd = ['hmmscan', '--domtblout', outfiles,
'--cpu', '1', '-E', '1e-17', HMM, input]
HMM = os.path.join(FUNDB, "dbCAN.hmm")
base = os.path.basename(input).split(".fa")[0]
outfiles = os.path.join(os.path.dirname(input), base + ".dbcan.txt")
cmd = ["hmmscan", "--domtblout", outfiles, "--cpu", "1", "-E", "1e-15", HMM, input]
subprocess.call(cmd, stdout=FNULL, stderr=FNULL)


Expand All @@ -103,20 +107,24 @@ def safe_run2(*args, **kwargs):

def dbCANsearch(inputList, cpus, evalue, tmpdir, output):
# run hmmerscan
dbCAN_out = os.path.join(tmpdir, 'dbCAN.txt')
dbCAN_filtered = os.path.join(tmpdir, 'dbCAN.filtered.txt')
dbCAN_out = os.path.join(tmpdir, "dbCAN.txt")
dbCAN_filtered = os.path.join(tmpdir, "dbCAN.filtered.txt")
lib.runMultiProgress(safe_run2, inputList, cpus, progress=False)
# now grab results
resultList = [os.path.join(tmpdir, f) for f in os.listdir(
tmpdir) if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith('.dbcan.txt')]
resultList = [
os.path.join(tmpdir, f)
for f in os.listdir(tmpdir)
if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith(".dbcan.txt")
]
combineHmmerOutputs(resultList, dbCAN_out)

# now parse results
Results = {}
with open(dbCAN_filtered, 'w') as filtered:
with open(dbCAN_filtered, "w") as filtered:
filtered.write(
"#HMM_family\tHMM_len\tQuery_ID\tQuery_len\tE-value\tHMM_start\tHMM_end\tQuery_start\tQuery_end\tCoverage\n")
with open(dbCAN_out, 'r') as results:
"#HMM_family\tHMM_len\tQuery_ID\tQuery_len\tE-value\tHMM_start\tHMM_end\tQuery_start\tQuery_end\tCoverage\n"
)
with open(dbCAN_out, "r") as results:
for qresult in SearchIO.parse(results, "hmmscan3-domtab"):
query_length = qresult.seq_len
hits = qresult.hits
Expand All @@ -128,30 +136,39 @@ def dbCANsearch(inputList, cpus, evalue, tmpdir, output):
continue
hit = hits[i].id
hmmLen = hits[i].seq_len
hmm_aln = int(hits[i].hsps[0].hit_end) - \
int(hits[i].hsps[0].hit_start)
hmm_aln = int(hits[i].hsps[0].hit_end) - int(
hits[i].hsps[0].hit_start
)
coverage = hmm_aln / float(hmmLen)
if coverage < 0.45:
if coverage < 0.35:
continue
query = hits[i].query_id
filtered.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\n" % (
hit, hmmLen, query, query_length, hit_evalue,
hits[i].hsps[0].hit_start,
hits[i].hsps[0].hit_end,
hits[i].hsps[0].query_start,
hits[i].hsps[0].query_end,
coverage))
filtered.write(
"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\n"
% (
hit,
hmmLen,
query,
query_length,
hit_evalue,
hits[i].hsps[0].hit_start,
hits[i].hsps[0].hit_end,
hits[i].hsps[0].query_start,
hits[i].hsps[0].query_end,
coverage,
)
)
if query not in Results:
Results[query] = [hit]
else:
Results[query].append(hit)
# run through results and simplify subdomain hits
with open(output, 'w') as out:
with open(output, "w") as out:
for k, v in natsorted(Results.items()):
simplified = []
for x in v:
if '_' in x:
cazy, subdomain = x.rsplit('_', 1)
if "_" in x:
cazy, subdomain = x.rsplit("_", 1)
if cazy not in simplified:
simplified.append(cazy)
else:
Expand All @@ -167,27 +184,37 @@ def __init__(self, prog):


parser = argparse.ArgumentParser(
prog='hmmer_parallel.py',
description='''Run hmmer3 multipthreaded.''',
prog="hmmer_parallel.py",
description="""Run hmmer3 multipthreaded.""",
epilog="""Written by Jon Palmer (2019) nextgenusfs@gmail.com""",
formatter_class=MyFormatter)
parser.add_argument('-i', '--input', required=True,
help='folder of protein fasta files')
parser.add_argument('-m', '--method', default='pfam',
choices=['pfam', 'cazy'], help='database to search')
parser.add_argument('-d', '--db', required=True,
help='location of HMM database')
parser.add_argument('-c', '--cpus', default=1, type=int,
help='location of HMM database')
parser.add_argument('-o', '--out', required=True, help='output file')
formatter_class=MyFormatter,
)
parser.add_argument(
"-i", "--input", required=True, help="folder of protein fasta files"
)
parser.add_argument(
"-m",
"--method",
default="pfam",
choices=["pfam", "cazy"],
help="database to search",
)
parser.add_argument("-d", "--db", required=True, help="location of HMM database")
parser.add_argument(
"-c", "--cpus", default=1, type=int, help="location of HMM database"
)
parser.add_argument("-o", "--out", required=True, help="output file")
args = parser.parse_args()

global FUNDB, FNULL
FUNDB = args.db
FNULL = open(os.devnull, 'w')
splitProts = [os.path.join(args.input, f) for f in os.listdir(
args.input) if os.path.isfile(os.path.join(args.input, f))]
if args.method == 'pfam':
FNULL = open(os.devnull, "w")
splitProts = [
os.path.join(args.input, f)
for f in os.listdir(args.input)
if os.path.isfile(os.path.join(args.input, f))
]
if args.method == "pfam":
multiPFAMsearch(splitProts, args.cpus, args.input, args.out)
elif args.method == 'cazy':
elif args.method == "cazy":
dbCANsearch(splitProts, args.cpus, 1e-17, args.input, args.out)

0 comments on commit 036c9d6

Please sign in to comment.