Skip to content

Commit

Permalink
fixed support for gzip input FASTA files
Browse files Browse the repository at this point in the history
  • Loading branch information
donovan-h-parks committed Apr 14, 2022
1 parent da43034 commit 40720cc
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 21 deletions.
6 changes: 6 additions & 0 deletions checkm/VERSION
@@ -1,3 +1,9 @@
1.1.9
- fixed support for gzip input files

1.1.8
- updated setup.py to include long description and fixed PyPi URL

1.1.7
- added Jie Zhu's (alienzj) PR to allow input files to be specified via a file
- added Jeremy Jacobson's (jjacobson95) PR to fix prettytable import issue
Expand Down
32 changes: 20 additions & 12 deletions checkm/prodigal.py
Expand Up @@ -25,6 +25,7 @@
import subprocess
import logging
import shutil
import tempfile

import numpy as np

Expand Down Expand Up @@ -54,18 +55,20 @@ def run(self, query, bNucORFs=True):

prodigal_input = query

# decompress gzip input files
if prodigal_input.endswith('.gz'):
tmp_dir = tempfile.mkdtemp()
prodigal_input = os.path.join(tmp_dir, os.path.basename(prodigal_input[0:-3]) + '.fna')
writeFasta(seqs, prodigal_input)

# gather statistics about query file
seqs = readFasta(prodigal_input)
totalBases = 0
for seqId, seq in seqs.items():
totalBases += len(seq)

# decompress gzip input files
tmp_dir = None
if prodigal_input.endswith('.gz'):
tmp_dir = tempfile.mkdtemp()
prodigal_input = os.path.join(
tmp_dir, os.path.basename(prodigal_input[0:-3]))
writeFasta(seqs, prodigal_input)

# call ORFs with different translation tables and select the one with the highest coding density
tableCodingDensity = {}
for translationTable in [4, 11]:
Expand Down Expand Up @@ -119,10 +122,13 @@ def run(self, query, bNucORFs=True):
if (tableCodingDensity[4] - tableCodingDensity[11] > 0.05) and tableCodingDensity[4] > 0.7:
bestTranslationTable = 4

shutil.copyfile(self.aaGeneFile + '.' + str(bestTranslationTable), self.aaGeneFile)
shutil.copyfile(self.gffFile + '.' + str(bestTranslationTable), self.gffFile)
shutil.copyfile(self.aaGeneFile + '.' +
str(bestTranslationTable), self.aaGeneFile)
shutil.copyfile(self.gffFile + '.' +
str(bestTranslationTable), self.gffFile)
if bNucORFs:
shutil.copyfile(self.ntGeneFile + '.' + str(bestTranslationTable), self.ntGeneFile)
shutil.copyfile(self.ntGeneFile + '.' +
str(bestTranslationTable), self.ntGeneFile)

# clean up redundant prodigal results
for translationTable in [4, 11]:
Expand All @@ -131,7 +137,7 @@ def run(self, query, bNucORFs=True):
if bNucORFs:
os.remove(self.ntGeneFile + '.' + str(translationTable))

if prodigal_input.endswith('.gz'):
if tmp_dir:
shutil.rmtree(tmp_dir)

return bestTranslationTable
Expand All @@ -153,7 +159,8 @@ def checkForProdigal(self):
# Assume that a successful prodigal -h returns 0 and anything
# else returns something non-zero
try:
subprocess.call(['prodigal', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT)
subprocess.call(
['prodigal', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT)
except:
self.logger.error("Make sure prodigal is on your system path.")
sys.exit(1)
Expand Down Expand Up @@ -205,7 +212,8 @@ def __parseGFF(self, filename):
lineSplit = line.split(';')
for token in lineSplit:
if 'transl_table' in token:
self.translationTable = int(token[token.find('=') + 1:])
self.translationTable = int(
token[token.find('=') + 1:])

if line[0] == '#' or line.strip() == '"':
# work around for Prodigal having lines with just a
Expand Down
21 changes: 13 additions & 8 deletions checkm/util/seqUtils.py
Expand Up @@ -138,11 +138,13 @@ def checkNuclotideSeqs(seq_files):
for seq_file in seq_files:
if os.stat(seq_file).st_size == 0:
continue

if not isNucleotide(seq_file):
logger = logging.getLogger('timestamp')
logger.warning('Expected all files to contain sequences in nucleotide space.')
logger.warning('File %s appears to contain amino acids sequences.' % seq_file)
logger.warning(
'Expected all files to contain sequences in nucleotide space.')
logger.warning(
'File %s appears to contain amino acids sequences.' % seq_file)

return True

Expand All @@ -164,11 +166,13 @@ def checkProteinSeqs(seq_files):
for seq_file in seq_files:
if os.stat(seq_file).st_size == 0:
continue

if isNucleotide(seq_file):
logger = logging.getLogger('timestamp')
logger.warning('Expected all files to contain sequences in amino acid space.')
logger.warning('File %s appears to contain nucleotide sequences.' % seq_file)
logger.warning(
'Expected all files to contain sequences in amino acid space.')
logger.warning(
'File %s appears to contain nucleotide sequences.' % seq_file)

return True

Expand All @@ -182,7 +186,7 @@ def readFasta(fastaFile, trimHeader=True):
openFile = open

seqs = {}
for line in openFile(fastaFile):
for line in openFile(fastaFile, 'rt'):
# skip blank lines
if not line.strip():
continue
Expand All @@ -198,7 +202,8 @@ def readFasta(fastaFile, trimHeader=True):

for seqId, seq in seqs.items():
seqs[seqId] = ''.join(seq)
except:
except Exception as e:
print(e)
logger = logging.getLogger('timestamp')
logger.error("Failed to process sequence file: {}".format(fastaFile))
sys.exit(1)
Expand Down
12 changes: 11 additions & 1 deletion setup.py
Expand Up @@ -3,11 +3,18 @@
import os
from setuptools import setup


def version():
setupDir = os.path.dirname(os.path.realpath(__file__))
versionFile = open(os.path.join(setupDir, 'checkm', 'VERSION'))
return versionFile.readline().strip()


def readme():
with open('README.md') as f:
return f.read()


setup(
name='checkm-genome',
version=version(),
Expand All @@ -17,11 +24,14 @@ def version():
scripts=['bin/checkm'],
package_data={'checkm': ['VERSION', 'DATA_CONFIG']},
include_package_data=True,
url='http://pypi.python.org/pypi/checkm/',
url='http://pypi.python.org/pypi/checkm-genome/',
license='GPL3',
description='Assess the quality of putative genome bins.',
long_description=readme(),
long_description_content_type='text/markdown',
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'Natural Language :: English',
'Programming Language :: Python :: 3.6',
Expand Down

0 comments on commit 40720cc

Please sign in to comment.