fixed support for gzip input FASTA files

Ecogenomics · Apr 14, 2022 · 40720cc · 40720cc
1 parent da43034
commit 40720cc
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 21 deletions.
diff --git a/checkm/VERSION b/checkm/VERSION
@@ -1,3 +1,9 @@
+1.1.9
+- fixed support for gzip input files
+
+1.1.8
+- updated setup.py to include long description and fixed PyPi URL
+
 1.1.7
 - added Jie Zhu's (alienzj) PR to allow input files to be specified via a file
 - added Jeremy Jacobson's (jjacobson95) PR to fix prettytable import issue

diff --git a/checkm/prodigal.py b/checkm/prodigal.py
@@ -25,6 +25,7 @@
 import subprocess
 import logging
 import shutil
+import tempfile
 
 import numpy as np
 
@@ -54,18 +55,20 @@ def run(self, query, bNucORFs=True):
 
         prodigal_input = query
 
-        # decompress gzip input files
-        if prodigal_input.endswith('.gz'):
-            tmp_dir = tempfile.mkdtemp()
-            prodigal_input = os.path.join(tmp_dir, os.path.basename(prodigal_input[0:-3]) + '.fna')
-            writeFasta(seqs, prodigal_input)
-
         # gather statistics about query file
         seqs = readFasta(prodigal_input)
         totalBases = 0
         for seqId, seq in seqs.items():
             totalBases += len(seq)
 
+        # decompress gzip input files
+        tmp_dir = None
+        if prodigal_input.endswith('.gz'):
+            tmp_dir = tempfile.mkdtemp()
+            prodigal_input = os.path.join(
+                tmp_dir, os.path.basename(prodigal_input[0:-3]))
+            writeFasta(seqs, prodigal_input)
+
         # call ORFs with different translation tables and select the one with the highest coding density
         tableCodingDensity = {}
         for translationTable in [4, 11]:
@@ -119,10 +122,13 @@ def run(self, query, bNucORFs=True):
         if (tableCodingDensity[4] - tableCodingDensity[11] > 0.05) and tableCodingDensity[4] > 0.7:
             bestTranslationTable = 4
 
-        shutil.copyfile(self.aaGeneFile + '.' + str(bestTranslationTable), self.aaGeneFile)
-        shutil.copyfile(self.gffFile + '.' + str(bestTranslationTable), self.gffFile)
+        shutil.copyfile(self.aaGeneFile + '.' +
+                        str(bestTranslationTable), self.aaGeneFile)
+        shutil.copyfile(self.gffFile + '.' +
+                        str(bestTranslationTable), self.gffFile)
         if bNucORFs:
-            shutil.copyfile(self.ntGeneFile + '.' + str(bestTranslationTable), self.ntGeneFile)
+            shutil.copyfile(self.ntGeneFile + '.' +
+                            str(bestTranslationTable), self.ntGeneFile)
 
         # clean up redundant prodigal results
         for translationTable in [4, 11]:
@@ -131,7 +137,7 @@ def run(self, query, bNucORFs=True):
             if bNucORFs:
                 os.remove(self.ntGeneFile + '.' + str(translationTable))
 
-        if prodigal_input.endswith('.gz'):
+        if tmp_dir:
             shutil.rmtree(tmp_dir)
 
         return bestTranslationTable
@@ -153,7 +159,8 @@ def checkForProdigal(self):
         # Assume that a successful prodigal -h returns 0 and anything
         # else returns something non-zero
         try:
-            subprocess.call(['prodigal', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT)
+            subprocess.call(
+                ['prodigal', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT)
         except:
             self.logger.error("Make sure prodigal is on your system path.")
             sys.exit(1)
@@ -205,7 +212,8 @@ def __parseGFF(self, filename):
                 lineSplit = line.split(';')
                 for token in lineSplit:
                     if 'transl_table' in token:
-                        self.translationTable = int(token[token.find('=') + 1:])
+                        self.translationTable = int(
+                            token[token.find('=') + 1:])
 
             if line[0] == '#' or line.strip() == '"':
                 # work around for Prodigal having lines with just a

diff --git a/checkm/util/seqUtils.py b/checkm/util/seqUtils.py
@@ -138,11 +138,13 @@ def checkNuclotideSeqs(seq_files):
     for seq_file in seq_files:
         if os.stat(seq_file).st_size == 0:
             continue
-            
+
         if not isNucleotide(seq_file):
             logger = logging.getLogger('timestamp')
-            logger.warning('Expected all files to contain sequences in nucleotide space.')
-            logger.warning('File %s appears to contain amino acids sequences.' % seq_file)
+            logger.warning(
+                'Expected all files to contain sequences in nucleotide space.')
+            logger.warning(
+                'File %s appears to contain amino acids sequences.' % seq_file)
 
     return True
 
@@ -164,11 +166,13 @@ def checkProteinSeqs(seq_files):
     for seq_file in seq_files:
         if os.stat(seq_file).st_size == 0:
             continue
-        
+
         if isNucleotide(seq_file):
             logger = logging.getLogger('timestamp')
-            logger.warning('Expected all files to contain sequences in amino acid space.')
-            logger.warning('File %s appears to contain nucleotide sequences.' % seq_file)
+            logger.warning(
+                'Expected all files to contain sequences in amino acid space.')
+            logger.warning(
+                'File %s appears to contain nucleotide sequences.' % seq_file)
 
     return True
 
@@ -182,7 +186,7 @@ def readFasta(fastaFile, trimHeader=True):
             openFile = open
 
         seqs = {}
-        for line in openFile(fastaFile):
+        for line in openFile(fastaFile, 'rt'):
             # skip blank lines
             if not line.strip():
                 continue
@@ -198,7 +202,8 @@ def readFasta(fastaFile, trimHeader=True):
 
         for seqId, seq in seqs.items():
             seqs[seqId] = ''.join(seq)
-    except:
+    except Exception as e:
+        print(e)
         logger = logging.getLogger('timestamp')
         logger.error("Failed to process sequence file: {}".format(fastaFile))
         sys.exit(1)

diff --git a/setup.py b/setup.py
@@ -3,11 +3,18 @@
 import os
 from setuptools import setup
 
+
 def version():
     setupDir = os.path.dirname(os.path.realpath(__file__))
     versionFile = open(os.path.join(setupDir, 'checkm', 'VERSION'))
     return versionFile.readline().strip()
 
+
+def readme():
+    with open('README.md') as f:
+        return f.read()
+
+
 setup(
     name='checkm-genome',
     version=version(),
@@ -17,11 +24,14 @@ def version():
     scripts=['bin/checkm'],
     package_data={'checkm': ['VERSION', 'DATA_CONFIG']},
     include_package_data=True,
-    url='http://pypi.python.org/pypi/checkm/',
+    url='http://pypi.python.org/pypi/checkm-genome/',
     license='GPL3',
     description='Assess the quality of putative genome bins.',
+    long_description=readme(),
+    long_description_content_type='text/markdown',
     classifiers=[
         'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Science/Research',
         'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
         'Natural Language :: English',
         'Programming Language :: Python :: 3.6',