Fixed bug preventing running of checkm test. Moving to PEP8 compliance.

Ecogenomics · Dec 2, 2014 · 21c4b69 · 21c4b69
1 parent 1ce279d
commit 21c4b69
Show file tree

Hide file tree

Showing 8 changed files with 158 additions and 128 deletions.
diff --git a/MANIFEST b/MANIFEST
@@ -11,6 +11,7 @@ checkm/aminoAcidIdentity.py
 checkm/binComparer.py
 checkm/binStatistics.py
 checkm/binTools.py
+checkm/binUnion.py
 checkm/checkmData.py
 checkm/common.py
 checkm/coverage.py

diff --git a/bin/checkm b/bin/checkm
diff --git a/checkm/binTools.py b/checkm/binTools.py
@@ -22,15 +22,17 @@
 import os
 import sys
 import logging
+import gzip
 
 import numpy as np
 
 from common import binIdFromFilename, checkFileExists, readDistribution, findNearest
-from checkm.util.seqUtils import readFasta, readFastaSeqIds, writeFasta, baseCount
+from checkm.util.seqUtils import readFasta, writeFasta, baseCount
 from checkm.genomicSignatures import GenomicSignatures
 from checkm.prodigal import ProdigalGeneFeatureParser
 from checkm.defaultValues import DefaultValues
 
+
 class BinTools():
     """Functions for exploring and modifying bins."""
     def __init__(self, threads=1):
@@ -104,17 +106,33 @@ def removeOutliers(self, binFile, outlierFile, outputFile):
     def unique(self, binFiles):
         """Check if sequences are assigned to multiple bins."""
 
-        # read seq ids from all bins
+        # read sequence IDs from all bins,
+        # while checking for duplicate sequences within a bin
         binSeqs = {}
         for f in binFiles:
             binId = binIdFromFilename(f)
-            binSeqs[binId] = readFastaSeqIds(f)
+
+            if f.endswith('.gz'):
+                openFile = gzip.open
+            else:
+                openFile = open
+
+            seqIds = set()
+            for line in openFile(f):
+                if line[0] == '>':
+                    seqId = line[1:].split(None, 1)[0]
+
+                    if seqId in seqIds:
+                        print '  [Warning] Sequence %s found multiple times in bin %s.' % (seqId, binId)
+                    seqIds.add(seqId)
+
+            binSeqs[binId] = seqIds
 
         # check for sequences assigned to multiple bins
         bDuplicates = False
         binIds = binSeqs.keys()
         for i in xrange(0, len(binIds)):
-            for j in xrange(i+1, len(binIds)):
+            for j in xrange(i + 1, len(binIds)):
                 seqInter = set(binSeqs[binIds[i]]).intersection(set(binSeqs[binIds[j]]))
 
                 if len(seqInter) > 0:
@@ -171,11 +189,12 @@ def binTetraSig(self, seqs, tetraSigs):
         for _, seq in seqs.iteritems():
             binSize += len(seq)
 
-        binSig = None
+        bInit = True
         for seqId, seq in seqs.iteritems():
             weightedTetraSig = tetraSigs[seqId] * (float(len(seq)) / binSize)
-            if binSig == None:
+            if bInit:
                 binSig = weightedTetraSig
+                bInit = False
             else:
                 binSig += weightedTetraSig
 
@@ -224,21 +243,21 @@ def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, rep
             if not os.path.exists(gffFile):
                 self.logger.error('  [Error] Missing gene feature file (%s). This plot if not compatible with the --genes option.\n' % DefaultValues.PRODIGAL_GFF)
                 sys.exit()
-            
+
             prodigalParser = ProdigalGeneFeatureParser(gffFile)
             meanCD, deltaCDs, CDs = self.codingDensityDist(seqs, prodigalParser)
 
             # find keys into GC and CD distributions
             closestGC = findNearest(np.array(gcBounds.keys()), meanGC)
             sampleSeqLen = gcBounds[closestGC].keys()[0]
             d = gcBounds[closestGC][sampleSeqLen]
-            gcLowerBoundKey = findNearest(d.keys(), (100 - distribution)/2.0)
-            gcUpperBoundKey = findNearest(d.keys(), (100 + distribution)/2.0)
+            gcLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0)
+            gcUpperBoundKey = findNearest(d.keys(), (100 + distribution) / 2.0)
 
             closestCD = findNearest(np.array(cdBounds.keys()), meanCD)
             sampleSeqLen = cdBounds[closestCD].keys()[0]
             d = cdBounds[closestCD][sampleSeqLen]
-            cdLowerBoundKey = findNearest(d.keys(), (100 - distribution)/2.0)
+            cdLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0)
 
             tdBoundKey = findNearest(tdBounds[tdBounds.keys()[0]].keys(), distribution)
 
@@ -269,8 +288,8 @@ def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, rep
 
                 if (reportType == 'any' and len(outlyingDists) >= 1) or (reportType == 'all' and len(outlyingDists) == 3):
                     fout.write(binId + '\t' + seqId + '\t%d' % len(seq) + '\t' + ','.join(outlyingDists))
-                    fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index]*100, meanGC*100, (meanGC+gcLowerBound)*100, (meanGC+gcUpperBound)*100))
-                    fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index]*100, meanCD*100, (meanCD+cdLowerBound)*100))
+                    fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index] * 100, meanGC * 100, (meanGC + gcLowerBound) * 100, (meanGC + gcUpperBound) * 100))
+                    fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index] * 100, meanCD * 100, (meanCD + cdLowerBound) * 100))
                     fout.write('\t%.3f\t%.3f\t%.3f' % (deltaTDs[index], meanTD, tdBound) + '\n')
 
                 index += 1

diff --git a/checkm/main.py b/checkm/main.py
@@ -1263,6 +1263,7 @@ def parseOptions(self, options):
         elif(options.subparser_name == 'bin_union'):
             self.binUnion(options)
         elif(options.subparser_name == 'test'):
+            options.bCalledGenes = False
             self.test(options)
         else:
             self.logger.error('  [Error] Unknown CheckM command: ' + options.subparser_name + '\n')

diff --git a/checkm/resultsParser.py b/checkm/resultsParser.py
@@ -36,6 +36,7 @@
 
 from checkm.util.pfam import PFAM
 
+
 class ResultsParser():
     """Parse output of Prodigal+HMMER run and derived statistics."""
     def __init__(self, binIdToModels):
@@ -49,10 +50,10 @@ def analyseResults(self,
                        binStatsFile,
                        seqStatsFile,
                        hmmTableFile,
-                       bIgnoreThresholds = False,
-                       evalueThreshold = DefaultValues.E_VAL,
-                       lengthThreshold = DefaultValues.LENGTH,
-                       bSkipOrfCorrection = False,
+                       bIgnoreThresholds=False,
+                       evalueThreshold=DefaultValues.E_VAL,
+                       lengthThreshold=DefaultValues.LENGTH,
+                       bSkipOrfCorrection=False,
                        ):
         """Parse results in the output directory."""
 
@@ -72,12 +73,12 @@ def cacheResults(self, outDir, binIdToBinMarkerSets, bIndividualMarkers):
 
     def parseBinHits(self, outDir,
                      hmmTableFile,
-                     bSkipOrfCorrection = False,
-                     bIgnoreThresholds = False,
-                     evalueThreshold = DefaultValues.E_VAL,
-                     lengthThreshold = DefaultValues.LENGTH,
-                     binStats = None,
-                     seqStats = None):
+                     bSkipOrfCorrection=False,
+                     bIgnoreThresholds=False,
+                     evalueThreshold=DefaultValues.E_VAL,
+                     lengthThreshold=DefaultValues.LENGTH,
+                     binStats=None,
+                     seqStats=None):
         """ Parse HMM hits for each bin."""
         if not self.models:
             self.logger.error('  [Error] Models must be parsed before identifying HMM hits.')
@@ -89,7 +90,7 @@ def parseBinHits(self, outDir,
         for binId in self.models:
             if self.logger.getEffectiveLevel() <= logging.INFO:
                 numBinsProcessed += 1
-                statusStr = '    Finished parsing hits for %d of %d (%.2f%%) bins.' % (numBinsProcessed, len(self.models), float(numBinsProcessed)*100/len(self.models))
+                statusStr = '    Finished parsing hits for %d of %d (%.2f%%) bins.' % (numBinsProcessed, len(self.models), float(numBinsProcessed) * 100 / len(self.models))
                 sys.stderr.write('%s\r' % statusStr)
                 sys.stderr.flush()
 
@@ -202,22 +203,22 @@ def parseHmmerResults(self, fileName, resultsManager, bSkipOrfCorrection):
                 resultsManager.identifyOrfErrors()
 
         except IOError as detail:
-            sys.stderr.write(str(detail)+"\n")
+            sys.stderr.write(str(detail) + "\n")
 
-    def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles = None):
+    def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
         """Get header for requested output table."""
 
         # keep count of single, double, triple genes etc...
         if outputFormat == 1:
-            header = ['Bin Id','Marker lineage', '# genomes', '# markers','# marker sets','0','1','2','3','4','5+','Completeness','Contamination','Strain heterogeneity']
+            header = ['Bin Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity']
         elif outputFormat == 2:
             header = ['Bin Id']
             header += ['Marker lineage', '# genomes', '# markers', '# marker sets']
-            header += ['Completeness','Contamination', 'Strain heterogeneity']
+            header += ['Completeness', 'Contamination', 'Strain heterogeneity']
             header += ['Genome size (bp)', '# ambiguous bases', '# scaffolds', '# contigs', 'N50 (scaffolds)', 'N50 (contigs)', 'Longest scaffold (bp)', 'Longest contig (bp)']
             header += ['GC', 'GC std (scaffolds > 1kbp)']
             header += ['Coding density', 'Translation table', '# predicted genes']
-            header += ['0','1','2','3','4','5+']
+            header += ['0', '1', '2', '3', '4', '5+']
 
             if coverageBinProfiles != None:
                 for bamId in coverageBinProfiles[coverageBinProfiles.keys()[0]]:
@@ -227,19 +228,19 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles = None):
                 self.logger.error('  [Error] Labeling error: GC std (scaffolds > 1kbp)')
                 sys.exit()
         elif outputFormat == 3:
-            header = ['Bin Id', 'Node Id', 'Marker lineage', '# genomes', '# markers','# marker sets','0','1','2','3','4','5+','Completeness','Contamination','Strain heterogeneity']
+            header = ['Bin Id', 'Node Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity']
         elif outputFormat == 4:
             header = None
         elif outputFormat == 5:
-            header = ['Bin Id','Marker Id','Gene Id']
+            header = ['Bin Id', 'Marker Id', 'Gene Id']
         elif outputFormat == 6:
-            header = ['Bin Id','Marker Id','Gene Ids']
+            header = ['Bin Id', 'Marker Id', 'Gene Ids']
         elif outputFormat == 7:
-            header = ['Bin Id','Marker Id','Gene Ids']
+            header = ['Bin Id', 'Marker Id', 'Gene Ids']
         elif outputFormat == 8:
-            header = ['Bin Id','Gene Id','{Marker Id, Start position, End position}']
+            header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}']
         elif outputFormat == 9:
-            header = ['Scaffold Id','Bin Id','Length','# contigs','GC','# ORFs','Coding density','Marker Ids']
+            header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']
 
         return header
 
@@ -258,7 +259,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
         if bTabTable or outputFormat not in prettyTableFormats:
             bTabTable = True
             pTable = None
-            
+
             if header != None:
                 print('\t'.join(header))
         else:
@@ -272,21 +273,22 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
         for binId in sorted(self.results.keys()):
             self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable)
 
-        if not bTabTable :
-            if outputFormat in [1,2]:
+        if not bTabTable:
+            if outputFormat in [1, 2]:
                 print(pTable.get_string(sortby='Completeness', reversesort=True))
             else:
                 print(pTable.get_string())
 
         # restore stdout
         restoreStdOut(outFile, oldStdOut)
 
+
 class ResultsManager():
     """Store all the results for a single bin"""
     def __init__(self, binId, models,
-                 bIgnoreThresholds = False,
-                 evalueThreshold = DefaultValues.E_VAL,
-                 lengthThreshold = DefaultValues.LENGTH,
+                 bIgnoreThresholds=False,
+                 evalueThreshold=DefaultValues.E_VAL,
+                 lengthThreshold=DefaultValues.LENGTH,
                  binStats=None,
                  scaffoldStats=None):
         self.binId = binId
@@ -325,7 +327,7 @@ def vetHit(self, hit):
                 return False
 
             alignment_length = float(hit.ali_to - hit.ali_from)
-            length_perc = alignment_length/float(hit.query_length)
+            length_perc = alignment_length / float(hit.query_length)
             if length_perc >= self.lengthThreshold:
                 return True
 
@@ -365,15 +367,15 @@ def identifyOrfErrors(self):
                     scaffoldIdI = orfI[0:orfI.rfind('_')]
 
                     bCombined = False
-                    for j in xrange(i+1, len(hits)):
+                    for j in xrange(i + 1, len(hits)):
                         orfJ = hits[j].target_name
                         scaffoldIdJ = orfJ[0:orfJ.rfind('_')]
 
                         # check if hits are on adjacent ORFs
                         if scaffoldIdI == scaffoldIdJ:
                             try:
-                                orfNumI = int(orfI[orfI.rfind('_')+1:])
-                                orfNumJ = int(orfJ[orfJ.rfind('_')+1:])
+                                orfNumI = int(orfI[orfI.rfind('_') + 1:])
+                                orfNumJ = int(orfJ[orfJ.rfind('_') + 1:])
                             except:
                                 # it appears called genes are not labeled
                                 # according to the prodigal format, so
@@ -387,13 +389,13 @@ def identifyOrfErrors(self):
 
                                 sJ = hits[j].hmm_from
                                 eJ = hits[j].hmm_to
-                                
-                                if (sI <= sJ and eI > sJ) or (sJ <= sI and eJ > sI):                                   
+
+                                if (sI <= sJ and eI > sJ) or (sJ <= sI and eJ > sI):
                                     # models overlap so treat these as unique hits
                                     # which may represent an assembly error or a true
                                     # gene duplication
                                     pass
-                                else:                                    
+                                else:
                                     # combine the two hits
                                     bCombined = True
                                     break
@@ -420,7 +422,7 @@ def identifyOrfErrors(self):
                         break
 
             self.markerHits[markerId] = hits
-            
+
     def countUniqueHits(self):
         """Determine number of unique and multiple hits."""
         uniqueHits = 0
@@ -430,7 +432,7 @@ def countUniqueHits(self):
                 uniqueHits += 1
             elif len(hits) > 1:
                 multiCopyHits += 1
-                
+
         return uniqueHits, multiCopyHits
 
     def hitsToMarkerGene(self, markerSet):
@@ -455,7 +457,7 @@ def geneCountsForSelectedMarkerSet(self, binMarkerSets, bIndividualMarkers):
     def geneCounts(self, markerSet, markerHits, bIndividualMarkers):
         """ Determine number of marker genes with 0-5 hits
             as well as the total completeness and contamination."""
-        geneCounts = [0]*6
+        geneCounts = [0] * 6
         multiCopyCount = 0
         for marker in markerSet.getMarkerGenes():
             if marker in markerHits:
@@ -614,7 +616,7 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1):
 
         return summary
 
-    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles = None, table = None):
+    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None):
         """Print out information about bin."""
         if outputFormat == 1:
             selectedMarkerSet = binMarkerSets.selectedMarkerSet()
@@ -642,8 +644,8 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
                 row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'],
                                                  self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'],
                                                  self.binStats['Longest scaffold'], self.binStats['Longest contig'])
-                row += '\t%.1f\t%.2f' % (self.binStats['GC']*100, self.binStats['GC std']*100)
-                row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density']*100, self.binStats['Translation table'], self.binStats['# predicted genes'])
+                row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100, self.binStats['GC std'] * 100)
+                row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes'])
                 row += '\t' + '\t'.join([str(data[i]) for i in xrange(6)])
 
                 if coverageBinProfiles:
@@ -657,8 +659,8 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
                 row.extend([self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'],
                                                  self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'],
                                                  self.binStats['Longest scaffold'], self.binStats['Longest contig']])
-                row.extend([self.binStats['GC']*100, self.binStats['GC std']*100])
-                row.extend([self.binStats['Coding density']*100, self.binStats['Translation table'], self.binStats['# predicted genes']])
+                row.extend([self.binStats['GC'] * 100, self.binStats['GC std'] * 100])
+                row.extend([self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']])
                 row.extend(data[0:6])
 
                 if coverageBinProfiles:
@@ -688,12 +690,12 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
             for marker in data:
                 row += '\t' + marker
             print(row)
-            
+
             row = self.binId
             for count in data.values():
                 row += '\t' + str(count)
             print(row)
-            
+
             print()
         elif outputFormat == 5:
             # tabular of bin_id, marker, contig_id
@@ -733,7 +735,7 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
                     scaffoldsWithMultipleHits = set()
                     for i in xrange(0, len(hitList)):
                         scaffoldId = hitList[i].target_name[0:hitList[i].target_name.rfind('_')]
-                        for j in xrange(i+1, len(hitList)):
+                        for j in xrange(i + 1, len(hitList)):
                             if scaffoldId == hitList[j].target_name[0:hitList[j].target_name.rfind('_')]:
                                 scaffoldsWithMultipleHits.add(hitList[i].target_name)
                                 scaffoldsWithMultipleHits.add(hitList[j].target_name)