Skip to content

Commit

Permalink
Fixed bug preventing running of checkm test. Moving to PEP8 compliance.
Browse files Browse the repository at this point in the history
  • Loading branch information
donovan-h-parks committed Dec 2, 2014
1 parent 1ce279d commit 21c4b69
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 128 deletions.
1 change: 1 addition & 0 deletions MANIFEST
Expand Up @@ -11,6 +11,7 @@ checkm/aminoAcidIdentity.py
checkm/binComparer.py
checkm/binStatistics.py
checkm/binTools.py
checkm/binUnion.py
checkm/checkmData.py
checkm/common.py
checkm/coverage.py
Expand Down
136 changes: 67 additions & 69 deletions bin/checkm

Large diffs are not rendered by default.

43 changes: 31 additions & 12 deletions checkm/binTools.py
Expand Up @@ -22,15 +22,17 @@
import os
import sys
import logging
import gzip

import numpy as np

from common import binIdFromFilename, checkFileExists, readDistribution, findNearest
from checkm.util.seqUtils import readFasta, readFastaSeqIds, writeFasta, baseCount
from checkm.util.seqUtils import readFasta, writeFasta, baseCount
from checkm.genomicSignatures import GenomicSignatures
from checkm.prodigal import ProdigalGeneFeatureParser
from checkm.defaultValues import DefaultValues


class BinTools():
"""Functions for exploring and modifying bins."""
def __init__(self, threads=1):
Expand Down Expand Up @@ -104,17 +106,33 @@ def removeOutliers(self, binFile, outlierFile, outputFile):
def unique(self, binFiles):
"""Check if sequences are assigned to multiple bins."""

# read seq ids from all bins
# read sequence IDs from all bins,
# while checking for duplicate sequences within a bin
binSeqs = {}
for f in binFiles:
binId = binIdFromFilename(f)
binSeqs[binId] = readFastaSeqIds(f)

if f.endswith('.gz'):
openFile = gzip.open
else:
openFile = open

seqIds = set()
for line in openFile(f):
if line[0] == '>':
seqId = line[1:].split(None, 1)[0]

if seqId in seqIds:
print ' [Warning] Sequence %s found multiple times in bin %s.' % (seqId, binId)
seqIds.add(seqId)

binSeqs[binId] = seqIds

# check for sequences assigned to multiple bins
bDuplicates = False
binIds = binSeqs.keys()
for i in xrange(0, len(binIds)):
for j in xrange(i+1, len(binIds)):
for j in xrange(i + 1, len(binIds)):
seqInter = set(binSeqs[binIds[i]]).intersection(set(binSeqs[binIds[j]]))

if len(seqInter) > 0:
Expand Down Expand Up @@ -171,11 +189,12 @@ def binTetraSig(self, seqs, tetraSigs):
for _, seq in seqs.iteritems():
binSize += len(seq)

binSig = None
bInit = True
for seqId, seq in seqs.iteritems():
weightedTetraSig = tetraSigs[seqId] * (float(len(seq)) / binSize)
if binSig == None:
if bInit:
binSig = weightedTetraSig
bInit = False
else:
binSig += weightedTetraSig

Expand Down Expand Up @@ -224,21 +243,21 @@ def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, rep
if not os.path.exists(gffFile):
self.logger.error(' [Error] Missing gene feature file (%s). This plot if not compatible with the --genes option.\n' % DefaultValues.PRODIGAL_GFF)
sys.exit()

prodigalParser = ProdigalGeneFeatureParser(gffFile)
meanCD, deltaCDs, CDs = self.codingDensityDist(seqs, prodigalParser)

# find keys into GC and CD distributions
closestGC = findNearest(np.array(gcBounds.keys()), meanGC)
sampleSeqLen = gcBounds[closestGC].keys()[0]
d = gcBounds[closestGC][sampleSeqLen]
gcLowerBoundKey = findNearest(d.keys(), (100 - distribution)/2.0)
gcUpperBoundKey = findNearest(d.keys(), (100 + distribution)/2.0)
gcLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0)
gcUpperBoundKey = findNearest(d.keys(), (100 + distribution) / 2.0)

closestCD = findNearest(np.array(cdBounds.keys()), meanCD)
sampleSeqLen = cdBounds[closestCD].keys()[0]
d = cdBounds[closestCD][sampleSeqLen]
cdLowerBoundKey = findNearest(d.keys(), (100 - distribution)/2.0)
cdLowerBoundKey = findNearest(d.keys(), (100 - distribution) / 2.0)

tdBoundKey = findNearest(tdBounds[tdBounds.keys()[0]].keys(), distribution)

Expand Down Expand Up @@ -269,8 +288,8 @@ def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, rep

if (reportType == 'any' and len(outlyingDists) >= 1) or (reportType == 'all' and len(outlyingDists) == 3):
fout.write(binId + '\t' + seqId + '\t%d' % len(seq) + '\t' + ','.join(outlyingDists))
fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index]*100, meanGC*100, (meanGC+gcLowerBound)*100, (meanGC+gcUpperBound)*100))
fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index]*100, meanCD*100, (meanCD+cdLowerBound)*100))
fout.write('\t%.1f\t%.1f\t%.1f\t%.1f' % (seqGC[index] * 100, meanGC * 100, (meanGC + gcLowerBound) * 100, (meanGC + gcUpperBound) * 100))
fout.write('\t%.1f\t%.1f\t%.1f' % (CDs[index] * 100, meanCD * 100, (meanCD + cdLowerBound) * 100))
fout.write('\t%.3f\t%.3f\t%.3f' % (deltaTDs[index], meanTD, tdBound) + '\n')

index += 1
Expand Down
1 change: 1 addition & 0 deletions checkm/main.py
Expand Up @@ -1263,6 +1263,7 @@ def parseOptions(self, options):
elif(options.subparser_name == 'bin_union'):
self.binUnion(options)
elif(options.subparser_name == 'test'):
options.bCalledGenes = False
self.test(options)
else:
self.logger.error(' [Error] Unknown CheckM command: ' + options.subparser_name + '\n')
Expand Down
94 changes: 48 additions & 46 deletions checkm/resultsParser.py
Expand Up @@ -36,6 +36,7 @@

from checkm.util.pfam import PFAM


class ResultsParser():
"""Parse output of Prodigal+HMMER run and derived statistics."""
def __init__(self, binIdToModels):
Expand All @@ -49,10 +50,10 @@ def analyseResults(self,
binStatsFile,
seqStatsFile,
hmmTableFile,
bIgnoreThresholds = False,
evalueThreshold = DefaultValues.E_VAL,
lengthThreshold = DefaultValues.LENGTH,
bSkipOrfCorrection = False,
bIgnoreThresholds=False,
evalueThreshold=DefaultValues.E_VAL,
lengthThreshold=DefaultValues.LENGTH,
bSkipOrfCorrection=False,
):
"""Parse results in the output directory."""

Expand All @@ -72,12 +73,12 @@ def cacheResults(self, outDir, binIdToBinMarkerSets, bIndividualMarkers):

def parseBinHits(self, outDir,
hmmTableFile,
bSkipOrfCorrection = False,
bIgnoreThresholds = False,
evalueThreshold = DefaultValues.E_VAL,
lengthThreshold = DefaultValues.LENGTH,
binStats = None,
seqStats = None):
bSkipOrfCorrection=False,
bIgnoreThresholds=False,
evalueThreshold=DefaultValues.E_VAL,
lengthThreshold=DefaultValues.LENGTH,
binStats=None,
seqStats=None):
""" Parse HMM hits for each bin."""
if not self.models:
self.logger.error(' [Error] Models must be parsed before identifying HMM hits.')
Expand All @@ -89,7 +90,7 @@ def parseBinHits(self, outDir,
for binId in self.models:
if self.logger.getEffectiveLevel() <= logging.INFO:
numBinsProcessed += 1
statusStr = ' Finished parsing hits for %d of %d (%.2f%%) bins.' % (numBinsProcessed, len(self.models), float(numBinsProcessed)*100/len(self.models))
statusStr = ' Finished parsing hits for %d of %d (%.2f%%) bins.' % (numBinsProcessed, len(self.models), float(numBinsProcessed) * 100 / len(self.models))
sys.stderr.write('%s\r' % statusStr)
sys.stderr.flush()

Expand Down Expand Up @@ -202,22 +203,22 @@ def parseHmmerResults(self, fileName, resultsManager, bSkipOrfCorrection):
resultsManager.identifyOrfErrors()

except IOError as detail:
sys.stderr.write(str(detail)+"\n")
sys.stderr.write(str(detail) + "\n")

def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles = None):
def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
"""Get header for requested output table."""

# keep count of single, double, triple genes etc...
if outputFormat == 1:
header = ['Bin Id','Marker lineage', '# genomes', '# markers','# marker sets','0','1','2','3','4','5+','Completeness','Contamination','Strain heterogeneity']
header = ['Bin Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity']
elif outputFormat == 2:
header = ['Bin Id']
header += ['Marker lineage', '# genomes', '# markers', '# marker sets']
header += ['Completeness','Contamination', 'Strain heterogeneity']
header += ['Completeness', 'Contamination', 'Strain heterogeneity']
header += ['Genome size (bp)', '# ambiguous bases', '# scaffolds', '# contigs', 'N50 (scaffolds)', 'N50 (contigs)', 'Longest scaffold (bp)', 'Longest contig (bp)']
header += ['GC', 'GC std (scaffolds > 1kbp)']
header += ['Coding density', 'Translation table', '# predicted genes']
header += ['0','1','2','3','4','5+']
header += ['0', '1', '2', '3', '4', '5+']

if coverageBinProfiles != None:
for bamId in coverageBinProfiles[coverageBinProfiles.keys()[0]]:
Expand All @@ -227,19 +228,19 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles = None):
self.logger.error(' [Error] Labeling error: GC std (scaffolds > 1kbp)')
sys.exit()
elif outputFormat == 3:
header = ['Bin Id', 'Node Id', 'Marker lineage', '# genomes', '# markers','# marker sets','0','1','2','3','4','5+','Completeness','Contamination','Strain heterogeneity']
header = ['Bin Id', 'Node Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity']
elif outputFormat == 4:
header = None
elif outputFormat == 5:
header = ['Bin Id','Marker Id','Gene Id']
header = ['Bin Id', 'Marker Id', 'Gene Id']
elif outputFormat == 6:
header = ['Bin Id','Marker Id','Gene Ids']
header = ['Bin Id', 'Marker Id', 'Gene Ids']
elif outputFormat == 7:
header = ['Bin Id','Marker Id','Gene Ids']
header = ['Bin Id', 'Marker Id', 'Gene Ids']
elif outputFormat == 8:
header = ['Bin Id','Gene Id','{Marker Id, Start position, End position}']
header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}']
elif outputFormat == 9:
header = ['Scaffold Id','Bin Id','Length','# contigs','GC','# ORFs','Coding density','Marker Ids']
header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']

return header

Expand All @@ -258,7 +259,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
if bTabTable or outputFormat not in prettyTableFormats:
bTabTable = True
pTable = None

if header != None:
print('\t'.join(header))
else:
Expand All @@ -272,21 +273,22 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
for binId in sorted(self.results.keys()):
self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable)

if not bTabTable :
if outputFormat in [1,2]:
if not bTabTable:
if outputFormat in [1, 2]:
print(pTable.get_string(sortby='Completeness', reversesort=True))
else:
print(pTable.get_string())

# restore stdout
restoreStdOut(outFile, oldStdOut)


class ResultsManager():
"""Store all the results for a single bin"""
def __init__(self, binId, models,
bIgnoreThresholds = False,
evalueThreshold = DefaultValues.E_VAL,
lengthThreshold = DefaultValues.LENGTH,
bIgnoreThresholds=False,
evalueThreshold=DefaultValues.E_VAL,
lengthThreshold=DefaultValues.LENGTH,
binStats=None,
scaffoldStats=None):
self.binId = binId
Expand Down Expand Up @@ -325,7 +327,7 @@ def vetHit(self, hit):
return False

alignment_length = float(hit.ali_to - hit.ali_from)
length_perc = alignment_length/float(hit.query_length)
length_perc = alignment_length / float(hit.query_length)
if length_perc >= self.lengthThreshold:
return True

Expand Down Expand Up @@ -365,15 +367,15 @@ def identifyOrfErrors(self):
scaffoldIdI = orfI[0:orfI.rfind('_')]

bCombined = False
for j in xrange(i+1, len(hits)):
for j in xrange(i + 1, len(hits)):
orfJ = hits[j].target_name
scaffoldIdJ = orfJ[0:orfJ.rfind('_')]

# check if hits are on adjacent ORFs
if scaffoldIdI == scaffoldIdJ:
try:
orfNumI = int(orfI[orfI.rfind('_')+1:])
orfNumJ = int(orfJ[orfJ.rfind('_')+1:])
orfNumI = int(orfI[orfI.rfind('_') + 1:])
orfNumJ = int(orfJ[orfJ.rfind('_') + 1:])
except:
# it appears called genes are not labeled
# according to the prodigal format, so
Expand All @@ -387,13 +389,13 @@ def identifyOrfErrors(self):

sJ = hits[j].hmm_from
eJ = hits[j].hmm_to
if (sI <= sJ and eI > sJ) or (sJ <= sI and eJ > sI):

if (sI <= sJ and eI > sJ) or (sJ <= sI and eJ > sI):
# models overlap so treat these as unique hits
# which may represent an assembly error or a true
# gene duplication
pass
else:
else:
# combine the two hits
bCombined = True
break
Expand All @@ -420,7 +422,7 @@ def identifyOrfErrors(self):
break

self.markerHits[markerId] = hits

def countUniqueHits(self):
"""Determine number of unique and multiple hits."""
uniqueHits = 0
Expand All @@ -430,7 +432,7 @@ def countUniqueHits(self):
uniqueHits += 1
elif len(hits) > 1:
multiCopyHits += 1

return uniqueHits, multiCopyHits

def hitsToMarkerGene(self, markerSet):
Expand All @@ -455,7 +457,7 @@ def geneCountsForSelectedMarkerSet(self, binMarkerSets, bIndividualMarkers):
def geneCounts(self, markerSet, markerHits, bIndividualMarkers):
""" Determine number of marker genes with 0-5 hits
as well as the total completeness and contamination."""
geneCounts = [0]*6
geneCounts = [0] * 6
multiCopyCount = 0
for marker in markerSet.getMarkerGenes():
if marker in markerHits:
Expand Down Expand Up @@ -614,7 +616,7 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1):

return summary

def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles = None, table = None):
def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None):
"""Print out information about bin."""
if outputFormat == 1:
selectedMarkerSet = binMarkerSets.selectedMarkerSet()
Expand Down Expand Up @@ -642,8 +644,8 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'],
self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'],
self.binStats['Longest scaffold'], self.binStats['Longest contig'])
row += '\t%.1f\t%.2f' % (self.binStats['GC']*100, self.binStats['GC std']*100)
row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density']*100, self.binStats['Translation table'], self.binStats['# predicted genes'])
row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100, self.binStats['GC std'] * 100)
row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes'])
row += '\t' + '\t'.join([str(data[i]) for i in xrange(6)])

if coverageBinProfiles:
Expand All @@ -657,8 +659,8 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
row.extend([self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'],
self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'],
self.binStats['Longest scaffold'], self.binStats['Longest contig']])
row.extend([self.binStats['GC']*100, self.binStats['GC std']*100])
row.extend([self.binStats['Coding density']*100, self.binStats['Translation table'], self.binStats['# predicted genes']])
row.extend([self.binStats['GC'] * 100, self.binStats['GC std'] * 100])
row.extend([self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']])
row.extend(data[0:6])

if coverageBinProfiles:
Expand Down Expand Up @@ -688,12 +690,12 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
for marker in data:
row += '\t' + marker
print(row)

row = self.binId
for count in data.values():
row += '\t' + str(count)
print(row)

print()
elif outputFormat == 5:
# tabular of bin_id, marker, contig_id
Expand Down Expand Up @@ -733,7 +735,7 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
scaffoldsWithMultipleHits = set()
for i in xrange(0, len(hitList)):
scaffoldId = hitList[i].target_name[0:hitList[i].target_name.rfind('_')]
for j in xrange(i+1, len(hitList)):
for j in xrange(i + 1, len(hitList)):
if scaffoldId == hitList[j].target_name[0:hitList[j].target_name.rfind('_')]:
scaffoldsWithMultipleHits.add(hitList[i].target_name)
scaffoldsWithMultipleHits.add(hitList[j].target_name)
Expand Down

0 comments on commit 21c4b69

Please sign in to comment.