diff --git a/checkm/VERSION b/checkm/VERSION index 4b6e586..ed4962f 100755 --- a/checkm/VERSION +++ b/checkm/VERSION @@ -1,3 +1,9 @@ +1.2.0 +- modified how bin IDs are identified to improve output tables + +1.1.11 +- fixed error with inverse_transformed being deprecated in newer versions of Matplotlib + 1.1.10 - fixed bug with missing bCalledGenes flag that was impacting a number of commands diff --git a/checkm/common.py b/checkm/common.py index 881dd6d..b5ea808 100755 --- a/checkm/common.py +++ b/checkm/common.py @@ -95,9 +95,12 @@ def checkBinInputExists(binInput, bCalledGenes): have_error = True if have_error: logger = logging.getLogger("timestamp") - logger.error('Input table format is not correct: ' + binInput + '\n') - logger.error('If input is nucleotide contigs, please supply 2 column at least: [genome_id, genome_contigs_file]') - logger.error('If input is gene, please supply 3 column at least: [genome_id, genome_contigs_file, genome_genes_file]') + logger.error('Input table format is not correct: ' + + binInput + '\n') + logger.error( + 'If input is nucleotide contigs, please supply 2 column at least: [genome_id, genome_contigs_file]') + logger.error( + 'If input is gene, please supply 3 column at least: [genome_id, genome_contigs_file, genome_genes_file]') def checkFileExists(inputFile): @@ -132,7 +135,16 @@ def makeSurePathExists(path): def binIdFromFilename(filename): """Extract bin id from bin filename.""" + binId = os.path.basename(filename) + + # remove common compression extensions + compress_ext = ['.gz', '.bz2', '.rar', '.tgz', '.zip', '.7z'] + for ext in compress_ext: + if binId.endswith(ext): + binId = binId[0:len(binId) - len(ext)] + break + binId = os.path.splitext(binId)[0] return binId diff --git a/checkm/main.py b/checkm/main.py index 47be221..ab00eb2 100755 --- a/checkm/main.py +++ b/checkm/main.py @@ -294,7 +294,6 @@ def taxonSet(self, options, db=None): options.rank, options.taxon, options.marker_file) if bValidSet: - self.logger.info('Marker set written to: ' + options.marker_file) self.timeKeeper.printTimeStamp() @@ -422,7 +421,6 @@ def qa(self, options): aai.run(options.aai_strain, options.analyze_dir, options.alignment_file) # get HMM file for each bin - markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join( diff --git a/checkm/plot/AbstractPlot.py b/checkm/plot/AbstractPlot.py index dabc347..00d2d21 100755 --- a/checkm/plot/AbstractPlot.py +++ b/checkm/plot/AbstractPlot.py @@ -34,6 +34,7 @@ class AbstractPlot(FigureCanvas): ''' Abstract base class for plotting. ''' + def __init__(self, options): self.options = options @@ -60,7 +61,8 @@ def __init__(self, options): def savePlot(self, filename, dpi=300): imgFormat = filename[filename.rfind('.') + 1:len(filename)] if imgFormat in ['png', 'pdf', 'ps', 'eps', 'svg']: - self.fig.savefig(filename, format=imgFormat, dpi=dpi, facecolor='white', edgecolor='white', bbox_inches='tight') + self.fig.savefig(filename, format=imgFormat, dpi=dpi, + facecolor='white', edgecolor='white', bbox_inches='tight') else: pass @@ -72,20 +74,22 @@ def labelExtents(self, xLabels, xFontSize, xRotation, yLabels, yFontSize, yRotat tempAxes.set_xticks(np.arange(len(xLabels))) tempAxes.set_yticks(np.arange(len(yLabels))) - xText = tempAxes.set_xticklabels(xLabels, size=xFontSize, rotation=xRotation) - yText = tempAxes.set_yticklabels(yLabels, size=yFontSize, rotation=yRotation) + xText = tempAxes.set_xticklabels( + xLabels, size=xFontSize, rotation=xRotation) + yText = tempAxes.set_yticklabels( + yLabels, size=yFontSize, rotation=yRotation) bboxes = [] for label in xText: bbox = label.get_window_extent(self.get_renderer()) - bboxi = bbox.inverse_transformed(self.fig.transFigure) + bboxi = bbox.transformed(self.fig.transFigure.inverted()) bboxes.append(bboxi) xLabelBounds = mtransforms.Bbox.union(bboxes) bboxes = [] for label in yText: bbox = label.get_window_extent(self.get_renderer()) - bboxi = bbox.inverse_transformed(self.fig.transFigure) + bboxi = bbox.transformed(self.fig.transFigure.inverted()) bboxes.append(bboxi) yLabelBounds = mtransforms.Bbox.union(bboxes) @@ -98,12 +102,14 @@ def xLabelExtents(self, labels, fontSize, rotation=0): tempAxes = self.fig.add_axes([0, 0, 1.0, 1.0]) tempAxes.set_xticks(np.arange(len(labels))) - xLabels = tempAxes.set_xticklabels(labels, size=fontSize, rotation=rotation) + xLabels = tempAxes.set_xticklabels( + labels, size=fontSize, rotation=rotation) bboxes = [] for label in xLabels: bbox = label.get_window_extent(self.get_renderer()) - bboxi = bbox.inverse_transformed(self.fig.transFigure) + + bboxi = bbox.transformed(self.fig.transFigure.inverted()) bboxes.append(bboxi) xLabelBounds = mtransforms.Bbox.union(bboxes) @@ -116,12 +122,13 @@ def yLabelExtents(self, labels, fontSize, rotation=0): tempAxes = self.fig.add_axes([0, 0, 1.0, 1.0]) tempAxes.set_yticks(np.arange(len(labels))) - yLabels = tempAxes.set_yticklabels(labels, size=fontSize, rotation=rotation) + yLabels = tempAxes.set_yticklabels( + labels, size=fontSize, rotation=rotation) bboxes = [] for label in yLabels: bbox = label.get_window_extent(self.get_renderer()) - bboxi = bbox.inverse_transformed(self.fig.transFigure) + bboxi = bbox.transformed(self.fig.transFigure.inverted()) bboxes.append(bboxi) yLabelBounds = mtransforms.Bbox.union(bboxes) @@ -172,5 +179,5 @@ def boundingBox(self, data, ax, label, bBoundingBoxes, bLabels): if bLabels: ax.annotate(label, xy=(min(data[:, 0]), max(data[:, 1])), xytext=(0, 0), - textcoords='offset points', ha='right', va='bottom', - bbox=dict(boxstyle='round,pad=0.5', fc=(0.5, 0.5, 0.5), alpha=0.1)) + textcoords='offset points', ha='right', va='bottom', + bbox=dict(boxstyle='round,pad=0.5', fc=(0.5, 0.5, 0.5), alpha=0.1)) diff --git a/checkm/plot/codingDensityPlots.py b/checkm/plot/codingDensityPlots.py index 8759451..a760960 100755 --- a/checkm/plot/codingDensityPlots.py +++ b/checkm/plot/codingDensityPlots.py @@ -37,7 +37,7 @@ class CodingDensityPlots(AbstractPlot): def __init__(self, options): AbstractPlot.__init__(self, options) - + self.logger = logging.getLogger('timestamp') def plot(self, fastaFile, distributionsToPlot): @@ -55,9 +55,11 @@ def plot(self, fastaFile, distributionsToPlot): def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # parse Prodigal output - gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF) + gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename( + fastaFile), DefaultValues.PRODIGAL_GFF) if not os.path.exists(gffFile): - self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF) + self.logger.error( + 'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF) sys.exit(1) prodigalParser = ProdigalGeneFeatureParser(gffFile) @@ -87,7 +89,8 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): end += self.options.cd_window_size if len(data) == 0: - axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) + axesHist.set_xlabel( + '[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size) return # Histogram plot @@ -100,7 +103,8 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): axesHist.hist(data, bins=bins, density=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% coding density') - axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)') + axesHist.set_ylabel( + '% windows (' + str(self.options.cd_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: @@ -128,8 +132,10 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser) # Delta-CD vs sequence length plot - axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r') - axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) + axesDeltaCD.scatter(deltaCDs, seqLens, c=abs( + deltaCDs), s=10, lw=0.5, ec='black', cmap='gray_r') + axesDeltaCD.set_xlabel( + r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100)) axesDeltaCD.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaCD.get_ylim() @@ -142,8 +148,10 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): # find closest distribution values sampleSeqLen = list(dist[closestCD].keys())[0] d = dist[closestCD][sampleSeqLen] - cdLowerBoundKey = findNearest(list(d.keys()), (100 - distToPlot) / 2.0) - cdUpperBoundKey = findNearest(list(d.keys()), (100 + distToPlot) / 2.0) + cdLowerBoundKey = findNearest( + list(d.keys()), (100 - distToPlot) / 2.0) + cdUpperBoundKey = findNearest( + list(d.keys()), (100 + distToPlot) / 2.0) xL = [] xU = [] @@ -168,15 +176,17 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD): axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 - axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) + yticks = axesDeltaCD.get_yticks() + axesDeltaCD.vlines(0, 0, yticks[-1], linestyle='dashed', + color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp - yticks = axesDeltaCD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) + axesDeltaCD.set_yticks(yticks) axesDeltaCD.set_yticklabels(kbpLabels) # Prettify plot diff --git a/checkm/plot/gcPlots.py b/checkm/plot/gcPlots.py index 4b5b2cc..f3ce8c3 100755 --- a/checkm/plot/gcPlots.py +++ b/checkm/plot/gcPlots.py @@ -75,7 +75,8 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): end += self.options.gc_window_size if len(data) == 0: - axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size) + axesHist.set_xlabel( + '[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size) return # Histogram plot @@ -88,7 +89,8 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): axesHist.hist(data, bins=bins, density=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel('% GC') - axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)') + axesHist.set_ylabel( + '% windows (' + str(self.options.gc_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: @@ -116,8 +118,10 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): meanGC, deltaGCs, _ = binTools.gcDist(seqs) # Delta-GC vs Sequence length plot - axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap='gray_r') - axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100)) + axesDeltaGC.scatter(deltaGCs, seqLens, c=abs( + deltaGCs), s=10, lw=0.5, ec='black', cmap='gray_r') + axesDeltaGC.set_xlabel( + r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100)) axesDeltaGC.set_ylabel('Sequence length (kbp)') _, yMaxSeqs = axesDeltaGC.get_ylim() @@ -130,8 +134,10 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): # find closest distribution values sampleSeqLen = list(dist[closestGC].keys())[0] d = dist[closestGC][sampleSeqLen] - gcLowerBoundKey = findNearest(list(d.keys()), (100 - distToPlot) / 2.0) - gcUpperBoundKey = findNearest(list(d.keys()), (100 + distToPlot) / 2.0) + gcLowerBoundKey = findNearest( + list(d.keys()), (100 - distToPlot) / 2.0) + gcUpperBoundKey = findNearest( + list(d.keys()), (100 + distToPlot) / 2.0) xL = [] xU = [] @@ -156,15 +162,17 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC): axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 - axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) + yticks = axesDeltaGC.get_yticks() + axesDeltaGC.vlines(0, 0, yticks[-1], linestyle='dashed', + color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp - yticks = axesDeltaGC.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) + axesDeltaGC.set_yticks(yticks) axesDeltaGC.set_yticklabels(kbpLabels) # Prettify plot diff --git a/checkm/plot/tetraDistPlots.py b/checkm/plot/tetraDistPlots.py index 0deb190..53299f1 100755 --- a/checkm/plot/tetraDistPlots.py +++ b/checkm/plot/tetraDistPlots.py @@ -41,7 +41,8 @@ def plot(self, fastaFile, tetraSigs, distributionsToPlot): axesHist = self.fig.add_subplot(121) axesDeltaTD = self.fig.add_subplot(122) - self.plotOnAxes(fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD) + self.plotOnAxes(fastaFile, tetraSigs, + distributionsToPlot, axesHist, axesDeltaTD) self.fig.tight_layout(pad=1, w_pad=1) self.draw() @@ -78,7 +79,8 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe end += self.options.td_window_size if len(data) == 0: - axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.td_window_size) + axesHist.set_xlabel( + '[Error] No seqs >= %d, the specified window size' % self.options.td_window_size) return deltaTDs = np.array(deltaTDs) @@ -93,7 +95,8 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe axesHist.hist(data, bins=bins, density=True, color=(0.5, 0.5, 0.5)) axesHist.set_xlabel(r'$\Delta$ TD') - axesHist.set_ylabel('% windows (' + str(self.options.td_window_size) + ' bp)') + axesHist.set_ylabel( + '% windows (' + str(self.options.td_window_size) + ' bp)') # Prettify plot for a in axesHist.yaxis.majorTicks: @@ -117,10 +120,12 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe spine.set_color(self.axesColour) # get CD bin statistics - meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig) + meanTD, deltaTDs = binTools.tetraDiffDist( + seqs, genomicSig, tetraSigs, binSig) # Delta-TD vs Sequence length plot - axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap='gray_r') + axesDeltaTD.scatter(deltaTDs, seqLens, c=abs( + deltaTDs), s=10, lw=0.5, ec='black', cmap='gray_r') axesDeltaTD.set_xlabel(r'$\Delta$ TD (mean TD = %.2f)' % meanTD) axesDeltaTD.set_ylabel('Sequence length (kbp)') @@ -129,7 +134,8 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe # plot reference distributions for distToPlot in distributionsToPlot: - boundKey = findNearest(list(dist[list(dist.keys())[0]].keys()), distToPlot) + boundKey = findNearest( + list(dist[list(dist.keys())[0]].keys()), distToPlot) x = [] y = [] @@ -150,7 +156,8 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe if j == len(x) - 1: x[j] = x[i] else: - x[j] = (x[j - 1] + x[j + 1]) / 2 # interpolate values from neighbours + # interpolate values from neighbours + x[j] = (x[j - 1] + x[j + 1]) / 2 if x[j] > x[i]: x[j] = x[i] @@ -164,15 +171,17 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs]) # draw vertical line at x=0 - axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0) + yticks = axesDeltaTD.get_yticks() + axesDeltaTD.vlines(0, 0, yticks[-1], linestyle='dashed', + color=self.axesColour, zorder=0) # Change sequence lengths from bp to kbp - yticks = axesDeltaTD.get_yticks() kbpLabels = [] for seqLen in yticks: label = '%.1f' % (float(seqLen) / 1000) label = label.replace('.0', '') # remove trailing zero kbpLabels.append(label) + axesDeltaTD.set_yticks(yticks) axesDeltaTD.set_yticklabels(kbpLabels) # Prettify plot diff --git a/checkm/resultsParser.py b/checkm/resultsParser.py index ba32dbc..d51ac87 100755 --- a/checkm/resultsParser.py +++ b/checkm/resultsParser.py @@ -40,6 +40,7 @@ class ResultsParser(): """Parse output of Prodigal+HMMER run and derived statistics.""" + def __init__(self, binIdToModels): self.logger = logging.getLogger('timestamp') @@ -62,14 +63,17 @@ def analyseResults(self, binStats = self.parseBinStats(outDir, binStatsFile) # get hits to each bin - self.parseBinHits(outDir, hmmTableFile, bSkipAdjCorrection, bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection, binStats) + self.parseBinHits(outDir, hmmTableFile, bSkipAdjCorrection, bIgnoreThresholds, + evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection, binStats) return binStats def cacheResults(self, outDir, binIdToBinMarkerSets, bIndividualMarkers): # cache critical results to file - self.__writeBinStatsExt(outDir, binIdToBinMarkerSets, bIndividualMarkers) - self.__writeMarkerGeneStats(outDir, binIdToBinMarkerSets, bIndividualMarkers) + self.__writeBinStatsExt( + outDir, binIdToBinMarkerSets, bIndividualMarkers) + self.__writeMarkerGeneStats( + outDir, binIdToBinMarkerSets, bIndividualMarkers) def parseBinHits(self, outDir, hmmTableFile, @@ -80,9 +84,10 @@ def parseBinHits(self, outDir, bSkipPseudoGeneCorrection=False, binStats=None): """ Parse HMM hits for each bin.""" - + if not self.models: - self.logger.error('Models must be parsed before identifying HMM hits.') + self.logger.error( + 'Models must be parsed before identifying HMM hits.') sys.exit(1) self.logger.info('Parsing HMM hits to marker genes:') @@ -91,41 +96,51 @@ def parseBinHits(self, outDir, for binId in self.models: if self.logger.getEffectiveLevel() <= logging.INFO: numBinsProcessed += 1 - statusStr = ' Finished parsing hits for %d of %d (%.2f%%) bins.' % (numBinsProcessed, len(self.models), float(numBinsProcessed) * 100 / len(self.models)) + statusStr = ' Finished parsing hits for %d of %d (%.2f%%) bins.' % ( + numBinsProcessed, len(self.models), float(numBinsProcessed) * 100 / len(self.models)) sys.stderr.write('%s\r' % statusStr) sys.stderr.flush() if binStats != None: - resultsManager = ResultsManager(binId, self.models[binId], bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection, binStats[binId]) + resultsManager = ResultsManager( + binId, self.models[binId], bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection, binStats[binId]) elif binStats == None: - resultsManager = ResultsManager(binId, self.models[binId], bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection) + resultsManager = ResultsManager( + binId, self.models[binId], bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection) else: - self.logger.error('Invalid parameter settings for binStats and seqStats.') + self.logger.error( + 'Invalid parameter settings for binStats and seqStats.') sys.exit(1) hmmerTableFile = os.path.join(outDir, 'bins', binId, hmmTableFile) - self.parseHmmerResults(hmmerTableFile, resultsManager, bSkipAdjCorrection) + self.parseHmmerResults( + hmmerTableFile, resultsManager, bSkipAdjCorrection) self.results[binId] = resultsManager if self.logger.getEffectiveLevel() <= logging.INFO: sys.stderr.write('\n') def __writeBinStatsExt(self, directory, binIdToBinMarkerSets, bIndividualMarkers): - binStatsExtFile = os.path.join(directory, 'storage', DefaultValues.BIN_STATS_EXT_OUT) + binStatsExtFile = os.path.join( + directory, 'storage', DefaultValues.BIN_STATS_EXT_OUT) fout = open(binStatsExtFile, 'w') for binId in self.results: - binStatsExt = self.results[binId].getSummary(binIdToBinMarkerSets[binId], bIndividualMarkers, outputFormat=2) - binStatsExt.update(self.results[binId].geneCopyNumber(binIdToBinMarkerSets[binId])) + binStatsExt = self.results[binId].getSummary( + binIdToBinMarkerSets[binId], bIndividualMarkers, outputFormat=2) + binStatsExt.update(self.results[binId].geneCopyNumber( + binIdToBinMarkerSets[binId])) fout.write(binId + '\t' + str(binStatsExt) + '\n') fout.close() def __writeMarkerGeneStats(self, directory, binIdToBinMarkerSets, bIndividualMarkers): - markerGenesFile = os.path.join(directory, 'storage', DefaultValues.MARKER_GENE_STATS) + markerGenesFile = os.path.join( + directory, 'storage', DefaultValues.MARKER_GENE_STATS) fout = open(markerGenesFile, 'w') for binId in self.results: - markerGenesHits = self.results[binId].getSummary(binIdToBinMarkerSets[binId], bIndividualMarkers, outputFormat=8) + markerGenesHits = self.results[binId].getSummary( + binIdToBinMarkerSets[binId], bIndividualMarkers, outputFormat=8) fout.write(binId + '\t' + str(markerGenesHits) + '\n') fout.close() @@ -145,7 +160,8 @@ def parseBinStats(self, resultsFolder, binStatsFile): def parseBinStatsExt(self, resultsFolder): """Read bin statistics from file.""" - binStatsExtFile = os.path.join(resultsFolder, 'storage', DefaultValues.BIN_STATS_EXT_OUT) + binStatsExtFile = os.path.join( + resultsFolder, 'storage', DefaultValues.BIN_STATS_EXT_OUT) checkFileExists(binStatsExtFile) @@ -159,7 +175,8 @@ def parseBinStatsExt(self, resultsFolder): def parseMarkerGeneStats(self, resultsFolder): """Read bin statistics from file.""" - markerGeneStatsFile = os.path.join(resultsFolder, 'storage', DefaultValues.MARKER_GENE_STATS) + markerGeneStatsFile = os.path.join( + resultsFolder, 'storage', DefaultValues.MARKER_GENE_STATS) checkFileExists(markerGeneStatsFile) @@ -189,7 +206,8 @@ def parseHmmerResults(self, fileName, resultsManager, bSkipAdjCorrection): # retain only best hit to PFAM clan pfam = PFAM(DefaultValues.PFAM_CLAN_FILE) - resultsManager.markerHits = pfam.filterHitsFromSameClan(resultsManager.markerHits) + resultsManager.markerHits = pfam.filterHitsFromSameClan( + resultsManager.markerHits) # correct for errors in ORF calling if not bSkipAdjCorrection: @@ -203,27 +221,34 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None, tab # keep count of single, double, triple genes etc... if outputFormat == 1: - header = ['Bin Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity'] + header = ['Bin Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', + '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity'] elif outputFormat == 2: header = ['Bin Id'] - header += ['Marker lineage', '# genomes', '# markers', '# marker sets'] + header += ['Marker lineage', '# genomes', + '# markers', '# marker sets'] header += ['Completeness', 'Contamination', 'Strain heterogeneity'] - header += ['Genome size (bp)', '# ambiguous bases', '# scaffolds', '# contigs'] - header += ['N50 (scaffolds)', 'N50 (contigs)', 'Mean scaffold length (bp)', 'Mean contig length (bp)'] + header += ['Genome size (bp)', '# ambiguous bases', + '# scaffolds', '# contigs'] + header += ['N50 (scaffolds)', 'N50 (contigs)', + 'Mean scaffold length (bp)', 'Mean contig length (bp)'] header += ['Longest scaffold (bp)', 'Longest contig (bp)'] header += ['GC', 'GC std (scaffolds > 1kbp)'] - header += ['Coding density', 'Translation table', '# predicted genes'] + header += ['Coding density', + 'Translation table', '# predicted genes'] header += ['0', '1', '2', '3', '4', '5+'] if coverageBinProfiles != None: for bamId in coverageBinProfiles[list(coverageBinProfiles.keys())[0]]: - header += ['Coverage (' + bamId + ')', 'Coverage std (' + bamId + ')'] + header += ['Coverage (' + bamId + ')', + 'Coverage std (' + bamId + ')'] if DefaultValues.MIN_SEQ_LEN_GC_STD != 1000: self.logger.error('Labeling error: GC std (scaffolds > 1kbp)') sys.exit(1) elif outputFormat == 3: - header = ['Bin Id', 'Node Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity'] + header = ['Bin Id', 'Node Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', + '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity'] elif outputFormat == 4: header = None elif outputFormat == 5: @@ -233,14 +258,17 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None, tab elif outputFormat == 7: header = ['Bin Id', 'Marker Id', 'Gene Ids'] elif outputFormat == 8: - header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}'] + header = ['Bin Id', 'Gene Id', + '{Marker Id, Start position, End position}'] elif outputFormat == 9: if table is not None: - header = ['Bin Id', 'Contig', 'Gene Number', 'Gene Start', 'Gene End', 'Gene Strand', 'Prot Length', 'Marker Id', 'Align Start', 'Align End', 'Sequence'] + header = ['Bin Id', 'Contig', 'Gene Number', 'Gene Start', 'Gene End', + 'Gene Strand', 'Prot Length', 'Marker Id', 'Align Start', 'Align End', 'Sequence'] else: header = " " elif outputFormat == 10: - header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids'] + header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', + 'GC', '# ORFs', 'Coding density', 'Marker Ids'] return header @@ -255,7 +283,8 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke prettyTableFormats = [1, 2, 3, 9] - header = self.__getHeader(outputFormat, binIdToBinMarkerSets[list(binIdToBinMarkerSets.keys())[0]], coverageBinProfiles, bTabTable) + header = self.__getHeader(outputFormat, binIdToBinMarkerSets[list( + binIdToBinMarkerSets.keys())[0]], coverageBinProfiles, bTabTable) if bTabTable or outputFormat not in prettyTableFormats: bTabTable = True pTable = None @@ -272,7 +301,8 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke seqsReported = 0 for binId in sorted(self.results.keys()): - seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder) + seqsReported += self.results[binId].printSummary( + outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder) if outputFormat in [6, 7] and seqsReported == 0: print('[No marker genes satisfied the reporting criteria.]') @@ -291,6 +321,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke class ResultsManager(): """Store all the results for a single bin""" + def __init__(self, binId, models, bIgnoreThresholds=False, evalueThreshold=DefaultValues.E_VAL, @@ -361,7 +392,8 @@ def addHit(self, hit): else: if previousHitToORF.dom_score < hit.dom_score: self.markerHits[hit.query_accession].append(hit) - self.markerHits[hit.query_accession].remove(previousHitToORF) + self.markerHits[hit.query_accession].remove( + previousHitToORF) else: self.markerHits[hit.query_accession] = [hit] @@ -420,16 +452,21 @@ def identifyAdjacentMarkerGenes(self): # produce concatenated label indicating the two genes being combined orfA, orfB = sorted([orfI, orfJ]) - newHit.target_name = DefaultValues.SEQ_CONCAT_CHAR.join([orfA, orfB]) + newHit.target_name = DefaultValues.SEQ_CONCAT_CHAR.join([ + orfA, orfB]) - newHit.target_length = hits[i].target_length + hits[j].target_length - newHit.hmm_from = min(hits[i].hmm_from, hits[j].hmm_from) + newHit.target_length = hits[i].target_length + \ + hits[j].target_length + newHit.hmm_from = min( + hits[i].hmm_from, hits[j].hmm_from) newHit.hmm_to = min(hits[i].hmm_to, hits[j].hmm_to) - newHit.ali_from = min(hits[i].ali_from, hits[j].ali_from) + newHit.ali_from = min( + hits[i].ali_from, hits[j].ali_from) newHit.ali_to = min(hits[i].ali_to, hits[j].ali_to) - newHit.env_from = min(hits[i].env_from, hits[j].env_from) + newHit.env_from = min( + hits[i].env_from, hits[j].env_from) newHit.env_to = min(hits[i].env_to, hits[j].env_to) hits.remove(hits[j]) @@ -468,7 +505,8 @@ def geneCountsForSelectedMarkerSet(self, binMarkerSets, bIndividualMarkers): """Report number of times each marker gene was found along with a completeness and contamination estimation for the selected marker set associated with a bin.""" - geneCounts = self.geneCounts(binMarkerSets.selectedMarkerSet(), self.markerHits, bIndividualMarkers) + geneCounts = self.geneCounts( + binMarkerSets.selectedMarkerSet(), self.markerHits, bIndividualMarkers) return geneCounts @@ -490,7 +528,8 @@ def geneCounts(self, markerSet, markerHits, bIndividualMarkers): geneCounts[markerCount] += 1 - percComp, percCont = markerSet.genomeCheck(markerHits, bIndividualMarkers) + percComp, percCont = markerSet.genomeCheck( + markerHits, bIndividualMarkers) geneCounts.append(percComp) geneCounts.append(percCont) @@ -518,7 +557,8 @@ def geneCopyNumber(self, binMarkerSets): if len(self.markerHits[marker]) >= 5: geneCopyNumber['GCN5+'].append(markerId) else: - geneCopyNumber['GCN' + str(len(self.markerHits[marker]))].append(markerId) + geneCopyNumber['GCN' + + str(len(self.markerHits[marker]))].append(markerId) else: geneCopyNumber['GCN0'].append(markerId) @@ -530,7 +570,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1): if outputFormat == 1: selectedMarkerSet = binMarkerSets.selectedMarkerSet() - data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) + data = self.geneCountsForSelectedMarkerSet( + binMarkerSets, bIndividualMarkers) summary['marker lineage'] = selectedMarkerSet.lineageStr summary['# genomes'] = selectedMarkerSet.numGenomes @@ -546,7 +587,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1): summary['Contamination'] = data[7] elif outputFormat == 2: selectedMarkerSet = binMarkerSets.selectedMarkerSet() - data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) + data = self.geneCountsForSelectedMarkerSet( + binMarkerSets, bIndividualMarkers) summary['marker lineage'] = selectedMarkerSet.lineageStr summary['# genomes'] = selectedMarkerSet.numGenomes @@ -620,19 +662,20 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1): continue for hit in hit_list: - genesWithMarkers[hit.target_name] = genesWithMarkers.get(hit.target_name, []) + [hit] + genesWithMarkers[hit.target_name] = genesWithMarkers.get( + hit.target_name, []) + [hit] for geneId, hits in genesWithMarkers.items(): summary[geneId] = {} for hit in hits: - summary[geneId][hit.query_accession] = summary[geneId].get(hit.query_accession, []) + [[hit.ali_from, hit.ali_to]] + summary[geneId][hit.query_accession] = summary[geneId].get( + hit.query_accession, []) + [[hit.ali_from, hit.ali_to]] else: print("Unknown output format: ", outputFormat) return summary def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None): - """Print out information about bin.""" if outputFormat == 1: selectedMarkerSet = binMarkerSets.selectedMarkerSet() @@ -641,18 +684,22 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov if selectedMarkerSet.UID != '0': lineageStr += ' (' + str(selectedMarkerSet.UID) + ')' - data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) + data = self.geneCountsForSelectedMarkerSet( + binMarkerSets, bIndividualMarkers) row = "%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, lineageStr, - selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets(), - "\t".join([str(data[i]) for i in range(6)]), - data[6], - data[7], - aai.aaiMeanBinHetero.get(self.binId, 0.0) - ) + selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets(), + "\t".join( + [str(data[i]) for i in range(6)]), + data[6], + data[7], + aai.aaiMeanBinHetero.get( + self.binId, 0.0) + ) if table == None: print(row) else: - table.add_row([self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) + table.add_row([self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers( + ), selectedMarkerSet.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) elif outputFormat == 2: selectedMarkerSet = binMarkerSets.selectedMarkerSet() @@ -660,38 +707,50 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov if selectedMarkerSet.UID != '0': lineageStr += ' (' + str(selectedMarkerSet.UID) + ')' - data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) + data = self.geneCountsForSelectedMarkerSet( + binMarkerSets, bIndividualMarkers) if table == None: row = self.binId - row += '\t%s\t%d\t%d\t%d' % (lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()) - row += '\t%0.2f\t%0.2f\t%0.2f' % (data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)) + row += '\t%s\t%d\t%d\t%d' % (lineageStr, selectedMarkerSet.numGenomes, + selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()) + row += '\t%0.2f\t%0.2f\t%0.2f' % ( + data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)) row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (self.binStats['Genome size'], self.binStats['# ambiguous bases'], - self.binStats['# scaffolds'], self.binStats['# contigs'], - self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], - self.binStats['Mean scaffold length'], self.binStats['Mean contig length'], - self.binStats['Longest scaffold'], self.binStats['Longest contig']) - row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100, self.binStats['GC std'] * 100) - row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']) + self.binStats['# scaffolds'], self.binStats['# contigs'], + self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], + self.binStats['Mean scaffold length'], self.binStats['Mean contig length'], + self.binStats['Longest scaffold'], self.binStats['Longest contig']) + row += '\t%.1f\t%.2f' % (self.binStats['GC'] + * 100, self.binStats['GC std'] * 100) + row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, + self.binStats['Translation table'], self.binStats['# predicted genes']) row += '\t' + '\t'.join([str(data[i]) for i in range(6)]) if coverageBinProfiles: if self.binId in coverageBinProfiles: for _, coverageStats in coverageBinProfiles[self.binId].items(): - row += '\t%.2f\t%.2f' % (coverageStats[0], coverageStats[1]) + row += '\t%.2f\t%.2f' % ( + coverageStats[0], coverageStats[1]) else: for bamId in coverageBinProfiles[list(coverageBinProfiles.keys())[0]]: row += '\t%.2f\t%.2f' % (0, 0) print(row) else: - row = [self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] - row.extend([data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)]) + row = [self.binId, lineageStr, selectedMarkerSet.numGenomes, + selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] + row.extend( + [data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)]) row.extend([self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'], - self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], - int(self.binStats['Mean scaffold length']), int(self.binStats['Mean contig length']), - self.binStats['Longest scaffold'], self.binStats['Longest contig']]) - row.extend([self.binStats['GC'] * 100, self.binStats['GC std'] * 100]) - row.extend([self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']]) + self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats[ + 'N50 (contigs)'], + int(self.binStats['Mean scaffold length']), int( + self.binStats['Mean contig length']), + self.binStats['Longest scaffold'], self.binStats['Longest contig']]) + row.extend([self.binStats['GC'] * 100, + self.binStats['GC std'] * 100]) + row.extend([self.binStats['Coding density'] * 100, + self.binStats['Translation table'], self.binStats['# predicted genes']]) row.extend(data[0:6]) if coverageBinProfiles: @@ -700,28 +759,32 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov row.extend(coverageStats) else: for bamId in coverageBinProfiles[list(coverageBinProfiles.keys())[0]]: - row.extend([0,0]) + row.extend([0, 0]) table.add_row(row) elif outputFormat == 3: for ms in binMarkerSets.markerSetIter(): data = self.geneCounts(ms, self.markerHits, bIndividualMarkers) row = "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, ms.UID, ms.lineageStr, ms.numGenomes, - ms.numMarkers(), ms.numSets(), - "\t".join([str(data[i]) for i in range(6)]), - data[6], - data[7], - aai.aaiMeanBinHetero.get(self.binId, 0.0) - ) + ms.numMarkers(), ms.numSets(), + "\t".join( + [str(data[i]) for i in range(6)]), + data[6], + data[7], + aai.aaiMeanBinHetero.get( + self.binId, 0.0) + ) if table == None: print(row) else: - table.add_row([self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers(), ms.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) + table.add_row([self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers( + ), ms.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) elif outputFormat == 4: selectedMarkerSet = binMarkerSets.selectedMarkerSet() data = self.hitsToMarkerGene(binMarkerSets.selectedMarkerSet()) - row = "Node Id: %s; Marker lineage: %s" % (selectedMarkerSet.UID, selectedMarkerSet.lineageStr) + row = "Node Id: %s; Marker lineage: %s" % ( + selectedMarkerSet.UID, selectedMarkerSet.lineageStr) for marker in data: row += '\t' + marker print(row) @@ -775,15 +838,19 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov if len(hitList) >= 2: scaffoldsWithMultipleHits = set() for i in range(0, len(hitList)): - scaffoldId = hitList[i].target_name[0:hitList[i].target_name.rfind('_')] + scaffoldId = hitList[i].target_name[0:hitList[i].target_name.rfind( + '_')] for j in range(i + 1, len(hitList)): if scaffoldId == hitList[j].target_name[0:hitList[j].target_name.rfind('_')]: - scaffoldsWithMultipleHits.add(hitList[i].target_name) - scaffoldsWithMultipleHits.add(hitList[j].target_name) + scaffoldsWithMultipleHits.add( + hitList[i].target_name) + scaffoldsWithMultipleHits.add( + hitList[j].target_name) if len(scaffoldsWithMultipleHits) >= 2: print(self.binId, marker, sep='\t', end='\t') - print(','.join(sorted(list(scaffoldsWithMultipleHits))), end='\n') + print( + ','.join(sorted(list(scaffoldsWithMultipleHits))), end='\n') seqsReported += 1 return seqsReported @@ -798,12 +865,14 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov continue for hit in hit_list: - genesWithMarkers[hit.target_name] = genesWithMarkers.get(hit.target_name, []) + [hit] + genesWithMarkers[hit.target_name] = genesWithMarkers.get( + hit.target_name, []) + [hit] for geneId, hits in genesWithMarkers.items(): rowStr = self.binId + '\t' + geneId for hit in hits: - rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to) + rowStr += '\t' + hit.query_accession + ',' + \ + str(hit.ali_from) + ',' + str(hit.ali_to) print(rowStr) # Hunter Cameron, May 29, 2015 - print a fasta of marker genes @@ -812,7 +881,8 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov # check for the analyze folder for later use if anaFolder is None: - raise ValueError("AnaFolder must not be None for outputFormat 9") + raise ValueError( + "AnaFolder must not be None for outputFormat 9") # ## build a dict to link target_names with marker gene alignment information markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() @@ -824,13 +894,14 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov for hit in hit_list: name = hit.target_name hitInfo[name] = { - "marker": marker, - "ali_from": str(hit.ali_from), - "ali_to": str(hit.ali_to) - } + "marker": marker, + "ali_from": str(hit.ali_from), + "ali_to": str(hit.ali_to) + } # ## Open genes.faa and print the ones that were found with some descriptive info in the header - path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"]) + path_to_genes = "/".join([anaFolder, "bins", + self.binId, "genes.faa"]) # get only the seqs we need and their information as a dict seqs = readFasta(path_to_genes, trimHeader=False) @@ -863,32 +934,34 @@ def sort_header(header): # if table output not specified, print FASTA if table != None: gene_info = "geneId={};start={};end={};strand={};protlen={}".format( - gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header]))) + gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header]))) marker_info = "marker={};mstart={};mend={}".format( - hitInfo[gene_name]["marker"], - hitInfo[gene_name]["ali_from"], - hitInfo[gene_name]["ali_to"]) + hitInfo[gene_name]["marker"], + hitInfo[gene_name]["ali_from"], + hitInfo[gene_name]["ali_to"]) # new header will be the bin name, contig name, gene info, and marker info separated by spaces - new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info]) + new_header = ">" + \ + " ".join([self.binId, contig_name, + gene_info, marker_info]) print(new_header, seqs[header], sep="\n") # otherwise, print a table else: print("\t".join([ - self.binId, - contig_name, - gene_num, - gene_start, - gene_end, - gene_strand, - str(len(seqs[header])), - hitInfo[gene_name]["marker"], - hitInfo[gene_name]["ali_from"], - hitInfo[gene_name]["ali_to"], - seqs[header] - ])) + self.binId, + contig_name, + gene_num, + gene_start, + gene_end, + gene_strand, + str(len(seqs[header])), + hitInfo[gene_name]["marker"], + hitInfo[gene_name]["ali_from"], + hitInfo[gene_name]["ali_to"], + seqs[header] + ])) else: self.logger.error("Unknown output format: %d", outputFormat) diff --git a/setup.py b/setup.py index 58c68f0..a4af6ec 100755 --- a/setup.py +++ b/setup.py @@ -38,11 +38,11 @@ def readme(): 'Topic :: Scientific/Engineering :: Bio-Informatics', ], install_requires=[ - "numpy >= 1.13.1", - "scipy >= 0.19.1", - "matplotlib >= 2.1.0", - "pysam >= 0.12.0.1", - "dendropy >= 4.4.0", + "numpy >= 1.21.3", + "scipy >= 1.7.3", + "matplotlib >= 3.5.1", + "pysam >= 0.19.0", + "dendropy >= 4.5.2", "setuptools"], zip_safe=False )