From da430343dfe23524d511634726c6cfbd183b66c1 Mon Sep 17 00:00:00 2001 From: Donovan Parks Date: Sun, 10 Apr 2022 07:52:17 -0700 Subject: [PATCH] - accepted PRs allowing bins to be specified via an input table, fixing import error with prettytable, and enhancing identification of bins via their extension --- checkm/VERSION | 7 +++++- checkm/main.py | 62 +++++++++++++++++++++++++++++++++-------------- checkm/profile.py | 15 ++++++++---- 3 files changed, 60 insertions(+), 24 deletions(-) diff --git a/checkm/VERSION b/checkm/VERSION index 370eb8b..fd9b658 100755 --- a/checkm/VERSION +++ b/checkm/VERSION @@ -1,3 +1,8 @@ +1.1.7 +- added Jie Zhu's (alienzj) PR to allow input files to be specified via a file +- added Jeremy Jacobson's (jjacobson95) PR to fix prettytable import issue +- added fix by Jie Li (jili6t) regarding more stringent matching of bin extensions + 1.1.6 - allow CheckM data path to be set through a CHECKM_DATA_PATH environmental variable @@ -5,7 +10,7 @@ - fixed small bug with maxibor's PR 1.1.4 -- added maxibor's PR to allow path to DATA_CONFIG file to be set +- added Maxime Borry's (maxibor) PR to allow path to DATA_CONFIG file to be set 1.1.3 - fixed error resulting from incorrect GFF files produced by Prodigal on FASTA files with Windows style line endings diff --git a/checkm/main.py b/checkm/main.py index 626a2e0..7b47b61 100755 --- a/checkm/main.py +++ b/checkm/main.py @@ -86,6 +86,9 @@ def binFiles(self, binInput, binExtension, bCalledGenes): isInputDir = True if binInput is not None: if os.path.isdir(binInput): + if binExtension[0] != '.': + binExtension = '.' + binExtension + all_files = os.listdir(binInput) for f in all_files: if f.endswith(binExtension): @@ -119,21 +122,24 @@ def binFiles(self, binInput, binExtension, bCalledGenes): "No bins found. Check the extension (-x) used to identify bins.") else: self.logger.error( - "No binsfound. Check the bins input table to identify bins turely exists") + "No bins found. Check the bins input table to verify bins exists.") sys.exit(1) if len(binIDs) != len(binFiles): - self.logger.error("There are redundant bin ID, please check and update it") + self.logger.error( + "There are redundant bin IDs, please check and update.") sys.exit(1) return sorted(binFiles) def tree(self, options): """Tree command""" + self.logger.info( '[CheckM - tree] Placing bins in reference genome tree.') - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): @@ -195,6 +201,7 @@ def tree(self, options): def treeQA(self, options): """QA command""" + self.logger.info( '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.') @@ -225,6 +232,7 @@ def treeQA(self, options): def lineageSet(self, options, db=None): """Lineage set command""" + self.logger.info( '[CheckM - lineage_set] Inferring lineage-specific marker sets.') @@ -262,6 +270,7 @@ def lineageSet(self, options, db=None): def taxonList(self, options, db=None): """Lineage set command""" + self.logger.info( '[CheckM - taxon_list] Listing available taxonomic-specific marker sets.') @@ -272,6 +281,7 @@ def taxonList(self, options, db=None): def taxonSet(self, options, db=None): """Taxon set command""" + self.logger.info( '[CheckM - taxon_set] Generate taxonomic-specific marker set.') @@ -291,10 +301,12 @@ def taxonSet(self, options, db=None): def analyze(self, options, db=None): """Analyze command""" + self.logger.info( '[CheckM - analyze] Identifying marker genes in bins.') - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): @@ -459,7 +471,8 @@ def gcPlot(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) makeSurePathExists(options.output_dir) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) plots = GcPlots(options) filesProcessed = 1 @@ -486,7 +499,8 @@ def codingDensityPlot(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) makeSurePathExists(options.output_dir) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) plots = CodingDensityPlots(options) filesProcessed = 1 @@ -513,7 +527,8 @@ def tetraDistPlot(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) makeSurePathExists(options.output_dir) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) genomicSignatures = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSignatures.read(options.tetra_profile) @@ -543,7 +558,8 @@ def distributionPlots(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) makeSurePathExists(options.output_dir) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) genomicSignatures = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSignatures.read(options.tetra_profile) @@ -574,7 +590,8 @@ def gcBiasPlot(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) makeSurePathExists(options.output_dir) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) coverageWindows = CoverageWindows(options.threads) coverageProfile = coverageWindows.run( @@ -605,7 +622,8 @@ def nxPlot(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) makeSurePathExists(options.output_dir) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) nx = NxPlot(options) filesProcessed = 1 @@ -632,7 +650,8 @@ def lengthHistogram(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) makeSurePathExists(options.output_dir) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) plot = LengthHistogram(options) filesProcessed = 1 @@ -660,7 +679,8 @@ def markerPlot(self, options): makeSurePathExists(options.output_dir) # generate plot for each bin - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) resultsParser = ResultsParser(None) markerGeneStats = resultsParser.parseMarkerGeneStats( @@ -697,7 +717,8 @@ def unbinned(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) unbinned = Unbinned() unbinned.run(binFiles, options.seq_file, options.output_seq_file, @@ -719,7 +740,8 @@ def coverage(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) makeSurePathExists(os.path.dirname(options.output_file)) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) coverage = Coverage(options.threads) coverage.run(binFiles, options.bam_files, options.output_file, options.all_reads, @@ -771,7 +793,8 @@ def merge(self, options): checkBinInputExists(options.bin_input, options.bCalledGenes) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): @@ -831,7 +854,8 @@ def outliers(self, options): checkFileExists(options.tetra_profile) makeSurePathExists(os.path.dirname(options.output_file)) - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) binTools = BinTools() binTools.identifyOutliers(options.results_dir, @@ -881,7 +905,8 @@ def unique(self, options): self.logger.info( '[CheckM - unique] Ensuring no sequences are assigned to multiple bins.') - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) binTools = BinTools() binTools.unique(binFiles) @@ -894,7 +919,8 @@ def ssuFinder(self, options): self.logger.info( '[CheckM - ssu_finder] Identifying SSU (16S/18S) rRNAs in sequences.') - binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes) + binFiles = self.binFiles( + options.bin_input, options.extension, options.bCalledGenes) checkFileExists(options.seq_file) makeSurePathExists(options.output_dir) diff --git a/checkm/profile.py b/checkm/profile.py index 2f24e21..59dd007 100755 --- a/checkm/profile.py +++ b/checkm/profile.py @@ -61,8 +61,10 @@ def run(self, coverageFile, outFile, bTabTable): bamId = lineSplit[i] mappedReads = int(lineSplit[i + 2]) - totalMappedReads[bamId] = totalMappedReads.get(bamId, 0) + mappedReads - readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(bamId, 0) + mappedReads + totalMappedReads[bamId] = totalMappedReads.get( + bamId, 0) + mappedReads + readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get( + bamId, 0) + mappedReads # calculate percentage of mapped reads to binned populations perMappedReads = {} @@ -73,7 +75,8 @@ def run(self, coverageFile, outFile, bTabTable): normBinCoverage[binId] = {} for bamId in bamIds: - perMR = float(readsMappedToBin[binId][bamId]) / totalMappedReads[bamId] + perMR = float( + readsMappedToBin[binId][bamId]) / totalMappedReads[bamId] perMappedReads[binId][bamId] = perMR if binId == DefaultValues.UNBINNED: @@ -81,7 +84,8 @@ def run(self, coverageFile, outFile, bTabTable): normCoverage = perMR / binSize[binId] normBinCoverage[binId][bamId] = normCoverage - sumNormBinCoverage[bamId] = sumNormBinCoverage.get(bamId, 0) + normCoverage + sumNormBinCoverage[bamId] = sumNormBinCoverage.get( + bamId, 0) + normCoverage for binId, bamIds in normBinCoverage.items(): for bamId in bamIds: @@ -131,7 +135,8 @@ def run(self, coverageFile, outFile, bTabTable): row += [unbinnedPercentage * 100.0] else: row += [normBinCoverage[binId][bamId] * 100.0] - row += [normBinCoverage[binId][bamId] * 100.0 * (1.0 - unbinnedPercentage)] + row += [normBinCoverage[binId][bamId] * + 100.0 * (1.0 - unbinnedPercentage)] if bTabTable: print('\t'.join(list(map(str, row))))