From da430343dfe23524d511634726c6cfbd183b66c1 Mon Sep 17 00:00:00 2001
From: Donovan Parks <donovan.parks@gmail.com>
Date: Sun, 10 Apr 2022 07:52:17 -0700
Subject: [PATCH] - accepted PRs allowing bins to be specified via an input
 table, fixing import error with prettytable, and enhancing identification of
 bins via their extension

---
 checkm/VERSION    |  7 +++++-
 checkm/main.py    | 62 +++++++++++++++++++++++++++++++++--------------
 checkm/profile.py | 15 ++++++++----
 3 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/checkm/VERSION b/checkm/VERSION
index 370eb8b..fd9b658 100755
--- a/checkm/VERSION
+++ b/checkm/VERSION
@@ -1,3 +1,8 @@
+1.1.7
+- added Jie Zhu's (alienzj) PR to allow input files to be specified via a file
+- added Jeremy Jacobson's (jjacobson95) PR to fix prettytable import issue
+- added fix by Jie Li (jili6t) regarding more stringent matching of bin extensions
+
 1.1.6
 - allow CheckM data path to be set through a CHECKM_DATA_PATH environmental variable
 
@@ -5,7 +10,7 @@
 - fixed small bug with maxibor's PR
 
 1.1.4
-- added maxibor's PR to allow path to DATA_CONFIG file to be set 
+- added Maxime Borry's (maxibor) PR to allow path to DATA_CONFIG file to be set 
 
 1.1.3
 - fixed error resulting from incorrect GFF files produced by Prodigal on FASTA files with Windows style line endings
diff --git a/checkm/main.py b/checkm/main.py
index 626a2e0..7b47b61 100755
--- a/checkm/main.py
+++ b/checkm/main.py
@@ -86,6 +86,9 @@ def binFiles(self, binInput, binExtension, bCalledGenes):
         isInputDir = True
         if binInput is not None:
             if os.path.isdir(binInput):
+                if binExtension[0] != '.':
+                    binExtension = '.' + binExtension
+
                 all_files = os.listdir(binInput)
                 for f in all_files:
                     if f.endswith(binExtension):
@@ -119,21 +122,24 @@ def binFiles(self, binInput, binExtension, bCalledGenes):
                     "No bins found. Check the extension (-x) used to identify bins.")
             else:
                 self.logger.error(
-                    "No binsfound. Check the bins input table to identify bins turely exists")
+                    "No bins found. Check the bins input table to verify bins exists.")
             sys.exit(1)
 
         if len(binIDs) != len(binFiles):
-            self.logger.error("There are redundant bin ID, please check and update it")
+            self.logger.error(
+                "There are redundant bin IDs, please check and update.")
             sys.exit(1)
 
         return sorted(binFiles)
 
     def tree(self, options):
         """Tree command"""
+
         self.logger.info(
             '[CheckM - tree] Placing bins in reference genome tree.')
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         if not options.bCalledGenes:
             if not checkNuclotideSeqs(binFiles):
@@ -195,6 +201,7 @@ def tree(self, options):
 
     def treeQA(self, options):
         """QA command"""
+
         self.logger.info(
             '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.')
 
@@ -225,6 +232,7 @@ def treeQA(self, options):
 
     def lineageSet(self, options, db=None):
         """Lineage set command"""
+
         self.logger.info(
             '[CheckM - lineage_set] Inferring lineage-specific marker sets.')
 
@@ -262,6 +270,7 @@ def lineageSet(self, options, db=None):
 
     def taxonList(self, options, db=None):
         """Lineage set command"""
+
         self.logger.info(
             '[CheckM - taxon_list] Listing available taxonomic-specific marker sets.')
 
@@ -272,6 +281,7 @@ def taxonList(self, options, db=None):
 
     def taxonSet(self, options, db=None):
         """Taxon set command"""
+
         self.logger.info(
             '[CheckM - taxon_set] Generate taxonomic-specific marker set.')
 
@@ -291,10 +301,12 @@ def taxonSet(self, options, db=None):
 
     def analyze(self, options, db=None):
         """Analyze command"""
+
         self.logger.info(
             '[CheckM - analyze] Identifying marker genes in bins.')
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         if not options.bCalledGenes:
             if not checkNuclotideSeqs(binFiles):
@@ -459,7 +471,8 @@ def gcPlot(self, options):
         checkBinInputExists(options.bin_input, options.bCalledGenes)
         makeSurePathExists(options.output_dir)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         plots = GcPlots(options)
         filesProcessed = 1
@@ -486,7 +499,8 @@ def codingDensityPlot(self, options):
         checkBinInputExists(options.bin_input, options.bCalledGenes)
         makeSurePathExists(options.output_dir)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         plots = CodingDensityPlots(options)
         filesProcessed = 1
@@ -513,7 +527,8 @@ def tetraDistPlot(self, options):
         checkBinInputExists(options.bin_input, options.bCalledGenes)
         makeSurePathExists(options.output_dir)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         genomicSignatures = GenomicSignatures(K=4, threads=1)
         tetraSigs = genomicSignatures.read(options.tetra_profile)
@@ -543,7 +558,8 @@ def distributionPlots(self, options):
         checkBinInputExists(options.bin_input, options.bCalledGenes)
         makeSurePathExists(options.output_dir)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         genomicSignatures = GenomicSignatures(K=4, threads=1)
         tetraSigs = genomicSignatures.read(options.tetra_profile)
@@ -574,7 +590,8 @@ def gcBiasPlot(self, options):
         checkBinInputExists(options.bin_input, options.bCalledGenes)
         makeSurePathExists(options.output_dir)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         coverageWindows = CoverageWindows(options.threads)
         coverageProfile = coverageWindows.run(
@@ -605,7 +622,8 @@ def nxPlot(self, options):
         checkBinInputExists(options.bin_input, options.bCalledGenes)
         makeSurePathExists(options.output_dir)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         nx = NxPlot(options)
         filesProcessed = 1
@@ -632,7 +650,8 @@ def lengthHistogram(self, options):
         checkBinInputExists(options.bin_input, options.bCalledGenes)
         makeSurePathExists(options.output_dir)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         plot = LengthHistogram(options)
         filesProcessed = 1
@@ -660,7 +679,8 @@ def markerPlot(self, options):
         makeSurePathExists(options.output_dir)
 
         # generate plot for each bin
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         resultsParser = ResultsParser(None)
         markerGeneStats = resultsParser.parseMarkerGeneStats(
@@ -697,7 +717,8 @@ def unbinned(self, options):
 
         checkBinInputExists(options.bin_input, options.bCalledGenes)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         unbinned = Unbinned()
         unbinned.run(binFiles, options.seq_file, options.output_seq_file,
@@ -719,7 +740,8 @@ def coverage(self, options):
         checkBinInputExists(options.bin_input, options.bCalledGenes)
         makeSurePathExists(os.path.dirname(options.output_file))
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         coverage = Coverage(options.threads)
         coverage.run(binFiles, options.bam_files, options.output_file, options.all_reads,
@@ -771,7 +793,8 @@ def merge(self, options):
 
         checkBinInputExists(options.bin_input, options.bCalledGenes)
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         if not options.bCalledGenes:
             if not checkNuclotideSeqs(binFiles):
@@ -831,7 +854,8 @@ def outliers(self, options):
         checkFileExists(options.tetra_profile)
         makeSurePathExists(os.path.dirname(options.output_file))
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         binTools = BinTools()
         binTools.identifyOutliers(options.results_dir,
@@ -881,7 +905,8 @@ def unique(self, options):
         self.logger.info(
             '[CheckM - unique] Ensuring no sequences are assigned to multiple bins.')
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         binTools = BinTools()
         binTools.unique(binFiles)
@@ -894,7 +919,8 @@ def ssuFinder(self, options):
         self.logger.info(
             '[CheckM - ssu_finder] Identifying SSU (16S/18S) rRNAs in sequences.')
 
-        binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
+        binFiles = self.binFiles(
+            options.bin_input, options.extension, options.bCalledGenes)
 
         checkFileExists(options.seq_file)
         makeSurePathExists(options.output_dir)
diff --git a/checkm/profile.py b/checkm/profile.py
index 2f24e21..59dd007 100755
--- a/checkm/profile.py
+++ b/checkm/profile.py
@@ -61,8 +61,10 @@ def run(self, coverageFile, outFile, bTabTable):
                 bamId = lineSplit[i]
                 mappedReads = int(lineSplit[i + 2])
 
-                totalMappedReads[bamId] = totalMappedReads.get(bamId, 0) + mappedReads
-                readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(bamId, 0) + mappedReads
+                totalMappedReads[bamId] = totalMappedReads.get(
+                    bamId, 0) + mappedReads
+                readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(
+                    bamId, 0) + mappedReads
 
         # calculate percentage of mapped reads to binned populations
         perMappedReads = {}
@@ -73,7 +75,8 @@ def run(self, coverageFile, outFile, bTabTable):
             normBinCoverage[binId] = {}
 
             for bamId in bamIds:
-                perMR = float(readsMappedToBin[binId][bamId]) / totalMappedReads[bamId]
+                perMR = float(
+                    readsMappedToBin[binId][bamId]) / totalMappedReads[bamId]
                 perMappedReads[binId][bamId] = perMR
 
                 if binId == DefaultValues.UNBINNED:
@@ -81,7 +84,8 @@ def run(self, coverageFile, outFile, bTabTable):
 
                 normCoverage = perMR / binSize[binId]
                 normBinCoverage[binId][bamId] = normCoverage
-                sumNormBinCoverage[bamId] = sumNormBinCoverage.get(bamId, 0) + normCoverage
+                sumNormBinCoverage[bamId] = sumNormBinCoverage.get(
+                    bamId, 0) + normCoverage
 
         for binId, bamIds in normBinCoverage.items():
             for bamId in bamIds:
@@ -131,7 +135,8 @@ def run(self, coverageFile, outFile, bTabTable):
                     row += [unbinnedPercentage * 100.0]
                 else:
                     row += [normBinCoverage[binId][bamId] * 100.0]
-                    row += [normBinCoverage[binId][bamId] * 100.0 * (1.0 - unbinnedPercentage)]
+                    row += [normBinCoverage[binId][bamId] *
+                            100.0 * (1.0 - unbinnedPercentage)]
 
             if bTabTable:
                 print('\t'.join(list(map(str, row))))