Skip to content

Commit

Permalink
- accepted PRs allowing bins to be specified via an input table, fixi…
Browse files Browse the repository at this point in the history
…ng import error with prettytable, and enhancing identification of bins via their extension
  • Loading branch information
donovan-h-parks committed Apr 10, 2022
1 parent 3638c2d commit da43034
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 24 deletions.
7 changes: 6 additions & 1 deletion checkm/VERSION
@@ -1,11 +1,16 @@
1.1.7
- added Jie Zhu's (alienzj) PR to allow input files to be specified via a file
- added Jeremy Jacobson's (jjacobson95) PR to fix prettytable import issue
- added fix by Jie Li (jili6t) regarding more stringent matching of bin extensions

1.1.6
- allow CheckM data path to be set through a CHECKM_DATA_PATH environmental variable

1.1.5
- fixed small bug with maxibor's PR

1.1.4
- added maxibor's PR to allow path to DATA_CONFIG file to be set
- added Maxime Borry's (maxibor) PR to allow path to DATA_CONFIG file to be set

1.1.3
- fixed error resulting from incorrect GFF files produced by Prodigal on FASTA files with Windows style line endings
Expand Down
62 changes: 44 additions & 18 deletions checkm/main.py
Expand Up @@ -86,6 +86,9 @@ def binFiles(self, binInput, binExtension, bCalledGenes):
isInputDir = True
if binInput is not None:
if os.path.isdir(binInput):
if binExtension[0] != '.':
binExtension = '.' + binExtension

all_files = os.listdir(binInput)
for f in all_files:
if f.endswith(binExtension):
Expand Down Expand Up @@ -119,21 +122,24 @@ def binFiles(self, binInput, binExtension, bCalledGenes):
"No bins found. Check the extension (-x) used to identify bins.")
else:
self.logger.error(
"No binsfound. Check the bins input table to identify bins turely exists")
"No bins found. Check the bins input table to verify bins exists.")
sys.exit(1)

if len(binIDs) != len(binFiles):
self.logger.error("There are redundant bin ID, please check and update it")
self.logger.error(
"There are redundant bin IDs, please check and update.")
sys.exit(1)

return sorted(binFiles)

def tree(self, options):
"""Tree command"""

self.logger.info(
'[CheckM - tree] Placing bins in reference genome tree.')

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

if not options.bCalledGenes:
if not checkNuclotideSeqs(binFiles):
Expand Down Expand Up @@ -195,6 +201,7 @@ def tree(self, options):

def treeQA(self, options):
"""QA command"""

self.logger.info(
'[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.')

Expand Down Expand Up @@ -225,6 +232,7 @@ def treeQA(self, options):

def lineageSet(self, options, db=None):
"""Lineage set command"""

self.logger.info(
'[CheckM - lineage_set] Inferring lineage-specific marker sets.')

Expand Down Expand Up @@ -262,6 +270,7 @@ def lineageSet(self, options, db=None):

def taxonList(self, options, db=None):
"""Lineage set command"""

self.logger.info(
'[CheckM - taxon_list] Listing available taxonomic-specific marker sets.')

Expand All @@ -272,6 +281,7 @@ def taxonList(self, options, db=None):

def taxonSet(self, options, db=None):
"""Taxon set command"""

self.logger.info(
'[CheckM - taxon_set] Generate taxonomic-specific marker set.')

Expand All @@ -291,10 +301,12 @@ def taxonSet(self, options, db=None):

def analyze(self, options, db=None):
"""Analyze command"""

self.logger.info(
'[CheckM - analyze] Identifying marker genes in bins.')

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

if not options.bCalledGenes:
if not checkNuclotideSeqs(binFiles):
Expand Down Expand Up @@ -459,7 +471,8 @@ def gcPlot(self, options):
checkBinInputExists(options.bin_input, options.bCalledGenes)
makeSurePathExists(options.output_dir)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

plots = GcPlots(options)
filesProcessed = 1
Expand All @@ -486,7 +499,8 @@ def codingDensityPlot(self, options):
checkBinInputExists(options.bin_input, options.bCalledGenes)
makeSurePathExists(options.output_dir)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

plots = CodingDensityPlots(options)
filesProcessed = 1
Expand All @@ -513,7 +527,8 @@ def tetraDistPlot(self, options):
checkBinInputExists(options.bin_input, options.bCalledGenes)
makeSurePathExists(options.output_dir)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

genomicSignatures = GenomicSignatures(K=4, threads=1)
tetraSigs = genomicSignatures.read(options.tetra_profile)
Expand Down Expand Up @@ -543,7 +558,8 @@ def distributionPlots(self, options):
checkBinInputExists(options.bin_input, options.bCalledGenes)
makeSurePathExists(options.output_dir)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

genomicSignatures = GenomicSignatures(K=4, threads=1)
tetraSigs = genomicSignatures.read(options.tetra_profile)
Expand Down Expand Up @@ -574,7 +590,8 @@ def gcBiasPlot(self, options):
checkBinInputExists(options.bin_input, options.bCalledGenes)
makeSurePathExists(options.output_dir)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

coverageWindows = CoverageWindows(options.threads)
coverageProfile = coverageWindows.run(
Expand Down Expand Up @@ -605,7 +622,8 @@ def nxPlot(self, options):
checkBinInputExists(options.bin_input, options.bCalledGenes)
makeSurePathExists(options.output_dir)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

nx = NxPlot(options)
filesProcessed = 1
Expand All @@ -632,7 +650,8 @@ def lengthHistogram(self, options):
checkBinInputExists(options.bin_input, options.bCalledGenes)
makeSurePathExists(options.output_dir)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

plot = LengthHistogram(options)
filesProcessed = 1
Expand Down Expand Up @@ -660,7 +679,8 @@ def markerPlot(self, options):
makeSurePathExists(options.output_dir)

# generate plot for each bin
binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

resultsParser = ResultsParser(None)
markerGeneStats = resultsParser.parseMarkerGeneStats(
Expand Down Expand Up @@ -697,7 +717,8 @@ def unbinned(self, options):

checkBinInputExists(options.bin_input, options.bCalledGenes)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

unbinned = Unbinned()
unbinned.run(binFiles, options.seq_file, options.output_seq_file,
Expand All @@ -719,7 +740,8 @@ def coverage(self, options):
checkBinInputExists(options.bin_input, options.bCalledGenes)
makeSurePathExists(os.path.dirname(options.output_file))

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

coverage = Coverage(options.threads)
coverage.run(binFiles, options.bam_files, options.output_file, options.all_reads,
Expand Down Expand Up @@ -771,7 +793,8 @@ def merge(self, options):

checkBinInputExists(options.bin_input, options.bCalledGenes)

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

if not options.bCalledGenes:
if not checkNuclotideSeqs(binFiles):
Expand Down Expand Up @@ -831,7 +854,8 @@ def outliers(self, options):
checkFileExists(options.tetra_profile)
makeSurePathExists(os.path.dirname(options.output_file))

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

binTools = BinTools()
binTools.identifyOutliers(options.results_dir,
Expand Down Expand Up @@ -881,7 +905,8 @@ def unique(self, options):
self.logger.info(
'[CheckM - unique] Ensuring no sequences are assigned to multiple bins.')

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

binTools = BinTools()
binTools.unique(binFiles)
Expand All @@ -894,7 +919,8 @@ def ssuFinder(self, options):
self.logger.info(
'[CheckM - ssu_finder] Identifying SSU (16S/18S) rRNAs in sequences.')

binFiles = self.binFiles(options.bin_input, options.extension, options.bCalledGenes)
binFiles = self.binFiles(
options.bin_input, options.extension, options.bCalledGenes)

checkFileExists(options.seq_file)
makeSurePathExists(options.output_dir)
Expand Down
15 changes: 10 additions & 5 deletions checkm/profile.py
Expand Up @@ -61,8 +61,10 @@ def run(self, coverageFile, outFile, bTabTable):
bamId = lineSplit[i]
mappedReads = int(lineSplit[i + 2])

totalMappedReads[bamId] = totalMappedReads.get(bamId, 0) + mappedReads
readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(bamId, 0) + mappedReads
totalMappedReads[bamId] = totalMappedReads.get(
bamId, 0) + mappedReads
readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(
bamId, 0) + mappedReads

# calculate percentage of mapped reads to binned populations
perMappedReads = {}
Expand All @@ -73,15 +75,17 @@ def run(self, coverageFile, outFile, bTabTable):
normBinCoverage[binId] = {}

for bamId in bamIds:
perMR = float(readsMappedToBin[binId][bamId]) / totalMappedReads[bamId]
perMR = float(
readsMappedToBin[binId][bamId]) / totalMappedReads[bamId]
perMappedReads[binId][bamId] = perMR

if binId == DefaultValues.UNBINNED:
continue

normCoverage = perMR / binSize[binId]
normBinCoverage[binId][bamId] = normCoverage
sumNormBinCoverage[bamId] = sumNormBinCoverage.get(bamId, 0) + normCoverage
sumNormBinCoverage[bamId] = sumNormBinCoverage.get(
bamId, 0) + normCoverage

for binId, bamIds in normBinCoverage.items():
for bamId in bamIds:
Expand Down Expand Up @@ -131,7 +135,8 @@ def run(self, coverageFile, outFile, bTabTable):
row += [unbinnedPercentage * 100.0]
else:
row += [normBinCoverage[binId][bamId] * 100.0]
row += [normBinCoverage[binId][bamId] * 100.0 * (1.0 - unbinnedPercentage)]
row += [normBinCoverage[binId][bamId] *
100.0 * (1.0 - unbinnedPercentage)]

if bTabTable:
print('\t'.join(list(map(str, row))))
Expand Down

0 comments on commit da43034

Please sign in to comment.