Skip to content

Commit

Permalink
- fixed CheckM data setRoot command to work from command line (thanks…
Browse files Browse the repository at this point in the history
… to cbt for PR)

- added logging to files
- improved reporting of CheckM results to include timestamp
- removed "--force-overwrite" option and now allow output directories to already exist
  • Loading branch information
donovan-h-parks committed Aug 8, 2019
1 parent 4c7be0e commit 1cda8cf
Show file tree
Hide file tree
Showing 33 changed files with 564 additions and 589 deletions.
113 changes: 58 additions & 55 deletions bin/checkm

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion checkm/DATA_CONFIG
@@ -1 +1 @@
{"dataRoot": "/srv/whitlam/bio/db/checkm_data/1.0.0", "remoteManifestURL": "https://data.ace.uq.edu.au/public/CheckM_databases/", "manifestType": "CheckM", "remoteManifestName": ".dmanifest", "localManifestName": ".dmanifest"}
{"dataRoot": "/srv/db/checkm_data/1.0.0", "remoteManifestURL": "https://data.ace.uq.edu.au/public/CheckM_databases/", "manifestType": "CheckM", "remoteManifestName": ".dmanifest", "localManifestName": ".dmanifest"}
6 changes: 6 additions & 0 deletions checkm/VERSION
@@ -1,3 +1,9 @@
1.0.15
- fixed CheckM data setRoot command to work from command line (thanks to cbt for PR)
- added logging to files
- improved reporting of CheckM results to include timestamp
- removed "--force-overwrite" option and now allow output directories to already exist

1.0.14
- fixed bug with test command missing "--force-overwrite" option

Expand Down
4 changes: 2 additions & 2 deletions checkm/aminoAcidIdentity.py
Expand Up @@ -32,15 +32,15 @@
class AminoAcidIdentity():
"""Calculate AAI between sequences aligned to an HMM."""
def __init__(self):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')
self.aaiRawScores = defaultdict(dict)
self.aaiHetero = defaultdict(dict)
self.aaiMeanBinHetero = {}

def run(self, aaiStrainThreshold, outDir, alignmentOutputFile):
"""Calculate AAI between input alignments."""

self.logger.info(' Calculating AAI between multi-copy marker genes.')
self.logger.info('Calculating AAI between multi-copy marker genes.')

if alignmentOutputFile:
fout = open(alignmentOutputFile, 'w')
Expand Down
17 changes: 8 additions & 9 deletions checkm/binComparer.py
Expand Up @@ -28,7 +28,7 @@

class BinComparer(object):
def __init__(self):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')

def __readBins(self, binFiles):
bins = {}
Expand Down Expand Up @@ -62,7 +62,7 @@ def __binningStats(self, bins, seqLens):

def report(self, binFiles1, binFiles2, seqFile, outputFile):
# determine total number of sequences
self.logger.info(' Reading sequences.')
self.logger.info('Reading sequences.')
seqs = readFasta(seqFile)

seqLens = {}
Expand Down Expand Up @@ -95,19 +95,18 @@ def report(self, binFiles1, binFiles2, seqFile, outputFile):
binStats2 = sorted(binStats2.iteritems(), key=lambda x: x[1][1], reverse=True)

# report summary results
self.logger.info(' Total seqs = %d (%.2f Mbp)' % (len(seqs), float(totalBases) / 1e6))
self.logger.info(' # seqs > 1 kbp = %d (%.2f Mbp)' % (numSeq1K, float(totalBases1K) / 1e6))
self.logger.info(' # seqs > 5 kbp = %d (%.2f Mbp)' % (numSeq5K, float(totalBases5K) / 1e6))
self.logger.info('')
self.logger.info(' Binned seqs statistics:')
self.logger.info(' 1) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
self.logger.info('Total seqs = %d (%.2f Mbp)' % (len(seqs), float(totalBases) / 1e6))
self.logger.info(' # seqs > 1 kbp = %d (%.2f Mbp)' % (numSeq1K, float(totalBases1K) / 1e6))
self.logger.info(' # seqs > 5 kbp = %d (%.2f Mbp)' % (numSeq5K, float(totalBases5K) / 1e6))
self.logger.info('Binned seqs statistics:')
self.logger.info(' 1) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
% (len(bins1),
totalUniqueBinnedSeqs1,
float(totalUniqueBinnedSeqs1) * 100 / len(seqs),
float(totalUniqueBinnedBases1) / 1e6,
float(totalUniqueBinnedBases1) * 100 / totalBases,
numRepeats1))
self.logger.info(' 2) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
self.logger.info(' 2) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
% (len(bins2),
totalUniqueBinnedSeqs2,
float(totalUniqueBinnedSeqs2) * 100 / len(seqs),
Expand Down
4 changes: 2 additions & 2 deletions checkm/binStatistics.py
Expand Up @@ -57,14 +57,14 @@ def __init__(self, threads=1):
Number of cpus to use.
"""

self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')
self.totalThreads = threads

def calculate(self, binFiles, outDir, binStatsFile):
"""Calculate statistics for each putative genome bin."""

# process each bin
self.logger.info(" Calculating genome statistics for %d bins with %d threads:" % (len(binFiles), self.totalThreads))
self.logger.info("Calculating genome statistics for %d bins with %d threads:" % (len(binFiles), self.totalThreads))

workerQueue = mp.Queue()
writerQueue = mp.Queue()
Expand Down
7 changes: 3 additions & 4 deletions checkm/binTools.py
Expand Up @@ -36,7 +36,7 @@
class BinTools():
"""Functions for exploring and modifying bins."""
def __init__(self, threads=1):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')

def __removeSeqs(self, seqs, seqsToRemove):
"""Remove sequences. """
Expand Down Expand Up @@ -211,7 +211,7 @@ def tetraDiffDist(self, seqs, genomicSig, tetraSigs, binSig):
def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, reportType, outputFile):
"""Identify sequences that are outliers."""

self.logger.info(' Reading reference distributions.')
self.logger.info('Reading reference distributions.')
gcBounds = readDistribution('gc_dist')
cdBounds = readDistribution('cd_dist')
tdBounds = readDistribution('td_dist')
Expand All @@ -222,13 +222,12 @@ def identifyOutliers(self, outDir, binFiles, tetraProfileFile, distribution, rep
fout.write('\tSequence CD\tMean bin CD\tLower CD bound (%s%%)' % distribution)
fout.write('\tSequence TD\tMean bin TD\tUpper TD bound (%s%%)\n' % distribution)

self.logger.info('')
processedBins = 0
for binFile in binFiles:
binId = binIdFromFilename(binFile)

processedBins += 1
self.logger.info(' Finding outliers in %s (%d of %d).' % (binId, processedBins, len(binFiles)))
self.logger.info('Finding outliers in %s (%d of %d).' % (binId, processedBins, len(binFiles)))

seqs = readFasta(binFile)

Expand Down
10 changes: 4 additions & 6 deletions checkm/binUnion.py
Expand Up @@ -76,7 +76,7 @@ def contamination(self, binId):

class BinUnion(object):
def __init__(self):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')

def report(self, binFolders, binFileSets, checkmQaTsvs, unionBinOutputFile, multiplyBinnedContigsFile, minCompleteness=0, maxContamination=0):
# Read QA files
Expand All @@ -87,14 +87,13 @@ def report(self, binFolders, binFileSets, checkmQaTsvs, unionBinOutputFile, mult
numBinsBinningWise = [0] * len(binFolders)
for candidate in bestCandidates:
numBinsBinningWise[candidate.binningIndex] += 1
self.logger.info("")

for binIndex, numBins in enumerate(numBinsBinningWise):
self.logger.info(" Kept %i out of %i bins from %s" % (numBins,
self.logger.info("Kept %i out of %i bins from %s" % (numBins,
len(binFileSets[binIndex]),
binFolders[binIndex]
))

self.logger.info("")
with open(multiplyBinnedContigsFile, 'w') as multiplyBinnedOutput:
self.printMultiplyBinnedContigs(bestCandidates, multiplyBinnedOutput)

Expand All @@ -103,8 +102,7 @@ def report(self, binFolders, binFileSets, checkmQaTsvs, unionBinOutputFile, mult
out.write(candidate.binFile)
out.write("\n")

self.logger.info("")
self.logger.info(" Wrote %i bins to %s" % (len(bestCandidates),
self.logger.info("Wrote %i bins to %s" % (len(bestCandidates),
unionBinOutputFile,
))

Expand Down
15 changes: 9 additions & 6 deletions checkm/checkmData.py
Expand Up @@ -35,7 +35,7 @@ class DBConfig(object):
the DATA_CONFIG file as an object.
"""
def __init__(self):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')
self.configFile = os.path.abspath(resource_filename('checkm', 'DATA_CONFIG'))
self.values = self.getConfig()

Expand Down Expand Up @@ -97,11 +97,14 @@ def checkPermissions(self):
class DBManager(mm.ManifestManager):

"""Manage all aspects of data location and version control."""
def __init__(self):
def __init__(self, set_path=None):
mm.ManifestManager.__init__(self, timeout=15)
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')
self.config = DBConfig() # load inbuilt configuration
self.type = self.config.values["manifestType"]

if set_path:
self.setRoot(set_path)

# check that the data root is legit
manifestFile = os.path.join(self.config.values["dataRoot"], mm.__MANIFEST__)
Expand All @@ -110,9 +113,9 @@ def __init__(self):

if self.config.values["dataRoot"] == "":
# no data folder set.
print "It seems that the CheckM data folder has not been set yet or has been removed. Running: 'checkm data setRoot'."
print ("It seems that the CheckM data folder has not been set yet or has been removed. Please run 'checkm data setRoot'.")
if not self.setRoot():
print "Sorry, CheckM cannot run without a valid data folder."
print("Sorry, CheckM cannot run without a valid data folder.")

def runAction(self, action):
"""Main entry point for the updating code"""
Expand Down Expand Up @@ -159,7 +162,7 @@ def confirmPath(self, path=None):
if(minimal):
path = raw_input("Please specify a location or type 'abort' to stop trying: \n")
else:
path = raw_input("Where should CheckM store it's data?\n" \
path = raw_input("Where should CheckM store its data?\n" \
"Please specify a location or type 'abort' to stop trying: \n")

if path.upper() == "ABORT":
Expand Down
24 changes: 12 additions & 12 deletions checkm/common.py
Expand Up @@ -69,24 +69,24 @@ def checkEmptyDir(inputDir):
# check if directory is empty
files = os.listdir(inputDir)
if len(files) != 0:
logger = logging.getLogger()
logger.error(' [Error] Output directory must be empty: ' + inputDir + '\n')
logger = logging.getLogger('timestamp')
logger.error('Output directory must be empty: ' + inputDir + '\n')
sys.exit(1)


def checkFileExists(inputFile):
"""Check if file exists."""
if not os.path.exists(inputFile):
logger = logging.getLogger()
logger.error(' [Error] Input file does not exists: ' + inputFile + '\n')
logger = logging.getLogger('timestamp')
logger.error('Input file does not exists: ' + inputFile + '\n')
sys.exit(1)


def checkDirExists(inputDir):
"""Check if directory exists."""
if not os.path.exists(inputDir):
logger = logging.getLogger()
logger.error(' [Error] Input directory does not exists: ' + inputDir + '\n')
logger = logging.getLogger('timestamp')
logger.error('Input directory does not exists: ' + inputDir + '\n')
sys.exit(1)


Expand All @@ -99,8 +99,8 @@ def makeSurePathExists(path):
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
logger = logging.getLogger()
logger.error(' [Error] Specified path does not exist: ' + path + '\n')
logger = logging.getLogger('timestamp')
logger.error('Specified path does not exist: ' + path + '\n')
sys.exit(1)


Expand All @@ -120,8 +120,8 @@ def reassignStdOut(outFile):
# redirect stdout to a file
sys.stdout = open(outFile, 'w')
except:
logger = logging.getLogger()
logger.error(" [Error] Error diverting stdout to file: " + outFile)
logger = logging.getLogger('timestamp')
logger.error("Error diverting stdout to file: " + outFile)
sys.exit(1)

return oldStdOut
Expand All @@ -135,6 +135,6 @@ def restoreStdOut(outFile, oldStdOut):
sys.stdout.close()
sys.stdout = oldStdOut
except:
logger = logging.getLogger()
logger.error(" [Error] Error restoring stdout ", outFile)
logger = logging.getLogger('timestamp')
logger.error("Error restoring stdout ", outFile)
sys.exit(1)
10 changes: 5 additions & 5 deletions checkm/coverage.py
Expand Up @@ -46,15 +46,15 @@ def __init__(self, seqLen, mappedReads, coverage):
class Coverage():
"""Calculate coverage of all sequences."""
def __init__(self, threads):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')

self.totalThreads = threads

def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC):
"""Calculate coverage of sequences for each BAM file."""

# determine bin assignment of each sequence
self.logger.info(' Determining bin assignment of each sequence.')
self.logger.info('Determining bin assignment of each sequence.')

seqIdToBinId = {}
seqIdToSeqLen = {}
Expand All @@ -67,7 +67,7 @@ def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPe
seqIdToSeqLen[seqId] = len(seq)

# process each fasta file
self.logger.info(" Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads))
self.logger.info("Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads))

# make sure all BAM files are sorted
self.numFiles = len(bamFiles)
Expand All @@ -81,13 +81,13 @@ def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPe
numFilesStarted = 0
for bamFile in bamFiles:
numFilesStarted += 1
self.logger.info(' Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles)))
self.logger.info('Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles)))

coverageInfo[bamFile] = mp.Manager().dict()
coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile])

# redirect output
self.logger.info(' Writing coverage information to file.')
self.logger.info('Writing coverage information to file.')
oldStdOut = reassignStdOut(outFile)

header = 'Sequence Id\tBin Id\tSequence length (bp)'
Expand Down
4 changes: 2 additions & 2 deletions checkm/coverageWindows.py
Expand Up @@ -85,7 +85,7 @@ def __init__(self, seqLen, mappedReads, coverage):
class CoverageWindows():
"""Calculate coverage of all sequences."""
def __init__(self, threads):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')

self.totalThreads = threads

Expand All @@ -98,7 +98,7 @@ def run(self, binFiles, bamFile, bAllReads, minAlignPer, maxEditDistPer, windowS
sys.exit(1)

# calculate coverage of each BAM file
self.logger.info(' Calculating coverage of windows.')
self.logger.info('Calculating coverage of windows.')
coverageInfo = mp.Manager().dict()
coverageInfo = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, windowSize, coverageInfo)

Expand Down
2 changes: 1 addition & 1 deletion checkm/defaultValues.py
Expand Up @@ -95,4 +95,4 @@ class DefaultValues():

UNBINNED = 'unbinned'

MIN_SEQ_LEN_GC_STD = 1000
MIN_SEQ_LEN_GC_STD = 1000
4 changes: 2 additions & 2 deletions checkm/genomicSignatures.py
Expand Up @@ -33,7 +33,7 @@

class GenomicSignatures(object):
def __init__(self, K, threads):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')

self.K = K
self.compl = maketrans('ACGT', 'TGCA')
Expand Down Expand Up @@ -151,7 +151,7 @@ def seqSignature(self, seq):
def calculate(self, seqFile, outputFile):
"""Calculate genomic signature of each sequence."""

self.logger.info(' Determining tetranucleotide signature of each sequence.')
self.logger.info('Determining tetranucleotide signature of each sequence.')

# process each sequence in parallel
workerQueue = mp.Queue()
Expand Down
2 changes: 1 addition & 1 deletion checkm/hmmer.py
Expand Up @@ -41,7 +41,7 @@ class HMMMERModeError(BaseException):
class HMMERRunner():
"""Wrapper for running HMMER3."""
def __init__(self, mode="dom"):
self.logger = logging.getLogger()
self.logger = logging.getLogger('timestamp')

# make sure HMMER is installed
self.checkForHMMER()
Expand Down

0 comments on commit 1cda8cf

Please sign in to comment.