Skip to content

Commit

Permalink
Modified how bin IDs are identified to improve output tables
Browse files Browse the repository at this point in the history
  • Loading branch information
donovan-h-parks committed Apr 30, 2022
1 parent d74bb68 commit 1343a1c
Show file tree
Hide file tree
Showing 9 changed files with 279 additions and 156 deletions.
6 changes: 6 additions & 0 deletions checkm/VERSION
@@ -1,3 +1,9 @@
1.2.0
- modified how bin IDs are identified to improve output tables

1.1.11
- fixed error with inverse_transformed being deprecated in newer versions of Matplotlib

1.1.10
- fixed bug with missing bCalledGenes flag that was impacting a number of commands

Expand Down
18 changes: 15 additions & 3 deletions checkm/common.py
Expand Up @@ -95,9 +95,12 @@ def checkBinInputExists(binInput, bCalledGenes):
have_error = True
if have_error:
logger = logging.getLogger("timestamp")
logger.error('Input table format is not correct: ' + binInput + '\n')
logger.error('If input is nucleotide contigs, please supply 2 column at least: [genome_id, genome_contigs_file]')
logger.error('If input is gene, please supply 3 column at least: [genome_id, genome_contigs_file, genome_genes_file]')
logger.error('Input table format is not correct: ' +
binInput + '\n')
logger.error(
'If input is nucleotide contigs, please supply 2 column at least: [genome_id, genome_contigs_file]')
logger.error(
'If input is gene, please supply 3 column at least: [genome_id, genome_contigs_file, genome_genes_file]')


def checkFileExists(inputFile):
Expand Down Expand Up @@ -132,7 +135,16 @@ def makeSurePathExists(path):

def binIdFromFilename(filename):
"""Extract bin id from bin filename."""

binId = os.path.basename(filename)

# remove common compression extensions
compress_ext = ['.gz', '.bz2', '.rar', '.tgz', '.zip', '.7z']
for ext in compress_ext:
if binId.endswith(ext):
binId = binId[0:len(binId) - len(ext)]
break

binId = os.path.splitext(binId)[0]

return binId
Expand Down
2 changes: 0 additions & 2 deletions checkm/main.py
Expand Up @@ -294,7 +294,6 @@ def taxonSet(self, options, db=None):
options.rank, options.taxon, options.marker_file)

if bValidSet:

self.logger.info('Marker set written to: ' + options.marker_file)

self.timeKeeper.printTimeStamp()
Expand Down Expand Up @@ -422,7 +421,6 @@ def qa(self, options):
aai.run(options.aai_strain, options.analyze_dir, options.alignment_file)

# get HMM file for each bin

markerSetParser = MarkerSetParser(options.threads)

hmmModelInfoFile = os.path.join(
Expand Down
29 changes: 18 additions & 11 deletions checkm/plot/AbstractPlot.py
Expand Up @@ -34,6 +34,7 @@ class AbstractPlot(FigureCanvas):
'''
Abstract base class for plotting.
'''

def __init__(self, options):
self.options = options

Expand All @@ -60,7 +61,8 @@ def __init__(self, options):
def savePlot(self, filename, dpi=300):
imgFormat = filename[filename.rfind('.') + 1:len(filename)]
if imgFormat in ['png', 'pdf', 'ps', 'eps', 'svg']:
self.fig.savefig(filename, format=imgFormat, dpi=dpi, facecolor='white', edgecolor='white', bbox_inches='tight')
self.fig.savefig(filename, format=imgFormat, dpi=dpi,
facecolor='white', edgecolor='white', bbox_inches='tight')
else:
pass

Expand All @@ -72,20 +74,22 @@ def labelExtents(self, xLabels, xFontSize, xRotation, yLabels, yFontSize, yRotat
tempAxes.set_xticks(np.arange(len(xLabels)))
tempAxes.set_yticks(np.arange(len(yLabels)))

xText = tempAxes.set_xticklabels(xLabels, size=xFontSize, rotation=xRotation)
yText = tempAxes.set_yticklabels(yLabels, size=yFontSize, rotation=yRotation)
xText = tempAxes.set_xticklabels(
xLabels, size=xFontSize, rotation=xRotation)
yText = tempAxes.set_yticklabels(
yLabels, size=yFontSize, rotation=yRotation)

bboxes = []
for label in xText:
bbox = label.get_window_extent(self.get_renderer())
bboxi = bbox.inverse_transformed(self.fig.transFigure)
bboxi = bbox.transformed(self.fig.transFigure.inverted())
bboxes.append(bboxi)
xLabelBounds = mtransforms.Bbox.union(bboxes)

bboxes = []
for label in yText:
bbox = label.get_window_extent(self.get_renderer())
bboxi = bbox.inverse_transformed(self.fig.transFigure)
bboxi = bbox.transformed(self.fig.transFigure.inverted())
bboxes.append(bboxi)
yLabelBounds = mtransforms.Bbox.union(bboxes)

Expand All @@ -98,12 +102,14 @@ def xLabelExtents(self, labels, fontSize, rotation=0):

tempAxes = self.fig.add_axes([0, 0, 1.0, 1.0])
tempAxes.set_xticks(np.arange(len(labels)))
xLabels = tempAxes.set_xticklabels(labels, size=fontSize, rotation=rotation)
xLabels = tempAxes.set_xticklabels(
labels, size=fontSize, rotation=rotation)

bboxes = []
for label in xLabels:
bbox = label.get_window_extent(self.get_renderer())
bboxi = bbox.inverse_transformed(self.fig.transFigure)

bboxi = bbox.transformed(self.fig.transFigure.inverted())
bboxes.append(bboxi)
xLabelBounds = mtransforms.Bbox.union(bboxes)

Expand All @@ -116,12 +122,13 @@ def yLabelExtents(self, labels, fontSize, rotation=0):

tempAxes = self.fig.add_axes([0, 0, 1.0, 1.0])
tempAxes.set_yticks(np.arange(len(labels)))
yLabels = tempAxes.set_yticklabels(labels, size=fontSize, rotation=rotation)
yLabels = tempAxes.set_yticklabels(
labels, size=fontSize, rotation=rotation)

bboxes = []
for label in yLabels:
bbox = label.get_window_extent(self.get_renderer())
bboxi = bbox.inverse_transformed(self.fig.transFigure)
bboxi = bbox.transformed(self.fig.transFigure.inverted())
bboxes.append(bboxi)
yLabelBounds = mtransforms.Bbox.union(bboxes)

Expand Down Expand Up @@ -172,5 +179,5 @@ def boundingBox(self, data, ax, label, bBoundingBoxes, bLabels):

if bLabels:
ax.annotate(label, xy=(min(data[:, 0]), max(data[:, 1])), xytext=(0, 0),
textcoords='offset points', ha='right', va='bottom',
bbox=dict(boxstyle='round,pad=0.5', fc=(0.5, 0.5, 0.5), alpha=0.1))
textcoords='offset points', ha='right', va='bottom',
bbox=dict(boxstyle='round,pad=0.5', fc=(0.5, 0.5, 0.5), alpha=0.1))
32 changes: 21 additions & 11 deletions checkm/plot/codingDensityPlots.py
Expand Up @@ -37,7 +37,7 @@
class CodingDensityPlots(AbstractPlot):
def __init__(self, options):
AbstractPlot.__init__(self, options)

self.logger = logging.getLogger('timestamp')

def plot(self, fastaFile, distributionsToPlot):
Expand All @@ -55,9 +55,11 @@ def plot(self, fastaFile, distributionsToPlot):

def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
# parse Prodigal output
gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(fastaFile), DefaultValues.PRODIGAL_GFF)
gffFile = os.path.join(self.options.results_dir, 'bins', binIdFromFilename(
fastaFile), DefaultValues.PRODIGAL_GFF)
if not os.path.exists(gffFile):
self.logger.error('Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF)
self.logger.error(
'Missing gene feature file (%s). This plot if not compatible with the --genes option.' % DefaultValues.PRODIGAL_GFF)
sys.exit(1)

prodigalParser = ProdigalGeneFeatureParser(gffFile)
Expand Down Expand Up @@ -87,7 +89,8 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
end += self.options.cd_window_size

if len(data) == 0:
axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
axesHist.set_xlabel(
'[Error] No seqs >= %d, the specified window size' % self.options.cd_window_size)
return

# Histogram plot
Expand All @@ -100,7 +103,8 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):

axesHist.hist(data, bins=bins, density=True, color=(0.5, 0.5, 0.5))
axesHist.set_xlabel('% coding density')
axesHist.set_ylabel('% windows (' + str(self.options.cd_window_size) + ' bp)')
axesHist.set_ylabel(
'% windows (' + str(self.options.cd_window_size) + ' bp)')

# Prettify plot
for a in axesHist.yaxis.majorTicks:
Expand Down Expand Up @@ -128,8 +132,10 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
meanCD, deltaCDs, _ = binTools.codingDensityDist(seqs, prodigalParser)

# Delta-CD vs sequence length plot
axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(deltaCDs), s=10, lw=0.5, cmap='gray_r')
axesDeltaCD.set_xlabel(r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
axesDeltaCD.scatter(deltaCDs, seqLens, c=abs(
deltaCDs), s=10, lw=0.5, ec='black', cmap='gray_r')
axesDeltaCD.set_xlabel(
r'$\Delta$ CD (mean coding density = %.1f%%)' % (meanCD * 100))
axesDeltaCD.set_ylabel('Sequence length (kbp)')

_, yMaxSeqs = axesDeltaCD.get_ylim()
Expand All @@ -142,8 +148,10 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
# find closest distribution values
sampleSeqLen = list(dist[closestCD].keys())[0]
d = dist[closestCD][sampleSeqLen]
cdLowerBoundKey = findNearest(list(d.keys()), (100 - distToPlot) / 2.0)
cdUpperBoundKey = findNearest(list(d.keys()), (100 + distToPlot) / 2.0)
cdLowerBoundKey = findNearest(
list(d.keys()), (100 - distToPlot) / 2.0)
cdUpperBoundKey = findNearest(
list(d.keys()), (100 + distToPlot) / 2.0)

xL = []
xU = []
Expand All @@ -168,15 +176,17 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaCD):
axesDeltaCD.set_xlim([xMinSeqs, xMaxSeqs])

# draw vertical line at x=0
axesDeltaCD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)
yticks = axesDeltaCD.get_yticks()
axesDeltaCD.vlines(0, 0, yticks[-1], linestyle='dashed',
color=self.axesColour, zorder=0)

# Change sequence lengths from bp to kbp
yticks = axesDeltaCD.get_yticks()
kbpLabels = []
for seqLen in yticks:
label = '%.1f' % (float(seqLen) / 1000)
label = label.replace('.0', '') # remove trailing zero
kbpLabels.append(label)
axesDeltaCD.set_yticks(yticks)
axesDeltaCD.set_yticklabels(kbpLabels)

# Prettify plot
Expand Down
24 changes: 16 additions & 8 deletions checkm/plot/gcPlots.py
Expand Up @@ -75,7 +75,8 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC):
end += self.options.gc_window_size

if len(data) == 0:
axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size)
axesHist.set_xlabel(
'[Error] No seqs >= %d, the specified window size' % self.options.gc_window_size)
return

# Histogram plot
Expand All @@ -88,7 +89,8 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC):

axesHist.hist(data, bins=bins, density=True, color=(0.5, 0.5, 0.5))
axesHist.set_xlabel('% GC')
axesHist.set_ylabel('% windows (' + str(self.options.gc_window_size) + ' bp)')
axesHist.set_ylabel(
'% windows (' + str(self.options.gc_window_size) + ' bp)')

# Prettify plot
for a in axesHist.yaxis.majorTicks:
Expand Down Expand Up @@ -116,8 +118,10 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC):
meanGC, deltaGCs, _ = binTools.gcDist(seqs)

# Delta-GC vs Sequence length plot
axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(deltaGCs), s=10, lw=0.5, cmap='gray_r')
axesDeltaGC.set_xlabel(r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100))
axesDeltaGC.scatter(deltaGCs, seqLens, c=abs(
deltaGCs), s=10, lw=0.5, ec='black', cmap='gray_r')
axesDeltaGC.set_xlabel(
r'$\Delta$ GC (mean GC = %.1f%%)' % (meanGC * 100))
axesDeltaGC.set_ylabel('Sequence length (kbp)')

_, yMaxSeqs = axesDeltaGC.get_ylim()
Expand All @@ -130,8 +134,10 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC):
# find closest distribution values
sampleSeqLen = list(dist[closestGC].keys())[0]
d = dist[closestGC][sampleSeqLen]
gcLowerBoundKey = findNearest(list(d.keys()), (100 - distToPlot) / 2.0)
gcUpperBoundKey = findNearest(list(d.keys()), (100 + distToPlot) / 2.0)
gcLowerBoundKey = findNearest(
list(d.keys()), (100 - distToPlot) / 2.0)
gcUpperBoundKey = findNearest(
list(d.keys()), (100 + distToPlot) / 2.0)

xL = []
xU = []
Expand All @@ -156,15 +162,17 @@ def plotOnAxes(self, fastaFile, distributionsToPlot, axesHist, axesDeltaGC):
axesDeltaGC.set_xlim([xMinSeqs, xMaxSeqs])

# draw vertical line at x=0
axesDeltaGC.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)
yticks = axesDeltaGC.get_yticks()
axesDeltaGC.vlines(0, 0, yticks[-1], linestyle='dashed',
color=self.axesColour, zorder=0)

# Change sequence lengths from bp to kbp
yticks = axesDeltaGC.get_yticks()
kbpLabels = []
for seqLen in yticks:
label = '%.1f' % (float(seqLen) / 1000)
label = label.replace('.0', '') # remove trailing zero
kbpLabels.append(label)
axesDeltaGC.set_yticks(yticks)
axesDeltaGC.set_yticklabels(kbpLabels)

# Prettify plot
Expand Down
27 changes: 18 additions & 9 deletions checkm/plot/tetraDistPlots.py
Expand Up @@ -41,7 +41,8 @@ def plot(self, fastaFile, tetraSigs, distributionsToPlot):
axesHist = self.fig.add_subplot(121)
axesDeltaTD = self.fig.add_subplot(122)

self.plotOnAxes(fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDeltaTD)
self.plotOnAxes(fastaFile, tetraSigs,
distributionsToPlot, axesHist, axesDeltaTD)

self.fig.tight_layout(pad=1, w_pad=1)
self.draw()
Expand Down Expand Up @@ -78,7 +79,8 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe
end += self.options.td_window_size

if len(data) == 0:
axesHist.set_xlabel('[Error] No seqs >= %d, the specified window size' % self.options.td_window_size)
axesHist.set_xlabel(
'[Error] No seqs >= %d, the specified window size' % self.options.td_window_size)
return

deltaTDs = np.array(deltaTDs)
Expand All @@ -93,7 +95,8 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe

axesHist.hist(data, bins=bins, density=True, color=(0.5, 0.5, 0.5))
axesHist.set_xlabel(r'$\Delta$ TD')
axesHist.set_ylabel('% windows (' + str(self.options.td_window_size) + ' bp)')
axesHist.set_ylabel(
'% windows (' + str(self.options.td_window_size) + ' bp)')

# Prettify plot
for a in axesHist.yaxis.majorTicks:
Expand All @@ -117,10 +120,12 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe
spine.set_color(self.axesColour)

# get CD bin statistics
meanTD, deltaTDs = binTools.tetraDiffDist(seqs, genomicSig, tetraSigs, binSig)
meanTD, deltaTDs = binTools.tetraDiffDist(
seqs, genomicSig, tetraSigs, binSig)

# Delta-TD vs Sequence length plot
axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(deltaTDs), s=10, lw=0.5, cmap='gray_r')
axesDeltaTD.scatter(deltaTDs, seqLens, c=abs(
deltaTDs), s=10, lw=0.5, ec='black', cmap='gray_r')
axesDeltaTD.set_xlabel(r'$\Delta$ TD (mean TD = %.2f)' % meanTD)
axesDeltaTD.set_ylabel('Sequence length (kbp)')

Expand All @@ -129,7 +134,8 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe

# plot reference distributions
for distToPlot in distributionsToPlot:
boundKey = findNearest(list(dist[list(dist.keys())[0]].keys()), distToPlot)
boundKey = findNearest(
list(dist[list(dist.keys())[0]].keys()), distToPlot)

x = []
y = []
Expand All @@ -150,7 +156,8 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe
if j == len(x) - 1:
x[j] = x[i]
else:
x[j] = (x[j - 1] + x[j + 1]) / 2 # interpolate values from neighbours
# interpolate values from neighbours
x[j] = (x[j - 1] + x[j + 1]) / 2

if x[j] > x[i]:
x[j] = x[i]
Expand All @@ -164,15 +171,17 @@ def plotOnAxes(self, fastaFile, tetraSigs, distributionsToPlot, axesHist, axesDe
axesDeltaTD.set_xlim([xMinSeqs, xMaxSeqs])

# draw vertical line at x=0
axesDeltaTD.vlines(0, 0, yMaxSeqs, linestyle='dashed', color=self.axesColour, zorder=0)
yticks = axesDeltaTD.get_yticks()
axesDeltaTD.vlines(0, 0, yticks[-1], linestyle='dashed',
color=self.axesColour, zorder=0)

# Change sequence lengths from bp to kbp
yticks = axesDeltaTD.get_yticks()
kbpLabels = []
for seqLen in yticks:
label = '%.1f' % (float(seqLen) / 1000)
label = label.replace('.0', '') # remove trailing zero
kbpLabels.append(label)
axesDeltaTD.set_yticks(yticks)
axesDeltaTD.set_yticklabels(kbpLabels)

# Prettify plot
Expand Down

0 comments on commit 1343a1c

Please sign in to comment.