diff --git a/.gitignore b/.gitignore index e8dfd11..04721f2 100644 --- a/.gitignore +++ b/.gitignore @@ -15,5 +15,6 @@ c-concoct/vbgmm.c .screenrc #ignore test folder tests/nose_tmp_output +tests/test_data/integration_test_data #ignore doc builds doc/build/* diff --git a/.travis.yml b/.travis.yml index 980ae3f..f90fb7f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,20 +1,24 @@ language: python python: - "2.7" - - "3.4" + - "3.5" # Since we are using system_site_packages, we are only able to use # the default python versions, see: # https://docs.travis-ci.com/user/languages/python/#travis-ci-uses-isolated-virtualenvs # command to install dependencies virtualenv: system_site_packages: true +services: + - xvfb before_install: - pip install --upgrade pip - sudo apt-get update -qq - - sudo apt-get install -qq build-essential libgsl0-dev bedtools mummer + - sudo apt-get install -qq build-essential libgsl0-dev bedtools mummer samtools - "export DISPLAY=:99.0" - - "sh -e /etc/init.d/xvfb start" - pip install -r requirements.txt + - wget https://github.com/BinPro/integration_test_data/archive/v1.0.tar.gz + - mkdir tests/test_data/integration_test_data + - tar -xvzf v1.0.tar.gz -C tests/test_data/integration_test_data --strip-components=1 install: - python setup.py install # command to run tests diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1d01474 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,34 @@ +# Changelog + +A list of changes per version. +When changing something in the develop branch, it should be added here. + +## [v1.1.0] 2019-08-02 + +### `Changed` + + - [#236](https://github.com/BinPro/CONCOCT/pull/236) - Always add suffix to contigs at cutup, even when they are not cut. + - [#254](https://github.com/BinPro/CONCOCT/pull/254) - Slight cleanup of concoct refine + - [#258](https://github.com/BinPro/CONCOCT/pull/258) - New suffices (.concoct_part_XX) are now used for contig parts + - [#261](https://github.com/BinPro/CONCOCT/pull/261) - Epsilon argument removed as it was not working and is not very useful + - [#262](https://github.com/BinPro/CONCOCT/pull/262) - Rewrote documentation, including installation instructions + - [#264](https://github.com/BinPro/CONCOCT/pull/264) - `concoct_part_` suffix is enforced in subcontig for coverage script + - [#264](https://github.com/BinPro/CONCOCT/pull/264) - Header line is enforced for input for `merge_cutup_clustering.py` and `extract_fasta_bins.py` + - [#267](https://github.com/BinPro/CONCOCT/pull/267) - Updated documentation + +### `Added` + + - [#253](https://github.com/BinPro/CONCOCT/pull/253) - A dockerfile useful to test the conda installation + - [#258](https://github.com/BinPro/CONCOCT/pull/258) - Tests for all fundamental scripts, including a new integration test data repository + - [#259](https://github.com/BinPro/CONCOCT/pull/259) - This changelog + - [#262](https://github.com/BinPro/CONCOCT/pull/262) - Added documentation for the core scripts used with concoct + - [#265](https://github.com/BinPro/CONCOCT/pull/265) - A warning is now printed when concoct runs in single threaded mode + +### `Fixed` + + - [#230](https://github.com/BinPro/CONCOCT/pull/230) - Enable at least single threaded installation on Mac OSX + - [#231](https://github.com/BinPro/CONCOCT/pull/231) - Replace pandas .ix with .loc to fix deprecation warnings + - [#246](https://github.com/BinPro/CONCOCT/pull/246) - Limit some dependency version numbers for python 2 + - [#254](https://github.com/BinPro/CONCOCT/pull/254) - Concoct refine now works with python 3 + - [#258](https://github.com/BinPro/CONCOCT/pull/258) - Seed tests now working again + - [#260](https://github.com/BinPro/CONCOCT/pull/260) - Fix the dockerfile build by adding integration test data diff --git a/Dockerfile b/Dockerfile index ccca028..9465745 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,12 @@ -# Docker for CONCOCT (http://github.com/BinPro/CONCOCT) v1.0.0 -# VERSION 1.0.0 +# Docker for CONCOCT (http://github.com/BinPro/CONCOCT) v1.1.0 +# VERSION 1.1.0 # # This docker creates and sets up an Ubuntu environment with all -# dependencies for CONCOCT v1.0.0 installed. +# dependencies for CONCOCT v1.1.0 installed. # # To login to the docker with a shared directory from the host do: # -# docker run -v /my/host/shared/directory:/my/docker/location -i -t alneberg/concoct_1.0.0 /bin/bash +# docker run -v /my/host/shared/directory:/my/docker/location -i -t alneberg/concoct_1.1.0 /bin/bash # FROM ubuntu:18.04 @@ -14,18 +14,17 @@ COPY . /opt/CONCOCT # Get basic ubuntu packages needed RUN apt-get update -qq -RUN apt-get install -qq wget build-essential libgsl0-dev git zip unzip bedtools python-pip +RUN apt-get install -qq wget build-essential libgsl0-dev git zip unzip bedtools python-pip samtools RUN pip install --upgrade pip -# Install python dependencies and fetch and install CONCOCT 1.0.0 +RUN wget --no-check-certificate https://github.com/BinPro/integration_test_data/archive/v1.1.tar.gz +RUN mkdir /opt/CONCOCT/tests/test_data/integration_test_data +RUN tar -xvzf v1.1.tar.gz -C /opt/CONCOCT/tests/test_data/integration_test_data --strip-components=1 + +# Install python dependencies and fetch and install CONCOCT 1.1.0 RUN cd /opt/CONCOCT;\ pip install -r requirements.txt;\ - -# wget --no-check-certificate https://github.com/BinPro/CONCOCT/archive/1.0.0.tar.gz;\ -# tar xf 1.0.0.tar.gz;\ -# cd CONCOCT-1.0.0;\ -# python setup.py install RUN cd /opt/CONCOCT/;\ python setup.py install diff --git a/README.md b/README.md index 624e329..d6156d0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -## CONCOCT 1.0.0 [![Build Status](https://travis-ci.org/BinPro/CONCOCT.png?branch=master)](https://travis-ci.org/BinPro/CONCOCT) +## CONCOCT 1.1.0 [![Build Status](https://travis-ci.org/BinPro/CONCOCT.png?branch=master)](https://travis-ci.org/BinPro/CONCOCT) A program for unsupervised binning of metagenomic contigs by using nucleotide composition, coverage data in multiple samples and linkage data from paired end reads. diff --git a/bin/concoct b/bin/concoct index 4236001..2a71516 100755 --- a/bin/concoct +++ b/bin/concoct @@ -25,7 +25,7 @@ def main(args): sys.exit(-1) if cov is not None: - joined = composition.join(cov.ix[:,cov_range[0]:cov_range[1]],how="inner") + joined = composition.join(cov.loc[:,cov_range[0]:cov_range[1]],how="inner") else: joined = composition @@ -61,12 +61,12 @@ def main(args): logging.info('PCA transformed data.') - logging.info('Will call vbgmm with parameters: %s, %s, %s, %s' % (Output.CONCOCT_PATH, args.clusters, args.length_threshold, args.threads)) + logging.info('Will call vbgmm with parameters: %s, %s, %s, %s, %s' % (Output.CONCOCT_PATH, args.clusters, args.length_threshold, args.threads,args.iterations)) N_contigs = transform_filter.shape[0] assign = np.zeros(N_contigs, dtype=np.int32) - assign = vbgmm.fit(np.copy(transform_filter,order='C'), int(args.clusters), int(args.seed), int(args.threads)) + assign = vbgmm.fit(np.copy(transform_filter,order='C'), int(args.clusters), int(args.seed), int(args.threads),int(args.iterations)) Output.write_assign( @@ -85,6 +85,8 @@ if __name__=="__main__": else: args.pca_components = args.total_percentage_pca/100.0 + if args.threads == 1: + logging.warning("CONCOCT is running in single threaded mode. Please, consider adjusting the --threads parameter.") results = main(args) - print("CONCOCT Finished, the log shows how it went.", file=sys.stderr) + logging.info("CONCOCT Finished, the log shows how it went.") diff --git a/bin/concoct_refine b/bin/concoct_refine index 1cd4122..a584b80 100755 --- a/bin/concoct_refine +++ b/bin/concoct_refine @@ -4,8 +4,6 @@ DESC="""A script that iterates over concoct results and reruns the concoct algor for clusters where the median SCG presence is at least 2.""" -import sys -import logging import vbgmm import numpy as np import argparse @@ -13,30 +11,12 @@ import pandas as p from sklearn.decomposition import PCA -from concoct.transform import perform_pca - -def main(argv): - parser = argparse.ArgumentParser(description=DESC) - - parser.add_argument("cluster_file", help="string specifying cluster file") - - parser.add_argument("original_data", help="string original but transformed data file") - - parser.add_argument("scg_file", help="string specifying scg frequency file") - - parser.add_argument('-e','--expansion_factor',default=2, type=int, - help=("number of clusters to expand by")) - - parser.add_argument('-t','--threads',default=1, type=int, - help=("number of threads to use defaults to one")) - - args = parser.parse_args() - - clusters = p.read_csv(args.cluster_file, header=None, index_col=0) +def main(args): + clusters = p.read_csv(args.cluster_file, header=None, index_col=0) original_data = p.read_csv(args.original_data, header=0, index_col=0) - original_data_matrix = original_data.as_matrix() + original_data_matrix = original_data.values() scg_freq = p.read_csv(args.scg_file, header=0, index_col=0) @@ -64,8 +44,8 @@ def main(argv): transform_k = pca_object.transform(slice_k) NK = med_scgs[k]*args.expansion_factor - print "Run CONCOCT for " + str(k) + "with " + str(NK) + "clusters" + " using " + str(args.threads) + "threads" - assigns = vbgmm.fit(np.copy(transform_k,order='C'),int(NK),int(args.threads)) + print("Run CONCOCT for " + str(k) + "with " + str(NK) + "clusters" + " using " + str(args.threads) + "threads") + assigns = vbgmm.fit(np.copy(transform_k,order='C'), int(NK), args.seed, args.threads) kK = np.max(assigns) + 1 @@ -77,6 +57,26 @@ def main(argv): new_assign_df = p.DataFrame(new_clusters_matrix,index=original_data.index) new_assign_df.to_csv("clustering_refine.csv") + if __name__ == "__main__": - main(sys.argv[1:]) + parser = argparse.ArgumentParser(description=DESC) + + parser.add_argument("cluster_file", help="string specifying cluster file") + + parser.add_argument("original_data", help="string original but transformed data file") + + parser.add_argument("scg_file", help="string specifying scg frequency file") + + parser.add_argument('-e','--expansion_factor',default=2, type=int, + help=("number of clusters to expand by")) + + parser.add_argument('-s', '--seed' , default=11, type=int, + help=("The seed used for algorithm result reproducibility.")) + + parser.add_argument('-t','--threads',default=1, type=int, + help=("number of threads to use defaults to one")) + + args = parser.parse_args() + + main(args) diff --git a/c-concoct/c_vbgmm_fit.c b/c-concoct/c_vbgmm_fit.c index c5a9a1a..0d844c6 100644 --- a/c-concoct/c_vbgmm_fit.c +++ b/c-concoct/c_vbgmm_fit.c @@ -34,11 +34,16 @@ /*User includes*/ #include "c_vbgmm_fit.h" -void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads) +void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads, int nIter) { int debug = 0; int bAssign = 0; - driverMP(adX, nN, nD, anAssign, nK, seed, DEF_MAX_ITER, DEF_EPSILON, debug, bAssign, nThreads); + + if (nIter < 1){ + nIter = DEF_MAX_ITER; + } + + driverMP(adX, nN, nD, anAssign, nK, seed, nIter, DEF_EPSILON, debug, bAssign, nThreads); return; } diff --git a/c-concoct/c_vbgmm_fit.h b/c-concoct/c_vbgmm_fit.h index 40e7cda..5573031 100644 --- a/c-concoct/c_vbgmm_fit.h +++ b/c-concoct/c_vbgmm_fit.h @@ -121,7 +121,7 @@ typedef struct s_Cluster /*user defines*/ -void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads); +void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads, int nIter); int driverMP(double *adX, int nN, int nD, int *anAssign, int nKStart, unsigned long lSeed, int nMaxIter, double dEpsilon, int debug, int bAssign, int nThreads); diff --git a/c-concoct/vbgmm.pyx b/c-concoct/vbgmm.pyx index 8043334..687a406 100644 --- a/c-concoct/vbgmm.pyx +++ b/c-concoct/vbgmm.pyx @@ -12,11 +12,11 @@ import numpy as np cimport numpy as np # declare the interface to the C code -cdef extern void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads) +cdef extern void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads, int nIter) @cython.boundscheck(False) @cython.wraparound(False) -def fit(np.ndarray[double, ndim=2, mode="c"] xarray not None, nClusters, seed, threads): +def fit(np.ndarray[double, ndim=2, mode="c"] xarray not None, nClusters, seed, threads, piter): """ fit (xarray, nClusters, seed, threads) @@ -26,18 +26,20 @@ def fit(np.ndarray[double, ndim=2, mode="c"] xarray not None, nClusters, seed, t param: nClusters -- an int, number of start clusters param: seed -- an int, the random seed param: threads -- int, the number of threads to use - + param: piter -- int, the number of VB iterations to use """ - cdef int nN, nD, nK, nThreads + cdef int nN, nD, nK, nThreads, nIter nN, nD = xarray.shape[0], xarray.shape[1] nK = nClusters + nIter = piter + nThreads = threads cdef np.ndarray[int, ndim=1,mode="c"] assign = np.zeros((nN), dtype=np.intc) - c_vbgmm_fit (&xarray[0,0], nN, nD, nK, seed, &assign[0], nThreads) + c_vbgmm_fit (&xarray[0,0], nN, nD, nK, seed, &assign[0], nThreads, nIter) return assign diff --git a/concoct/input.py b/concoct/input.py index 2e0c392..c5a5b0a 100644 --- a/concoct/input.py +++ b/concoct/input.py @@ -87,32 +87,32 @@ def load_coverage(cov_file, contig_lengths, no_cov_normalization, add_total_cove cov_range = (cov.columns[0],cov.columns[-1]) # Adding pseudo count - cov.ix[:,cov_range[0]:cov_range[1]] = cov.ix[:,cov_range[0]:cov_range[1]].add( + cov.loc[:,cov_range[0]:cov_range[1]] = cov.loc[:,cov_range[0]:cov_range[1]].add( (read_length/contig_lengths), axis='index') if not no_cov_normalization: #Normalize per sample first - cov.ix[:,cov_range[0]:cov_range[1]] = \ - _normalize_per_sample(cov.ix[:,cov_range[0]:cov_range[1]]) + cov.loc[:,cov_range[0]:cov_range[1]] = \ + _normalize_per_sample(cov.loc[:,cov_range[0]:cov_range[1]]) temp_cov_range = None # Total coverage should be calculated after per sample normalization if add_total_coverage: - cov['total_coverage'] = cov.ix[:,cov_range[0]:cov_range[1]].sum(axis=1) + cov['total_coverage'] = cov.loc[:,cov_range[0]:cov_range[1]].sum(axis=1) temp_cov_range = (cov_range[0],'total_coverage') if not no_cov_normalization: # Normalize contigs next - cov.ix[:,cov_range[0]:cov_range[1]] = \ - _normalize_per_contig(cov.ix[:,cov_range[0]:cov_range[1]]) + cov.loc[:,cov_range[0]:cov_range[1]] = \ + _normalize_per_contig(cov.loc[:,cov_range[0]:cov_range[1]]) if temp_cov_range: cov_range = temp_cov_range # Log transform - cov.ix[:,cov_range[0]:cov_range[1]] = np.log( - cov.ix[:,cov_range[0]:cov_range[1]]) + cov.loc[:,cov_range[0]:cov_range[1]] = np.log( + cov.loc[:,cov_range[0]:cov_range[1]]) logging.info('Successfully loaded coverage data.') return cov, cov_range diff --git a/concoct/output.py b/concoct/output.py index 5799d77..4499502 100644 --- a/concoct/output.py +++ b/concoct/output.py @@ -52,6 +52,11 @@ def __init__(self,basename,args): "PCA_components_data_gt{0}.csv" self.LOG_FILE_BASE = self.CONCOCT_PATH + 'log.txt' + # Reset any previous logging handlers, see: + # https://stackoverflow.com/a/49202811 + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) + logging.basicConfig( filename=self.LOG_FILE_BASE, level=logging.INFO, diff --git a/concoct/parser.py b/concoct/parser.py index 50ab55d..36fb106 100644 --- a/concoct/parser.py +++ b/concoct/parser.py @@ -68,9 +68,6 @@ def arguments(): parser.add_argument('-i','--iterations',type=int, default=500, help=('Specify maximum number of iterations for the VBGMM. ' 'Default value is 500')) - parser.add_argument('-e','--epsilon',type=float, default=1.0e-6, - help=('Specify the epsilon for VBGMM. ' - 'Default value is 1.0e-6')) parser.add_argument('--no_cov_normalization', default=False, action="store_true", help=("By default the coverage is normalized with regards to samples, " "then normalized with regards of contigs and finally log transformed. " diff --git a/doc/Dockerfile.template b/doc/Dockerfile.template deleted file mode 100644 index c36e414..0000000 --- a/doc/Dockerfile.template +++ /dev/null @@ -1,99 +0,0 @@ -# Docker for CONCOCT (http://github.com/BinPro/CONCOCT) v{{version}} -# VERSION {{version}} -# -# This docker creates and sets up an Ubuntu environment with all -# dependencies for CONCOCT v{{version}} installed. -# -# To login to the docker with a shared directory from the host do: -# -# sudo docker run -v /my/host/shared/directory:/my/docker/location -i -t binnisb/concoct_{{version}} /bin/bash -# - -FROM ubuntu:13.10 -MAINTAINER CONCOCT developer group, concoct-support@lists.sourceforge.net - -ENV PATH /opt/miniconda/bin:$PATH -ENV PATH /opt/velvet_1.2.10:$PATH - -# Get basic ubuntu packages needed -RUN apt-get update -qq -RUN apt-get install -qq wget build-essential libgsl0-dev git zip unzip - -# Set up Miniconda environment for python2 -RUN cd /opt;\ - wget http://repo.continuum.io/miniconda/Miniconda-3.3.0-Linux-x86_64.sh -O miniconda.sh;\ - chmod +x miniconda.sh;\ - ./miniconda.sh -p /opt/miniconda -b;\ - conda update --yes conda;\ - conda install --yes python=2.7 - -# Velvet for assembly -RUN apt-get install -qq zlib1g-dev -RUN cd /opt;\ - wget www.ebi.ac.uk/~zerbino/velvet/velvet_1.2.10.tgz -O velvet.tgz;\ - tar xf velvet.tgz;\ - cd velvet_1.2.10;\ - sed -i "s/MAXKMERLENGTH=31/MAXKMERLENGTH=128/" Makefile ;\ - make - -# Bedtools2.17 -RUN apt-get install -qq bedtools - -# Picard tools 1.118 -# To get fuse to work, I need the following (Issue here: https://github.com/dotcloud/docker/issues/514, -# solution here: https://gist.github.com/henrik-muehe/6155333). -ENV MRKDUP /opt/picard-tools-1.118/MarkDuplicates.jar -RUN apt-get install -qq libfuse2 openjdk-7-jre-headless -RUN cd /tmp ; apt-get download fuse -RUN cd /tmp ; dpkg-deb -x fuse_* . -RUN cd /tmp ; dpkg-deb -e fuse_* -RUN cd /tmp ; rm fuse_*.deb -RUN cd /tmp ; echo -en '#!/bin/bash\nexit 0\n' > DEBIAN/postinst -RUN cd /tmp ; dpkg-deb -b . /fuse.deb -RUN cd /tmp ; dpkg -i /fuse.deb -RUN cd /opt;\ - wget "http://downloads.sourceforge.net/project/picard/picard-tools/1.118/picard-tools-1.118.zip?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Fpicard%2Ffiles%2Fpicard-tools%2F1.118%2F&ts=1396879817&use_mirror=freefr" -O picard-tools-1.118.zip;\ - unzip picard-tools-1.118.zip - -# Samtools 0.1.19 -RUN apt-get install -qq samtools - -# Bowtie2.1.0 -RUN apt-get install -qq bowtie2 - -# Parallel 20130622-1 -RUN apt-get install -qq parallel - - - -# Install prodigal 2.60 -RUN cd /opt;\ - wget --no-check-certificate https://prodigal.googlecode.com/files/Prodigal-2.60.tar.gz;\ - tar xf Prodigal-2.60.tar.gz;\ - cd Prodigal-2.60;\ - make;\ - ln -s /opt/Prodigal-2.60/prodigal /bin/prodigal - -# Install R -RUN apt-get install -qq r-base - -# Install R packages -RUN cd /opt;\ - RREPO='"http://cran.rstudio.com/"';\ - printf "install.packages(\"ggplot2\", repo=$RREPO)\ninstall.packages(\"reshape\",repo=$RREPO)\ninstall.packages(\"gplots\",repo=$RREPO)\ninstall.packages(\"ellipse\",repo=$RREPO)\ninstall.packages(\"grid\",repo=$RREPO)\ninstall.packages(\"getopt\",repo=$RREPO)" > dep.R;\ - Rscript dep.R - -# Install python dependencies and fetch and install CONCOCT {{version}} -RUN cd /opt;\ - conda update --yes conda;\ - conda install --yes python=2.7 atlas cython numpy scipy biopython pandas pip scikit-learn pysam;\ - pip install bcbio-gff;\ - wget --no-check-certificate https://github.com/BinPro/CONCOCT/archive/{{version}}.tar.gz;\ - tar xf {{version}}.tar.gz;\ - cd CONCOCT-{{version}};\ - python setup.py install - -ENV CONCOCT /opt/CONCOCT-{{version}} -ENV CONCOCT_TEST /opt/Data/CONCOCT-test-data -ENV CONCOCT_EXAMPLE /opt/Data/CONCOCT-complete-example - diff --git a/doc/Makefile b/doc/Makefile index 8d13c7b..5fe1972 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -19,7 +19,7 @@ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) sou # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext help: @echo "Please use \`make ' where is one of" @@ -30,6 +30,7 @@ help: @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @@ -45,6 +46,7 @@ help: @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* @@ -89,6 +91,14 @@ qthelp: @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/CONCOCT.qhc" +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @@ -166,6 +176,11 @@ doctest: @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo diff --git a/doc/dockerfile_develop b/doc/dockerfile_develop new file mode 100644 index 0000000..28ec4d1 --- /dev/null +++ b/doc/dockerfile_develop @@ -0,0 +1,36 @@ +# Dockerfile to test conda install of CONCOCT (http://github.com/BinPro/CONCOCT) +# +# This docker creates and sets up an Ubuntu environment to test +# the conda installation process +# +# Run with command: +# docker build . -f doc/dockerfile_develop + +FROM ubuntu:latest +COPY . /opt/CONCOCT + +# Get basic ubuntu packages needed +RUN apt-get update -qq +RUN apt-get install -qq wget + +# Install conda +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh; \ +bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda + +ENV PATH="/root/miniconda/bin:${PATH}" + +RUN conda config --add channels defaults +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge + +# Install python dependencies and fetch and install CONCOCT +RUN conda create -n concoct_env python=3 concoct + +RUN cd /opt/CONCOCT;\ + conda run -n concoct_env \ + concoct -t 1 --composition_file tests/test_data/composition.fa \ + --coverage_file tests/test_data/coverage -b test_out_t1; \ + conda run -n concoct_env \ + concoct -t 4 --composition_file tests/test_data/composition.fa \ + --coverage_file tests/test_data/coverage -b test_out_t4; + diff --git a/doc/generate_dockerfile.py b/doc/generate_dockerfile.py deleted file mode 100644 index 93fced2..0000000 --- a/doc/generate_dockerfile.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python -import os -from argparse import ArgumentParser -import jinja2 -import sys - -def main(args): - with open(args.template) as tf: - t = jinja2.Template(tf.read()) - - with open(args.Dockerfile, 'w') as df: - df.write(t.render(version=args.version)) - -if __name__ == "__main__": - # Argumentparser only to add --help option - parser = ArgumentParser(description=("Generates the Dockerfile for the given release by changing the " - " version number in the Dockerfile template.")) - parser.add_argument("template", help="Path to the Dockerfile template") - parser.add_argument("Dockerfile", help="Path to where new dockerfile will be printed") - parser.add_argument("version", help=("Version number for current release, " - " need to be present as a tag on github.")) - - args = parser.parse_args() - - main(args) diff --git a/doc/requirements.txt b/doc/requirements.txt index 4e636cf..3a2e71d 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,5 +1,6 @@ # The requirements to build the documentation -Sphinx==1.3.1 -mock==1.0.1 -sphinxcontrib-programoutput==0.8 +Sphinx>=1.3.1 +mock>=1.0.1 +sphinxcontrib-programoutput>=0.8 sphinx-rtd-theme>=0.1.6 +conf diff --git a/doc/source/cmd_options.rst b/doc/source/cmd_options.rst new file mode 100644 index 0000000..2261a4a --- /dev/null +++ b/doc/source/cmd_options.rst @@ -0,0 +1,10 @@ + +Command Line Options +==================== + +CONCOCT uses several command line options to control the clustering, here is a +complete documentation of these. These can also be viewed by typing ``concoct +-h`` on the command line: + +.. program-output:: (echo 'import conf'; tail -n+2 ../../concoct/parser.py; echo 'args=arguments()') | python - --help + :shell: diff --git a/doc/source/complete_example.rst b/doc/source/complete_example.rst deleted file mode 100644 index 747f27a..0000000 --- a/doc/source/complete_example.rst +++ /dev/null @@ -1,8 +0,0 @@ -Complete Example V1.0 -===================== - -We'd like to here give you a complete example walk through. However, the -examples that were here previously were so outdated that they were directly -unhelpful. Hopefully a new version of this page will appear here within a -not so distant future. - diff --git a/doc/source/conf.py b/doc/source/conf.py index c65ba27..d15b111 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,7 +1,8 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # CONCOCT documentation build configuration file, created by -# sphinx-quickstart on Wed Jul 23 15:17:29 2014. +# sphinx-quickstart on Thu Aug 1 11:22:50 2019. # # This file is execfile()d with the current directory set to its # containing dir. @@ -14,6 +15,7 @@ import sys import os +import shlex # Add readthedocs.org theme # on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org @@ -41,14 +43,17 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.pngmath', + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', 'sphinxcontrib.programoutput', ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -# The suffix of source filenames. +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The encoding of source files. @@ -59,20 +64,24 @@ # General information about the project. project = 'CONCOCT' -copyright = '2014, Johannes Alneberg, Brynjar Smari Bjarnason, Ino de Bruijn, Melanie Schirmer, Joshua Quick, Umer Z. Ijaz, Nicholas J. Loman, Anders F. Andersson, Christopher Quince' +copyright = '2014-2019, Johannes Alneberg, Brynjar Smari Bjarnason, Ino de Bruijn, Melanie Schirmer, Joshua Quick, Umer Z. Ijaz, Nicholas J. Loman, Anders F. Andersson, Christopher Quince' +author = 'Johannes Alneberg, Brynjar Smari Bjarnason, Ino de Bruijn, Melanie Schirmer, Joshua Quick, Umer Z. Ijaz, Nicholas J. Loman, Anders F. Andersson, Christopher Quince' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '1.0' +version = '1.1' # The full version, including alpha/beta/rc tags. -release = '1.0.0' +release = '1.1.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -108,6 +117,9 @@ # If true, keep warnings as "system message" paragraphs in the built documents. #keep_warnings = False +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + # -- Options for HTML output ---------------------------------------------- @@ -190,10 +202,23 @@ # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + # Output file base name for HTML help builder. htmlhelp_basename = 'CONCOCTdoc' - # -- Options for LaTeX output --------------------------------------------- latex_elements = { @@ -205,13 +230,16 @@ # Additional stuff for the LaTeX preamble. #'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'CONCOCT.tex', 'CONCOCT Documentation', + (master_doc, 'CONCOCT.tex', 'CONCOCT Documentation', 'Johannes Alneberg, Brynjar Smari Bjarnason, Ino de Bruijn, Melanie Schirmer, Joshua Quick, Umer Z. Ijaz, Nicholas J. Loman, Anders F. Andersson, Christopher Quince', 'manual'), ] @@ -241,8 +269,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'concoct', 'CONCOCT Documentation', - ['Johannes Alneberg, Brynjar Smari Bjarnason, Ino de Bruijn, Melanie Schirmer, Joshua Quick, Umer Z. Ijaz, Nicholas J. Loman, Anders F. Andersson, Christopher Quince'], 1) + (master_doc, 'concoct', 'CONCOCT Documentation', + [author], 1) ] # If true, show URL addresses after external links. @@ -255,8 +283,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'CONCOCT', 'CONCOCT Documentation', - 'Johannes Alneberg, Brynjar Smari Bjarnason, Ino de Bruijn, Melanie Schirmer, Joshua Quick, Umer Z. Ijaz, Nicholas J. Loman, Anders F. Andersson, Christopher Quince', 'CONCOCT', 'One line description of project.', + (master_doc, 'CONCOCT', 'CONCOCT Documentation', + author, 'CONCOCT', 'One line description of project.', 'Miscellaneous'), ] @@ -286,4 +314,5 @@ def __getattr__(cls, name): MOCK_MODULES = ['pygtk', 'gtk', 'gobject', 'numpy', 'pandas', 'Bio', 'concoct', 'concoct.utils', 'concoct.output', 'concoct.parser', 'concoct.cluster', 'concoct.input', 'concoct.transform', 'vbgmm'] -sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) +update_d = {mod_name: Mock() for mod_name in MOCK_MODULES} +sys.modules.update(update_d) diff --git a/doc/source/index.rst b/doc/source/index.rst index 85ba6f5..3255976 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1,5 +1,5 @@ .. CONCOCT documentation master file, created by - sphinx-quickstart on Wed Jul 23 15:17:29 2014. + sphinx-quickstart on Thu Aug 1 11:22:50 2019. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. @@ -7,12 +7,9 @@ CONCOCT's documentation ======================= CONCOCT "bins" metagenomic contigs. Metagenomic binning is the process of clustering sequences into clusters corresponding to operational taxonomic units of some level. -For any known issues with CONCOCT check the issue tracker: -https://github.com/BinPro/CONCOCT/issues - Features -------- -CONCOCT does unsupervised binning of metagenomic contigs by using nucleotide composition - kmer frequencies - and coverage data for multiple samples. +CONCOCT does unsupervised binning of metagenomic contigs by using nucleotide composition - kmer frequencies - and coverage data for multiple samples. CONCOCT can accurately (up to species level) bin metagenomic contigs. For optimal performance: - Map several samples against your assembled contigs. @@ -32,7 +29,21 @@ Contribute Support ------- If you are having issues, please let us know. -We have a mailing list located at: concoct-support@lists.sourceforge.net which you can subscribe to `here `__. +We have a discussion thread on gitter: + +.. image:: https://img.shields.io/badge/gitter-%20join%20chat%20%E2%86%92-4fb99a.svg?style=flat-square + :alt: Join the chat at gitter.im/BinPro/CONCOCT + :target: https://gitter.im/BinPro/CONCOCT + +Known Issues +------------ + + - Contig names consisting of digits only are not allowed. Please rename your contigs in both the fasta and the coverage table before proceeding. + - Contig sequences can only contain letters A,C,G or T. For example Ns are currently not allowed. + - Contigs need to be cut up prior to binning. This is covered in the :doc:`usage` page. + +For a more up to date list of reported issues, check the issue tracker: +https://github.com/BinPro/CONCOCT/issues Licence ------- @@ -40,11 +51,19 @@ FreeBSD Contents: --------- + .. toctree:: :maxdepth: 2 self installation usage - complete_example + cmd_options scripts/index + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/doc/source/installation.rst b/doc/source/installation.rst index 039d5ee..2009a39 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -1,188 +1,68 @@ Installation ============ -Dependencies ------------- - -Fundamental dependencies -~~~~~~~~~~~~~~~~~~~~~~~~ - -:: - - python version 2.7 or version 3 - gcc - C compiler - gsl - GNU Scientific Library - gslcblas - GNU Scientific Library BLAS library - gomp - GNU OpenMP implementation - - -These items are prerequisities for the installation of concoct as -described below. The installation procedure varies on different systems, -and described in this README is only how to proceed with a linux -(ubuntu) distribution. - -We recommend using miniconda to install python. -A c-compiler, e.g. ``gcc``, is needed to compile -the c parts of concoct that uses the GNU Scientific Library ``gsl``. For -linux (ubuntu) this is installed through: - -:: - - apt-get install build-essential libgsl0-dev libgomp1 - -Making it work on Mac OSX -~~~~~~~~~~~~~~~~~~~~~~~~~ -A bit of a hack. You have been warned: - -:: - - conda install llvm gcc libgcc pip - export CC=/Users/johannes.alneberg/miniconda3/envs/concoct_py3/bin/gcc - export CXX=/Users/johannes.alneberg/miniconda3/envs/concoct_py3/bin/g++ - conda install gsl - pip install -r requirements.txt - unset CC - unset CXX - pip install pysam +With Bioconda [Recommended] +--------------------------- - -Python packages -~~~~~~~~~~~~~~~ +The easiest and recommended way to install concoct is through `Bioconda `_ and `conda `_ in an isolated environment: :: - cython>=0.19.2 - numpy>=1.7.1 - scipy>=0.12.0 - pandas>=0.11.0 - biopython>=1.62b - scikit-learn>=0.13.1 - -These are the python packages that need to be installed in order to run -concoct. If you follow the installation instructions below, these will -be installed automatically, but are listed here for transparency. + conda config --add channels defaults + conda config --add channels bioconda + conda config --add channels conda-forge -Optional dependencies -~~~~~~~~~~~~~~~~~~~~~ + conda create -n concoct_env python=3 concoct -- For assembly, use your favorite, here is one - - `Megahit `__ +Note for Mac OSX users +~~~~~~~~~~~~~~~~~~~~~~ +Currently concoct on Mac OSX can only run in single threaded mode, which drastically increases the runtime. +However, the Mac OSX installation of concoct can still be useful for testing purposes and is possible to install through conda as shown above. -- To create the input table (containing average coverage per sample and - contig) - - `BEDTools `__ version - >= 2.15.0 (only genomeCoverageBed) - - `Picard `__ - tools version >= 1.110 - - `samtools `__ version >= 0.1.18 - - `bowtie2 `__ - version >= 2.1.0 - - `GNU parallel `__ version - >= 20130422 - - Python packages: ``pysam>=0.6`` +Manual Installation +------------------- -- For validation of clustering using single-copy core genes we recommend using: +The conda installation should be enough for most users. +However, if you want to modify the source code, a manual installation might be needed. +An example of a manual installation on an Ubuntu system can be seen in the `Travis CI config file `_. - - `CheckM `__ -If you want to install these dependencies on your own server, you can -take a look at `doc/Dockerfile.all\_dep `__ for -ideas on how to install them. -Installation +Using Docker ------------ -Here we describe two recommended ways of getting concoct to run on your -computer/server. The first option, using Anaconda, should work for any -\*nix (e.g. Mac OS X or Linux) system even where you do not have 'sudo' -rights (e.g. on a common computer cluster). The second option is -suitable for a linux computer where you have root privileges and you -prefer to use a virtual machine where all dependencies to run concoct -are included. Docker does also run on Mac OS X through a virtual machine. -For more information check out the `Docker documentation `__. - -Using Anaconda -~~~~~~~~~~~~~~ - -This instruction shows how to install all dependencies (except the -'Fundamental dependencies' and the 'Optional dependencies' listed above) -using an Anaconda environment. Anaconda is a tool to isolate your python -installation, which allows you to have multiple parallel installations -using different versions of different packages, and gives you a very -convenient and fast way to install the most common scientific python -packages. Anaconda is free but not open source, you can download -Anaconda `here `__. -Installation instructions can be found -`here `__. - -After installing Anaconda, create a new environment that will contain -the concoct installation: - -:: - - conda create -n concoct_env python=2.7 - -After choosing to proceed, run the suggested command: - -:: - - source activate concoct_env +We provide a Docker image: +binpro/concoct\_latest which contains CONCOCT and its dependencies for a basic workflow. -then install the concoct dependencies into this environment: +Assuming DOcker is installed, the following command will then download the image from the Docker image +index, map the Data folder to the image and log you into the docker image. :: - conda install cython numpy scipy biopython pandas pip scikit-learn + docker run -v /home/USER/Data:/opt/Data -i -t binpro/concoct_latest bash -Finally, download the CONCOCT distribution from -https://github.com/BinPro/CONCOCT/releases (stable) and extract the -files, or clone the repository with github (potentially unstable). -Resolve all dependencies, see above and then execute within the CONCOCT -directory: +To test concoct you can then do: :: - python setup.py install - -Using Docker -~~~~~~~~~~~~ - -If you have root access to a machine where you want to install concoct -and you have storage for roughly 2G "virtual machine" then Docker -provides a very nice way to get a Docker image with concoct and its -dependencies installed. This way the only thing you install on your host -system is Docker, the rest is contained in an Docker image. This allows -you to install and run programs in that image without it affecting your -host system. You should `get to know Docker -here `__. You need to `get -Docker installed `__ and -specially if you have -`Ubuntu `__. -When Docker is installed you need to download and log into the concoct -image. - -We provide a Docker image: + $ cd /opt/CONCOCT_latest + $ nosetests -binpro/concoct\_latest contains CONCOCT and all its dependencies for the -:doc:`complete_example` with the exception of -the SCG evaluation. +Which should execute all tests without errors. -The following command will then download the image from the Docker image -index, map the Data folder to the image and log you into the docker -image. -:: +Other Programs Needed +--------------------- - sudo docker run -v /home/USER/Data:/opt/Data -i -t binpro/concoct_latest bash +- For assembly, use your favourite. Here is a good one: -To test concoct you can then do: + - `Megahit `__ -:: - $ cd /opt/CONCOCT_latest - $ nosetests +- To create the input bam files, choose your favourite aligner, for example bowtie2 or bwa. +- For validation of clustering using single-copy core genes we recommend using: -Which should execute all tests without errors. + - `CheckM `__ diff --git a/doc/source/scripts/concoct_coverage_table.rst b/doc/source/scripts/concoct_coverage_table.rst new file mode 100644 index 0000000..79647c2 --- /dev/null +++ b/doc/source/scripts/concoct_coverage_table.rst @@ -0,0 +1,20 @@ +========================= +concoct_coverage_table.py +========================= + +Usage +===== +The usage and help documentation of ``concoct_coverage_table.py`` can be seen by +running ``concoct_coverage_table.py -h``: + +.. program-output:: cat ../../scripts/concoct_coverage_table.py | sed 's/import argparse/import argparse, conf/' | python - --help + :shell: + +Example +======= +An example of how to run ``concoct_coverage_table.py``:: + + concoct_coverage_table.py contigs_10K.bed mapping/Sample*.sorted.bam > coverage_table.tsv + +This creates a coverage table suitable as input for concoct as the `coverage_file` parameter. +The ``contigs_10K.bed`` file is created from the ``cut_up_fasta.py`` script and the ``bam``-files needs to be sorted and indexed. diff --git a/doc/source/scripts/cut_up_fasta.rst b/doc/source/scripts/cut_up_fasta.rst new file mode 100644 index 0000000..5a7006d --- /dev/null +++ b/doc/source/scripts/cut_up_fasta.rst @@ -0,0 +1,21 @@ +====================== +cut_up_fasta.py +====================== + +Usage +===== +The usage and help documentation of ``cut_up_fasta.py`` can be seen by +running ``cut_up_fasta.py -h``: + +.. program-output:: cat ../../scripts/cut_up_fasta.py | sed 's/import argparse/import argparse, conf/' | python - --help + :shell: + +Example +======= +An example of how to run ``cut_up_fasta.py``:: + + cut_up_fasta.py original_contigs.fa -c 10000 -o 0 --merge_last -b contigs_10K.bed > contigs_10K.fa + +This creates a fasta file and a BED file. +The fasta file ``contigs_10K.fa`` contains the original contigs cut up into parts of length exactly 10K, except for the last contig part which is between 10K and 20K long. +The BED file ``contigs_10K.bed`` contains a list of the contig parts created with coordinates in the original contigs. diff --git a/doc/source/scripts/dnadiff_dist_matrix.rst b/doc/source/scripts/dnadiff_dist_matrix.rst index 23fafe2..8372773 100644 --- a/doc/source/scripts/dnadiff_dist_matrix.rst +++ b/doc/source/scripts/dnadiff_dist_matrix.rst @@ -13,7 +13,7 @@ running ``pyhton dnadiff_dist_matrix -h``: Example ======= An example of how to run ``dnadiff_dist_matrix`` on the test data:: - + cd CONCOCT/scripts python dnadiff_dist_matrix.py test_dnadiff_out tests/test_data/bins/sample*.fa diff --git a/doc/source/scripts/extract_fasta_bins.rst b/doc/source/scripts/extract_fasta_bins.rst new file mode 100644 index 0000000..816f33f --- /dev/null +++ b/doc/source/scripts/extract_fasta_bins.rst @@ -0,0 +1,21 @@ +====================== +extract_fasta_bins.py +====================== + +Usage +===== +The usage and help documentation of ``extract_fasta_bins.py`` can be seen by +running ``extract_fasta_bins.py -h``: + +.. program-output:: cat ../../scripts/extract_fasta_bins.py | sed 's/import argparse/import argparse, conf/' | python - --help + :shell: + +Example +======= +An example of how to run ``extract_fasta_bins.py``:: + + mkdir concoct_output/fasta_bins + extract_fasta_bins.py original_contigs.fa concoct_output/clustering_merged.csv --output_path concoct_output/fasta_bins + +This creates a fasta file for each cluster assigned by concoct. +The clusters assigned need not to be complete or uncontaminated and should be investigated closer with e.g. CheckM. diff --git a/doc/source/scripts/extract_scg_bins.rst b/doc/source/scripts/extract_scg_bins.rst index b43e380..2151287 100644 --- a/doc/source/scripts/extract_scg_bins.rst +++ b/doc/source/scripts/extract_scg_bins.rst @@ -1,6 +1,6 @@ -====================== -extract_scg_bins.py -====================== +================================ +[Deprecated] extract_scg_bins.py +================================ Usage ===== @@ -13,7 +13,7 @@ running ``pyhton extract_scg_bins -h``: Example ======= An example of how to run ``extract_scg_bins`` on the test data:: - + cd CONCOCT/scripts/tests/test_data python extract_scg_bins.py \ --output_folder test_extract_scg_bins_out \ @@ -26,7 +26,7 @@ An example of how to run ``extract_scg_bins`` on the test data:: --groups gt300 gt500 This results in the following output files in the folder ``test_extraxt_scg_bins_out/``:: - + $ ls test_extract_scg_bins_out/ sample0_gt300_bin2.fa sample0_gt500_bin2.fa diff --git a/doc/source/scripts/index.rst b/doc/source/scripts/index.rst index dcbfb0a..7593b43 100644 --- a/doc/source/scripts/index.rst +++ b/doc/source/scripts/index.rst @@ -1,24 +1,28 @@ CONCOCT Scripts -================================================= -The scripts in the ``CONCOCT/scripts`` directory are not fully maintained. They -implement methods that we apply after binning with CONCOCT. Eventually some of -these methods might make it to a package of their own. +=============== +CONCOCT ships with some additional scripts which are very useful to e.g. create input files and to extract output fastas for concoct. +These scripts are: -To test all scripts that have tests one could do:: + - ``cut_up_fasta.py`` + - ``concoct_coverage_table.py`` + - ``merge_cutup_clustering.py`` + - ``extract_fasta_bins.py`` - cd CONCOCT/scripts/tests - nosetests +The repository CONCOCT contains additional scripts in the ``CONCOCT/scripts`` directory which are not fully maintained. +They implement methods that we apply after binning with CONCOCT and it might be useful as a starting point or inspiration when creating your own scripts for downstream processing of the output files. +Out of these scripts, the ones documented here are: -Before using a script it would be good to check if its test (in case it has -one) is working for you:: - - cd CONCOCT/scripts/tests - nosetests -s test_script_name + - ``dnadiff_dist_matrix.py`` + - ``extract_scg_bins.py`` [Deprecated] Contents: .. toctree:: :maxdepth: 2 + cut_up_fasta + concoct_coverage_table + merge_cutup_clustering + extract_fasta_bins dnadiff_dist_matrix extract_scg_bins diff --git a/doc/source/scripts/merge_cutup_clustering.rst b/doc/source/scripts/merge_cutup_clustering.rst new file mode 100644 index 0000000..ae882b5 --- /dev/null +++ b/doc/source/scripts/merge_cutup_clustering.rst @@ -0,0 +1,20 @@ +========================= +merge_cutup_clustering.py +========================= + +Usage +===== +The usage and help documentation of ``merge_cutup_clustering.py`` can be seen by +running ``merge_cutup_clustering.py -h``: + +.. program-output:: cat ../../scripts/merge_cutup_clustering.py | sed 's/import argparse/import argparse, conf/' | python - --help + :shell: + +Example +======= +An example of how to run ``merge_cutup_clustering.py``:: + + merge_cutup_clustering.py concoct_output/clustering_gt1000.csv > concoct_output/clustering_merged.csv + +This merges the clustering ``clustering_gt1000.csv`` created by concoct by looking at cluster assignments per contig part and assigning a concensus cluster for the original contig. +The output clustering_merged.csv contains a header line and contig_id and cluster_id per line, separated by a comma. diff --git a/doc/source/usage.rst b/doc/source/usage.rst index a838e63..97e76d5 100644 --- a/doc/source/usage.rst +++ b/doc/source/usage.rst @@ -1,10 +1,35 @@ -Usage -===== +Basic Usage +=========== -CONCOCT uses several command line options to control the clustering, here is a -complete documentation of these. These can also be viewed by typing ``concoct --h`` on the command line: +This guide assumes you have your original contigs assembled into a file ``original_contigs.fa`` and that you have mapped reads from several samples to these contigs into ``.bam`` files. +Note that the assembly can be constructed using either one single sample or several (usually all) samples. +In either case, all sample reads should be mapped against the assembly to achieve the best binning performance. -.. program-output:: (echo 'import conf'; cat ../../concoct/parser.py; echo 'args=arguments()') | python - --help - :shell: +The next step is then to cut contigs into smaller parts:: + + cut_up_fasta.py original_contigs.fa -c 10000 -o 0 --merge_last -b contigs_10K.bed > contigs_10K.fa + + +Generate table with coverage depth information per sample and subcontig. +This step assumes the directory 'mapping' contains sorted and indexed bam files where each sample has been mapped against the original contigs:: + + concoct_coverage_table.py contigs_10K.bed mapping/Sample*.sorted.bam > coverage_table.tsv + + +Run concoct:: + + concoct --composition_file contigs_10K.fa --coverage_file coverage_table.tsv -b concoct_output/ + + +Merge subcontig clustering into original contig clustering:: + + merge_cutup_clustering.py concoct_output/clustering_gt1000.csv > concoct_output/clustering_merged.csv + + +Extract bins as individual FASTA:: + + mkdir concoct_output/fasta_bins + extract_fasta_bins.py original_contigs.fa concoct_output/clustering_merged.csv --output_path concoct_output/fasta_bins + +These bins should now be evaluated and filtered for completeness and contamination using for example `CheckM `__ or `BUSCO `__. diff --git a/requirements.txt b/requirements.txt index 90b6bbe..dd16d89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ argparse>=1.2.1 -bcbio-gff>=0.4 -biopython>=1.63 +bcbio-gff>=0.4,<=0.6.4; python_version < '3.0' +bcbio-gff>=0.4; python_version >= '3.0' +biopython>=1.63,<=1.72; python_version < '3.0' +biopython>=1.63; python_version >= '3.0' nose>=1.3.0 numpy>=1.8.0 pandas>=0.13.0 diff --git a/scripts/concoct_coverage_table.py b/scripts/concoct_coverage_table.py index 3c49b35..4710194 100755 --- a/scripts/concoct_coverage_table.py +++ b/scripts/concoct_coverage_table.py @@ -16,7 +16,20 @@ import glob from signal import signal, SIGPIPE, SIG_DFL import pandas as pd - + +def check_bed_file_for_errors(bedfile): + with open(bedfile) as ifh: + for line in ifh: + line = line.strip() + original_id, _, _, cutup_id = line.split('\t') + try: + assert 'concoct_part_' not in original_id + assert 'concoct_part_' in cutup_id + except AssertionError: + sys.stderr.write(("ERROR! Something is wrong with the line:\n'{}'\n" + "Perhaps 'concoct_part_' is misplaced or missing? Exiting!\n").format(line)) + sys.exit(-1) + def generate_input_table(bedfile, bamfiles, samplenames=None): """Reads input files into dictionaries then prints everything in the table format required for running CONCOCT.""" @@ -54,8 +67,9 @@ def generate_input_table(bedfile, bamfiles, samplenames=None): if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("bedfile", help="Contigs BEDFile with four columns representing: 'Contig ID, Start Position, " - "End Position and SubContig ID' respectively. The Subcontig ID is usually the same as the Contig ID for contigs " - "which are not cutup. This file can be generated by the cut_up_fasta.py script.") + "End Position and SubContig ID' respectively. The Subcontig ID must contain the pattern " + "'concoct_part_[0-9]*' while the contigs which are not cutup cannot contain this pattern. " + "This file can be generated by the cut_up_fasta.py script.") parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to the original contigs.") parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr " "of bamfiles. Default sample names used are the file names of the bamfiles, excluding the file extension.") @@ -73,5 +87,5 @@ def generate_input_table(bedfile, bamfiles, samplenames=None): # ignore broken pipe error when piping output # http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html signal(SIGPIPE,SIG_DFL) - + check_bed_file_for_errors(args.bedfile) generate_input_table(args.bedfile, args.bamfiles, samplenames=samplenames) diff --git a/scripts/cut_up_fasta.py b/scripts/cut_up_fasta.py index 0b48f3c..c4e7f4d 100755 --- a/scripts/cut_up_fasta.py +++ b/scripts/cut_up_fasta.py @@ -17,15 +17,15 @@ def cut_up_fasta(fastfiles, chunk_size, overlap, merge_last, bedoutfile): if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size): i = 0 for split_seq in chunks(record.seq, chunk_size, overlap, merge_last): - print(">%s.%i\n%s" % (record.id, i, split_seq)) + print(">{}.concoct_part_{}\n{}".format(record.id, i, split_seq)) if bedoutfile: - print("{0}\t{2}\t{3}\t{0}.{1}".format(record.id, i, chunk_size*i, chunk_size*i+len(split_seq)), + print("{0}\t{2}\t{3}\t{0}.concoct_part_{1}".format(record.id, i, chunk_size*i, chunk_size*i+len(split_seq)), file=bedoutfile_fh) i = i + 1 else: - print(">%s\n%s" % (record.id, record.seq)) + print(">{}.concoct_part_0\n{}".format(record.id, record.seq)) if bedoutfile: - print("{0}\t0\t{1}\t{0}".format(record.id, len(record.seq)), + print("{0}\t0\t{1}\t{0}.concoct_part_0".format(record.id, len(record.seq)), file=bedoutfile_fh) if bedoutfile: diff --git a/scripts/extract_fasta_bins.py b/scripts/extract_fasta_bins.py index 0b1de09..4c91e6c 100755 --- a/scripts/extract_fasta_bins.py +++ b/scripts/extract_fasta_bins.py @@ -16,7 +16,12 @@ def main(args): for i, seq in enumerate(SeqIO.parse(args.fasta_file, "fasta")): all_seqs[seq.id] = seq df = pd.read_csv(args.cluster_file) - df.columns = ['contig_id', 'cluster_id'] + try: + assert df.columns[0] == 'contig_id' + assert df.columns[1] == 'cluster_id' + except AssertionError: + sys.stderr.write("ERROR! Header line was not 'contig_id, cluster_id', please adjust your input file. Exiting!\n") + sys.exit(-1) cluster_to_contigs = defaultdict(list) for i, row in df.iterrows(): diff --git a/scripts/map-bowtie2-markduplicates.sh b/scripts/map-bowtie2-markduplicates.sh index e258ec4..3f8c07e 100755 --- a/scripts/map-bowtie2-markduplicates.sh +++ b/scripts/map-bowtie2-markduplicates.sh @@ -116,7 +116,7 @@ fi bowtie2 ${BOWTIE2_OPT} -p $THREADS -x $REF -1 $Q1 -2 $Q2 -S $OUTDIR/${RNAME}_${QNAME}.sam samtools faidx $REF samtools view -bt $REF.fai $OUTDIR/${RNAME}_${QNAME}.sam > $OUTDIR/${RNAME}_${QNAME}.bam -samtools sort $OUTDIR/${RNAME}_${QNAME}.bam $OUTDIR/${RNAME}_${QNAME}-s +samtools sort -T $OUTDIR/${RNAME}_${QNAME}.sorted -o $OUTDIR/${RNAME}_${QNAME}-s.bam $OUTDIR/${RNAME}_${QNAME}.bam samtools index $OUTDIR/${RNAME}_${QNAME}-s.bam # Mark duplicates and sort @@ -129,7 +129,7 @@ java -Xms1g -Xmx24g -XX:ParallelGCThreads=$THREADS -XX:MaxPermSize=1g -XX:+CMSCl VALIDATION_STRINGENCY=LENIENT \ MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \ REMOVE_DUPLICATES=TRUE -samtools sort $OUTDIR/${RNAME}_${QNAME}-smd.bam $OUTDIR/${RNAME}_${QNAME}-smds +samtools sort -T $OUTDIR/${RNAME}_${QNAME}-smd.sorted -o $OUTDIR/${RNAME}_${QNAME}-smds.bam $OUTDIR/${RNAME}_${QNAME}-smd.bam samtools index $OUTDIR/${RNAME}_${QNAME}-smds.bam # Determine Genome Coverage and mean coverage per contig diff --git a/scripts/merge_cutup_clustering.py b/scripts/merge_cutup_clustering.py index cda9199..6e52c83 100755 --- a/scripts/merge_cutup_clustering.py +++ b/scripts/merge_cutup_clustering.py @@ -12,21 +12,16 @@ import os import argparse from collections import defaultdict, Counter +import re + +CONTIG_PART_EXPR = re.compile("(.*)\.concoct_part_([0-9]*)") def original_contig_name_special(s): n = s.split(".")[-1] try: - int(n) - except: - return s, 0 - # Only small integers are likely to be - # indicating a cutup part. - if int(n) < 1000: - - return ".".join(s.split(".")[:-1]), int(n) - else: - # A large n indicates that the integer - # was part of the original contig + original_id, part_nr = CONTIG_PART_EXPR.match(s).group(1,2) + return original_id, part_nr + except AttributeError: # No matches for concoct_part regex return s, 0 def main(args): @@ -37,15 +32,19 @@ def main(args): for line in ifh: if first: first=False + if 'contig_id' not in line: + sys.stderr.write(("ERROR! The term 'contig_id' was not found on the first row. Please make sure that there " + "is a header line before continuing. Exiting\n")) + sys.exit(-1) continue line = line.strip() contig_id, cluster_id = line.split(',') original_contig_name, part_id = original_contig_name_special(contig_id) - + all_originals[original_contig_name][part_id] = cluster_id merged_contigs_stack = [] - + sys.stdout.write("contig_id,cluster_id\n") for original_contig_id, part_ids_d in all_originals.items(): if len(part_ids_d) > 1: @@ -53,9 +52,7 @@ def main(args): cluster_id = c.most_common(1)[0][0] c_string = [(a,b) for a, b in c.items()] if len(c.values()) > 1: - sys.stderr.write("{}\t{}, chosen: {}\n".format(original_contig_id, c_string, cluster_id)) - else: - sys.stderr.write("{}\t{}\n".format(original_contig_id, c_string)) + sys.stderr.write("No consensus cluster for contig {}: {}\t Chosen cluster: {}\n".format(original_contig_id, c_string, cluster_id)) else: cluster_id = list(part_ids_d.values())[0] diff --git a/setup.py b/setup.py index 4f1262d..e35f051 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from setuptools import setup, find_packages import sys, os +from sys import platform from distutils.core import Extension import numpy as np @@ -10,10 +11,15 @@ print("You need to have Cython installed on your system to run setup.py. Sorry!") sys.exit() -version = '1.0.0' +version = '1.1.0' include_dirs_for_concoct = [np.get_include(), '/opt/local/include/'] +extra_compile_args = ['-O3','-std=c99'] +# System clang on MacOS does not recognize the -fopenmp argument +if platform != 'darwin': + extra_compile_args = ['-fopenmp'] + extra_compile_args + setup(name='concoct', version=version, description="Clustering cONtigs with COverage and ComposiTion", @@ -37,7 +43,8 @@ cmdclass = {'build_ext': build_ext}, ext_modules = [ Extension("vbgmm", sources=["./c-concoct/vbgmm.pyx", "./c-concoct/c_vbgmm_fit.c"], - libraries =['gsl', 'gslcblas','gomp'], include_dirs=include_dirs_for_concoct, extra_compile_args = ['-fopenmp','-O3','-std=c99']) + libraries =['gsl', 'gslcblas', 'gomp'], include_dirs=include_dirs_for_concoct, + extra_compile_args = extra_compile_args) ], install_requires=['cython>=0.19.1', 'numpy>=1.7.1', diff --git a/tests/test_cut_up_fasta.py b/tests/test_cut_up_fasta.py new file mode 100644 index 0000000..7df2058 --- /dev/null +++ b/tests/test_cut_up_fasta.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +from nose.tools import assert_equal, assert_true, assert_almost_equal, nottest +from os.path import isdir,isfile +from os import listdir +import os +import sys +import subprocess +from Bio import SeqIO +from collections import defaultdict +import re + +file_path = os.path.realpath(__file__) +test_dir_path = os.path.dirname(file_path) +data_path = os.path.abspath(os.path.join(test_dir_path,"test_data")) +tmp_dir_path = os.path.join(test_dir_path, 'nose_tmp_output') +tmp_basename_dir = os.path.join(tmp_dir_path, '1') +script_path = os.path.join(test_dir_path, '..', 'scripts', 'cut_up_fasta.py') + +CWD = os.getcwd() + +CONTIG_PART_EXPR = re.compile("(.*)\.concoct_part_([0-9]*)") + +class TestCMD(object): + def setUp(self): + """Create temporary dir if necessary, + otherwise clear contents of it""" + if not isdir(tmp_dir_path): + os.mkdir(tmp_dir_path) + self.tearDown() + os.mkdir(tmp_basename_dir) + os.chdir(test_dir_path) + + def tearDown(self): + """remove temporary output files""" + for d in os.listdir(tmp_dir_path): + d_path = os.path.join(tmp_dir_path,d) + try: + os.remove(d_path) + except: + for f in os.listdir(d_path): + f_path = os.path.join(d_path,f) + os.remove(f_path) + os.rmdir(d_path) + assert os.listdir(tmp_dir_path) == [] + + + def run_command(self, contigs_file=None, chunk_size=None, overlap_size=None, merge_last=None, bedfile=None, output_file=None): + + call = ["python", script_path, contigs_file, + "--chunk_size", str(chunk_size), + "--overlap_size", str(overlap_size)] + + if merge_last: + call.append("--merge_last") + + if bedfile: + call += ["--bedfile", bedfile] + + call += ['>', output_file] + + self.c = 0 + print(" ".join(call)) + self.op = subprocess.check_output( + " ".join(call) + " 2> /dev/null", + shell=True) + + def file_len(self,fh): + i=0 + with open(fh) as f: + for i, l in enumerate(f): + pass + return i + 1 + + def test_basic(self): + contigs_file = os.path.join(data_path, 'composition.fa') + + output_file = os.path.join(tmp_dir_path, 'contigs_cutup.fa') + + self.run_command( + contigs_file=contigs_file, + chunk_size=200, + overlap_size=0, + merge_last=True, + bedfile=None, + output_file=output_file) + + assert_equal(self.c, 0, + msg = "Command exited with nonzero status") + + result_dict = {} + nr_parts_per_original = defaultdict(int) + part_lengths_per_original = defaultdict(int) + reconstructed_original = defaultdict(str) + + with open(output_file) as fhandle: + for rec in SeqIO.parse(fhandle, "fasta"): + assert_true(".concoct_part_" in rec.id) + original_id, part_nr = CONTIG_PART_EXPR.match(rec.id).group(1,2) + assert_true(".concoct_part_" not in original_id) + nr_parts_per_original[original_id] += 1 + part_lengths_per_original[original_id] += len(rec.seq) + assert_true((len(rec.seq) == 200) or (len(rec.seq) > 200 and len(rec.seq) < 400)) + + reconstructed_original[original_id] += str(rec.seq) + + with open(contigs_file) as fhandle: + for rec in SeqIO.parse(fhandle, "fasta"): + assert_true(rec.id in reconstructed_original.keys()) + assert_equal(str(rec.seq), reconstructed_original[rec.id]) + assert_equal(int(len(rec.seq) / 200), \ + nr_parts_per_original[rec.id]) + + + def test_with_bedfile(self): + bedfile = os.path.join(tmp_dir_path, 'contigs_cutup.bed'), diff --git a/tests/test_gen_input_table.py b/tests/test_gen_input_table.py index 45906ab..ee87433 100644 --- a/tests/test_gen_input_table.py +++ b/tests/test_gen_input_table.py @@ -90,10 +90,10 @@ def test_with_bamfiles(self): new_output = os.path.join(tmp_dir_path, 'inputtable.tsv') df = pd.read_csv(new_output, sep='\t', index_col=0) - assert_almost_equal(df['cov_mean_sample_ten_reads'].ix['contig-75000034'], 10*100.0/1615, 5) - assert_almost_equal(df['cov_mean_sample_ten_reads'].ix['contig-21000001'], 10*100.0/9998, 5) - assert_almost_equal(df['cov_mean_sample_twenty_reads'].ix['contig-75000034'], 20*100.0/1615, 5) - assert_almost_equal(df['cov_mean_sample_twenty_reads'].ix['contig-21000001'], 20*100.0/9998, 5) + assert_almost_equal(df['cov_mean_sample_ten_reads'].loc['contig-75000034'], 10*100.0/1615, 5) + assert_almost_equal(df['cov_mean_sample_ten_reads'].loc['contig-21000001'], 10*100.0/9998, 5) + assert_almost_equal(df['cov_mean_sample_twenty_reads'].loc['contig-75000034'], 20*100.0/1615, 5) + assert_almost_equal(df['cov_mean_sample_twenty_reads'].loc['contig-21000001'], 20*100.0/9998, 5) #assert_equal(new_output, old_output, @@ -111,8 +111,8 @@ def test_with_bedfiles(self): new_output = os.path.join(tmp_dir_path, 'inputtable.tsv') df = pd.read_csv(new_output, sep='\t', index_col=0) - assert_almost_equal(df['cov_mean_sample_ten_reads'].ix['contig-75000034'], 10*100.0/1615, 5) - assert_almost_equal(df['cov_mean_sample_ten_reads'].ix['contig-21000001'], 10*100.0/9998, 5) - assert_almost_equal(df['cov_mean_sample_twenty_reads'].ix['contig-75000034'], 20*100.0/1615, 5) - assert_almost_equal(df['cov_mean_sample_twenty_reads'].ix['contig-21000001'], 20*100.0/9998, 5) + assert_almost_equal(df['cov_mean_sample_ten_reads'].loc['contig-75000034'], 10*100.0/1615, 5) + assert_almost_equal(df['cov_mean_sample_ten_reads'].loc['contig-21000001'], 10*100.0/9998, 5) + assert_almost_equal(df['cov_mean_sample_twenty_reads'].loc['contig-75000034'], 20*100.0/1615, 5) + assert_almost_equal(df['cov_mean_sample_twenty_reads'].loc['contig-21000001'], 20*100.0/9998, 5) diff --git a/tests/test_integration.py b/tests/test_integration.py index 10abe99..6549381 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -8,7 +8,6 @@ import pandas as p file_path = os.path.realpath(__file__) -data_path = os.path.abspath(os.path.join(file_path,"..","..","data/")) test_dir_path = os.path.dirname(file_path) tmp_dir_path = test_dir_path + '/nose_tmp_output' tmp_basename_dir = tmp_dir_path + '/1' @@ -186,7 +185,6 @@ def test_logging(self): pca_dimensions = last_dim + 1 assert_equal(pca_dimensions, pca_dimensions_log) - @nottest def test_seed(self): #Test default behaviour, seed = 11 self.run_command() @@ -203,8 +201,8 @@ def test_seed(self): assert_true(first_file == second_file, msg='Clustering outcomes were not the same with same seeds') - #Should be equal to both above since default seed is 11 - self.run_command(tags=["-f","11"]) + #Should be equal to both above since default seed is 1 + self.run_command(tags=["--seed","1"]) first_time = os.path.getmtime(tmp_basename_dir+'/clustering_gt1000.csv') with open(tmp_basename_dir+'/clustering_gt1000.csv','r') as clustering: first_file=clustering.read() @@ -213,15 +211,15 @@ def test_seed(self): assert_true(first_file == second_file, msg='Clustering outcomes were not the same with same seeds') - #Test that 0 gives random seed - self.run_command(tags=['-f','0']) + #Test that 0 gives different seed + self.run_command(tags=['--seed','0']) first_time = os.path.getmtime(tmp_basename_dir+'/clustering_gt1000.csv') with open(tmp_basename_dir+'/clustering_gt1000.csv','r') as clustering: first_file=clustering.read() - #Should give random clustering - self.run_command(tags=['-f','0']) + #Should give different clustering + self.run_command(tags=['--seed','0']) second_time = os.path.getmtime(tmp_basename_dir+'/clustering_gt1000.csv') with open(tmp_basename_dir+'/clustering_gt1000.csv','r') as clustering: second_file=clustering.read() @@ -233,13 +231,13 @@ def test_seed(self): #Test that two differnet seeds give different clustering #Should give clustering 2 - self.run_command(tags=['-f','2']) + self.run_command(tags=['--seed','2']) first_time = os.path.getmtime(tmp_basename_dir+'/clustering_gt1000.csv') with open(tmp_basename_dir+'/clustering_gt1000.csv','r') as clustering: first_file=clustering.read() #Should give clustering 3 - self.run_command(tags=['-f','3']) + self.run_command(tags=['--seed','3']) second_time = os.path.getmtime(tmp_basename_dir+'/clustering_gt1000.csv') with open(tmp_basename_dir+'/clustering_gt1000.csv','r') as clustering: second_file=clustering.read() @@ -251,7 +249,7 @@ def test_seed(self): def test_log_coverage(self): self.run_command() original_coverage_data_path = os.path.join(tmp_basename_dir,'original_data_gt1000.csv') - df = p.io.parsers.read_table(original_coverage_data_path,index_col=0,sep=',') + df = p.io.parsers.read_csv(original_coverage_data_path,index_col=0,sep=',') true_pseudo_cov = -1.3143 calc_pseudo_cov = df.sample_1[0] @@ -260,7 +258,7 @@ def test_log_coverage(self): def test_log_coverage_no_cov_normalization(self): self.run_command(tags=["--no_cov_normalization"]) original_coverage_data_path = os.path.join(tmp_basename_dir,'original_data_gt1000.csv') - df = p.io.parsers.read_table(original_coverage_data_path,index_col=0,sep=',') + df = p.io.parsers.read_csv(original_coverage_data_path,index_col=0,sep=',') true_pseudo_cov = -1.8107 calc_pseudo_cov = df.sample_1[0] diff --git a/tests/test_integration_with_scripts.py b/tests/test_integration_with_scripts.py new file mode 100644 index 0000000..2d662d0 --- /dev/null +++ b/tests/test_integration_with_scripts.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +from nose.tools import assert_equal, assert_true, assert_almost_equal, nottest, assert_false +from os.path import isdir,isfile +from os import listdir +import os +import sys +import subprocess +import pandas as p +import glob + +file_path = os.path.realpath(__file__) +data_path = os.path.abspath(os.path.join(file_path, "..", "test_data", "integration_test_data")) +test_dir_path = os.path.dirname(file_path) +tmp_dir_path = test_dir_path + '/nose_tmp_output' + +CWD = os.getcwd() + +class TestCMD(object): + def setUp(self): + """Create temporary dir if necessary, + otherwise clear contents of it""" + if not isdir(tmp_dir_path): + os.mkdir(tmp_dir_path) + self.tearDown() + os.chdir(test_dir_path) + + def tearDown(self): + """remove temporary output files""" + for d in os.listdir(tmp_dir_path): + d_path = os.path.join(tmp_dir_path,d) + try: + os.remove(d_path) + except: + for f in os.listdir(d_path): + f_path = os.path.join(d_path,f) + os.remove(f_path) + os.rmdir(d_path) + assert os.listdir(tmp_dir_path) == [] + + + def run_command(self, call_list): + call_string = " ".join(call_list) + " 2> /dev/null" + self.op = subprocess.check_output( + call_string, + shell=True) + + def run_all(self, cov_file='coverage', comp_file='composition.fa', + tags=[], basename='nose_tmp_output/1'): + + original_contigs = os.path.join(data_path, "velvet_71.fa") + contigs_bed = os.path.join(tmp_dir_path, "contigs.bed") + cutup_contigs = os.path.join(tmp_dir_path, "contigs_c10K.fa") + bam_files = glob.glob(os.path.join(data_path, "map", "*.bam")) + coverage_table = os.path.join(tmp_dir_path, "coverage_table.tsv") + concoct_output = os.path.join(tmp_dir_path, "concoct_output") + "/" + clustering_file = os.path.join(concoct_output, "clustering_gt1000.csv") + merged_clustering = os.path.join(concoct_output, "clustering_merged.csv") + fasta_bins_dir = os.path.join(tmp_dir_path, "fasta_bins") + + cutup_call = ["cut_up_fasta.py", original_contigs, "-c", "10000", "-o", "0", "--merge_last", "-b", contigs_bed, ">", cutup_contigs] + self.run_command(cutup_call) + + coverage_table_call = ["concoct_coverage_table.py", contigs_bed] + bam_files + [">", coverage_table] + self.run_command(coverage_table_call) + + concoct_call = ["concoct", "--composition_file", cutup_contigs, "--coverage_file", coverage_table, "-b", concoct_output] + self.run_command(concoct_call) + + merge_cutup_call = ["merge_cutup_clustering.py", clustering_file, ">", merged_clustering] + self.run_command(merge_cutup_call) + + _ = subprocess.check_output("mkdir {}".format(fasta_bins_dir), shell=True) + + extract_call = ["extract_fasta_bins.py", original_contigs, merged_clustering, "--output_path", fasta_bins_dir] + self.run_command(extract_call) + + def test_directory_creation(self): + self.run_all() + fasta_bins_dir = os.path.join(tmp_dir_path, "fasta_bins") + print(os.listdir(fasta_bins_dir)) + assert_true(len(glob.glob(os.path.join(fasta_bins_dir, "*.fa"))) > 2, "Too few bins were created") diff --git a/tests/test_merge_cutup_clustering.py b/tests/test_merge_cutup_clustering.py new file mode 100644 index 0000000..1baede5 --- /dev/null +++ b/tests/test_merge_cutup_clustering.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +from nose.tools import assert_equal, assert_true, assert_almost_equal, nottest +from os.path import isdir,isfile +from os import listdir +import os +import sys +import subprocess +from Bio import SeqIO +from collections import defaultdict +import re + +file_path = os.path.realpath(__file__) +test_dir_path = os.path.dirname(file_path) +data_path = os.path.abspath(os.path.join(test_dir_path,"test_data", "integration_test_data")) +tmp_dir_path = os.path.join(test_dir_path, 'nose_tmp_output') +script_path = os.path.join(test_dir_path, '..', 'scripts', 'merge_cutup_clustering.py') + +CWD = os.getcwd() + +CONTIG_PART_EXPR = re.compile("(.*)\.concoct_part_([0-9]*)") + +class TestCMD(object): + def setUp(self): + """Create temporary dir if necessary, + otherwise clear contents of it""" + if not isdir(tmp_dir_path): + os.mkdir(tmp_dir_path) + self.tearDown() + os.chdir(test_dir_path) + + def tearDown(self): + """remove temporary output files""" + for d in os.listdir(tmp_dir_path): + d_path = os.path.join(tmp_dir_path,d) + try: + os.remove(d_path) + except: + for f in os.listdir(d_path): + f_path = os.path.join(d_path,f) + os.remove(f_path) + os.rmdir(d_path) + assert os.listdir(tmp_dir_path) == [] + + + def run_command(self, clustering_file, output_file): + + call = ["python", script_path, clustering_file] + + call += ['>', output_file] + + self.c = 0 + print(" ".join(call)) + self.op = subprocess.check_output( + " ".join(call) + " 2> /dev/null", + shell=True) + + def file_len(self,fh): + i = 0 + with open(fh) as f: + for i, l in enumerate(f): + pass + return i + 1 + + def test_basic(self): + clustering_file = os.path.join(data_path, 'clustering_gt1000.csv') + output_file = os.path.join(tmp_dir_path, 'clustering_merged.csv') + + self.run_command( + clustering_file=clustering_file, + output_file=output_file + ) + + clusters_per_contig = defaultdict(set) + with open(clustering_file) as fhandle: + first = True + for line in fhandle: + # First line is header + if first: + first = False + continue + line = line.strip() + subcontig_id, cluster_id = line.split(',') + contig_id, part_nr = CONTIG_PART_EXPR.match(subcontig_id).group(1,2) + clusters_per_contig[contig_id].add(cluster_id) + if len(clusters_per_contig[contig_id]) > 1: + print(contig_id) + + assert_equal(self.c, 0, + msg="Command exited with nonzero status") + + assert_equal(self.file_len(output_file), 1+len(clusters_per_contig)) + with open(output_file) as fhandle: + for line in fhandle: + line = line.strip() + contig_id, cluster_id = line.split(',') + if len(clusters_per_contig[contig_id]) == 1: + assert_true(cluster_id in clusters_per_contig[contig_id]) + else: + print(contig_id) diff --git a/tests/test_unittest_input.py b/tests/test_unittest_input.py index 312e6a5..69e9215 100644 --- a/tests/test_unittest_input.py +++ b/tests/test_unittest_input.py @@ -46,7 +46,7 @@ def test_load_composition(self): assert_equal(len(c_len), len(contig_lengths)) # All equal for ix in ids: - assert_equal(c_len.ix[ix], contig_lengths.ix[ix]) + assert_equal(c_len.loc[ix], contig_lengths.loc[ix]) def test__calculate_composition(self): @@ -67,7 +67,7 @@ def test__calculate_composition(self): for seq_id, s in seq_strings.items(): c = count_substrings(s, "".join(kmer_s)) - assert_equal(composition.ix[seq_id, feature_mapping[kmer_s]], c+1) + assert_equal(composition.loc[seq_id, feature_mapping[kmer_s]], c+1) # Check that non palindromic kmers works as well: kmer_s = ('A', 'G', 'G', 'G') @@ -75,7 +75,7 @@ def test__calculate_composition(self): for seq_id, s in seq_strings.items(): c_1 = count_substrings(s, "".join(kmer_s)) c_2 = count_substrings(s, "".join(reverse_kmer_s)) - assert_equal(composition.ix[seq_id, feature_mapping[kmer_s]], c_1 + c_2 + 1) + assert_equal(composition.loc[seq_id, feature_mapping[kmer_s]], c_1 + c_2 + 1) def count_substrings(s, subs):