Merge pull request #225 from BinPro/develop

Version 1.0.0
BinPro · Dec 12, 2018 · a46214d · a46214d
2 parents 1a7c213 + ff8de11
commit a46214d
Show file tree

Hide file tree

Showing 49 changed files with 2,584 additions and 2,841 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,8 +9,8 @@ dist/*
 build/*
 c-concoct/build/*
 *.egg*
-c-concoct/build/*
 c-concoct/dist/*
+c-concoct/vbgmm.c
 #ignore screen file
 .screenrc
 #ignore test folder

diff --git a/.travis.yml b/.travis.yml
@@ -1,26 +1,21 @@
 language: python
 python:
   - "2.7"
-  # does not have headers provided, please ask https://launchpad.net/~pypy/+archive/ppa
-  # maintainers to fix their pypy-dev package.
+  - "3.4"
+  # Since we are using system_site_packages, we are only able to use
+  # the default python versions, see:
+  # https://docs.travis-ci.com/user/languages/python/#travis-ci-uses-isolated-virtualenvs
 # command to install dependencies
 virtualenv:
   system_site_packages: true
 before_install:
-   #Uses miniconda installation of scientific python packages instead of building from source
-   #or using old versions supplied by apt-get. Source: https://gist.github.com/dan-blanchard/7045057
-   - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; else wget http://repo.continuum.io/miniconda/Miniconda3-3.7.3-Linux-x86_64.sh -O miniconda.sh; fi
-   - chmod +x miniconda.sh
-   - ./miniconda.sh -b
-   - export PATH=/home/travis/miniconda3/bin:/home/travis/miniconda2/bin:$PATH
-   - conda update --yes conda
+   - pip install --upgrade pip
    - sudo apt-get update -qq
    - sudo apt-get install -qq build-essential libgsl0-dev bedtools mummer
    - "export DISPLAY=:99.0"
    - "sh -e /etc/init.d/xvfb start"
+   - pip install -r requirements.txt
 install:
-  - conda install --yes python=$TRAVIS_PYTHON_VERSION cython numpy scipy biopython pandas pip scikit-learn docutils sphinx jinja2 seaborn
-  - pip install bcbio-gff
   - python setup.py install
 # command to run tests
 script: nosetests

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,34 @@
+# Docker for CONCOCT (http://github.com/BinPro/CONCOCT) v1.0.0
+# VERSION 1.0.0
+#
+# This docker creates and sets up an Ubuntu environment with all
+# dependencies for CONCOCT v1.0.0 installed.
+#
+# To login to the docker with a shared directory from the host do:
+#
+# docker run -v /my/host/shared/directory:/my/docker/location -i -t alneberg/concoct_1.0.0 /bin/bash
+#
+
+FROM ubuntu:18.04
+COPY . /opt/CONCOCT
+
+# Get basic ubuntu packages needed
+RUN apt-get update -qq
+RUN apt-get install -qq wget build-essential libgsl0-dev git zip unzip bedtools python-pip
+
+RUN pip install --upgrade pip
+
+# Install python dependencies and fetch and install CONCOCT 1.0.0
+RUN cd /opt/CONCOCT;\
+    pip install -r requirements.txt;\
+
+#    wget --no-check-certificate https://github.com/BinPro/CONCOCT/archive/1.0.0.tar.gz;\
+#    tar xf 1.0.0.tar.gz;\
+#    cd CONCOCT-1.0.0;\
+#    python setup.py install
+
+RUN cd /opt/CONCOCT/;\
+    python setup.py install
+
+RUN cd /opt/CONCOCT/;\
+    nosetests
diff --git a/README.md b/README.md
@@ -1,12 +1,8 @@
-## CONCOCT 0.4.2 [![Build Status](https://travis-ci.org/BinPro/CONCOCT.png?branch=master)](https://travis-ci.org/BinPro/CONCOCT)
+## CONCOCT 1.0.0 [![Build Status](https://travis-ci.org/BinPro/CONCOCT.png?branch=master)](https://travis-ci.org/BinPro/CONCOCT)
 
-A program for unsupervised binning of metagenomic contigs by using nucleotide composition, 
+A program for unsupervised binning of metagenomic contigs by using nucleotide composition,
 coverage data in multiple samples and linkage data from paired end reads.
 
-Warning! This software is to be considered under development. Functionality and the user interface may still change significantly from one version to another.
-If you want to use this software, please stay up to date with the list of known issues:
-https://github.com/BinPro/CONCOCT/issues
-
 ## Please Cite ##
 If you use CONCOCT in your publication, please cite:
 
@@ -15,8 +11,37 @@ Johannes Alneberg, Brynjar Smári Bjarnason, Ino de Bruijn, Melanie Schirmer, Jo
 ## Documentation ##
 A comprehensive documentation for concoct is hosted on [readthedocs](https://concoct.readthedocs.org).
 
+## Basic Usage ##
+Cut contigs into smaller parts
+```bash
+cut_up_fasta.py original_contigs.fa -c 10000 -o 0 --merge_last -b contigs_10K.bed > contigs_10K.fa
+```
+
+Generate table with coverage depth information per sample and subcontig.
+This step assumes the directory 'mapping' contains sorted and indexed bam files where each sample has been mapped against the original contigs.
+```bash
+concoct_coverage_table.py contigs_10K.bed mapping/Sample*.sorted.bam > coverage_table.tsv
+```
+
+Run concoct
+```bash
+concoct --composition_file contigs_10K.fa --coverage_file coverage_table.tsv -b concoct_output/
+```
+
+Merge subcontig clustering into original contig clustering
+```bash
+merge_cutup_clustering.py concoct_output/clustering_gt1000.csv > concoct_output/clustering_merged.csv
+```
+
+Extract bins as individual FASTA
+```bash
+mkdir concoct_output/fasta_bins
+extract_fasta_bins.py original_contigs.fa concoct_output/clustering_merged.csv --output_path concoct_output/fasta_bins
+```
+
 ## Support ##
-If you are having issues, please let us know. We have a mailing list located at: concoct-support@lists.sourcefourge.net which you can also subscribe to [here](https://lists.sourceforge.net/lists/listinfo/concoct-support).
+[![Gitter](https://img.shields.io/badge/gitter-%20join%20chat%20%E2%86%92-4fb99a.svg?style=flat-square)](https://gitter.im/BinPro/CONCOCT)
+If you are having trouble running CONCOCT or interpretting any results, please don't hesitate to write a question in our gitter channel.
 
 ## Contribute ##
 

diff --git a/bin/concoct b/bin/concoct
@@ -1,16 +1,17 @@
 #!/usr/bin/env python
-from __future__ import division
+from __future__ import print_function
 
 import sys
 import logging
 import vbgmm
+import numpy as np
+
 
 from concoct.output import Output
 from concoct.parser import arguments
 from concoct.input import load_data
 from concoct.transform import perform_pca
 
-import vbgmm
 
 def main(args):
     # Initialize output handling
@@ -22,24 +23,25 @@ def main(args):
     if len(composition) < 2:
         logging.error('Not enough contigs pass the threshold filter. Exiting!')
         sys.exit(-1)
-    
+
     if cov is not None:
         joined = composition.join(cov.ix[:,cov_range[0]:cov_range[1]],how="inner")
     else:
         joined = composition
 
     # Fix special case in pca_components
     if args.pca_components == "All":
-        args.pca_components = joined[args.length_threshold_filter].shape[1]
+        args.pca_components = joined.shape[1]
 
     #PCA on the contigs that have kmer count greater than length_threshold
     transform_filter, pca = perform_pca(
         joined,
-        args.pca_components
+        args.pca_components,
+        args.seed
         )
 
     logging.info('Performed PCA, resulted in %s dimensions' % transform_filter.shape[1])
-    
+
     if not args.no_original_data:
         Output.write_original_data(
             joined,
@@ -59,13 +61,23 @@ def main(args):
 
     logging.info('PCA transformed data.')
 
-    logging.info('Will call vbgmm with parameters: %s, %s, %s' % (Output.CONCOCT_PATH, args.clusters, args.length_threshold))
+    logging.info('Will call vbgmm with parameters: %s, %s, %s, %s' % (Output.CONCOCT_PATH, args.clusters, args.length_threshold, args.threads))
+
+    N_contigs = transform_filter.shape[0]
+    assign = np.zeros(N_contigs, dtype=np.int32)
+
+    assign = vbgmm.fit(np.copy(transform_filter,order='C'), int(args.clusters), int(args.seed), int(args.threads))
 
-    vbgmm.fit(Output.CONCOCT_PATH, args.clusters, args.length_threshold,args.seed,args.iterations,args.epsilon,args.converge_out)
+
+    Output.write_assign(
+        assign,
+        args.length_threshold,
+        joined.index,
+        )
 
     logging.info("CONCOCT Finished")
 
-        
+
 if __name__=="__main__":
     args = arguments()
     if args.total_percentage_pca == 100:
@@ -75,4 +87,4 @@ if __name__=="__main__":
 
     results = main(args)
 
-    print >> sys.stderr, "CONCOCT Finished, the log shows how it went."
+    print("CONCOCT Finished, the log shows how it went.", file=sys.stderr)
diff --git a/bin/concoct_refine b/bin/concoct_refine
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+from __future__ import division
+DESC="""A script that iterates over concoct results and reruns the concoct algorithm 
+for clusters where the median SCG presence is at least 2."""
+
+
+import sys
+import logging
+import vbgmm
+import numpy as np 
+import argparse
+import pandas as p
+
+from sklearn.decomposition import PCA
+
+from concoct.transform import perform_pca
+
+def main(argv):
+    parser = argparse.ArgumentParser(description=DESC)
+
+    parser.add_argument("cluster_file", help="string specifying cluster file")
+
+    parser.add_argument("original_data", help="string original but transformed data file")
+
+    parser.add_argument("scg_file", help="string specifying scg frequency file")
+
+    parser.add_argument('-e','--expansion_factor',default=2, type=int,
+                                        help=("number of clusters to expand by"))
+
+    parser.add_argument('-t','--threads',default=1, type=int, 
+                        help=("number of threads to use defaults to one"))
+
+    args = parser.parse_args()
+
+    clusters    = p.read_csv(args.cluster_file, header=None, index_col=0)
+
+    original_data = p.read_csv(args.original_data, header=0, index_col=0)
+
+    original_data_matrix = original_data.as_matrix()
+
+    scg_freq = p.read_csv(args.scg_file, header=0, index_col=0)
+
+    scg_freq_matrix = scg_freq.as_matrix()
+
+    med_scgs = np.median(scg_freq_matrix, axis=1) 
+
+    clusters_matrix = clusters.as_matrix()
+
+    cluster_freq = np.bincount(clusters_matrix[:,0]) 
+
+    K = cluster_freq.shape[0]
+    new_clusters_matrix = np.copy(clusters_matrix,order='C')
+    nNewK = K - 1
+    for k in range(K):
+        if med_scgs[k] > 1:
+
+            select = clusters_matrix == k
+
+            slice_k = original_data_matrix[select[:,0],:]
+
+            index_k = np.where(select[:,0])[0]
+
+            pca_object = PCA(n_components=0.90).fit(slice_k)
+            transform_k = pca_object.transform(slice_k)
+
+            NK = med_scgs[k]*args.expansion_factor
+            print "Run CONCOCT for " + str(k) + "with " + str(NK) + "clusters" + " using " + str(args.threads) + "threads"
+            assigns = vbgmm.fit(np.copy(transform_k,order='C'),int(NK),int(args.threads))
+            kK = np.max(assigns) + 1
+
+
+            for a in range(1,kK):
+                index_a = index_k[assigns == a]
+                new_clusters_matrix[index_a] =  nNewK + a
+
+            nNewK = nNewK + kK - 1
+
+    new_assign_df = p.DataFrame(new_clusters_matrix,index=original_data.index)
+    new_assign_df.to_csv("clustering_refine.csv")
+if __name__ == "__main__":
+    main(sys.argv[1:])
+
diff --git a/c-concoct/Makefile b/c-concoct/Makefile
@@ -0,0 +1,16 @@
+CC     = gcc
+CFLAGS = -std=c99 -g -I/usr/local/include/
+EFLAGS =  
+EFILE  = test_vbgmmfit
+LIBS   = -lgomp -lpthread -lm -lgsl -lgslcblas -L/usr/local/lib
+OBJS   = c_vbgmm_fit.o test_vbgmm_fit.o 
+
+$(EFILE) : $(OBJS)
+	@echo "linking..."
+	$(CC) $(EFLAGS) -o $(EFILE) $(OBJS) $(LIBS)
+
+$(OBJS) : c_vbgmm_fit.c c_vbgmm_fit.h
+	$(CC) $(CFLAGS) -c $*.c 
+
+clean:
+	rm -rf *.o test_vbgmmfit