Merge pull request #268 from BinPro/develop

Prepare version 1.1.0
BinPro · Aug 2, 2019 · c9fe807 · c9fe807
2 parents a46214d + 43ac24d
commit c9fe807
Show file tree

Hide file tree

Showing 44 changed files with 789 additions and 453 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,5 +15,6 @@ c-concoct/vbgmm.c
 .screenrc
 #ignore test folder
 tests/nose_tmp_output
+tests/test_data/integration_test_data
 #ignore doc builds
 doc/build/*
diff --git a/.travis.yml b/.travis.yml
@@ -1,20 +1,24 @@
 language: python
 python:
   - "2.7"
-  - "3.4"
+  - "3.5"
   # Since we are using system_site_packages, we are only able to use
   # the default python versions, see:
   # https://docs.travis-ci.com/user/languages/python/#travis-ci-uses-isolated-virtualenvs
 # command to install dependencies
 virtualenv:
   system_site_packages: true
+services:
+   - xvfb
 before_install:
    - pip install --upgrade pip
    - sudo apt-get update -qq
-   - sudo apt-get install -qq build-essential libgsl0-dev bedtools mummer
+   - sudo apt-get install -qq build-essential libgsl0-dev bedtools mummer samtools
    - "export DISPLAY=:99.0"
-   - "sh -e /etc/init.d/xvfb start"
    - pip install -r requirements.txt
+   - wget https://github.com/BinPro/integration_test_data/archive/v1.0.tar.gz
+   - mkdir tests/test_data/integration_test_data
+   - tar -xvzf v1.0.tar.gz -C tests/test_data/integration_test_data --strip-components=1
 install:
   - python setup.py install
 # command to run tests

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,34 @@
+# Changelog
+
+A list of changes per version.
+When changing something in the develop branch, it should be added here.
+
+## [v1.1.0] 2019-08-02
+
+### `Changed`
+
+ - [#236](https://github.com/BinPro/CONCOCT/pull/236) - Always add suffix to contigs at cutup, even when they are not cut.
+ - [#254](https://github.com/BinPro/CONCOCT/pull/254) - Slight cleanup of concoct refine
+ - [#258](https://github.com/BinPro/CONCOCT/pull/258) - New suffices (.concoct_part_XX) are now used for contig parts 
+ - [#261](https://github.com/BinPro/CONCOCT/pull/261) - Epsilon argument removed as it was not working and is not very useful
+ - [#262](https://github.com/BinPro/CONCOCT/pull/262) - Rewrote documentation, including installation instructions
+ - [#264](https://github.com/BinPro/CONCOCT/pull/264) - `concoct_part_` suffix is enforced in subcontig for coverage script 
+ - [#264](https://github.com/BinPro/CONCOCT/pull/264) - Header line is enforced for input for `merge_cutup_clustering.py` and `extract_fasta_bins.py`
+ - [#267](https://github.com/BinPro/CONCOCT/pull/267) - Updated documentation
+
+### `Added`
+
+ - [#253](https://github.com/BinPro/CONCOCT/pull/253) - A dockerfile useful to test the conda installation
+ - [#258](https://github.com/BinPro/CONCOCT/pull/258) - Tests for all fundamental scripts, including a new integration test data repository
+ - [#259](https://github.com/BinPro/CONCOCT/pull/259) - This changelog
+ - [#262](https://github.com/BinPro/CONCOCT/pull/262) - Added documentation for the core scripts used with concoct
+ - [#265](https://github.com/BinPro/CONCOCT/pull/265) - A warning is now printed when concoct runs in single threaded mode
+
+### `Fixed`
+
+ - [#230](https://github.com/BinPro/CONCOCT/pull/230) - Enable at least single threaded installation on Mac OSX
+ - [#231](https://github.com/BinPro/CONCOCT/pull/231) - Replace pandas .ix with .loc to fix deprecation warnings
+ - [#246](https://github.com/BinPro/CONCOCT/pull/246) - Limit some dependency version numbers for python 2
+ - [#254](https://github.com/BinPro/CONCOCT/pull/254) - Concoct refine now works with python 3
+ - [#258](https://github.com/BinPro/CONCOCT/pull/258) - Seed tests now working again
+ - [#260](https://github.com/BinPro/CONCOCT/pull/260) - Fix the dockerfile build by adding integration test data
diff --git a/Dockerfile b/Dockerfile
@@ -1,31 +1,30 @@
-# Docker for CONCOCT (http://github.com/BinPro/CONCOCT) v1.0.0
-# VERSION 1.0.0
+# Docker for CONCOCT (http://github.com/BinPro/CONCOCT) v1.1.0
+# VERSION 1.1.0
 #
 # This docker creates and sets up an Ubuntu environment with all
-# dependencies for CONCOCT v1.0.0 installed.
+# dependencies for CONCOCT v1.1.0 installed.
 #
 # To login to the docker with a shared directory from the host do:
 #
-# docker run -v /my/host/shared/directory:/my/docker/location -i -t alneberg/concoct_1.0.0 /bin/bash
+# docker run -v /my/host/shared/directory:/my/docker/location -i -t alneberg/concoct_1.1.0 /bin/bash
 #
 
 FROM ubuntu:18.04
 COPY . /opt/CONCOCT
 
 # Get basic ubuntu packages needed
 RUN apt-get update -qq
-RUN apt-get install -qq wget build-essential libgsl0-dev git zip unzip bedtools python-pip
+RUN apt-get install -qq wget build-essential libgsl0-dev git zip unzip bedtools python-pip samtools
 
 RUN pip install --upgrade pip
 
-# Install python dependencies and fetch and install CONCOCT 1.0.0
+RUN wget --no-check-certificate https://github.com/BinPro/integration_test_data/archive/v1.1.tar.gz
+RUN mkdir /opt/CONCOCT/tests/test_data/integration_test_data
+RUN tar -xvzf v1.1.tar.gz -C /opt/CONCOCT/tests/test_data/integration_test_data --strip-components=1
+
+# Install python dependencies and fetch and install CONCOCT 1.1.0
 RUN cd /opt/CONCOCT;\
     pip install -r requirements.txt;\
-
-#    wget --no-check-certificate https://github.com/BinPro/CONCOCT/archive/1.0.0.tar.gz;\
-#    tar xf 1.0.0.tar.gz;\
-#    cd CONCOCT-1.0.0;\
-#    python setup.py install
 
 RUN cd /opt/CONCOCT/;\
     python setup.py install

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-## CONCOCT 1.0.0 [![Build Status](https://travis-ci.org/BinPro/CONCOCT.png?branch=master)](https://travis-ci.org/BinPro/CONCOCT)
+## CONCOCT 1.1.0 [![Build Status](https://travis-ci.org/BinPro/CONCOCT.png?branch=master)](https://travis-ci.org/BinPro/CONCOCT)
 
 A program for unsupervised binning of metagenomic contigs by using nucleotide composition,
 coverage data in multiple samples and linkage data from paired end reads.

diff --git a/bin/concoct b/bin/concoct
@@ -25,7 +25,7 @@ def main(args):
         sys.exit(-1)
 
     if cov is not None:
-        joined = composition.join(cov.ix[:,cov_range[0]:cov_range[1]],how="inner")
+        joined = composition.join(cov.loc[:,cov_range[0]:cov_range[1]],how="inner")
     else:
         joined = composition
 
@@ -61,12 +61,12 @@ def main(args):
 
     logging.info('PCA transformed data.')
 
-    logging.info('Will call vbgmm with parameters: %s, %s, %s, %s' % (Output.CONCOCT_PATH, args.clusters, args.length_threshold, args.threads))
+    logging.info('Will call vbgmm with parameters: %s, %s, %s, %s, %s' % (Output.CONCOCT_PATH, args.clusters, args.length_threshold, args.threads,args.iterations))
 
     N_contigs = transform_filter.shape[0]
     assign = np.zeros(N_contigs, dtype=np.int32)
 
-    assign = vbgmm.fit(np.copy(transform_filter,order='C'), int(args.clusters), int(args.seed), int(args.threads))
+    assign = vbgmm.fit(np.copy(transform_filter,order='C'), int(args.clusters), int(args.seed), int(args.threads),int(args.iterations))
 
 
     Output.write_assign(
@@ -85,6 +85,8 @@ if __name__=="__main__":
     else:
         args.pca_components = args.total_percentage_pca/100.0
 
+    if args.threads == 1:
+        logging.warning("CONCOCT is running in single threaded mode. Please, consider adjusting the --threads parameter.")
     results = main(args)
 
-    print("CONCOCT Finished, the log shows how it went.", file=sys.stderr)
+    logging.info("CONCOCT Finished, the log shows how it went.")
diff --git a/bin/concoct_refine b/bin/concoct_refine
@@ -4,39 +4,19 @@ DESC="""A script that iterates over concoct results and reruns the concoct algor
 for clusters where the median SCG presence is at least 2."""
 
 
-import sys
-import logging
 import vbgmm
 import numpy as np 
 import argparse
 import pandas as p
 
 from sklearn.decomposition import PCA
 
-from concoct.transform import perform_pca
-
-def main(argv):
-    parser = argparse.ArgumentParser(description=DESC)
-
-    parser.add_argument("cluster_file", help="string specifying cluster file")
-
-    parser.add_argument("original_data", help="string original but transformed data file")
-
-    parser.add_argument("scg_file", help="string specifying scg frequency file")
-
-    parser.add_argument('-e','--expansion_factor',default=2, type=int,
-                                        help=("number of clusters to expand by"))
-
-    parser.add_argument('-t','--threads',default=1, type=int, 
-                        help=("number of threads to use defaults to one"))
-
-    args = parser.parse_args()
-
-    clusters    = p.read_csv(args.cluster_file, header=None, index_col=0)
+def main(args):
+    clusters = p.read_csv(args.cluster_file, header=None, index_col=0)
 
     original_data = p.read_csv(args.original_data, header=0, index_col=0)
 
-    original_data_matrix = original_data.as_matrix()
+    original_data_matrix = original_data.values()
 
     scg_freq = p.read_csv(args.scg_file, header=0, index_col=0)
 
@@ -64,8 +44,8 @@ def main(argv):
             transform_k = pca_object.transform(slice_k)
 
             NK = med_scgs[k]*args.expansion_factor
-            print "Run CONCOCT for " + str(k) + "with " + str(NK) + "clusters" + " using " + str(args.threads) + "threads"
-            assigns = vbgmm.fit(np.copy(transform_k,order='C'),int(NK),int(args.threads))
+            print("Run CONCOCT for " + str(k) + "with " + str(NK) + "clusters" + " using " + str(args.threads) + "threads")
+            assigns = vbgmm.fit(np.copy(transform_k,order='C'), int(NK), args.seed, args.threads)
             kK = np.max(assigns) + 1
 
 
@@ -77,6 +57,26 @@ def main(argv):
 
     new_assign_df = p.DataFrame(new_clusters_matrix,index=original_data.index)
     new_assign_df.to_csv("clustering_refine.csv")
+
 if __name__ == "__main__":
-    main(sys.argv[1:])
+    parser = argparse.ArgumentParser(description=DESC)
+
+    parser.add_argument("cluster_file", help="string specifying cluster file")
+
+    parser.add_argument("original_data", help="string original but transformed data file")
+
+    parser.add_argument("scg_file", help="string specifying scg frequency file")
+
+    parser.add_argument('-e','--expansion_factor',default=2, type=int,
+                                        help=("number of clusters to expand by"))
+
+    parser.add_argument('-s', '--seed' , default=11, type=int,
+                        help=("The seed used for algorithm result reproducibility."))
+
+    parser.add_argument('-t','--threads',default=1, type=int,
+                        help=("number of threads to use defaults to one"))
+
+    args = parser.parse_args()
+
+    main(args)
 
diff --git a/c-concoct/c_vbgmm_fit.c b/c-concoct/c_vbgmm_fit.c
@@ -34,11 +34,16 @@
 /*User includes*/
 #include "c_vbgmm_fit.h"
 
-void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads)
+void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads, int nIter)
 {
     int debug = 0;
     int bAssign = 0;
-    driverMP(adX, nN, nD, anAssign, nK, seed, DEF_MAX_ITER, DEF_EPSILON, debug, bAssign, nThreads);
+
+    if (nIter < 1){
+        nIter = DEF_MAX_ITER;
+    }
+
+    driverMP(adX, nN, nD, anAssign, nK, seed, nIter, DEF_EPSILON, debug, bAssign, nThreads);
 
     return;
 }

diff --git a/c-concoct/c_vbgmm_fit.h b/c-concoct/c_vbgmm_fit.h
@@ -121,7 +121,7 @@ typedef struct s_Cluster
 
 /*user defines*/
 
-void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads);
+void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads, int nIter);
 
 int driverMP(double *adX, int nN, int nD, int *anAssign, int nKStart, unsigned long lSeed, 
                                         int nMaxIter, double dEpsilon, int debug, int bAssign, int nThreads);

diff --git a/c-concoct/vbgmm.pyx b/c-concoct/vbgmm.pyx
@@ -12,11 +12,11 @@ import numpy as np
 cimport numpy as np
 
 # declare the interface to the C code
-cdef extern void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads)
+cdef extern void c_vbgmm_fit (double* adX, int nN, int nD, int nK, int seed, int* anAssign, int nThreads, int nIter)
 @cython.boundscheck(False)
 @cython.wraparound(False)
 
-def fit(np.ndarray[double, ndim=2, mode="c"] xarray not None, nClusters, seed, threads):
+def fit(np.ndarray[double, ndim=2, mode="c"] xarray not None, nClusters, seed, threads, piter):
     """
     fit (xarray, nClusters, seed, threads)
 
@@ -26,18 +26,20 @@ def fit(np.ndarray[double, ndim=2, mode="c"] xarray not None, nClusters, seed, t
     param: nClusters -- an int, number of start clusters
     param: seed -- an int, the random seed
     param: threads -- int, the number of threads to use
-
+    param: piter -- int, the number of VB iterations to use
     """
-    cdef int nN, nD, nK, nThreads
+    cdef int nN, nD, nK, nThreads, nIter
 
     nN, nD = xarray.shape[0], xarray.shape[1]
 
     nK = nClusters
 
+    nIter = piter
+
     nThreads = threads
 
     cdef np.ndarray[int, ndim=1,mode="c"] assign = np.zeros((nN), dtype=np.intc)
 
-    c_vbgmm_fit (&xarray[0,0], nN, nD, nK, seed, &assign[0], nThreads)
+    c_vbgmm_fit (&xarray[0,0], nN, nD, nK, seed, &assign[0], nThreads, nIter)
 
     return assign
diff --git a/concoct/input.py b/concoct/input.py
@@ -87,32 +87,32 @@ def load_coverage(cov_file, contig_lengths, no_cov_normalization, add_total_cove
     cov_range = (cov.columns[0],cov.columns[-1])
 
     # Adding pseudo count
-    cov.ix[:,cov_range[0]:cov_range[1]] = cov.ix[:,cov_range[0]:cov_range[1]].add(
+    cov.loc[:,cov_range[0]:cov_range[1]] = cov.loc[:,cov_range[0]:cov_range[1]].add(
             (read_length/contig_lengths),
             axis='index')
 
     if not no_cov_normalization:
         #Normalize per sample first
-        cov.ix[:,cov_range[0]:cov_range[1]] = \
-            _normalize_per_sample(cov.ix[:,cov_range[0]:cov_range[1]])
+        cov.loc[:,cov_range[0]:cov_range[1]] = \
+            _normalize_per_sample(cov.loc[:,cov_range[0]:cov_range[1]])
 
     temp_cov_range = None
     # Total coverage should be calculated after per sample normalization
     if add_total_coverage:
-        cov['total_coverage'] = cov.ix[:,cov_range[0]:cov_range[1]].sum(axis=1)
+        cov['total_coverage'] = cov.loc[:,cov_range[0]:cov_range[1]].sum(axis=1)
         temp_cov_range = (cov_range[0],'total_coverage')
 
     if not no_cov_normalization:
         # Normalize contigs next
-        cov.ix[:,cov_range[0]:cov_range[1]] = \
-            _normalize_per_contig(cov.ix[:,cov_range[0]:cov_range[1]])
+        cov.loc[:,cov_range[0]:cov_range[1]] = \
+            _normalize_per_contig(cov.loc[:,cov_range[0]:cov_range[1]])
 
     if temp_cov_range:
         cov_range = temp_cov_range
 
     # Log transform
-    cov.ix[:,cov_range[0]:cov_range[1]] = np.log(
-        cov.ix[:,cov_range[0]:cov_range[1]])
+    cov.loc[:,cov_range[0]:cov_range[1]] = np.log(
+        cov.loc[:,cov_range[0]:cov_range[1]])
 
     logging.info('Successfully loaded coverage data.')
     return cov, cov_range

diff --git a/concoct/output.py b/concoct/output.py
@@ -52,6 +52,11 @@ def __init__(self,basename,args):
             "PCA_components_data_gt{0}.csv"
         self.LOG_FILE_BASE = self.CONCOCT_PATH + 'log.txt'
 
+        # Reset any previous logging handlers, see:
+        # https://stackoverflow.com/a/49202811
+        for handler in logging.root.handlers[:]:
+            logging.root.removeHandler(handler)
+
         logging.basicConfig(
             filename=self.LOG_FILE_BASE,
             level=logging.INFO,

diff --git a/concoct/parser.py b/concoct/parser.py
@@ -68,9 +68,6 @@ def arguments():
     parser.add_argument('-i','--iterations',type=int, default=500,
       help=('Specify maximum number of iterations for the VBGMM. '
             'Default value is 500'))
-    parser.add_argument('-e','--epsilon',type=float, default=1.0e-6,
-      help=('Specify the epsilon for VBGMM. '
-            'Default value is 1.0e-6'))
     parser.add_argument('--no_cov_normalization', default=False, action="store_true",
       help=("By default the coverage is normalized with regards to samples, "
             "then normalized with regards of contigs and finally log transformed. "