diff --git a/.circleci/config.yml b/.circleci/config.yml index 3d954f579..2471aaf69 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -21,7 +21,7 @@ jobs: - run: conda config --set quiet true - run: conda install conda-build -c defaults - run: mkdir workspace - - run: conda build devtools/conda-recipe --python=3.7 --no-test --output-folder workspace + - run: conda build devtools/conda-recipe --python=3.9 --numpy=1.19 --no-test --output-folder workspace - persist_to_workspace: root: workspace paths: @@ -46,7 +46,7 @@ jobs: - run: mkdir -p $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS - attach_workspace: at: workspace - - run: conda build devtools/conda-recipe --python=3.7 --test --output-folder workspace + - run: conda build devtools/conda-recipe --python=3.9 --numpy=1.19 --test --output-folder workspace - run: bash <(curl -s https://codecov.io/bash) -f $HOME/coverage.xml - store_test_results: path: /tmp/circleci-test-results diff --git a/devtools/azure-pipelines-linux.yml b/devtools/azure-pipelines-linux.yml index 1ccb6f3df..1fbf0c870 100644 --- a/devtools/azure-pipelines-linux.yml +++ b/devtools/azure-pipelines-linux.yml @@ -6,15 +6,12 @@ jobs: strategy: matrix: - Python36: - CONDA_PY: '3.6' - CONDA_NPY: '1.16' Python37: CONDA_PY: '3.7' CONDA_NPY: '1.18' Python38: CONDA_PY: '3.8' - CONDA_NPY: '1.19' + CONDA_NPY: '1.18' Python39: CONDA_PY: '3.9' CONDA_NPY: '1.19' diff --git a/devtools/azure-pipelines-osx.yml b/devtools/azure-pipelines-osx.yml index cf4d3fe01..67a0e4b97 100644 --- a/devtools/azure-pipelines-osx.yml +++ b/devtools/azure-pipelines-osx.yml @@ -5,12 +5,12 @@ jobs: vmImage: 'macOS-10.14' strategy: matrix: - Python36: - CONDA_PY: '3.6' - CONDA_NPY: '1.17' Python37: CONDA_PY: '3.7' - CONDA_NPY: '1.19' + CONDA_NPY: '1.18' + Python38: + CONDA_PY: '3.8' + CONDA_NPY: '1.18' Python39: CONDA_PY: '3.9' CONDA_NPY: '1.19' diff --git a/devtools/azure-pipelines-win.yml b/devtools/azure-pipelines-win.yml index 2a590b423..d64d53d6e 100644 --- a/devtools/azure-pipelines-win.yml +++ b/devtools/azure-pipelines-win.yml @@ -7,10 +7,10 @@ jobs: matrix: Python37: CONDA_PY: '3.7' - CONDA_NPY: '1.19' + CONDA_NPY: '1.18' Python38: CONDA_PY: '3.8' - CONDA_NPY: '1.19' + CONDA_NPY: '1.18' Python39: CONDA_PY: '3.9' CONDA_NPY: '1.19' diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml index de097b739..85af0c989 100644 --- a/devtools/conda-recipe/meta.yaml +++ b/devtools/conda-recipe/meta.yaml @@ -35,7 +35,8 @@ requirements: - setuptools - pip - pybind11 - - deeptime + - pybind11-abi + - deeptime >=0.3.0 run: - bhmm >=0.6.3 diff --git a/pyemma/_base/serialization/tests/test_cli.py b/pyemma/_base/serialization/tests/test_cli.py index aaf220e5e..2d8bf111f 100644 --- a/pyemma/_base/serialization/tests/test_cli.py +++ b/pyemma/_base/serialization/tests/test_cli.py @@ -15,20 +15,21 @@ # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . - +import shutil import tempfile -import unittest +import pytest +from numpy.testing import assert_ from pyemma._base.serialization.cli import main from pyemma.coordinates import source, tica, cluster_kmeans -class TestListModelCLI(unittest.TestCase): - @classmethod - def setUpClass(cls): +@pytest.fixture +def model_file(): + file = None + try: from pyemma.datasets import get_bpti_test_data - d = get_bpti_test_data() trajs, top = d['trajs'], d['top'] s = source(trajs, top=top) @@ -36,25 +37,22 @@ def setUpClass(cls): t = tica(s, lag=1) c = cluster_kmeans(t) - cls.model_file = tempfile.mktemp() - c.save(cls.model_file, save_streaming_chain=True) - - @classmethod - def tearDownClass(cls): - import os - os.unlink(cls.model_file) - - def test_recursive(self): - """ check the whole chain has been printed""" - from pyemma.util.contexts import Capturing - with Capturing() as out: - main(['--recursive', self.model_file]) - assert out - all_out = '\n'.join(out) - self.assertIn('FeatureReader', all_out) - self.assertIn('TICA', all_out) - self.assertIn('Kmeans', all_out) - - -if __name__ == '__main__': - unittest.main() + file = tempfile.mktemp() + c.save(file, save_streaming_chain=True) + + yield file + finally: + if file is not None: + shutil.rmtree(file, ignore_errors=True) + + +def test_recursive(model_file): + """ check the whole chain has been printed""" + from pyemma.util.contexts import Capturing + with Capturing() as out: + main(['--recursive', model_file]) + assert out + all_out = '\n'.join(out) + assert_('FeatureReader' in all_out) + assert_('TICA' in all_out) + assert_('Kmeans' in all_out) diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py index 36f243171..32862d053 100644 --- a/pyemma/coordinates/api.py +++ b/pyemma/coordinates/api.py @@ -21,6 +21,8 @@ .. currentmodule:: pyemma.coordinates.api """ +from pathlib import Path + import numpy as _np import logging as _logging @@ -632,7 +634,7 @@ def discretizer(reader, return disc -def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None, image_molecules=False, verbose=True): +def save_traj(traj_inp, indexes, outfile, top=None, stride=1, chunksize=None, image_molecules=False, verbose=True): r""" Saves a sequence of frames as a single trajectory. Extracts the specified sequence of time/trajectory indexes from traj_inp @@ -753,6 +755,8 @@ def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None, return traj # or to disk as a molecular trajectory file else: + if isinstance(outfile, Path): + outfile = str(outfile.resolve()) traj.save(outfile) if verbose: _logger.info("Created file %s" % outfile) @@ -839,9 +843,12 @@ def save_trajs(traj_inp, indexes, prefix='set_', fmt=None, outfiles=None, # Determine output format of the molecular trajectory file if fmt is None: + fname = traj_inp.filenames[0] + while hasattr(fname, '__getitem__') and not isinstance(fname, (str, bytes)): + fname = fname[0] import os - _, fmt = os.path.splitext(traj_inp.filenames[0]) + _, fmt = os.path.splitext(fname) else: fmt = '.' + fmt diff --git a/pyemma/coordinates/clustering/include/Clustering.h b/pyemma/coordinates/clustering/include/Clustering.h deleted file mode 100644 index 1c0aaca48..000000000 --- a/pyemma/coordinates/clustering/include/Clustering.h +++ /dev/null @@ -1,78 +0,0 @@ -// -// Created by marscher on 4/3/17. -// - -#ifndef PYEMMA_CLUSTERING_H -#define PYEMMA_CLUSTERING_H - -#include -#include -#include - -#include "metric_base.h" - -namespace py = pybind11; - -template -class ClusteringBase { - -public: - enum MetricType { - EUCLIDEAN, MINRMSD - }; - - using np_array = py::array_t; - - ClusteringBase(const std::string& metric_s, std::size_t input_dimension) : input_dimension(input_dimension) { - if (metric_s == "euclidean") { - typedef euclidean_metric eucl; - metric = std::unique_ptr(new eucl(input_dimension)); - _metric_type = MetricType::EUCLIDEAN; - } else if(metric_s == "minRMSD") { - typedef min_rmsd_metric min_rmsd_t; - metric = std::unique_ptr(new min_rmsd_t(input_dimension)); - _metric_type = MetricType::MINRMSD; - } else { - throw std::invalid_argument("metric is not of {'euclidean', 'minRMSD'}"); - } - } - - virtual ~ClusteringBase()= default; - ClusteringBase(const ClusteringBase&) = delete; - ClusteringBase&operator=(const ClusteringBase&) = delete; - ClusteringBase(ClusteringBase&&) noexcept = default; - ClusteringBase&operator=(ClusteringBase&&) noexcept = default; - - std::unique_ptr> metric; - std::size_t input_dimension; - - py::array_t assign_chunk_to_centers(const py::array_t& chunk, - const py::array_t& centers, - unsigned int n_threads) const { - return metric->assign_chunk_to_centers(chunk, centers, n_threads); - } - - /** - * pre-center given centers in place - * @param centers - */ - void precenter_centers(np_array& centers) const { - switch (_metric_type) { - case MetricType::MINRMSD: { - auto ptr = dynamic_cast*>(metric.get()); - ptr->precenter_centers(centers.mutable_data(0), centers.shape(0)); - break; - } - default: { - throw std::runtime_error("precentering is only available for minRMSD metric."); - }; - } - } - -private: - MetricType _metric_type; -}; - - - -#endif //PYEMMA_CLUSTERING_H diff --git a/pyemma/coordinates/clustering/include/bits/kmeans_bits.h b/pyemma/coordinates/clustering/include/bits/kmeans_bits.h deleted file mode 100644 index b94cb3ac9..000000000 --- a/pyemma/coordinates/clustering/include/bits/kmeans_bits.h +++ /dev/null @@ -1,407 +0,0 @@ -// -// Created by marscher on 7/24/17. -// - - -#ifndef PYEMMA_KMEANS_BITS_H_H -#define PYEMMA_KMEANS_BITS_H_H - -#include "kmeans.h" -#include "threading_utils.h" - -#include -#include - -#include -#include - - -template -typename KMeans::np_array -KMeans::cluster(const np_array &np_chunk, const np_array &np_centers, int n_threads) const { - - if (np_chunk.ndim() != 2) { - throw std::runtime_error(R"(Number of dimensions of "chunk" ain't 2.)"); - } - if (np_centers.ndim() != 2) { - throw std::runtime_error(R"(Number of dimensions of "centers" ain't 2.)"); - } - - auto n_frames = static_cast(np_chunk.shape(0)); - auto dim = static_cast(np_chunk.shape(1)); - - if (dim == 0) { - throw std::invalid_argument("chunk dimension must be larger than zero."); - } - - auto chunk = np_chunk.template unchecked<2>(); - auto n_centers = static_cast(np_centers.shape(0)); - auto centers = np_centers.template unchecked<2>(); - - /* initialize centers_counter and new_centers with zeros */ - std::vector shape = {n_centers, dim}; - py::array_t return_new_centers(shape); - auto new_centers = return_new_centers.mutable_unchecked(); - std::fill(return_new_centers.mutable_data(), return_new_centers.mutable_data() + return_new_centers.size(), 0.0); - std::vector centers_counter(n_centers, 0); - - /* do the clustering */ - if (n_threads == 0) { - std::size_t closest_center_index = 0; - for (std::size_t i = 0; i < n_frames; ++i) { - auto mindist = std::numeric_limits::max(); - for(std::size_t j = 0; j < n_centers; ++j) { - auto d = parent_t::metric->compute(&chunk(i, 0), ¢ers(j, 0)); - if(d < mindist) { - mindist = d; - closest_center_index = j; - } - } - centers_counter[closest_center_index]++; - for (std::size_t j = 0; j < dim; j++) { - new_centers(closest_center_index, j) += chunk(i, j); - } - } - } else { -#if defined(USE_OPENMP) - omp_set_num_threads(n_threads); - -#pragma omp parallel for schedule(static, 1) - for (std::size_t i = 0; i < n_frames; ++i) { - std::vector dists(n_centers); - for (std::size_t j = 0; j < n_centers; ++j) { - dists[j] = parent_t::metric->compute(&chunk(i, 0), ¢ers(j, 0)); - } -#pragma omp flush(dists) - -#pragma omp critical(centers_counter) - { - auto closest_center_index = std::distance(dists.begin(), std::min_element(dists.begin(), dists.end())); - { - centers_counter.at(static_cast(closest_center_index))++; - for (std::size_t j = 0; j < dim; j++) { - new_centers(closest_center_index, j) += chunk(i, j); - } - } - } - } -#else - { - std::mutex mutex; - - std::vector threads; - threads.reserve(static_cast(n_threads)); - - std::size_t grainSize = n_frames / n_threads; - - auto worker = [&](std::size_t tid, std::size_t begin, std::size_t end, std::mutex& m) { - for (auto i = begin; i < end; ++i) { - std::size_t argMinDist = 0; - { - dtype minDist = parent_t::metric->compute(&chunk(i, 0), ¢ers(0, 0)); - for (std::size_t j = 1; j < n_centers; ++j) { - auto dist = parent_t::metric->compute(&chunk(i, 0), ¢ers(j, 0)); - if(dist < minDist) { - minDist = dist; - argMinDist = j; - } - } - } - - { - std::unique_lock lock(m); - - centers_counter.at(argMinDist)++; - for (std::size_t j = 0; j < dim; j++) { - new_centers(argMinDist, j) += chunk(i, j); - } - } - } - }; - - for(std::uint8_t i = 0; i < n_threads - 1; ++i) { - threads.emplace_back(worker, i, i*grainSize, (i+1)*grainSize, std::ref(mutex)); - } - threads.emplace_back(worker, n_threads, (n_threads - 1) * grainSize, n_frames, std::ref(mutex)); - } -#endif - } - - auto centers_counter_it = centers_counter.begin(); - for (std::size_t i = 0; i < n_centers; ++i, ++centers_counter_it) { - if (*centers_counter_it == 0) { - for (std::size_t j = 0; j < dim; ++j) { - new_centers(i, j) = centers(i, j); - } - } else { - for (std::size_t j = 0; j < dim; ++j) { - new_centers(i, j) /= static_cast(*centers_counter_it); - } - } - } - return return_new_centers; -} - - -template -typename KMeans::cluster_res KMeans::cluster_loop(const np_array& np_chunk, np_array& np_centers, - int n_threads, int max_iter, float tolerance, - py::object& callback) const { - int it = 0; - bool converged = false; - dtype rel_change = std::numeric_limits::max(); - dtype prev_cost = 0; - do { - np_centers = cluster(np_chunk, np_centers, n_threads); - auto cost = costFunction(np_chunk, np_centers, n_threads); - rel_change = (cost != 0.0) ? std::abs(cost - prev_cost) / cost : 0; - prev_cost = cost; - if(rel_change <= tolerance) { - converged = true; - } else { - if(! callback.is_none()) { - /* Acquire GIL before calling Python code */ - py::gil_scoped_acquire acquire; - callback(); - } - } - - it += 1; - } while(it < max_iter && ! converged); - int res = converged ? 0 : 1; - return std::make_tuple(std::move(np_centers), res, it); -} - -template -dtype KMeans::costFunction(const np_array &np_data, const np_array &np_centers, int n_threads) const { - auto data = np_data.template unchecked<2>(); - auto centers = np_centers.template unchecked<2>(); - - dtype value = 0.0; - auto n_frames = static_cast(np_data.shape(0)); -#ifdef USE_OPENMP - omp_set_num_threads(n_threads); -#endif - -#pragma omp parallel for reduction(+:value) - for (std::size_t i = 0; i < n_frames; i++) { - for (std::size_t r = 0; r < np_centers.shape(0); r++) { - auto l = parent_t::metric->compute(&data(i, 0), ¢ers(r, 0)); - { - value += l; - } - } - } - return value; -} - -template -typename KMeans::np_array KMeans:: -initCentersKMpp(const np_array &np_data, unsigned int random_seed, int n_threads, py::object& callback) const { - if (np_data.shape(0) < k) { - std::stringstream ss; - ss << "not enough data to initialize desired number of centers."; - ss << "Provided frames (" << np_data.shape(0) << ") < n_centers (" << k << ")."; - throw std::invalid_argument(ss.str()); - } - - constexpr auto size_t_max = std::numeric_limits::max(); - - std::size_t centers_found = 0; - std::size_t dim = parent_t::metric->dim; - - if (np_data.ndim() != 2) { - throw std::invalid_argument("input data does not have two dimensions."); - } - - if (np_data.shape(1) != dim) { - throw std::invalid_argument("input dimension of data does not match the requested metric ones."); - } - - auto n_frames = static_cast(np_data.shape(0)); - - /* number of trials before choosing the data point with the best potential */ - size_t n_trials = 2 + (size_t) log(k); - - /* allocate space for the index giving away which point has already been used as a cluster center */ - std::vector taken_points(n_frames, 0); - /* candidates allocations */ - std::vector next_center_candidates(n_trials, size_t_max); - std::vector next_center_candidates_rand(n_trials, 0); - std::vector next_center_candidates_potential(n_trials, 0); - /* allocate space for the array holding the squared distances to the assigned cluster centers */ - std::vector squared_distances(n_frames, 0); - - /* create the output objects */ - std::vector shape = {k, dim}; - np_array ret_init_centers(shape); - auto init_centers = ret_init_centers.mutable_unchecked(); - std::memset(init_centers.mutable_data(), 0, - static_cast(init_centers.size() * init_centers.itemsize())); - - const auto data = np_data.template unchecked<2>(); - /* initialize random device and pick first center randomly */ - std::default_random_engine generator(random_seed); - std::uniform_int_distribution uniform_dist(0, n_frames - 1); - auto first_center_index = uniform_dist(generator); - /* and mark it as assigned */ - taken_points[first_center_index] = 1; - /* write its coordinates into the init_centers array */ - for (std::size_t j = 0; j < dim; j++) { - init_centers(centers_found, j) = data(first_center_index, j); - } - /* increase number of found centers */ - centers_found++; - /* perform callback */ - if (!callback.is_none()) { - py::gil_scoped_acquire acquire; - callback(); - } -#ifdef USE_OPENMP - omp_set_num_threads(n_threads); -#endif - - /* iterate over all data points j, measuring the squared distance between j and the initial center i: */ - /* squared_distances[i] = distance(x_j, x_i)*distance(x_j, x_i) */ - dtype dist_sum = 0; - #pragma omp parallel for reduction(+:dist_sum) - for (std::size_t i = 0; i < n_frames; i++) { - if (i != first_center_index) { - auto value = parent_t::metric->compute(&data(i, 0), &data(first_center_index, 0)); - value = value * value; - squared_distances[i] = value; - /* build up dist_sum which keeps the sum of all squared distances */ - dist_sum += value; - } - } - - /* keep picking centers while we do not have enough of them... */ - while (centers_found < k) { - py::gil_scoped_release release; - - /* initialize the trials random values by the D^2-weighted distribution */ - for (std::size_t j = 0; j < n_trials; j++) { - next_center_candidates[j] = size_t_max; - auto point_index = uniform_dist(generator); - next_center_candidates_rand[j] = dist_sum * (static_cast(point_index) / - static_cast(uniform_dist.max())); - next_center_candidates_potential[j] = 0.0; - } - /* pick candidate data points corresponding to their random value */ - dtype sum = 0.0; - for (std::size_t i = 0; i < n_frames; i++) { - if (taken_points[i] == 0) { - sum += squared_distances[i]; - bool some_not_done{false}; - for (std::size_t j = 0; j < n_trials; j++) { - if (next_center_candidates[j] == size_t_max) { - if (sum >= next_center_candidates_rand[j]) { - assert(i < std::numeric_limits::max()); - next_center_candidates[j] = i; - } else { - some_not_done = true; - } - } - } - if (!some_not_done) break; - } - } - - /* now find the maximum squared distance for each trial... */ - std::atomic_bool terminate(false); - #pragma omp parallel for - for (std::size_t i = 0; i < n_frames; i++) { - if (terminate.load()) { - continue; - } - if (taken_points.at(i) == 0) { - for (std::size_t j = 0; j < n_trials; ++j) { - if (next_center_candidates.at(j) == size_t_max) { - terminate.store(true); - break; - } - #pragma omp critical - { - if (next_center_candidates.at(j) != i) { - auto value = parent_t::metric->compute(&data(i, 0), - &data(next_center_candidates.at(j), 0)); - auto d = value * value; - if (d < squared_distances.at(i)) { - next_center_candidates_potential[j] += d; - } else { - next_center_candidates_potential[j] += squared_distances[i]; - } - } - } - } - } - } - - /* ... and select the best candidate by the minimum value of the maximum squared distances */ - long best_candidate = -1; - auto best_potential = std::numeric_limits::max(); - for (std::size_t j = 0; j < n_trials; j++) { - if (next_center_candidates[j] != size_t_max && next_center_candidates_potential[j] < best_potential) { - best_potential = next_center_candidates_potential[j]; - best_candidate = next_center_candidates[j]; - } - } - - /* if for some reason we did not find a best candidate, just take the next available point */ - if (best_candidate == -1) { - for (std::size_t i = 0; i < n_frames; i++) { - if (taken_points[i] == 0) { - best_candidate = i; - break; - } - } - } - - /* check if best_candidate was set, otherwise break to avoid an infinite loop should things go wrong */ - if (best_candidate >= 0) { - /* write the best_candidate's components into the init_centers array */ - for (std::size_t j = 0; j < dim; j++) { - init_centers(centers_found, j) = data(best_candidate, j); - } - /* increase centers_found */ - centers_found++; - /* perform the callback */ - if (!callback.is_none()) { - py::gil_scoped_acquire acquire; - callback(); - } - /* mark the data point as assigned center */ - taken_points[best_candidate] = 1; - /* update the sum of squared distances by removing the assigned center */ - dist_sum -= squared_distances[best_candidate]; - - /* if we still have centers to assign, the squared distances array has to be updated */ - if (centers_found < k) { - /* Check for each data point if its squared distance to the freshly added center is smaller than */ - /* the squared distance to the previously picked centers. If so, update the squared_distances */ - /* array by the new value and also update the dist_sum value by removing the old value and adding */ - /* the new one. */ -#pragma omp parallel for - for (std::size_t i = 0; i < n_frames; ++i) { - if (taken_points[i] == 0) { - auto value = parent_t::metric->compute(&data(i, 0), &data(best_candidate, 0)); - auto d = value * value; -#pragma omp critical - { - if (d < squared_distances[i]) { - dist_sum += d - squared_distances[i]; - squared_distances[i] = d; - } - } - } - } - } - } else { - break; - } - } - if (centers_found != k) { throw std::runtime_error("kmeans++ failed to initialize all desired centers"); } - return ret_init_centers; -} - -#endif //PYEMMA_KMEANS_BITS_H_H diff --git a/pyemma/coordinates/clustering/include/bits/metric_base_bits.h b/pyemma/coordinates/clustering/include/bits/metric_base_bits.h deleted file mode 100644 index 4824c87dd..000000000 --- a/pyemma/coordinates/clustering/include/bits/metric_base_bits.h +++ /dev/null @@ -1,151 +0,0 @@ -// -// Created by marscher on 7/21/17. -// - -#ifndef PYEMMA_METRIC_BASE_BITS_H -#define PYEMMA_METRIC_BASE_BITS_H - -#include "../metric_base.h" - -#include -#include - -#ifdef USE_OPENMP -#include -#endif - -/** - * assign a given chunk to given centers using encapsuled metric. - * @tparam dtype - * @param chunk - * @param centers - * @param n_threads - * @return - */ -template -inline py::array_t metric_base::assign_chunk_to_centers(const np_array& chunk, - const np_array& centers, - unsigned int n_threads) { - if (chunk.ndim() != 2) { - throw std::invalid_argument("provided chunk does not have two dimensions."); - } - - if (centers.ndim() != 2) { - throw std::invalid_argument("provided centers does not have two dimensions."); - } - size_t N_centers = static_cast(centers.shape(0)); - size_t N_frames = static_cast(chunk.shape(0)); - size_t input_dim = static_cast(chunk.shape(1)); - - if ((input_dim != dim) || (input_dim != centers.shape(1))) { - throw std::invalid_argument("input dimension mismatch"); - } - std::vector dists(N_centers); - std::vector shape = {N_frames}; - py::array_t dtraj(shape); - - auto dtraj_buff = dtraj.template mutable_unchecked<1>(); - auto chunk_buff = chunk.template unchecked<2>(); - auto centers_buff = centers.template unchecked<2>(); - -#ifdef USE_OPENMP - /* Create a parallel thread block. */ - omp_set_num_threads(n_threads); - assert(omp_get_num_threads() == n_threads); -#endif - #pragma omp parallel - { - for(size_t i = 0; i < N_frames; ++i) { - /* Parallelize distance calculations to cluster centers to avoid cache misses */ - #pragma omp for - for(size_t j = 0; j < N_centers; ++j) { - dists[j] = compute(&chunk_buff(i, 0), ¢ers_buff(j, 0)); - } - #pragma omp flush(dists) - - /* Only one thread can make actual assignment */ - #pragma omp single - { - dtype mindist = std::numeric_limits::max(); - int argmin = -1; - for (size_t j = 0; j < N_centers; ++j) { - if (dists[j] < mindist) { - mindist = dists[j]; - argmin = static_cast(j); - } - } - dtraj_buff(i) = argmin; - } - /* Have all threads synchronize in progress through cluster assignments */ - #pragma omp barrier - } - } - return dtraj; -} - -/** - * euclidean distance method - * @tparam dtype - * @param a - * @param b - * @return - */ -template -inline dtype euclidean_metric::compute(const dtype *const a, const dtype *const b) { - double sum = 0.0; - //#pragma omp simd reduction(+:sum) - for (size_t i = 0; i < metric_base::dim; ++i) { - assert(std::isfinite(a[i])); - assert(std::isfinite(b[i])); - - auto d = a[i] - b[i]; - sum += d * d; - } - assert(std::isfinite(sum)); - return static_cast(std::sqrt(sum)); -} - -/** - * minRMSD distance function - * a: centers - * b: frames - * n: dimension of one frame - * buffer_a: pre-allocated buffer to store a copy of centers - * buffer_b: pre-allocated buffer to store a copy of frames - * trace_a_precalc: pre-calculated trace to centers (pointer to one value) - */ -template -inline dtype min_rmsd_metric::compute(const dtype *a, const dtype *b) { - float trace_a, trace_b; - auto dim3 = static_cast(parent_t::dim / 3); - std::vector buffer_b (b, b + parent_t::dim); - - if (!has_trace_a_been_precalculated) { - std::vector buffer_a (a, a + parent_t::dim); - inplace_center_and_trace_atom_major(buffer_a.data(), &trace_a, 1, dim3); - inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3); - - } else { - inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3); - trace_a = *trace_centers.data(); - } - - float msd = msd_atom_major(dim3, dim3, a, buffer_b.data(), trace_a, trace_b, 0, nullptr); - return std::sqrt(msd); -} - -template -inline void min_rmsd_metric::precenter_centers(float *centers, std::size_t N_centers) { - trace_centers.resize(N_centers); - float *trace_centers_p = trace_centers.data(); - - /* Parallelize centering of cluster generators */ - /* Note that this is already OpenMP-enabled */ - for (std::size_t j = 0; j < N_centers; ++j) { - inplace_center_and_trace_atom_major(¢ers[j * parent_t::dim], - &trace_centers_p[j], 1, parent_t::dim / 3); - } -} - - -#endif //PYEMMA_METRIC_BASE_BITS_H diff --git a/pyemma/coordinates/clustering/include/bits/threading_utils.h b/pyemma/coordinates/clustering/include/bits/threading_utils.h deleted file mode 100644 index 5e6b1e455..000000000 --- a/pyemma/coordinates/clustering/include/bits/threading_utils.h +++ /dev/null @@ -1,68 +0,0 @@ -// -// Created by clonkscher on 8/8/17. -// - -#ifndef PYEMMA_THREADING_UTILS_H -#define PYEMMA_THREADING_UTILS_H - - -#include -#include -#include - -/** - * scoped_thread implementation - */ -class scoped_thread { - std::thread t; -public: - /** - * Creates a new scoped_thread based on a thread object - * @param _t the reference thread - */ - template - explicit scoped_thread(Function &&fun, Args &&... args) - : t(std::forward(fun), std::forward(args)...) { - if (!t.joinable()) throw std::logic_error("No thread!"); - } - - /** - * joins the contained thread if its joinable - */ - ~scoped_thread() { - if (t.joinable()) { - t.join(); - } - } - - /** - * moves another scoped_thread into this - * @param rhs the other scoped_thread - */ - scoped_thread(scoped_thread &&rhs) { - t = std::move(rhs.t); - } - - /** - * moves another scoped_thread into this - * @param rhs the other scoped_thread - * @return myself - */ - scoped_thread &operator=(scoped_thread &&rhs) { - t = std::move(rhs.t); - return *this; - } - - /** - * copying is not allowed - */ - scoped_thread(const scoped_thread &) = delete; - - /** - * copying is not allowed - */ - scoped_thread &operator=(const scoped_thread &) = delete; -}; - - -#endif //PYEMMA_THREADING_UTILS_H diff --git a/pyemma/coordinates/clustering/include/kmeans.h b/pyemma/coordinates/clustering/include/kmeans.h deleted file mode 100644 index ca8be5126..000000000 --- a/pyemma/coordinates/clustering/include/kmeans.h +++ /dev/null @@ -1,68 +0,0 @@ -// -// Created by marscher on 4/3/17. -// - -#ifndef PYEMMA_KMEANS_H -#define PYEMMA_KMEANS_H - -#include - -#include "Clustering.h" - -namespace py = pybind11; - - -template -class KMeans : public ClusteringBase { -public: - using parent_t = ClusteringBase; - using np_array = py::array_t; - /** - * array with new cluster centers, return code (0 == converged), number of iterations taken. - */ - using cluster_res = std::tuple; - - KMeans(unsigned int k, - const std::string &metric, - size_t input_dimension) : ClusteringBase(metric, input_dimension), k(k) {} - - /** - * performs kmeans clustering on the given data chunk, provided a list of centers. - * @param np_chunk - * @param np_centers - * @param n_threads - * @return updated centers. - */ - np_array cluster(const np_array & /*np_chunk*/, const np_array & /*np_centers*/, int /*n_threads*/) const; - - /** - * - */ - cluster_res cluster_loop(const np_array & /*np_chunk*/, np_array & /*np_centers*/, - int /*n_threads*/, int /*max_iter*/, float /*tolerance*/, - py::object& /*callback*/) const; - - /** - * evaluate the quality of the centers - * - * @return - */ - dtype costFunction(const np_array & /*np_data*/, const np_array & /*np_centers*/, int /*n_threads*/) const; - - /** - * kmeans++ initialisation - * @param np_data - * @param random_seed - * @param n_threads - * @return init centers. - */ - np_array initCentersKMpp(const np_array& /*np_data*/, unsigned int /*random_seed*/, int /*n_threads*/, - py::object& /*callback*/) const; - -protected: - unsigned int k; -}; - -#include "bits/kmeans_bits.h" - -#endif //PYEMMA_KMEANS_H diff --git a/pyemma/coordinates/clustering/include/metric_base.h b/pyemma/coordinates/clustering/include/metric_base.h deleted file mode 100644 index 9111b99df..000000000 --- a/pyemma/coordinates/clustering/include/metric_base.h +++ /dev/null @@ -1,96 +0,0 @@ -// -// Created by marscher on 3/31/17. -// - -#ifndef PYEMMA_METRIC_H -#define PYEMMA_METRIC_H - -#include -#include -#include -#include -#include - -#include -#include - -namespace py = pybind11; - -/** - * Base type for all metrics. - * @tparam dtype eg. float, double - */ -template -class metric_base { - -public: - using np_array = py::array_t; - - explicit metric_base(std::size_t dim) : dim(dim) {} - virtual ~metric_base() = default; - metric_base(const metric_base&) = delete; - metric_base&operator=(const metric_base&) = delete; - metric_base(metric_base&&) = default; - metric_base&operator=(metric_base&&) = default; - - virtual dtype compute(const dtype *, const dtype *) = 0; - - py::array_t assign_chunk_to_centers(const np_array& chunk, - const np_array& centers, - unsigned int n_threads); - size_t dim; -}; - -template -class euclidean_metric : public metric_base { -public: - explicit euclidean_metric(size_t dim) : metric_base(dim) {} - ~euclidean_metric() = default; - euclidean_metric(const euclidean_metric&) = delete; - euclidean_metric&operator=(const euclidean_metric&) = delete; - euclidean_metric(euclidean_metric&&) = default; - euclidean_metric&operator=(euclidean_metric&&) = default; - - dtype compute(const dtype *, const dtype *); - -}; - -template -class min_rmsd_metric : public metric_base { - - static_assert(std::is_same::value, "only implemented for floats"); - -public: - using parent_t = metric_base; - min_rmsd_metric(std::size_t dim, float *precalc_trace_centers = nullptr) - : metric_base(dim) { - if (dim % 3 != 0) { - throw std::range_error("min_rmsd_metric is only implemented for input data with a dimension dividable by 3."); - } - has_trace_a_been_precalculated = precalc_trace_centers != nullptr; - } - ~min_rmsd_metric() = default; - min_rmsd_metric(const min_rmsd_metric&) = delete; - min_rmsd_metric&operator=(const min_rmsd_metric&) = delete; - min_rmsd_metric(min_rmsd_metric&&) = default; - min_rmsd_metric&operator=(min_rmsd_metric&&) = default; - dtype compute(const dtype *a, const dtype *b); - /** - * pre-center in place - * @param original_centers - * @param N_centers - */ - void precenter_centers(float *original_centers, std::size_t N_centers); - -private: - /** - * only used during cluster assignment. Avoids precentering the centers in every step. - */ - std::vector trace_centers; - bool has_trace_a_been_precalculated; - float trace_a_precentered; -}; - -#include "bits/metric_base_bits.h" - -#endif //PYEMMA_METRIC_H diff --git a/pyemma/coordinates/clustering/include/regspace.h b/pyemma/coordinates/clustering/include/regspace.h deleted file mode 100644 index 47244f349..000000000 --- a/pyemma/coordinates/clustering/include/regspace.h +++ /dev/null @@ -1,91 +0,0 @@ -// -// Created by marscher on 7/17/17. -// - -#ifndef PYEMMA_REGSPACE_H -#define PYEMMA_REGSPACE_H - -#if defined(USE_OPENMP) -#include -#endif - -#include - - -class MaxCentersReachedException : public std::exception { -public: - explicit MaxCentersReachedException(const char * m) : message{m} {} - virtual const char * what() const noexcept override {return message.c_str();} -private: - std::string message = ""; -}; - -template -class RegularSpaceClustering : public ClusteringBase { - using parent_t = ClusteringBase; -public: - /** - * - * @param dmin - * @param max_clusters - * @param metric - * @param input_dimension - */ - RegularSpaceClustering(dtype dmin, std::size_t max_clusters, - const std::string &metric, - size_t input_dimension) : - ClusteringBase(metric, input_dimension), - dmin(dmin), - max_clusters(max_clusters) {} - - /** - * loops over all points in chunk and checks for each center if the distance is smaller than dmin, - * if so, the point is appended to py_centers. This is done until max_centers is reached or all points have been - * added to the list. - * @param chunk array shape(n, d) - * @param py_centers python list containing found centers. - */ - void cluster(const py::array_t &chunk, py::list& py_centers, unsigned int n_threads) { - // this checks for ndim == 2 - const auto& data = chunk.template unchecked< 2 >(); - - std::size_t N_frames = chunk.shape(0); - std::size_t dim = chunk.shape(1); - std::size_t N_centers = py_centers.size(); - #if defined(USE_OPENMP) - omp_set_num_threads(n_threads); - #endif - // do the clustering - for (std::size_t i = 0; i < N_frames; ++i) { - auto mindist = std::numeric_limits::max(); - #pragma omp parallel for reduction(min:mindist) - for (std::size_t j = 0; j < N_centers; ++j) { - // TODO avoid the cast in inner loop? - auto point = py_centers[j].cast < py::array_t < dtype >> (); - auto d = parent_t::metric.get()->compute(&data(i, 0), point.data()); - if (d < mindist) mindist = d; - } - if (mindist > dmin) { - if (N_centers + 1 > max_clusters) { - throw MaxCentersReachedException( - "Maximum number of cluster centers reached. Consider increasing max_clusters " - "or choose a larger minimum distance, dmin."); - } - // add newly found center - std::vector shape = {1, dim}; - py::array_t new_center(shape, nullptr); - std::memcpy(new_center.mutable_data(), &data(i, 0), sizeof(dtype)*dim); - - py_centers.append(new_center); - N_centers++; - } - } - } - -protected: - dtype dmin; - std::size_t max_clusters; - -}; - -#endif //PYEMMA_REGSPACE_H diff --git a/pyemma/coordinates/clustering/interface.py b/pyemma/coordinates/clustering/interface.py index 3adc4db15..10b52124d 100644 --- a/pyemma/coordinates/clustering/interface.py +++ b/pyemma/coordinates/clustering/interface.py @@ -26,6 +26,8 @@ import os import numpy as np +from deeptime.clustering import ClusterModel, metrics + from pyemma._base.serialization.serialization import SerializableMixIn from pyemma._base.model import Model @@ -56,6 +58,8 @@ class AbstractClustering(StreamingEstimationTransformer, Model, ClusterMixin, NJ def __init__(self, metric='euclidean', n_jobs=None): super(AbstractClustering, self).__init__() + from ._ext import rmsd + metrics.register("minRMSD", rmsd) self.metric = metric self.clustercenters = None self._previous_stride = -1 @@ -153,18 +157,12 @@ def sample_indexes_by_cluster(self, clusters, nsample, replace=True): def _transform_array(self, X): """get closest index of point in :attr:`clustercenters` to x.""" X = np.require(X, dtype=np.float32, requirements='C') - if not hasattr(self, '_inst'): - self.logger.debug("new cluster inst") - from ._ext import ClusteringBase_f - self._inst = ClusteringBase_f(self.metric, X.shape[1]) - # for performance reasons we pre-center the cluster centers for minRMSD. if self.metric == 'minRMSD' and not self._precentered: - # self.logger.debug("precentering cluster centers for minRMSD.") - # self._inst.precenter_centers(self.clustercenters) self._precentered = True - dtraj = self._inst.assign(X, self.clustercenters, self.n_jobs) + model = ClusterModel(cluster_centers=self.clustercenters, metric=self.metric) + dtraj = model.transform(X) res = dtraj[:, None] # always return a column vector in this function return res diff --git a/pyemma/coordinates/clustering/kmeans.py b/pyemma/coordinates/clustering/kmeans.py index c96f01159..b4144ae22 100644 --- a/pyemma/coordinates/clustering/kmeans.py +++ b/pyemma/coordinates/clustering/kmeans.py @@ -35,6 +35,9 @@ from pyemma.util.units import bytes_to_string from pyemma.util.contexts import random_seed, nullcontext + +from deeptime.clustering import metrics, KMeans, MiniBatchKMeans + import numpy as np @@ -105,6 +108,9 @@ def __init__(self, n_clusters, max_iter=5, metric='euclidean', """ super(KmeansClustering, self).__init__(metric=metric, n_jobs=n_jobs) + from ._ext import rmsd + metrics.register("minRMSD", rmsd) + if clustercenters is None: clustercenters = [] @@ -199,6 +205,10 @@ def _check_resume_iteration(self): def _estimate(self, iterable, **kw): self._init_estimate() + estimator = KMeans(self.n_clusters, self.max_iter, self.metric, tolerance=self.tolerance, + init_strategy=self.init_strategy, fixed_seed=self.fixed_seed, n_jobs=self.n_jobs) + if len(self.clustercenters) > 0: + estimator.initial_centers = np.array(self.clustercenters) ctx = nullcontext() if 'data' not in self._progress_registered_stages else self._progress_context(stage='data') # collect the data only if, we have not done this previously (eg. keep_data=True) # or the centers are not initialized. @@ -212,7 +222,7 @@ def _estimate(self, iterable, **kw): # collect data self._collect_data(X, first_chunk, it.last_chunk) if not resume_centers: - # initialize cluster centers + # initialize cluster centers in case of uniform self._initialize_centers(X, itraj, it.pos, it.last_chunk) first_chunk = False self.initial_centers_ = self.clustercenters[:] @@ -226,22 +236,30 @@ def _estimate(self, iterable, **kw): format(len(self.clustercenters), self.n_clusters)) if self.show_progress: + if self.init_strategy == 'kmeans++': + self._progress_register(self.n_clusters, + description="initialize kmeans++ centers", stage=0) + self._progress_register(self.max_iter, description="kmeans iterations", stage=1) + + callback_init = lambda: self._progress_update(1, stage=0) callback = lambda: self._progress_update(1, stage=1) else: + callback_init = None callback = None - # run k-means with all the data - with self._progress_context(stage=1), self._finish_estimate(): - self.clustercenters, code, iterations = self._inst.cluster_loop(self._in_memory_chunks, self.clustercenters, - self.n_jobs, self.max_iter, self.tolerance, - callback) - if code == 0: - self._converged = True - self.logger.debug("Cluster centers converged after %i steps.", iterations + 1) - else: - self.logger.warning("Algorithm did not reach convergence criterion" - " of %g in %i iterations. Consider increasing max_iter.", - self.tolerance, self.max_iter) + with self._progress_context(stage=(0, 1)), self._finish_estimate(): + estimator.fit(self._in_memory_chunks, callback_init_centers=callback_init, callback_loop=callback, + n_jobs=self.n_jobs) + model = estimator.fetch_model() + self.clustercenters = model.cluster_centers + self._converged = model.converged + + if self.converged: + self.logger.debug("Cluster centers converged after %i steps.", len(model.inertias)) + else: + self.logger.warning("Algorithm did not reach convergence criterion" + " of %g in %i iterations. Consider increasing max_iter.", + self.tolerance, self.max_iter) return self @@ -278,10 +296,6 @@ def _init_estimate(self): n_chunks = self.data_producer.n_chunks(chunksize=self.chunksize, skip=self.skip, stride=self.stride) self._progress_register(n_chunks, description="creating data array", stage='data') - if self.init_strategy == 'kmeans++': - self._progress_register(self.n_clusters, - description="initialize kmeans++ centers", stage=0) - self._progress_register(self.max_iter, description="kmeans iterations", stage=1) self._init_in_memory_chunks(total_length) if self.init_strategy == 'uniform': @@ -292,9 +306,6 @@ def _init_estimate(self): self._init_centers_indices[idx] = random.sample(list(range(0, traj_len)), int( math.ceil((traj_len / float(total_length)) * self.n_clusters))) - from ._ext import kmeans as kmeans_mod - self._inst = kmeans_mod.Kmeans_f(self.n_clusters, self.metric, self.data_producer.ndim) - return stride def _initialize_centers(self, X, itraj, t, last_chunk): @@ -308,16 +319,6 @@ def _initialize_centers(self, X, itraj, t, last_chunk): if len(self.clustercenters) < self.n_clusters and t + l in self._init_centers_indices[itraj]: new = np.vstack((self.clustercenters, X[l])) self.clustercenters = new - elif last_chunk and self.init_strategy == 'kmeans++': - if self.init_strategy == 'kmeans++' and self.show_progress: - callback = lambda: self._progress_update(1, stage=0) - context = self._progress_context(stage=0) - else: - callback = None - context = nullcontext() - with context: - self.clustercenters = self._inst.init_centers_KMpp(self._in_memory_chunks, self.fixed_seed, self.n_jobs, - callback) def _collect_data(self, X, first_chunk, last_chunk): # beginning - compute @@ -342,6 +343,8 @@ class MiniBatchKmeansClustering(KmeansClustering): def __init__(self, n_clusters, max_iter=5, metric='euclidean', tolerance=1e-5, init_strategy='kmeans++', batch_size=0.2, oom_strategy='memmap', fixed_seed=False, stride=None, n_jobs=None, skip=0, clustercenters=None, keep_data=False): + from ._ext import rmsd + metrics.register("minRMSD", rmsd) if stride is not None: raise ValueError("stride is a dummy value in MiniBatch Kmeans") @@ -393,6 +396,12 @@ def _init_estimate(self): def _estimate(self, iterable, **kw): # mini-batch kmeans does not use stride. Enforce it. + + estimator = MiniBatchKMeans(n_clusters=self.n_clusters, metric=self.metric, n_jobs=self.n_jobs, + init_strategy=self.init_strategy) + if len(self.clustercenters) > 0: + estimator.initial_centers = np.array(self.clustercenters) + self.stride = None self._init_estimate() @@ -411,14 +420,11 @@ def _estimate(self, iterable, **kw): for X in iter(iterator): # collect data self._collect_data(X, first_chunk, iterator.last_chunk) - # initialize cluster centers - if i_pass == 0 and not self._check_resume_iteration(): - self._initialize_centers(X, iterator.current_trajindex, iterator.pos, iterator.last_chunk) first_chunk = False - # one pass over data completed - self.clustercenters = self._inst.cluster(self._in_memory_chunks, self.clustercenters, self.n_jobs) - cost = self._inst.cost_function(self._in_memory_chunks, self.clustercenters, self.n_jobs) + current_model = estimator.partial_fit(self._in_memory_chunks).fetch_model() + self.clustercenters = current_model.cluster_centers + cost = current_model.inertia rel_change = np.abs(cost - prev_cost) / cost if cost != 0.0 else 0.0 prev_cost = cost diff --git a/pyemma/coordinates/clustering/regspace.py b/pyemma/coordinates/clustering/regspace.py index 339c2e348..5e319eae7 100644 --- a/pyemma/coordinates/clustering/regspace.py +++ b/pyemma/coordinates/clustering/regspace.py @@ -31,7 +31,7 @@ from pyemma.util.exceptions import NotConvergedWarning import numpy as np -import deeptime as dt +from deeptime.clustering import RegularSpace, metrics, ClusterModel __all__ = ['RegularSpaceClustering'] @@ -80,8 +80,8 @@ def __init__(self, dmin, max_centers=1000, metric='euclidean', stride=1, n_jobs= """ super(RegularSpaceClustering, self).__init__(metric=metric, n_jobs=n_jobs) - from ._ext import RMSDMetric - dt.clustering.metrics.register("minRMSD", RMSDMetric) + from ._ext import rmsd + metrics.register("minRMSD", rmsd) self._converged = False self.set_params(dmin=dmin, metric=metric, @@ -135,12 +135,8 @@ def _estimate(self, iterable, **kwargs): # 3. add new centroid, if min(distance to all other clustercenters) >= dmin ######## # temporary list to store cluster centers - clustercenters = [] used_frames = 0 - regspace = dt.clustering.RegularSpace(dmin=self.dmin, max_centers=self.max_centers, - metric=self.metric, n_jobs=self.n_jobs) - - # from ._ext import regspace + regspace = RegularSpace(dmin=self.dmin, max_centers=self.max_centers, metric=self.metric, n_jobs=self.n_jobs) it = iterable.iterator(return_trajindex=False, stride=self.stride, chunk=self.chunksize, skip=self.skip) try: @@ -149,21 +145,26 @@ def _estimate(self, iterable, **kwargs): regspace.partial_fit(X.astype(np.float32, order='C', copy=False), n_jobs=self.n_jobs) used_frames += len(X) self._converged = True - except regspace.MaxCentersReachedException: - self._converged = False - msg = 'Maximum number of cluster centers reached.' \ - ' Consider increasing max_centers or choose' \ - ' a larger minimum distance, dmin.' - self.logger.warning(msg) - warnings.warn(msg) - # pass amount of processed data - used_data = used_frames / float(it.n_frames_total()) * 100.0 - raise NotConvergedWarning("Used data for centers: %.2f%%" % used_data) + except Exception as e: + if 'MaxCentersReachedException' in e.__class__.__name__: + self._converged = False + msg = 'Maximum number of cluster centers reached.' \ + ' Consider increasing max_centers or choose' \ + ' a larger minimum distance, dmin.' + self.logger.warning(msg) + warnings.warn(msg) + # pass amount of processed data + used_data = used_frames / float(it.n_frames_total()) * 100.0 + raise NotConvergedWarning("Used data for centers: %.2f%%" % used_data) + else: + # todo ugly workaround until maxcentersreached is placed not within metric subpackage but globally + # somewhere + raise finally: # even if not converged, we store the found centers. model = regspace.fetch_model() clustercenters = model.cluster_centers.squeeze().reshape(-1, iterable.ndim) - self._inst = dt.clustering.ClusterModel(clustercenters, metric=self.metric) + self._inst = ClusterModel(clustercenters, metric=self.metric) from types import MethodType def _assign(self, data, _, n_jobs): diff --git a/pyemma/coordinates/clustering/src/clustering_module.cpp b/pyemma/coordinates/clustering/src/clustering_module.cpp index 2c253020d..b8181a57a 100644 --- a/pyemma/coordinates/clustering/src/clustering_module.cpp +++ b/pyemma/coordinates/clustering/src/clustering_module.cpp @@ -2,47 +2,13 @@ // Created by marscher on 7/17/17. // -#include "metric.h" -#include "regspace.h" -#include "kmeans.h" +#include +#include +#include "register_clustering.h" -class MaximumMetric : public Metric { -public: - - double compute_squared_d(const double* xs, const double* ys, std::size_t dim) const override { - return _compute(xs, ys, dim); - } - float compute_squared_f(const float* xs, const float* ys, std::size_t dim) const override { - return _compute(xs, ys, dim); - } -private: - template - T _compute(const T* xs, const T* ys, std::size_t dim) const { - T result = 0.0; - for (size_t i = 0; i < dim; ++i) { - auto d = std::abs(xs[i] - ys[i]); - if (d > result) { - result = d; - } - } - return result*result; - } -}; - -class RMSDMetric : public Metric { -public: - - double compute_squared_d(const double* xs, const double* ys, std::size_t dim) const override { - std::vector xsCast (xs, xs+dim), ysCast (ys, ys+dim); - return _compute(xsCast.data(), ysCast.data(), dim); - } - float compute_squared_f(const float* xs, const float* ys, std::size_t dim) const override { - return _compute(xs, ys, dim); - } - -private: +struct RMSDMetric { template - T _compute(const T* xs, const T* ys, std::size_t dim) const { + static T compute(const T* xs, const T* ys, std::size_t dim) { if (dim % 3 != 0) { throw std::range_error("RMSDMetric is only implemented for input data with a dimension dividable by 3."); } @@ -53,45 +19,22 @@ class RMSDMetric : public Metric { inplace_center_and_trace_atom_major(buffer_a.data(), &trace_a, 1, dim3); inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3); + if constexpr(std::is_same::value) { + std::vector cast (xs, xs + dim); + return msd_atom_major(dim3, dim3, cast.data(), buffer_b.data(), trace_a, trace_b, 0, nullptr); + } else { + return msd_atom_major(dim3, dim3, xs, buffer_b.data(), trace_a, trace_b, 0, nullptr); + } + } - float msd = msd_atom_major(dim3, dim3, xs, - buffer_b.data(), trace_a, trace_b, 0, nullptr); - return msd; + template + static dtype compute_squared(const dtype* xs, const dtype* ys, std::size_t dim) { + auto d = compute(xs, ys, dim); + return d*d; } }; -using dtype = float; - PYBIND11_MODULE(_ext, m) { - m.doc() = "module containing clustering algorithms."; - - auto regspace_mod = m.def_submodule("regspace"); - auto kmeans_mod = m.def_submodule("kmeans"); - - typedef ClusteringBase cbase_f; - typedef RegularSpaceClustering regspace_f; - - // register base class first. - py::class_(m, "ClusteringBase_f") - .def(py::init()) - .def("assign", &cbase_f::assign_chunk_to_centers) - .def("precenter_centers", &cbase_f::precenter_centers); - // regular space clustering. - py::class_(regspace_mod, "Regspace_f") - .def(py::init()) - .def("cluster", ®space_f::cluster); - py::register_exception(regspace_mod, "MaxCentersReachedException"); - // kmeans - typedef KMeans kmeans_f; - py::class_(kmeans_mod, "Kmeans_f") - .def(py::init(), - py::arg("k"), py::arg("metric"), py::arg("dim")) - // py::arg("callback") = py::none() - .def("cluster", &kmeans_f::cluster) - .def("cluster_loop", &kmeans_f::cluster_loop) - .def("init_centers_KMpp", &kmeans_f::initCentersKMpp) - .def("cost_function", &kmeans_f::costFunction); - // py::class_(m, "Metric"); - py::object baseMetric = (py::object) py::module_::import("deeptime.clustering._clustering_bindings").attr("Metric"); - py::class_(m, "RMSDMetric", baseMetric).def(py::init<>()); + auto rmsdModule = m.def_submodule("rmsd"); + deeptime::clustering::registerClusteringImplementation(rmsdModule); } diff --git a/pyemma/coordinates/clustering/tests/test_kmeans.py b/pyemma/coordinates/clustering/tests/test_kmeans.py index 5c22d5934..5f9dbe79c 100644 --- a/pyemma/coordinates/clustering/tests/test_kmeans.py +++ b/pyemma/coordinates/clustering/tests/test_kmeans.py @@ -92,7 +92,6 @@ def test_3gaussian_1d_singletraj(self): while not km2.converged: km2.estimate(X=X, clustercenters=km2.clustercenters, keep_data=True) - assert np.linalg.norm(km1.clustercenters - km1.initial_centers_) > 0 np.testing.assert_allclose(km1.clustercenters, km2.clustercenters, err_msg="should yield same centers with fixed seed=%s for strategy %s, Initial centers=%s" % (fixed_seed, init_strategy, km2.initial_centers_), atol=1e-6) diff --git a/pyemma/coordinates/data/util/reader_utils.py b/pyemma/coordinates/data/util/reader_utils.py index a2a83a15c..14dbf0752 100644 --- a/pyemma/coordinates/data/util/reader_utils.py +++ b/pyemma/coordinates/data/util/reader_utils.py @@ -15,7 +15,7 @@ # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . - +from pathlib import Path from numpy import vstack import mdtraj as md @@ -127,6 +127,8 @@ def single_traj_from_n_files(file_list, top): """ traj = None for ff in file_list: + if isinstance(ff, Path): + ff = str(ff.resolve()) if traj is None: traj = md.load(ff, top=top) else: diff --git a/pyemma/coordinates/tests/test_save_trajs.py b/pyemma/coordinates/tests/test_save_trajs.py index c41d41079..398592134 100644 --- a/pyemma/coordinates/tests/test_save_trajs.py +++ b/pyemma/coordinates/tests/test_save_trajs.py @@ -29,6 +29,7 @@ import os import shutil import tempfile +from pathlib import Path import numpy as np import pyemma.coordinates as coor @@ -61,16 +62,19 @@ def setUp(self): self.sets = [set_1, set_2] - self.subdir = tempfile.mkdtemp(suffix='save_trajs_test/') + self.subdir = Path(tempfile.mkdtemp(suffix='save_trajs_test/')) # Instantiate the reader self.reader = coor.source(self.trajfiles, top=self.pdbfile) self.reader.chunksize = 30 - self.n_pass_files = [self.subdir + 'n_pass.set_%06u.xtc' % ii for ii in range(len(self.sets))] - self.one_pass_files = [self.subdir + '1_pass.set_%06u.xtc' % ii for ii in range(len(self.sets))] + self.frag_reader = coor.source([self.trajfiles], top=self.pdbfile) + self.n_pass_files = [str(self.subdir / ('n_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))] + self.frag_pass_files = [str(self.subdir / ('frag_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))] + self.one_pass_files = [str(self.subdir / ('1_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))] self.traj_ref = save_traj_w_md_load_frame(self.reader, self.sets) self.strides = [2, 3, 5] + self.subdir = str(self.subdir) def tearDown(self): shutil.rmtree(self.subdir, ignore_errors=True) @@ -89,12 +93,21 @@ def test_save_SaveTrajs_multipass(self): __ = save_trajs(self.reader, self.sets, outfiles=self.n_pass_files) + sets_frag = [] + for i in range(len(self.sets)): + sets_frag.append([]) + for traj_ix, frame_ix in self.sets[i]: + sets_frag[-1].append([0, traj_ix * 33 + frame_ix]) + + save_trajs(self.frag_reader, [np.array(sets_frag[0]), np.array(sets_frag[1])], outfiles=self.frag_pass_files) # Reload the object to memory traj_n_pass = single_traj_from_n_files(self.n_pass_files, top=self.pdbfile) + traj_frag = single_traj_from_n_files(self.frag_pass_files, top=self.pdbfile) # Check for diffs (found_diff, errmsg) = compare_coords_md_trajectory_objects(traj_n_pass, self.traj_ref, atom=0) - + self.assertFalse(found_diff, errmsg) + (found_diff, errmsg) = compare_coords_md_trajectory_objects(traj_frag, self.traj_ref, atom=0) self.assertFalse(found_diff, errmsg) def test_save_SaveTrajs_onepass(self): diff --git a/setup.py b/setup.py index 38ac054e7..6b119dcc3 100755 --- a/setup.py +++ b/setup.py @@ -93,12 +93,11 @@ def extensions(): include_dirs=[ mdtraj_inc, pybind_inc, - 'pyemma/coordinates/clustering/include', ] + deeptime_inc, language='c++', libraries=[lib_prefix+'theobald'], library_dirs=[mdtraj_lib], - extra_compile_args=common_cflags) + extra_compile_args=common_cflags + ['']) covar_module = \ Extension('pyemma._ext.variational.estimators.covar_c._covartools', @@ -239,6 +238,8 @@ def build_extensions(self): if has_flag(self.compiler, '-fvisibility=hidden'): opts.append('-fvisibility=hidden') elif ct == 'msvc': + opts.append('/std:c++17') + opts.append('/bigobj') opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) # setup OpenMP support diff --git a/setup_util.py b/setup_util.py index 9ef2d0751..bf4e6d4fa 100644 --- a/setup_util.py +++ b/setup_util.py @@ -129,9 +129,11 @@ def has_flag(compiler, flagname): def cpp_flag(compiler): - """Return the -std=c++[11/14] compiler flag. - The c++14 is prefered over c++11 (when it is available). + """Return the -std=c++[11/14/17] compiler flag. + The newer the better. """ + if has_flag(compiler, '-std=c++17'): + return '-std=c++17' if has_flag(compiler, '-std=c++14'): return '-std=c++14' elif has_flag(compiler, '-std=c++11'):