diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3d954f579..2471aaf69 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -21,7 +21,7 @@ jobs:
- run: conda config --set quiet true
- run: conda install conda-build -c defaults
- run: mkdir workspace
- - run: conda build devtools/conda-recipe --python=3.7 --no-test --output-folder workspace
+ - run: conda build devtools/conda-recipe --python=3.9 --numpy=1.19 --no-test --output-folder workspace
- persist_to_workspace:
root: workspace
paths:
@@ -46,7 +46,7 @@ jobs:
- run: mkdir -p $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS
- attach_workspace:
at: workspace
- - run: conda build devtools/conda-recipe --python=3.7 --test --output-folder workspace
+ - run: conda build devtools/conda-recipe --python=3.9 --numpy=1.19 --test --output-folder workspace
- run: bash <(curl -s https://codecov.io/bash) -f $HOME/coverage.xml
- store_test_results:
path: /tmp/circleci-test-results
diff --git a/devtools/azure-pipelines-linux.yml b/devtools/azure-pipelines-linux.yml
index 1ccb6f3df..1fbf0c870 100644
--- a/devtools/azure-pipelines-linux.yml
+++ b/devtools/azure-pipelines-linux.yml
@@ -6,15 +6,12 @@ jobs:
strategy:
matrix:
- Python36:
- CONDA_PY: '3.6'
- CONDA_NPY: '1.16'
Python37:
CONDA_PY: '3.7'
CONDA_NPY: '1.18'
Python38:
CONDA_PY: '3.8'
- CONDA_NPY: '1.19'
+ CONDA_NPY: '1.18'
Python39:
CONDA_PY: '3.9'
CONDA_NPY: '1.19'
diff --git a/devtools/azure-pipelines-osx.yml b/devtools/azure-pipelines-osx.yml
index cf4d3fe01..67a0e4b97 100644
--- a/devtools/azure-pipelines-osx.yml
+++ b/devtools/azure-pipelines-osx.yml
@@ -5,12 +5,12 @@ jobs:
vmImage: 'macOS-10.14'
strategy:
matrix:
- Python36:
- CONDA_PY: '3.6'
- CONDA_NPY: '1.17'
Python37:
CONDA_PY: '3.7'
- CONDA_NPY: '1.19'
+ CONDA_NPY: '1.18'
+ Python38:
+ CONDA_PY: '3.8'
+ CONDA_NPY: '1.18'
Python39:
CONDA_PY: '3.9'
CONDA_NPY: '1.19'
diff --git a/devtools/azure-pipelines-win.yml b/devtools/azure-pipelines-win.yml
index 2a590b423..d64d53d6e 100644
--- a/devtools/azure-pipelines-win.yml
+++ b/devtools/azure-pipelines-win.yml
@@ -7,10 +7,10 @@ jobs:
matrix:
Python37:
CONDA_PY: '3.7'
- CONDA_NPY: '1.19'
+ CONDA_NPY: '1.18'
Python38:
CONDA_PY: '3.8'
- CONDA_NPY: '1.19'
+ CONDA_NPY: '1.18'
Python39:
CONDA_PY: '3.9'
CONDA_NPY: '1.19'
diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml
index de097b739..85af0c989 100644
--- a/devtools/conda-recipe/meta.yaml
+++ b/devtools/conda-recipe/meta.yaml
@@ -35,7 +35,8 @@ requirements:
- setuptools
- pip
- pybind11
- - deeptime
+ - pybind11-abi
+ - deeptime >=0.3.0
run:
- bhmm >=0.6.3
diff --git a/pyemma/_base/serialization/tests/test_cli.py b/pyemma/_base/serialization/tests/test_cli.py
index aaf220e5e..2d8bf111f 100644
--- a/pyemma/_base/serialization/tests/test_cli.py
+++ b/pyemma/_base/serialization/tests/test_cli.py
@@ -15,20 +15,21 @@
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see .
-
+import shutil
import tempfile
-import unittest
+import pytest
+from numpy.testing import assert_
from pyemma._base.serialization.cli import main
from pyemma.coordinates import source, tica, cluster_kmeans
-class TestListModelCLI(unittest.TestCase):
- @classmethod
- def setUpClass(cls):
+@pytest.fixture
+def model_file():
+ file = None
+ try:
from pyemma.datasets import get_bpti_test_data
-
d = get_bpti_test_data()
trajs, top = d['trajs'], d['top']
s = source(trajs, top=top)
@@ -36,25 +37,22 @@ def setUpClass(cls):
t = tica(s, lag=1)
c = cluster_kmeans(t)
- cls.model_file = tempfile.mktemp()
- c.save(cls.model_file, save_streaming_chain=True)
-
- @classmethod
- def tearDownClass(cls):
- import os
- os.unlink(cls.model_file)
-
- def test_recursive(self):
- """ check the whole chain has been printed"""
- from pyemma.util.contexts import Capturing
- with Capturing() as out:
- main(['--recursive', self.model_file])
- assert out
- all_out = '\n'.join(out)
- self.assertIn('FeatureReader', all_out)
- self.assertIn('TICA', all_out)
- self.assertIn('Kmeans', all_out)
-
-
-if __name__ == '__main__':
- unittest.main()
+ file = tempfile.mktemp()
+ c.save(file, save_streaming_chain=True)
+
+ yield file
+ finally:
+ if file is not None:
+ shutil.rmtree(file, ignore_errors=True)
+
+
+def test_recursive(model_file):
+ """ check the whole chain has been printed"""
+ from pyemma.util.contexts import Capturing
+ with Capturing() as out:
+ main(['--recursive', model_file])
+ assert out
+ all_out = '\n'.join(out)
+ assert_('FeatureReader' in all_out)
+ assert_('TICA' in all_out)
+ assert_('Kmeans' in all_out)
diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py
index 36f243171..32862d053 100644
--- a/pyemma/coordinates/api.py
+++ b/pyemma/coordinates/api.py
@@ -21,6 +21,8 @@
.. currentmodule:: pyemma.coordinates.api
"""
+from pathlib import Path
+
import numpy as _np
import logging as _logging
@@ -632,7 +634,7 @@ def discretizer(reader,
return disc
-def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None, image_molecules=False, verbose=True):
+def save_traj(traj_inp, indexes, outfile, top=None, stride=1, chunksize=None, image_molecules=False, verbose=True):
r""" Saves a sequence of frames as a single trajectory.
Extracts the specified sequence of time/trajectory indexes from traj_inp
@@ -753,6 +755,8 @@ def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None,
return traj
# or to disk as a molecular trajectory file
else:
+ if isinstance(outfile, Path):
+ outfile = str(outfile.resolve())
traj.save(outfile)
if verbose:
_logger.info("Created file %s" % outfile)
@@ -839,9 +843,12 @@ def save_trajs(traj_inp, indexes, prefix='set_', fmt=None, outfiles=None,
# Determine output format of the molecular trajectory file
if fmt is None:
+ fname = traj_inp.filenames[0]
+ while hasattr(fname, '__getitem__') and not isinstance(fname, (str, bytes)):
+ fname = fname[0]
import os
- _, fmt = os.path.splitext(traj_inp.filenames[0])
+ _, fmt = os.path.splitext(fname)
else:
fmt = '.' + fmt
diff --git a/pyemma/coordinates/clustering/include/Clustering.h b/pyemma/coordinates/clustering/include/Clustering.h
deleted file mode 100644
index 1c0aaca48..000000000
--- a/pyemma/coordinates/clustering/include/Clustering.h
+++ /dev/null
@@ -1,78 +0,0 @@
-//
-// Created by marscher on 4/3/17.
-//
-
-#ifndef PYEMMA_CLUSTERING_H
-#define PYEMMA_CLUSTERING_H
-
-#include
-#include
-#include
-
-#include "metric_base.h"
-
-namespace py = pybind11;
-
-template
-class ClusteringBase {
-
-public:
- enum MetricType {
- EUCLIDEAN, MINRMSD
- };
-
- using np_array = py::array_t;
-
- ClusteringBase(const std::string& metric_s, std::size_t input_dimension) : input_dimension(input_dimension) {
- if (metric_s == "euclidean") {
- typedef euclidean_metric eucl;
- metric = std::unique_ptr(new eucl(input_dimension));
- _metric_type = MetricType::EUCLIDEAN;
- } else if(metric_s == "minRMSD") {
- typedef min_rmsd_metric min_rmsd_t;
- metric = std::unique_ptr(new min_rmsd_t(input_dimension));
- _metric_type = MetricType::MINRMSD;
- } else {
- throw std::invalid_argument("metric is not of {'euclidean', 'minRMSD'}");
- }
- }
-
- virtual ~ClusteringBase()= default;
- ClusteringBase(const ClusteringBase&) = delete;
- ClusteringBase&operator=(const ClusteringBase&) = delete;
- ClusteringBase(ClusteringBase&&) noexcept = default;
- ClusteringBase&operator=(ClusteringBase&&) noexcept = default;
-
- std::unique_ptr> metric;
- std::size_t input_dimension;
-
- py::array_t assign_chunk_to_centers(const py::array_t& chunk,
- const py::array_t& centers,
- unsigned int n_threads) const {
- return metric->assign_chunk_to_centers(chunk, centers, n_threads);
- }
-
- /**
- * pre-center given centers in place
- * @param centers
- */
- void precenter_centers(np_array& centers) const {
- switch (_metric_type) {
- case MetricType::MINRMSD: {
- auto ptr = dynamic_cast*>(metric.get());
- ptr->precenter_centers(centers.mutable_data(0), centers.shape(0));
- break;
- }
- default: {
- throw std::runtime_error("precentering is only available for minRMSD metric.");
- };
- }
- }
-
-private:
- MetricType _metric_type;
-};
-
-
-
-#endif //PYEMMA_CLUSTERING_H
diff --git a/pyemma/coordinates/clustering/include/bits/kmeans_bits.h b/pyemma/coordinates/clustering/include/bits/kmeans_bits.h
deleted file mode 100644
index b94cb3ac9..000000000
--- a/pyemma/coordinates/clustering/include/bits/kmeans_bits.h
+++ /dev/null
@@ -1,407 +0,0 @@
-//
-// Created by marscher on 7/24/17.
-//
-
-
-#ifndef PYEMMA_KMEANS_BITS_H_H
-#define PYEMMA_KMEANS_BITS_H_H
-
-#include "kmeans.h"
-#include "threading_utils.h"
-
-#include
-#include
-
-#include
-#include
-
-
-template
-typename KMeans::np_array
-KMeans::cluster(const np_array &np_chunk, const np_array &np_centers, int n_threads) const {
-
- if (np_chunk.ndim() != 2) {
- throw std::runtime_error(R"(Number of dimensions of "chunk" ain't 2.)");
- }
- if (np_centers.ndim() != 2) {
- throw std::runtime_error(R"(Number of dimensions of "centers" ain't 2.)");
- }
-
- auto n_frames = static_cast(np_chunk.shape(0));
- auto dim = static_cast(np_chunk.shape(1));
-
- if (dim == 0) {
- throw std::invalid_argument("chunk dimension must be larger than zero.");
- }
-
- auto chunk = np_chunk.template unchecked<2>();
- auto n_centers = static_cast(np_centers.shape(0));
- auto centers = np_centers.template unchecked<2>();
-
- /* initialize centers_counter and new_centers with zeros */
- std::vector shape = {n_centers, dim};
- py::array_t return_new_centers(shape);
- auto new_centers = return_new_centers.mutable_unchecked();
- std::fill(return_new_centers.mutable_data(), return_new_centers.mutable_data() + return_new_centers.size(), 0.0);
- std::vector centers_counter(n_centers, 0);
-
- /* do the clustering */
- if (n_threads == 0) {
- std::size_t closest_center_index = 0;
- for (std::size_t i = 0; i < n_frames; ++i) {
- auto mindist = std::numeric_limits::max();
- for(std::size_t j = 0; j < n_centers; ++j) {
- auto d = parent_t::metric->compute(&chunk(i, 0), ¢ers(j, 0));
- if(d < mindist) {
- mindist = d;
- closest_center_index = j;
- }
- }
- centers_counter[closest_center_index]++;
- for (std::size_t j = 0; j < dim; j++) {
- new_centers(closest_center_index, j) += chunk(i, j);
- }
- }
- } else {
-#if defined(USE_OPENMP)
- omp_set_num_threads(n_threads);
-
-#pragma omp parallel for schedule(static, 1)
- for (std::size_t i = 0; i < n_frames; ++i) {
- std::vector dists(n_centers);
- for (std::size_t j = 0; j < n_centers; ++j) {
- dists[j] = parent_t::metric->compute(&chunk(i, 0), ¢ers(j, 0));
- }
-#pragma omp flush(dists)
-
-#pragma omp critical(centers_counter)
- {
- auto closest_center_index = std::distance(dists.begin(), std::min_element(dists.begin(), dists.end()));
- {
- centers_counter.at(static_cast(closest_center_index))++;
- for (std::size_t j = 0; j < dim; j++) {
- new_centers(closest_center_index, j) += chunk(i, j);
- }
- }
- }
- }
-#else
- {
- std::mutex mutex;
-
- std::vector threads;
- threads.reserve(static_cast(n_threads));
-
- std::size_t grainSize = n_frames / n_threads;
-
- auto worker = [&](std::size_t tid, std::size_t begin, std::size_t end, std::mutex& m) {
- for (auto i = begin; i < end; ++i) {
- std::size_t argMinDist = 0;
- {
- dtype minDist = parent_t::metric->compute(&chunk(i, 0), ¢ers(0, 0));
- for (std::size_t j = 1; j < n_centers; ++j) {
- auto dist = parent_t::metric->compute(&chunk(i, 0), ¢ers(j, 0));
- if(dist < minDist) {
- minDist = dist;
- argMinDist = j;
- }
- }
- }
-
- {
- std::unique_lock lock(m);
-
- centers_counter.at(argMinDist)++;
- for (std::size_t j = 0; j < dim; j++) {
- new_centers(argMinDist, j) += chunk(i, j);
- }
- }
- }
- };
-
- for(std::uint8_t i = 0; i < n_threads - 1; ++i) {
- threads.emplace_back(worker, i, i*grainSize, (i+1)*grainSize, std::ref(mutex));
- }
- threads.emplace_back(worker, n_threads, (n_threads - 1) * grainSize, n_frames, std::ref(mutex));
- }
-#endif
- }
-
- auto centers_counter_it = centers_counter.begin();
- for (std::size_t i = 0; i < n_centers; ++i, ++centers_counter_it) {
- if (*centers_counter_it == 0) {
- for (std::size_t j = 0; j < dim; ++j) {
- new_centers(i, j) = centers(i, j);
- }
- } else {
- for (std::size_t j = 0; j < dim; ++j) {
- new_centers(i, j) /= static_cast(*centers_counter_it);
- }
- }
- }
- return return_new_centers;
-}
-
-
-template
-typename KMeans::cluster_res KMeans::cluster_loop(const np_array& np_chunk, np_array& np_centers,
- int n_threads, int max_iter, float tolerance,
- py::object& callback) const {
- int it = 0;
- bool converged = false;
- dtype rel_change = std::numeric_limits::max();
- dtype prev_cost = 0;
- do {
- np_centers = cluster(np_chunk, np_centers, n_threads);
- auto cost = costFunction(np_chunk, np_centers, n_threads);
- rel_change = (cost != 0.0) ? std::abs(cost - prev_cost) / cost : 0;
- prev_cost = cost;
- if(rel_change <= tolerance) {
- converged = true;
- } else {
- if(! callback.is_none()) {
- /* Acquire GIL before calling Python code */
- py::gil_scoped_acquire acquire;
- callback();
- }
- }
-
- it += 1;
- } while(it < max_iter && ! converged);
- int res = converged ? 0 : 1;
- return std::make_tuple(std::move(np_centers), res, it);
-}
-
-template
-dtype KMeans::costFunction(const np_array &np_data, const np_array &np_centers, int n_threads) const {
- auto data = np_data.template unchecked<2>();
- auto centers = np_centers.template unchecked<2>();
-
- dtype value = 0.0;
- auto n_frames = static_cast(np_data.shape(0));
-#ifdef USE_OPENMP
- omp_set_num_threads(n_threads);
-#endif
-
-#pragma omp parallel for reduction(+:value)
- for (std::size_t i = 0; i < n_frames; i++) {
- for (std::size_t r = 0; r < np_centers.shape(0); r++) {
- auto l = parent_t::metric->compute(&data(i, 0), ¢ers(r, 0));
- {
- value += l;
- }
- }
- }
- return value;
-}
-
-template
-typename KMeans::np_array KMeans::
-initCentersKMpp(const np_array &np_data, unsigned int random_seed, int n_threads, py::object& callback) const {
- if (np_data.shape(0) < k) {
- std::stringstream ss;
- ss << "not enough data to initialize desired number of centers.";
- ss << "Provided frames (" << np_data.shape(0) << ") < n_centers (" << k << ").";
- throw std::invalid_argument(ss.str());
- }
-
- constexpr auto size_t_max = std::numeric_limits::max();
-
- std::size_t centers_found = 0;
- std::size_t dim = parent_t::metric->dim;
-
- if (np_data.ndim() != 2) {
- throw std::invalid_argument("input data does not have two dimensions.");
- }
-
- if (np_data.shape(1) != dim) {
- throw std::invalid_argument("input dimension of data does not match the requested metric ones.");
- }
-
- auto n_frames = static_cast(np_data.shape(0));
-
- /* number of trials before choosing the data point with the best potential */
- size_t n_trials = 2 + (size_t) log(k);
-
- /* allocate space for the index giving away which point has already been used as a cluster center */
- std::vector taken_points(n_frames, 0);
- /* candidates allocations */
- std::vector next_center_candidates(n_trials, size_t_max);
- std::vector next_center_candidates_rand(n_trials, 0);
- std::vector next_center_candidates_potential(n_trials, 0);
- /* allocate space for the array holding the squared distances to the assigned cluster centers */
- std::vector squared_distances(n_frames, 0);
-
- /* create the output objects */
- std::vector shape = {k, dim};
- np_array ret_init_centers(shape);
- auto init_centers = ret_init_centers.mutable_unchecked();
- std::memset(init_centers.mutable_data(), 0,
- static_cast(init_centers.size() * init_centers.itemsize()));
-
- const auto data = np_data.template unchecked<2>();
- /* initialize random device and pick first center randomly */
- std::default_random_engine generator(random_seed);
- std::uniform_int_distribution uniform_dist(0, n_frames - 1);
- auto first_center_index = uniform_dist(generator);
- /* and mark it as assigned */
- taken_points[first_center_index] = 1;
- /* write its coordinates into the init_centers array */
- for (std::size_t j = 0; j < dim; j++) {
- init_centers(centers_found, j) = data(first_center_index, j);
- }
- /* increase number of found centers */
- centers_found++;
- /* perform callback */
- if (!callback.is_none()) {
- py::gil_scoped_acquire acquire;
- callback();
- }
-#ifdef USE_OPENMP
- omp_set_num_threads(n_threads);
-#endif
-
- /* iterate over all data points j, measuring the squared distance between j and the initial center i: */
- /* squared_distances[i] = distance(x_j, x_i)*distance(x_j, x_i) */
- dtype dist_sum = 0;
- #pragma omp parallel for reduction(+:dist_sum)
- for (std::size_t i = 0; i < n_frames; i++) {
- if (i != first_center_index) {
- auto value = parent_t::metric->compute(&data(i, 0), &data(first_center_index, 0));
- value = value * value;
- squared_distances[i] = value;
- /* build up dist_sum which keeps the sum of all squared distances */
- dist_sum += value;
- }
- }
-
- /* keep picking centers while we do not have enough of them... */
- while (centers_found < k) {
- py::gil_scoped_release release;
-
- /* initialize the trials random values by the D^2-weighted distribution */
- for (std::size_t j = 0; j < n_trials; j++) {
- next_center_candidates[j] = size_t_max;
- auto point_index = uniform_dist(generator);
- next_center_candidates_rand[j] = dist_sum * (static_cast(point_index) /
- static_cast(uniform_dist.max()));
- next_center_candidates_potential[j] = 0.0;
- }
- /* pick candidate data points corresponding to their random value */
- dtype sum = 0.0;
- for (std::size_t i = 0; i < n_frames; i++) {
- if (taken_points[i] == 0) {
- sum += squared_distances[i];
- bool some_not_done{false};
- for (std::size_t j = 0; j < n_trials; j++) {
- if (next_center_candidates[j] == size_t_max) {
- if (sum >= next_center_candidates_rand[j]) {
- assert(i < std::numeric_limits::max());
- next_center_candidates[j] = i;
- } else {
- some_not_done = true;
- }
- }
- }
- if (!some_not_done) break;
- }
- }
-
- /* now find the maximum squared distance for each trial... */
- std::atomic_bool terminate(false);
- #pragma omp parallel for
- for (std::size_t i = 0; i < n_frames; i++) {
- if (terminate.load()) {
- continue;
- }
- if (taken_points.at(i) == 0) {
- for (std::size_t j = 0; j < n_trials; ++j) {
- if (next_center_candidates.at(j) == size_t_max) {
- terminate.store(true);
- break;
- }
- #pragma omp critical
- {
- if (next_center_candidates.at(j) != i) {
- auto value = parent_t::metric->compute(&data(i, 0),
- &data(next_center_candidates.at(j), 0));
- auto d = value * value;
- if (d < squared_distances.at(i)) {
- next_center_candidates_potential[j] += d;
- } else {
- next_center_candidates_potential[j] += squared_distances[i];
- }
- }
- }
- }
- }
- }
-
- /* ... and select the best candidate by the minimum value of the maximum squared distances */
- long best_candidate = -1;
- auto best_potential = std::numeric_limits::max();
- for (std::size_t j = 0; j < n_trials; j++) {
- if (next_center_candidates[j] != size_t_max && next_center_candidates_potential[j] < best_potential) {
- best_potential = next_center_candidates_potential[j];
- best_candidate = next_center_candidates[j];
- }
- }
-
- /* if for some reason we did not find a best candidate, just take the next available point */
- if (best_candidate == -1) {
- for (std::size_t i = 0; i < n_frames; i++) {
- if (taken_points[i] == 0) {
- best_candidate = i;
- break;
- }
- }
- }
-
- /* check if best_candidate was set, otherwise break to avoid an infinite loop should things go wrong */
- if (best_candidate >= 0) {
- /* write the best_candidate's components into the init_centers array */
- for (std::size_t j = 0; j < dim; j++) {
- init_centers(centers_found, j) = data(best_candidate, j);
- }
- /* increase centers_found */
- centers_found++;
- /* perform the callback */
- if (!callback.is_none()) {
- py::gil_scoped_acquire acquire;
- callback();
- }
- /* mark the data point as assigned center */
- taken_points[best_candidate] = 1;
- /* update the sum of squared distances by removing the assigned center */
- dist_sum -= squared_distances[best_candidate];
-
- /* if we still have centers to assign, the squared distances array has to be updated */
- if (centers_found < k) {
- /* Check for each data point if its squared distance to the freshly added center is smaller than */
- /* the squared distance to the previously picked centers. If so, update the squared_distances */
- /* array by the new value and also update the dist_sum value by removing the old value and adding */
- /* the new one. */
-#pragma omp parallel for
- for (std::size_t i = 0; i < n_frames; ++i) {
- if (taken_points[i] == 0) {
- auto value = parent_t::metric->compute(&data(i, 0), &data(best_candidate, 0));
- auto d = value * value;
-#pragma omp critical
- {
- if (d < squared_distances[i]) {
- dist_sum += d - squared_distances[i];
- squared_distances[i] = d;
- }
- }
- }
- }
- }
- } else {
- break;
- }
- }
- if (centers_found != k) { throw std::runtime_error("kmeans++ failed to initialize all desired centers"); }
- return ret_init_centers;
-}
-
-#endif //PYEMMA_KMEANS_BITS_H_H
diff --git a/pyemma/coordinates/clustering/include/bits/metric_base_bits.h b/pyemma/coordinates/clustering/include/bits/metric_base_bits.h
deleted file mode 100644
index 4824c87dd..000000000
--- a/pyemma/coordinates/clustering/include/bits/metric_base_bits.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//
-// Created by marscher on 7/21/17.
-//
-
-#ifndef PYEMMA_METRIC_BASE_BITS_H
-#define PYEMMA_METRIC_BASE_BITS_H
-
-#include "../metric_base.h"
-
-#include
-#include
-
-#ifdef USE_OPENMP
-#include
-#endif
-
-/**
- * assign a given chunk to given centers using encapsuled metric.
- * @tparam dtype
- * @param chunk
- * @param centers
- * @param n_threads
- * @return
- */
-template
-inline py::array_t metric_base::assign_chunk_to_centers(const np_array& chunk,
- const np_array& centers,
- unsigned int n_threads) {
- if (chunk.ndim() != 2) {
- throw std::invalid_argument("provided chunk does not have two dimensions.");
- }
-
- if (centers.ndim() != 2) {
- throw std::invalid_argument("provided centers does not have two dimensions.");
- }
- size_t N_centers = static_cast(centers.shape(0));
- size_t N_frames = static_cast(chunk.shape(0));
- size_t input_dim = static_cast(chunk.shape(1));
-
- if ((input_dim != dim) || (input_dim != centers.shape(1))) {
- throw std::invalid_argument("input dimension mismatch");
- }
- std::vector dists(N_centers);
- std::vector shape = {N_frames};
- py::array_t dtraj(shape);
-
- auto dtraj_buff = dtraj.template mutable_unchecked<1>();
- auto chunk_buff = chunk.template unchecked<2>();
- auto centers_buff = centers.template unchecked<2>();
-
-#ifdef USE_OPENMP
- /* Create a parallel thread block. */
- omp_set_num_threads(n_threads);
- assert(omp_get_num_threads() == n_threads);
-#endif
- #pragma omp parallel
- {
- for(size_t i = 0; i < N_frames; ++i) {
- /* Parallelize distance calculations to cluster centers to avoid cache misses */
- #pragma omp for
- for(size_t j = 0; j < N_centers; ++j) {
- dists[j] = compute(&chunk_buff(i, 0), ¢ers_buff(j, 0));
- }
- #pragma omp flush(dists)
-
- /* Only one thread can make actual assignment */
- #pragma omp single
- {
- dtype mindist = std::numeric_limits::max();
- int argmin = -1;
- for (size_t j = 0; j < N_centers; ++j) {
- if (dists[j] < mindist) {
- mindist = dists[j];
- argmin = static_cast(j);
- }
- }
- dtraj_buff(i) = argmin;
- }
- /* Have all threads synchronize in progress through cluster assignments */
- #pragma omp barrier
- }
- }
- return dtraj;
-}
-
-/**
- * euclidean distance method
- * @tparam dtype
- * @param a
- * @param b
- * @return
- */
-template
-inline dtype euclidean_metric::compute(const dtype *const a, const dtype *const b) {
- double sum = 0.0;
- //#pragma omp simd reduction(+:sum)
- for (size_t i = 0; i < metric_base::dim; ++i) {
- assert(std::isfinite(a[i]));
- assert(std::isfinite(b[i]));
-
- auto d = a[i] - b[i];
- sum += d * d;
- }
- assert(std::isfinite(sum));
- return static_cast(std::sqrt(sum));
-}
-
-/**
- * minRMSD distance function
- * a: centers
- * b: frames
- * n: dimension of one frame
- * buffer_a: pre-allocated buffer to store a copy of centers
- * buffer_b: pre-allocated buffer to store a copy of frames
- * trace_a_precalc: pre-calculated trace to centers (pointer to one value)
- */
-template
-inline dtype min_rmsd_metric::compute(const dtype *a, const dtype *b) {
- float trace_a, trace_b;
- auto dim3 = static_cast(parent_t::dim / 3);
- std::vector buffer_b (b, b + parent_t::dim);
-
- if (!has_trace_a_been_precalculated) {
- std::vector buffer_a (a, a + parent_t::dim);
- inplace_center_and_trace_atom_major(buffer_a.data(), &trace_a, 1, dim3);
- inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3);
-
- } else {
- inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3);
- trace_a = *trace_centers.data();
- }
-
- float msd = msd_atom_major(dim3, dim3, a, buffer_b.data(), trace_a, trace_b, 0, nullptr);
- return std::sqrt(msd);
-}
-
-template
-inline void min_rmsd_metric::precenter_centers(float *centers, std::size_t N_centers) {
- trace_centers.resize(N_centers);
- float *trace_centers_p = trace_centers.data();
-
- /* Parallelize centering of cluster generators */
- /* Note that this is already OpenMP-enabled */
- for (std::size_t j = 0; j < N_centers; ++j) {
- inplace_center_and_trace_atom_major(¢ers[j * parent_t::dim],
- &trace_centers_p[j], 1, parent_t::dim / 3);
- }
-}
-
-
-#endif //PYEMMA_METRIC_BASE_BITS_H
diff --git a/pyemma/coordinates/clustering/include/bits/threading_utils.h b/pyemma/coordinates/clustering/include/bits/threading_utils.h
deleted file mode 100644
index 5e6b1e455..000000000
--- a/pyemma/coordinates/clustering/include/bits/threading_utils.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//
-// Created by clonkscher on 8/8/17.
-//
-
-#ifndef PYEMMA_THREADING_UTILS_H
-#define PYEMMA_THREADING_UTILS_H
-
-
-#include
-#include
-#include
-
-/**
- * scoped_thread implementation
- */
-class scoped_thread {
- std::thread t;
-public:
- /**
- * Creates a new scoped_thread based on a thread object
- * @param _t the reference thread
- */
- template
- explicit scoped_thread(Function &&fun, Args &&... args)
- : t(std::forward(fun), std::forward(args)...) {
- if (!t.joinable()) throw std::logic_error("No thread!");
- }
-
- /**
- * joins the contained thread if its joinable
- */
- ~scoped_thread() {
- if (t.joinable()) {
- t.join();
- }
- }
-
- /**
- * moves another scoped_thread into this
- * @param rhs the other scoped_thread
- */
- scoped_thread(scoped_thread &&rhs) {
- t = std::move(rhs.t);
- }
-
- /**
- * moves another scoped_thread into this
- * @param rhs the other scoped_thread
- * @return myself
- */
- scoped_thread &operator=(scoped_thread &&rhs) {
- t = std::move(rhs.t);
- return *this;
- }
-
- /**
- * copying is not allowed
- */
- scoped_thread(const scoped_thread &) = delete;
-
- /**
- * copying is not allowed
- */
- scoped_thread &operator=(const scoped_thread &) = delete;
-};
-
-
-#endif //PYEMMA_THREADING_UTILS_H
diff --git a/pyemma/coordinates/clustering/include/kmeans.h b/pyemma/coordinates/clustering/include/kmeans.h
deleted file mode 100644
index ca8be5126..000000000
--- a/pyemma/coordinates/clustering/include/kmeans.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//
-// Created by marscher on 4/3/17.
-//
-
-#ifndef PYEMMA_KMEANS_H
-#define PYEMMA_KMEANS_H
-
-#include
-
-#include "Clustering.h"
-
-namespace py = pybind11;
-
-
-template
-class KMeans : public ClusteringBase {
-public:
- using parent_t = ClusteringBase;
- using np_array = py::array_t;
- /**
- * array with new cluster centers, return code (0 == converged), number of iterations taken.
- */
- using cluster_res = std::tuple;
-
- KMeans(unsigned int k,
- const std::string &metric,
- size_t input_dimension) : ClusteringBase(metric, input_dimension), k(k) {}
-
- /**
- * performs kmeans clustering on the given data chunk, provided a list of centers.
- * @param np_chunk
- * @param np_centers
- * @param n_threads
- * @return updated centers.
- */
- np_array cluster(const np_array & /*np_chunk*/, const np_array & /*np_centers*/, int /*n_threads*/) const;
-
- /**
- *
- */
- cluster_res cluster_loop(const np_array & /*np_chunk*/, np_array & /*np_centers*/,
- int /*n_threads*/, int /*max_iter*/, float /*tolerance*/,
- py::object& /*callback*/) const;
-
- /**
- * evaluate the quality of the centers
- *
- * @return
- */
- dtype costFunction(const np_array & /*np_data*/, const np_array & /*np_centers*/, int /*n_threads*/) const;
-
- /**
- * kmeans++ initialisation
- * @param np_data
- * @param random_seed
- * @param n_threads
- * @return init centers.
- */
- np_array initCentersKMpp(const np_array& /*np_data*/, unsigned int /*random_seed*/, int /*n_threads*/,
- py::object& /*callback*/) const;
-
-protected:
- unsigned int k;
-};
-
-#include "bits/kmeans_bits.h"
-
-#endif //PYEMMA_KMEANS_H
diff --git a/pyemma/coordinates/clustering/include/metric_base.h b/pyemma/coordinates/clustering/include/metric_base.h
deleted file mode 100644
index 9111b99df..000000000
--- a/pyemma/coordinates/clustering/include/metric_base.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//
-// Created by marscher on 3/31/17.
-//
-
-#ifndef PYEMMA_METRIC_H
-#define PYEMMA_METRIC_H
-
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include
-
-namespace py = pybind11;
-
-/**
- * Base type for all metrics.
- * @tparam dtype eg. float, double
- */
-template
-class metric_base {
-
-public:
- using np_array = py::array_t;
-
- explicit metric_base(std::size_t dim) : dim(dim) {}
- virtual ~metric_base() = default;
- metric_base(const metric_base&) = delete;
- metric_base&operator=(const metric_base&) = delete;
- metric_base(metric_base&&) = default;
- metric_base&operator=(metric_base&&) = default;
-
- virtual dtype compute(const dtype *, const dtype *) = 0;
-
- py::array_t assign_chunk_to_centers(const np_array& chunk,
- const np_array& centers,
- unsigned int n_threads);
- size_t dim;
-};
-
-template
-class euclidean_metric : public metric_base {
-public:
- explicit euclidean_metric(size_t dim) : metric_base(dim) {}
- ~euclidean_metric() = default;
- euclidean_metric(const euclidean_metric&) = delete;
- euclidean_metric&operator=(const euclidean_metric&) = delete;
- euclidean_metric(euclidean_metric&&) = default;
- euclidean_metric&operator=(euclidean_metric&&) = default;
-
- dtype compute(const dtype *, const dtype *);
-
-};
-
-template
-class min_rmsd_metric : public metric_base {
-
- static_assert(std::is_same::value, "only implemented for floats");
-
-public:
- using parent_t = metric_base;
- min_rmsd_metric(std::size_t dim, float *precalc_trace_centers = nullptr)
- : metric_base(dim) {
- if (dim % 3 != 0) {
- throw std::range_error("min_rmsd_metric is only implemented for input data with a dimension dividable by 3.");
- }
- has_trace_a_been_precalculated = precalc_trace_centers != nullptr;
- }
- ~min_rmsd_metric() = default;
- min_rmsd_metric(const min_rmsd_metric&) = delete;
- min_rmsd_metric&operator=(const min_rmsd_metric&) = delete;
- min_rmsd_metric(min_rmsd_metric&&) = default;
- min_rmsd_metric&operator=(min_rmsd_metric&&) = default;
- dtype compute(const dtype *a, const dtype *b);
- /**
- * pre-center in place
- * @param original_centers
- * @param N_centers
- */
- void precenter_centers(float *original_centers, std::size_t N_centers);
-
-private:
- /**
- * only used during cluster assignment. Avoids precentering the centers in every step.
- */
- std::vector trace_centers;
- bool has_trace_a_been_precalculated;
- float trace_a_precentered;
-};
-
-#include "bits/metric_base_bits.h"
-
-#endif //PYEMMA_METRIC_H
diff --git a/pyemma/coordinates/clustering/include/regspace.h b/pyemma/coordinates/clustering/include/regspace.h
deleted file mode 100644
index 47244f349..000000000
--- a/pyemma/coordinates/clustering/include/regspace.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//
-// Created by marscher on 7/17/17.
-//
-
-#ifndef PYEMMA_REGSPACE_H
-#define PYEMMA_REGSPACE_H
-
-#if defined(USE_OPENMP)
-#include
-#endif
-
-#include
-
-
-class MaxCentersReachedException : public std::exception {
-public:
- explicit MaxCentersReachedException(const char * m) : message{m} {}
- virtual const char * what() const noexcept override {return message.c_str();}
-private:
- std::string message = "";
-};
-
-template
-class RegularSpaceClustering : public ClusteringBase {
- using parent_t = ClusteringBase;
-public:
- /**
- *
- * @param dmin
- * @param max_clusters
- * @param metric
- * @param input_dimension
- */
- RegularSpaceClustering(dtype dmin, std::size_t max_clusters,
- const std::string &metric,
- size_t input_dimension) :
- ClusteringBase(metric, input_dimension),
- dmin(dmin),
- max_clusters(max_clusters) {}
-
- /**
- * loops over all points in chunk and checks for each center if the distance is smaller than dmin,
- * if so, the point is appended to py_centers. This is done until max_centers is reached or all points have been
- * added to the list.
- * @param chunk array shape(n, d)
- * @param py_centers python list containing found centers.
- */
- void cluster(const py::array_t &chunk, py::list& py_centers, unsigned int n_threads) {
- // this checks for ndim == 2
- const auto& data = chunk.template unchecked< 2 >();
-
- std::size_t N_frames = chunk.shape(0);
- std::size_t dim = chunk.shape(1);
- std::size_t N_centers = py_centers.size();
- #if defined(USE_OPENMP)
- omp_set_num_threads(n_threads);
- #endif
- // do the clustering
- for (std::size_t i = 0; i < N_frames; ++i) {
- auto mindist = std::numeric_limits::max();
- #pragma omp parallel for reduction(min:mindist)
- for (std::size_t j = 0; j < N_centers; ++j) {
- // TODO avoid the cast in inner loop?
- auto point = py_centers[j].cast < py::array_t < dtype >> ();
- auto d = parent_t::metric.get()->compute(&data(i, 0), point.data());
- if (d < mindist) mindist = d;
- }
- if (mindist > dmin) {
- if (N_centers + 1 > max_clusters) {
- throw MaxCentersReachedException(
- "Maximum number of cluster centers reached. Consider increasing max_clusters "
- "or choose a larger minimum distance, dmin.");
- }
- // add newly found center
- std::vector shape = {1, dim};
- py::array_t new_center(shape, nullptr);
- std::memcpy(new_center.mutable_data(), &data(i, 0), sizeof(dtype)*dim);
-
- py_centers.append(new_center);
- N_centers++;
- }
- }
- }
-
-protected:
- dtype dmin;
- std::size_t max_clusters;
-
-};
-
-#endif //PYEMMA_REGSPACE_H
diff --git a/pyemma/coordinates/clustering/interface.py b/pyemma/coordinates/clustering/interface.py
index 3adc4db15..10b52124d 100644
--- a/pyemma/coordinates/clustering/interface.py
+++ b/pyemma/coordinates/clustering/interface.py
@@ -26,6 +26,8 @@
import os
import numpy as np
+from deeptime.clustering import ClusterModel, metrics
+
from pyemma._base.serialization.serialization import SerializableMixIn
from pyemma._base.model import Model
@@ -56,6 +58,8 @@ class AbstractClustering(StreamingEstimationTransformer, Model, ClusterMixin, NJ
def __init__(self, metric='euclidean', n_jobs=None):
super(AbstractClustering, self).__init__()
+ from ._ext import rmsd
+ metrics.register("minRMSD", rmsd)
self.metric = metric
self.clustercenters = None
self._previous_stride = -1
@@ -153,18 +157,12 @@ def sample_indexes_by_cluster(self, clusters, nsample, replace=True):
def _transform_array(self, X):
"""get closest index of point in :attr:`clustercenters` to x."""
X = np.require(X, dtype=np.float32, requirements='C')
- if not hasattr(self, '_inst'):
- self.logger.debug("new cluster inst")
- from ._ext import ClusteringBase_f
- self._inst = ClusteringBase_f(self.metric, X.shape[1])
-
# for performance reasons we pre-center the cluster centers for minRMSD.
if self.metric == 'minRMSD' and not self._precentered:
- # self.logger.debug("precentering cluster centers for minRMSD.")
- # self._inst.precenter_centers(self.clustercenters)
self._precentered = True
- dtraj = self._inst.assign(X, self.clustercenters, self.n_jobs)
+ model = ClusterModel(cluster_centers=self.clustercenters, metric=self.metric)
+ dtraj = model.transform(X)
res = dtraj[:, None] # always return a column vector in this function
return res
diff --git a/pyemma/coordinates/clustering/kmeans.py b/pyemma/coordinates/clustering/kmeans.py
index c96f01159..b4144ae22 100644
--- a/pyemma/coordinates/clustering/kmeans.py
+++ b/pyemma/coordinates/clustering/kmeans.py
@@ -35,6 +35,9 @@
from pyemma.util.units import bytes_to_string
from pyemma.util.contexts import random_seed, nullcontext
+
+from deeptime.clustering import metrics, KMeans, MiniBatchKMeans
+
import numpy as np
@@ -105,6 +108,9 @@ def __init__(self, n_clusters, max_iter=5, metric='euclidean',
"""
super(KmeansClustering, self).__init__(metric=metric, n_jobs=n_jobs)
+ from ._ext import rmsd
+ metrics.register("minRMSD", rmsd)
+
if clustercenters is None:
clustercenters = []
@@ -199,6 +205,10 @@ def _check_resume_iteration(self):
def _estimate(self, iterable, **kw):
self._init_estimate()
+ estimator = KMeans(self.n_clusters, self.max_iter, self.metric, tolerance=self.tolerance,
+ init_strategy=self.init_strategy, fixed_seed=self.fixed_seed, n_jobs=self.n_jobs)
+ if len(self.clustercenters) > 0:
+ estimator.initial_centers = np.array(self.clustercenters)
ctx = nullcontext() if 'data' not in self._progress_registered_stages else self._progress_context(stage='data')
# collect the data only if, we have not done this previously (eg. keep_data=True)
# or the centers are not initialized.
@@ -212,7 +222,7 @@ def _estimate(self, iterable, **kw):
# collect data
self._collect_data(X, first_chunk, it.last_chunk)
if not resume_centers:
- # initialize cluster centers
+ # initialize cluster centers in case of uniform
self._initialize_centers(X, itraj, it.pos, it.last_chunk)
first_chunk = False
self.initial_centers_ = self.clustercenters[:]
@@ -226,22 +236,30 @@ def _estimate(self, iterable, **kw):
format(len(self.clustercenters), self.n_clusters))
if self.show_progress:
+ if self.init_strategy == 'kmeans++':
+ self._progress_register(self.n_clusters,
+ description="initialize kmeans++ centers", stage=0)
+ self._progress_register(self.max_iter, description="kmeans iterations", stage=1)
+
+ callback_init = lambda: self._progress_update(1, stage=0)
callback = lambda: self._progress_update(1, stage=1)
else:
+ callback_init = None
callback = None
- # run k-means with all the data
- with self._progress_context(stage=1), self._finish_estimate():
- self.clustercenters, code, iterations = self._inst.cluster_loop(self._in_memory_chunks, self.clustercenters,
- self.n_jobs, self.max_iter, self.tolerance,
- callback)
- if code == 0:
- self._converged = True
- self.logger.debug("Cluster centers converged after %i steps.", iterations + 1)
- else:
- self.logger.warning("Algorithm did not reach convergence criterion"
- " of %g in %i iterations. Consider increasing max_iter.",
- self.tolerance, self.max_iter)
+ with self._progress_context(stage=(0, 1)), self._finish_estimate():
+ estimator.fit(self._in_memory_chunks, callback_init_centers=callback_init, callback_loop=callback,
+ n_jobs=self.n_jobs)
+ model = estimator.fetch_model()
+ self.clustercenters = model.cluster_centers
+ self._converged = model.converged
+
+ if self.converged:
+ self.logger.debug("Cluster centers converged after %i steps.", len(model.inertias))
+ else:
+ self.logger.warning("Algorithm did not reach convergence criterion"
+ " of %g in %i iterations. Consider increasing max_iter.",
+ self.tolerance, self.max_iter)
return self
@@ -278,10 +296,6 @@ def _init_estimate(self):
n_chunks = self.data_producer.n_chunks(chunksize=self.chunksize, skip=self.skip, stride=self.stride)
self._progress_register(n_chunks, description="creating data array", stage='data')
- if self.init_strategy == 'kmeans++':
- self._progress_register(self.n_clusters,
- description="initialize kmeans++ centers", stage=0)
- self._progress_register(self.max_iter, description="kmeans iterations", stage=1)
self._init_in_memory_chunks(total_length)
if self.init_strategy == 'uniform':
@@ -292,9 +306,6 @@ def _init_estimate(self):
self._init_centers_indices[idx] = random.sample(list(range(0, traj_len)), int(
math.ceil((traj_len / float(total_length)) * self.n_clusters)))
- from ._ext import kmeans as kmeans_mod
- self._inst = kmeans_mod.Kmeans_f(self.n_clusters, self.metric, self.data_producer.ndim)
-
return stride
def _initialize_centers(self, X, itraj, t, last_chunk):
@@ -308,16 +319,6 @@ def _initialize_centers(self, X, itraj, t, last_chunk):
if len(self.clustercenters) < self.n_clusters and t + l in self._init_centers_indices[itraj]:
new = np.vstack((self.clustercenters, X[l]))
self.clustercenters = new
- elif last_chunk and self.init_strategy == 'kmeans++':
- if self.init_strategy == 'kmeans++' and self.show_progress:
- callback = lambda: self._progress_update(1, stage=0)
- context = self._progress_context(stage=0)
- else:
- callback = None
- context = nullcontext()
- with context:
- self.clustercenters = self._inst.init_centers_KMpp(self._in_memory_chunks, self.fixed_seed, self.n_jobs,
- callback)
def _collect_data(self, X, first_chunk, last_chunk):
# beginning - compute
@@ -342,6 +343,8 @@ class MiniBatchKmeansClustering(KmeansClustering):
def __init__(self, n_clusters, max_iter=5, metric='euclidean', tolerance=1e-5, init_strategy='kmeans++',
batch_size=0.2, oom_strategy='memmap', fixed_seed=False, stride=None, n_jobs=None, skip=0,
clustercenters=None, keep_data=False):
+ from ._ext import rmsd
+ metrics.register("minRMSD", rmsd)
if stride is not None:
raise ValueError("stride is a dummy value in MiniBatch Kmeans")
@@ -393,6 +396,12 @@ def _init_estimate(self):
def _estimate(self, iterable, **kw):
# mini-batch kmeans does not use stride. Enforce it.
+
+ estimator = MiniBatchKMeans(n_clusters=self.n_clusters, metric=self.metric, n_jobs=self.n_jobs,
+ init_strategy=self.init_strategy)
+ if len(self.clustercenters) > 0:
+ estimator.initial_centers = np.array(self.clustercenters)
+
self.stride = None
self._init_estimate()
@@ -411,14 +420,11 @@ def _estimate(self, iterable, **kw):
for X in iter(iterator):
# collect data
self._collect_data(X, first_chunk, iterator.last_chunk)
- # initialize cluster centers
- if i_pass == 0 and not self._check_resume_iteration():
- self._initialize_centers(X, iterator.current_trajindex, iterator.pos, iterator.last_chunk)
first_chunk = False
- # one pass over data completed
- self.clustercenters = self._inst.cluster(self._in_memory_chunks, self.clustercenters, self.n_jobs)
- cost = self._inst.cost_function(self._in_memory_chunks, self.clustercenters, self.n_jobs)
+ current_model = estimator.partial_fit(self._in_memory_chunks).fetch_model()
+ self.clustercenters = current_model.cluster_centers
+ cost = current_model.inertia
rel_change = np.abs(cost - prev_cost) / cost if cost != 0.0 else 0.0
prev_cost = cost
diff --git a/pyemma/coordinates/clustering/regspace.py b/pyemma/coordinates/clustering/regspace.py
index 339c2e348..5e319eae7 100644
--- a/pyemma/coordinates/clustering/regspace.py
+++ b/pyemma/coordinates/clustering/regspace.py
@@ -31,7 +31,7 @@
from pyemma.util.exceptions import NotConvergedWarning
import numpy as np
-import deeptime as dt
+from deeptime.clustering import RegularSpace, metrics, ClusterModel
__all__ = ['RegularSpaceClustering']
@@ -80,8 +80,8 @@ def __init__(self, dmin, max_centers=1000, metric='euclidean', stride=1, n_jobs=
"""
super(RegularSpaceClustering, self).__init__(metric=metric, n_jobs=n_jobs)
- from ._ext import RMSDMetric
- dt.clustering.metrics.register("minRMSD", RMSDMetric)
+ from ._ext import rmsd
+ metrics.register("minRMSD", rmsd)
self._converged = False
self.set_params(dmin=dmin, metric=metric,
@@ -135,12 +135,8 @@ def _estimate(self, iterable, **kwargs):
# 3. add new centroid, if min(distance to all other clustercenters) >= dmin
########
# temporary list to store cluster centers
- clustercenters = []
used_frames = 0
- regspace = dt.clustering.RegularSpace(dmin=self.dmin, max_centers=self.max_centers,
- metric=self.metric, n_jobs=self.n_jobs)
-
- # from ._ext import regspace
+ regspace = RegularSpace(dmin=self.dmin, max_centers=self.max_centers, metric=self.metric, n_jobs=self.n_jobs)
it = iterable.iterator(return_trajindex=False, stride=self.stride,
chunk=self.chunksize, skip=self.skip)
try:
@@ -149,21 +145,26 @@ def _estimate(self, iterable, **kwargs):
regspace.partial_fit(X.astype(np.float32, order='C', copy=False), n_jobs=self.n_jobs)
used_frames += len(X)
self._converged = True
- except regspace.MaxCentersReachedException:
- self._converged = False
- msg = 'Maximum number of cluster centers reached.' \
- ' Consider increasing max_centers or choose' \
- ' a larger minimum distance, dmin.'
- self.logger.warning(msg)
- warnings.warn(msg)
- # pass amount of processed data
- used_data = used_frames / float(it.n_frames_total()) * 100.0
- raise NotConvergedWarning("Used data for centers: %.2f%%" % used_data)
+ except Exception as e:
+ if 'MaxCentersReachedException' in e.__class__.__name__:
+ self._converged = False
+ msg = 'Maximum number of cluster centers reached.' \
+ ' Consider increasing max_centers or choose' \
+ ' a larger minimum distance, dmin.'
+ self.logger.warning(msg)
+ warnings.warn(msg)
+ # pass amount of processed data
+ used_data = used_frames / float(it.n_frames_total()) * 100.0
+ raise NotConvergedWarning("Used data for centers: %.2f%%" % used_data)
+ else:
+ # todo ugly workaround until maxcentersreached is placed not within metric subpackage but globally
+ # somewhere
+ raise
finally:
# even if not converged, we store the found centers.
model = regspace.fetch_model()
clustercenters = model.cluster_centers.squeeze().reshape(-1, iterable.ndim)
- self._inst = dt.clustering.ClusterModel(clustercenters, metric=self.metric)
+ self._inst = ClusterModel(clustercenters, metric=self.metric)
from types import MethodType
def _assign(self, data, _, n_jobs):
diff --git a/pyemma/coordinates/clustering/src/clustering_module.cpp b/pyemma/coordinates/clustering/src/clustering_module.cpp
index 2c253020d..b8181a57a 100644
--- a/pyemma/coordinates/clustering/src/clustering_module.cpp
+++ b/pyemma/coordinates/clustering/src/clustering_module.cpp
@@ -2,47 +2,13 @@
// Created by marscher on 7/17/17.
//
-#include "metric.h"
-#include "regspace.h"
-#include "kmeans.h"
+#include
+#include
+#include "register_clustering.h"
-class MaximumMetric : public Metric {
-public:
-
- double compute_squared_d(const double* xs, const double* ys, std::size_t dim) const override {
- return _compute(xs, ys, dim);
- }
- float compute_squared_f(const float* xs, const float* ys, std::size_t dim) const override {
- return _compute(xs, ys, dim);
- }
-private:
- template
- T _compute(const T* xs, const T* ys, std::size_t dim) const {
- T result = 0.0;
- for (size_t i = 0; i < dim; ++i) {
- auto d = std::abs(xs[i] - ys[i]);
- if (d > result) {
- result = d;
- }
- }
- return result*result;
- }
-};
-
-class RMSDMetric : public Metric {
-public:
-
- double compute_squared_d(const double* xs, const double* ys, std::size_t dim) const override {
- std::vector xsCast (xs, xs+dim), ysCast (ys, ys+dim);
- return _compute(xsCast.data(), ysCast.data(), dim);
- }
- float compute_squared_f(const float* xs, const float* ys, std::size_t dim) const override {
- return _compute(xs, ys, dim);
- }
-
-private:
+struct RMSDMetric {
template
- T _compute(const T* xs, const T* ys, std::size_t dim) const {
+ static T compute(const T* xs, const T* ys, std::size_t dim) {
if (dim % 3 != 0) {
throw std::range_error("RMSDMetric is only implemented for input data with a dimension dividable by 3.");
}
@@ -53,45 +19,22 @@ class RMSDMetric : public Metric {
inplace_center_and_trace_atom_major(buffer_a.data(), &trace_a, 1, dim3);
inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3);
+ if constexpr(std::is_same::value) {
+ std::vector cast (xs, xs + dim);
+ return msd_atom_major(dim3, dim3, cast.data(), buffer_b.data(), trace_a, trace_b, 0, nullptr);
+ } else {
+ return msd_atom_major(dim3, dim3, xs, buffer_b.data(), trace_a, trace_b, 0, nullptr);
+ }
+ }
- float msd = msd_atom_major(dim3, dim3, xs,
- buffer_b.data(), trace_a, trace_b, 0, nullptr);
- return msd;
+ template
+ static dtype compute_squared(const dtype* xs, const dtype* ys, std::size_t dim) {
+ auto d = compute(xs, ys, dim);
+ return d*d;
}
};
-using dtype = float;
-
PYBIND11_MODULE(_ext, m) {
- m.doc() = "module containing clustering algorithms.";
-
- auto regspace_mod = m.def_submodule("regspace");
- auto kmeans_mod = m.def_submodule("kmeans");
-
- typedef ClusteringBase cbase_f;
- typedef RegularSpaceClustering regspace_f;
-
- // register base class first.
- py::class_(m, "ClusteringBase_f")
- .def(py::init())
- .def("assign", &cbase_f::assign_chunk_to_centers)
- .def("precenter_centers", &cbase_f::precenter_centers);
- // regular space clustering.
- py::class_(regspace_mod, "Regspace_f")
- .def(py::init())
- .def("cluster", ®space_f::cluster);
- py::register_exception(regspace_mod, "MaxCentersReachedException");
- // kmeans
- typedef KMeans kmeans_f;
- py::class_(kmeans_mod, "Kmeans_f")
- .def(py::init(),
- py::arg("k"), py::arg("metric"), py::arg("dim"))
- // py::arg("callback") = py::none()
- .def("cluster", &kmeans_f::cluster)
- .def("cluster_loop", &kmeans_f::cluster_loop)
- .def("init_centers_KMpp", &kmeans_f::initCentersKMpp)
- .def("cost_function", &kmeans_f::costFunction);
- // py::class_(m, "Metric");
- py::object baseMetric = (py::object) py::module_::import("deeptime.clustering._clustering_bindings").attr("Metric");
- py::class_(m, "RMSDMetric", baseMetric).def(py::init<>());
+ auto rmsdModule = m.def_submodule("rmsd");
+ deeptime::clustering::registerClusteringImplementation(rmsdModule);
}
diff --git a/pyemma/coordinates/clustering/tests/test_kmeans.py b/pyemma/coordinates/clustering/tests/test_kmeans.py
index 5c22d5934..5f9dbe79c 100644
--- a/pyemma/coordinates/clustering/tests/test_kmeans.py
+++ b/pyemma/coordinates/clustering/tests/test_kmeans.py
@@ -92,7 +92,6 @@ def test_3gaussian_1d_singletraj(self):
while not km2.converged:
km2.estimate(X=X, clustercenters=km2.clustercenters, keep_data=True)
- assert np.linalg.norm(km1.clustercenters - km1.initial_centers_) > 0
np.testing.assert_allclose(km1.clustercenters, km2.clustercenters,
err_msg="should yield same centers with fixed seed=%s for strategy %s, Initial centers=%s"
% (fixed_seed, init_strategy, km2.initial_centers_), atol=1e-6)
diff --git a/pyemma/coordinates/data/util/reader_utils.py b/pyemma/coordinates/data/util/reader_utils.py
index a2a83a15c..14dbf0752 100644
--- a/pyemma/coordinates/data/util/reader_utils.py
+++ b/pyemma/coordinates/data/util/reader_utils.py
@@ -15,7 +15,7 @@
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see .
-
+from pathlib import Path
from numpy import vstack
import mdtraj as md
@@ -127,6 +127,8 @@ def single_traj_from_n_files(file_list, top):
"""
traj = None
for ff in file_list:
+ if isinstance(ff, Path):
+ ff = str(ff.resolve())
if traj is None:
traj = md.load(ff, top=top)
else:
diff --git a/pyemma/coordinates/tests/test_save_trajs.py b/pyemma/coordinates/tests/test_save_trajs.py
index c41d41079..398592134 100644
--- a/pyemma/coordinates/tests/test_save_trajs.py
+++ b/pyemma/coordinates/tests/test_save_trajs.py
@@ -29,6 +29,7 @@
import os
import shutil
import tempfile
+from pathlib import Path
import numpy as np
import pyemma.coordinates as coor
@@ -61,16 +62,19 @@ def setUp(self):
self.sets = [set_1, set_2]
- self.subdir = tempfile.mkdtemp(suffix='save_trajs_test/')
+ self.subdir = Path(tempfile.mkdtemp(suffix='save_trajs_test/'))
# Instantiate the reader
self.reader = coor.source(self.trajfiles, top=self.pdbfile)
self.reader.chunksize = 30
- self.n_pass_files = [self.subdir + 'n_pass.set_%06u.xtc' % ii for ii in range(len(self.sets))]
- self.one_pass_files = [self.subdir + '1_pass.set_%06u.xtc' % ii for ii in range(len(self.sets))]
+ self.frag_reader = coor.source([self.trajfiles], top=self.pdbfile)
+ self.n_pass_files = [str(self.subdir / ('n_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))]
+ self.frag_pass_files = [str(self.subdir / ('frag_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))]
+ self.one_pass_files = [str(self.subdir / ('1_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))]
self.traj_ref = save_traj_w_md_load_frame(self.reader, self.sets)
self.strides = [2, 3, 5]
+ self.subdir = str(self.subdir)
def tearDown(self):
shutil.rmtree(self.subdir, ignore_errors=True)
@@ -89,12 +93,21 @@ def test_save_SaveTrajs_multipass(self):
__ = save_trajs(self.reader, self.sets,
outfiles=self.n_pass_files)
+ sets_frag = []
+ for i in range(len(self.sets)):
+ sets_frag.append([])
+ for traj_ix, frame_ix in self.sets[i]:
+ sets_frag[-1].append([0, traj_ix * 33 + frame_ix])
+
+ save_trajs(self.frag_reader, [np.array(sets_frag[0]), np.array(sets_frag[1])], outfiles=self.frag_pass_files)
# Reload the object to memory
traj_n_pass = single_traj_from_n_files(self.n_pass_files, top=self.pdbfile)
+ traj_frag = single_traj_from_n_files(self.frag_pass_files, top=self.pdbfile)
# Check for diffs
(found_diff, errmsg) = compare_coords_md_trajectory_objects(traj_n_pass, self.traj_ref, atom=0)
-
+ self.assertFalse(found_diff, errmsg)
+ (found_diff, errmsg) = compare_coords_md_trajectory_objects(traj_frag, self.traj_ref, atom=0)
self.assertFalse(found_diff, errmsg)
def test_save_SaveTrajs_onepass(self):
diff --git a/setup.py b/setup.py
index 38ac054e7..6b119dcc3 100755
--- a/setup.py
+++ b/setup.py
@@ -93,12 +93,11 @@ def extensions():
include_dirs=[
mdtraj_inc,
pybind_inc,
- 'pyemma/coordinates/clustering/include',
] + deeptime_inc,
language='c++',
libraries=[lib_prefix+'theobald'],
library_dirs=[mdtraj_lib],
- extra_compile_args=common_cflags)
+ extra_compile_args=common_cflags + [''])
covar_module = \
Extension('pyemma._ext.variational.estimators.covar_c._covartools',
@@ -239,6 +238,8 @@ def build_extensions(self):
if has_flag(self.compiler, '-fvisibility=hidden'):
opts.append('-fvisibility=hidden')
elif ct == 'msvc':
+ opts.append('/std:c++17')
+ opts.append('/bigobj')
opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
# setup OpenMP support
diff --git a/setup_util.py b/setup_util.py
index 9ef2d0751..bf4e6d4fa 100644
--- a/setup_util.py
+++ b/setup_util.py
@@ -129,9 +129,11 @@ def has_flag(compiler, flagname):
def cpp_flag(compiler):
- """Return the -std=c++[11/14] compiler flag.
- The c++14 is prefered over c++11 (when it is available).
+ """Return the -std=c++[11/14/17] compiler flag.
+ The newer the better.
"""
+ if has_flag(compiler, '-std=c++17'):
+ return '-std=c++17'
if has_flag(compiler, '-std=c++14'):
return '-std=c++14'
elif has_flag(compiler, '-std=c++11'):