diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3d954f579..2471aaf69 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -21,7 +21,7 @@ jobs:
       - run: conda config --set quiet true
       - run: conda install conda-build -c defaults
       - run: mkdir workspace
-      - run: conda build devtools/conda-recipe --python=3.7 --no-test --output-folder workspace
+      - run: conda build devtools/conda-recipe --python=3.9 --numpy=1.19 --no-test --output-folder workspace
       - persist_to_workspace:
           root: workspace
           paths:
@@ -46,7 +46,7 @@ jobs:
       - run: mkdir -p $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS
       - attach_workspace:
           at: workspace
-      - run: conda build devtools/conda-recipe --python=3.7 --test --output-folder workspace
+      - run: conda build devtools/conda-recipe --python=3.9 --numpy=1.19 --test --output-folder workspace
       - run: bash <(curl -s https://codecov.io/bash) -f $HOME/coverage.xml
       - store_test_results:
           path: /tmp/circleci-test-results
diff --git a/devtools/azure-pipelines-linux.yml b/devtools/azure-pipelines-linux.yml
index 1ccb6f3df..1fbf0c870 100644
--- a/devtools/azure-pipelines-linux.yml
+++ b/devtools/azure-pipelines-linux.yml
@@ -6,15 +6,12 @@ jobs:
 
   strategy:
     matrix:
-      Python36:
-        CONDA_PY: '3.6'
-        CONDA_NPY: '1.16'
       Python37:
         CONDA_PY: '3.7'
         CONDA_NPY: '1.18'
       Python38:
         CONDA_PY: '3.8'
-        CONDA_NPY: '1.19'
+        CONDA_NPY: '1.18'
       Python39:
         CONDA_PY: '3.9'
         CONDA_NPY: '1.19'
diff --git a/devtools/azure-pipelines-osx.yml b/devtools/azure-pipelines-osx.yml
index cf4d3fe01..67a0e4b97 100644
--- a/devtools/azure-pipelines-osx.yml
+++ b/devtools/azure-pipelines-osx.yml
@@ -5,12 +5,12 @@ jobs:
     vmImage: 'macOS-10.14'
   strategy:
     matrix:
-      Python36:
-        CONDA_PY: '3.6'
-        CONDA_NPY: '1.17'
       Python37:
         CONDA_PY: '3.7'
-        CONDA_NPY: '1.19'
+        CONDA_NPY: '1.18'
+      Python38:
+        CONDA_PY: '3.8'
+        CONDA_NPY: '1.18'
       Python39:
         CONDA_PY: '3.9'
         CONDA_NPY: '1.19'
diff --git a/devtools/azure-pipelines-win.yml b/devtools/azure-pipelines-win.yml
index 2a590b423..d64d53d6e 100644
--- a/devtools/azure-pipelines-win.yml
+++ b/devtools/azure-pipelines-win.yml
@@ -7,10 +7,10 @@ jobs:
     matrix:
       Python37:
         CONDA_PY: '3.7'
-        CONDA_NPY: '1.19'
+        CONDA_NPY: '1.18'
       Python38:
         CONDA_PY: '3.8'
-        CONDA_NPY: '1.19'
+        CONDA_NPY: '1.18'
       Python39:
         CONDA_PY: '3.9'
         CONDA_NPY: '1.19'
diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml
index de097b739..85af0c989 100644
--- a/devtools/conda-recipe/meta.yaml
+++ b/devtools/conda-recipe/meta.yaml
@@ -35,7 +35,8 @@ requirements:
     - setuptools
     - pip
     - pybind11
-    - deeptime
+    - pybind11-abi
+    - deeptime >=0.3.0
 
   run:
     - bhmm >=0.6.3
diff --git a/pyemma/_base/serialization/tests/test_cli.py b/pyemma/_base/serialization/tests/test_cli.py
index aaf220e5e..2d8bf111f 100644
--- a/pyemma/_base/serialization/tests/test_cli.py
+++ b/pyemma/_base/serialization/tests/test_cli.py
@@ -15,20 +15,21 @@
 #
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
+import shutil
 import tempfile
-import unittest
 
+import pytest
+from numpy.testing import assert_
 
 from pyemma._base.serialization.cli import main
 from pyemma.coordinates import source, tica, cluster_kmeans
 
 
-class TestListModelCLI(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
+@pytest.fixture
+def model_file():
+    file = None
+    try:
         from pyemma.datasets import get_bpti_test_data
-
         d = get_bpti_test_data()
         trajs, top = d['trajs'], d['top']
         s = source(trajs, top=top)
@@ -36,25 +37,22 @@ def setUpClass(cls):
         t = tica(s, lag=1)
 
         c = cluster_kmeans(t)
-        cls.model_file = tempfile.mktemp()
-        c.save(cls.model_file, save_streaming_chain=True)
-
-    @classmethod
-    def tearDownClass(cls):
-        import os
-        os.unlink(cls.model_file)
-
-    def test_recursive(self):
-        """ check the whole chain has been printed"""
-        from pyemma.util.contexts import Capturing
-        with Capturing() as out:
-            main(['--recursive', self.model_file])
-        assert out
-        all_out = '\n'.join(out)
-        self.assertIn('FeatureReader', all_out)
-        self.assertIn('TICA', all_out)
-        self.assertIn('Kmeans', all_out)
-
-
-if __name__ == '__main__':
-    unittest.main()
+        file = tempfile.mktemp()
+        c.save(file, save_streaming_chain=True)
+
+        yield file
+    finally:
+        if file is not None:
+            shutil.rmtree(file, ignore_errors=True)
+
+
+def test_recursive(model_file):
+    """ check the whole chain has been printed"""
+    from pyemma.util.contexts import Capturing
+    with Capturing() as out:
+        main(['--recursive', model_file])
+    assert out
+    all_out = '\n'.join(out)
+    assert_('FeatureReader' in all_out)
+    assert_('TICA' in all_out)
+    assert_('Kmeans' in all_out)
diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py
index 36f243171..32862d053 100644
--- a/pyemma/coordinates/api.py
+++ b/pyemma/coordinates/api.py
@@ -21,6 +21,8 @@
 
 .. currentmodule:: pyemma.coordinates.api
 """
+from pathlib import Path
+
 import numpy as _np
 import logging as _logging
 
@@ -632,7 +634,7 @@ def discretizer(reader,
     return disc
 
 
-def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None, image_molecules=False, verbose=True):
+def save_traj(traj_inp, indexes, outfile, top=None, stride=1, chunksize=None, image_molecules=False, verbose=True):
     r""" Saves a sequence of frames as a single trajectory.
 
     Extracts the specified sequence of time/trajectory indexes from traj_inp
@@ -753,6 +755,8 @@ def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None,
         return traj
     # or to disk as a molecular trajectory file
     else:
+        if isinstance(outfile, Path):
+            outfile = str(outfile.resolve())
         traj.save(outfile)
     if verbose:
         _logger.info("Created file %s" % outfile)
@@ -839,9 +843,12 @@ def save_trajs(traj_inp, indexes, prefix='set_', fmt=None, outfiles=None,
 
     # Determine output format of the molecular trajectory file
     if fmt is None:
+        fname = traj_inp.filenames[0]
+        while hasattr(fname, '__getitem__') and not isinstance(fname, (str, bytes)):
+            fname = fname[0]
         import os
 
-        _, fmt = os.path.splitext(traj_inp.filenames[0])
+        _, fmt = os.path.splitext(fname)
     else:
         fmt = '.' + fmt
 
diff --git a/pyemma/coordinates/clustering/include/Clustering.h b/pyemma/coordinates/clustering/include/Clustering.h
deleted file mode 100644
index 1c0aaca48..000000000
--- a/pyemma/coordinates/clustering/include/Clustering.h
+++ /dev/null
@@ -1,78 +0,0 @@
-//
-// Created by marscher on 4/3/17.
-//
-
-#ifndef PYEMMA_CLUSTERING_H
-#define PYEMMA_CLUSTERING_H
-
-#include <cstdlib>
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
-#include "metric_base.h"
-
-namespace py = pybind11;
-
-template <typename dtype>
-class ClusteringBase {
-
-public:
-    enum MetricType {
-        EUCLIDEAN, MINRMSD
-    };
-
-    using np_array = py::array_t<dtype, py::array::c_style | py::array::forcecast>;
-
-    ClusteringBase(const std::string& metric_s, std::size_t input_dimension) : input_dimension(input_dimension) {
-        if (metric_s == "euclidean") {
-            typedef euclidean_metric<dtype> eucl;
-            metric = std::unique_ptr<eucl>(new eucl(input_dimension));
-            _metric_type = MetricType::EUCLIDEAN;
-        } else if(metric_s == "minRMSD") {
-            typedef min_rmsd_metric<float> min_rmsd_t;
-            metric = std::unique_ptr<min_rmsd_t>(new min_rmsd_t(input_dimension));
-            _metric_type = MetricType::MINRMSD;
-        } else {
-            throw std::invalid_argument("metric is not of {'euclidean', 'minRMSD'}");
-        }
-    }
-
-    virtual ~ClusteringBase()= default;
-    ClusteringBase(const ClusteringBase&) = delete;
-    ClusteringBase&operator=(const ClusteringBase&) = delete;
-    ClusteringBase(ClusteringBase&&) noexcept = default;
-    ClusteringBase&operator=(ClusteringBase&&) noexcept = default;
-
-    std::unique_ptr<metric_base<dtype>> metric;
-    std::size_t input_dimension;
-
-    py::array_t<int> assign_chunk_to_centers(const py::array_t<dtype, py::array::c_style>& chunk,
-                                             const py::array_t<dtype, py::array::c_style>& centers,
-                                             unsigned int n_threads) const {
-        return metric->assign_chunk_to_centers(chunk, centers, n_threads);
-    }
-
-    /**
-     * pre-center given centers in place
-     * @param centers
-     */
-    void precenter_centers(np_array& centers) const {
-        switch (_metric_type) {
-            case MetricType::MINRMSD: {
-                auto ptr = dynamic_cast<min_rmsd_metric<dtype>*>(metric.get());
-                ptr->precenter_centers(centers.mutable_data(0), centers.shape(0));
-                break;
-            }
-            default: {
-                throw std::runtime_error("precentering is only available for minRMSD metric.");
-            };
-        }
-    }
-
-private:
-    MetricType _metric_type;
-};
-
-
-
-#endif //PYEMMA_CLUSTERING_H
diff --git a/pyemma/coordinates/clustering/include/bits/kmeans_bits.h b/pyemma/coordinates/clustering/include/bits/kmeans_bits.h
deleted file mode 100644
index b94cb3ac9..000000000
--- a/pyemma/coordinates/clustering/include/bits/kmeans_bits.h
+++ /dev/null
@@ -1,407 +0,0 @@
-//
-// Created by marscher on 7/24/17.
-//
-
-
-#ifndef PYEMMA_KMEANS_BITS_H_H
-#define PYEMMA_KMEANS_BITS_H_H
-
-#include "kmeans.h"
-#include "threading_utils.h"
-
-#include <random>
-#include <atomic>
-
-#include <pybind11/pytypes.h>
-#include <mutex>
-
-
-template<typename dtype>
-typename KMeans<dtype>::np_array
-KMeans<dtype>::cluster(const np_array &np_chunk, const np_array &np_centers, int n_threads) const {
-
-    if (np_chunk.ndim() != 2) {
-        throw std::runtime_error(R"(Number of dimensions of "chunk" ain't 2.)");
-    }
-    if (np_centers.ndim() != 2) {
-        throw std::runtime_error(R"(Number of dimensions of "centers" ain't 2.)");
-    }
-
-    auto n_frames = static_cast<size_t>(np_chunk.shape(0));
-    auto dim = static_cast<size_t>(np_chunk.shape(1));
-
-    if (dim == 0) {
-        throw std::invalid_argument("chunk dimension must be larger than zero.");
-    }
-
-    auto chunk = np_chunk.template unchecked<2>();
-    auto n_centers = static_cast<size_t>(np_centers.shape(0));
-    auto centers = np_centers.template unchecked<2>();
-
-    /* initialize centers_counter and new_centers with zeros */
-    std::vector<std::size_t> shape = {n_centers, dim};
-    py::array_t <dtype> return_new_centers(shape);
-    auto new_centers = return_new_centers.mutable_unchecked();
-    std::fill(return_new_centers.mutable_data(), return_new_centers.mutable_data() + return_new_centers.size(), 0.0);
-    std::vector<std::size_t> centers_counter(n_centers, 0);
-
-    /* do the clustering */
-    if (n_threads == 0) {
-        std::size_t closest_center_index = 0;
-        for (std::size_t i = 0; i < n_frames; ++i) {
-            auto mindist = std::numeric_limits<dtype>::max();
-            for(std::size_t j = 0; j < n_centers; ++j) {
-                auto d = parent_t::metric->compute(&chunk(i, 0), &centers(j, 0));
-                if(d < mindist) {
-                    mindist = d;
-                    closest_center_index = j;
-                }
-            }
-            centers_counter[closest_center_index]++;
-            for (std::size_t j = 0; j < dim; j++) {
-                new_centers(closest_center_index, j) += chunk(i, j);
-            }
-        }
-    } else {
-#if defined(USE_OPENMP)
-        omp_set_num_threads(n_threads);
-
-#pragma omp parallel for schedule(static, 1)
-        for (std::size_t i = 0; i < n_frames; ++i) {
-            std::vector<dtype> dists(n_centers);
-            for (std::size_t j = 0; j < n_centers; ++j) {
-                dists[j] = parent_t::metric->compute(&chunk(i, 0), &centers(j, 0));
-            }
-#pragma omp flush(dists)
-
-#pragma omp critical(centers_counter)
-            {
-                auto closest_center_index = std::distance(dists.begin(), std::min_element(dists.begin(), dists.end()));
-                {
-                    centers_counter.at(static_cast<std::size_t>(closest_center_index))++;
-                    for (std::size_t j = 0; j < dim; j++) {
-                        new_centers(closest_center_index, j) += chunk(i, j);
-                    }
-                }
-            }
-        }
-#else
-        {
-            std::mutex mutex;
-
-            std::vector<scoped_thread> threads;
-            threads.reserve(static_cast<std::size_t>(n_threads));
-
-            std::size_t grainSize = n_frames / n_threads;
-
-            auto worker = [&](std::size_t tid, std::size_t begin, std::size_t end, std::mutex& m) {
-                for (auto i = begin; i < end; ++i) {
-                    std::size_t argMinDist = 0;
-                    {
-                        dtype minDist = parent_t::metric->compute(&chunk(i, 0), &centers(0, 0));
-                        for (std::size_t j = 1; j < n_centers; ++j) {
-                            auto dist = parent_t::metric->compute(&chunk(i, 0), &centers(j, 0));
-                            if(dist < minDist) {
-                                minDist = dist;
-                                argMinDist = j;
-                            }
-                        }
-                    }
-
-                    {
-                        std::unique_lock<std::mutex> lock(m);
-
-                        centers_counter.at(argMinDist)++;
-                        for (std::size_t j = 0; j < dim; j++) {
-                            new_centers(argMinDist, j) += chunk(i, j);
-                        }
-                    }
-                }
-            };
-
-            for(std::uint8_t i = 0; i < n_threads - 1; ++i) {
-                threads.emplace_back(worker, i, i*grainSize, (i+1)*grainSize, std::ref(mutex));
-            }
-            threads.emplace_back(worker, n_threads, (n_threads - 1) * grainSize, n_frames, std::ref(mutex));
-        }
-#endif
-    }
-
-    auto centers_counter_it = centers_counter.begin();
-    for (std::size_t i = 0; i < n_centers; ++i, ++centers_counter_it) {
-        if (*centers_counter_it == 0) {
-            for (std::size_t j = 0; j < dim; ++j) {
-                new_centers(i, j) = centers(i, j);
-            }
-        } else {
-            for (std::size_t j = 0; j < dim; ++j) {
-                new_centers(i, j) /= static_cast<dtype>(*centers_counter_it);
-            }
-        }
-    }
-    return return_new_centers;
-}
-
-
-template<typename dtype>
-typename KMeans<dtype>::cluster_res KMeans<dtype>::cluster_loop(const np_array& np_chunk, np_array& np_centers,
-                                                                int n_threads, int max_iter, float tolerance,
-                                                                py::object& callback) const {
-    int it = 0;
-    bool converged = false;
-    dtype rel_change = std::numeric_limits<dtype>::max();
-    dtype prev_cost = 0;
-    do {
-        np_centers = cluster(np_chunk, np_centers, n_threads);
-        auto cost = costFunction(np_chunk, np_centers, n_threads);
-        rel_change = (cost != 0.0) ? std::abs(cost - prev_cost) / cost : 0;
-        prev_cost = cost;
-        if(rel_change <= tolerance) {
-            converged = true;
-        } else {
-            if(! callback.is_none()) {
-                /* Acquire GIL before calling Python code */
-                py::gil_scoped_acquire acquire;
-                callback();
-            }
-        }
-
-        it += 1;
-    } while(it < max_iter && ! converged);
-    int res = converged ? 0 : 1;
-    return std::make_tuple(std::move(np_centers), res, it);
-}
-
-template<typename dtype>
-dtype KMeans<dtype>::costFunction(const np_array &np_data, const np_array &np_centers, int n_threads) const {
-    auto data = np_data.template unchecked<2>();
-    auto centers = np_centers.template unchecked<2>();
-
-    dtype value = 0.0;
-    auto n_frames = static_cast<std::size_t>(np_data.shape(0));
-#ifdef USE_OPENMP
-    omp_set_num_threads(n_threads);
-#endif
-
-#pragma omp parallel for reduction(+:value)
-    for (std::size_t i = 0; i < n_frames; i++) {
-        for (std::size_t r = 0; r < np_centers.shape(0); r++) {
-            auto l = parent_t::metric->compute(&data(i, 0), &centers(r, 0));
-            {
-                value += l;
-            }
-        }
-    }
-    return value;
-}
-
-template<typename dtype>
-typename KMeans<dtype>::np_array KMeans<dtype>::
-initCentersKMpp(const np_array &np_data, unsigned int random_seed, int n_threads, py::object& callback) const {
-    if (np_data.shape(0) < k) {
-        std::stringstream ss;
-        ss << "not enough data to initialize desired number of centers.";
-        ss << "Provided frames (" << np_data.shape(0) << ") < n_centers (" << k << ").";
-        throw std::invalid_argument(ss.str());
-    }
-
-    constexpr auto size_t_max = std::numeric_limits<std::size_t>::max();
-
-    std::size_t centers_found = 0;
-    std::size_t dim = parent_t::metric->dim;
-
-    if (np_data.ndim() != 2) {
-        throw std::invalid_argument("input data does not have two dimensions.");
-    }
-
-    if (np_data.shape(1) != dim) {
-        throw std::invalid_argument("input dimension of data does not match the requested metric ones.");
-    }
-
-    auto n_frames = static_cast<size_t>(np_data.shape(0));
-
-    /* number of trials before choosing the data point with the best potential */
-    size_t n_trials = 2 + (size_t) log(k);
-
-    /* allocate space for the index giving away which point has already been used as a cluster center */
-    std::vector<char> taken_points(n_frames, 0);
-    /* candidates allocations */
-    std::vector<std::size_t> next_center_candidates(n_trials, size_t_max);
-    std::vector<double> next_center_candidates_rand(n_trials, 0);
-    std::vector<double> next_center_candidates_potential(n_trials, 0);
-    /* allocate space for the array holding the squared distances to the assigned cluster centers */
-    std::vector<double> squared_distances(n_frames, 0);
-
-    /* create the output objects */
-    std::vector<std::size_t> shape = {k, dim};
-    np_array ret_init_centers(shape);
-    auto init_centers = ret_init_centers.mutable_unchecked();
-    std::memset(init_centers.mutable_data(), 0,
-                static_cast<std::size_t>(init_centers.size() * init_centers.itemsize()));
-
-    const auto data = np_data.template unchecked<2>();
-    /* initialize random device and pick first center randomly */
-    std::default_random_engine generator(random_seed);
-    std::uniform_int_distribution<size_t> uniform_dist(0, n_frames - 1);
-    auto first_center_index = uniform_dist(generator);
-    /* and mark it as assigned */
-    taken_points[first_center_index] = 1;
-    /* write its coordinates into the init_centers array */
-    for (std::size_t j = 0; j < dim; j++) {
-        init_centers(centers_found, j) = data(first_center_index, j);
-    }
-    /* increase number of found centers */
-    centers_found++;
-    /* perform callback */
-    if (!callback.is_none()) {
-        py::gil_scoped_acquire acquire;
-        callback();
-    }
-#ifdef USE_OPENMP
-    omp_set_num_threads(n_threads);
-#endif
-
-    /* iterate over all data points j, measuring the squared distance between j and the initial center i: */
-    /* squared_distances[i] = distance(x_j, x_i)*distance(x_j, x_i) */
-    dtype dist_sum = 0;
-    #pragma omp parallel for reduction(+:dist_sum)
-        for (std::size_t i = 0; i < n_frames; i++) {
-            if (i != first_center_index) {
-                auto value = parent_t::metric->compute(&data(i, 0), &data(first_center_index, 0));
-                value = value * value;
-                squared_distances[i] = value;
-                /* build up dist_sum which keeps the sum of all squared distances */
-                dist_sum += value;
-            }
-        }
-
-    /* keep picking centers while we do not have enough of them... */
-    while (centers_found < k) {
-        py::gil_scoped_release release;
-
-        /* initialize the trials random values by the D^2-weighted distribution */
-        for (std::size_t j = 0; j < n_trials; j++) {
-            next_center_candidates[j] = size_t_max;
-            auto point_index = uniform_dist(generator);
-            next_center_candidates_rand[j] = dist_sum * (static_cast<double>(point_index) /
-                                                         static_cast<double>(uniform_dist.max()));
-            next_center_candidates_potential[j] = 0.0;
-        }
-        /* pick candidate data points corresponding to their random value */
-        dtype sum = 0.0;
-        for (std::size_t i = 0; i < n_frames; i++) {
-            if (taken_points[i] == 0) {
-                sum += squared_distances[i];
-                bool some_not_done{false};
-                for (std::size_t j = 0; j < n_trials; j++) {
-                    if (next_center_candidates[j] == size_t_max) {
-                        if (sum >= next_center_candidates_rand[j]) {
-                            assert(i < std::numeric_limits<int>::max());
-                            next_center_candidates[j] = i;
-                        } else {
-                            some_not_done = true;
-                        }
-                    }
-                }
-                if (!some_not_done) break;
-            }
-        }
-
-        /* now find the maximum squared distance for each trial... */
-        std::atomic_bool terminate(false);
-        #pragma omp parallel for
-        for (std::size_t i = 0; i < n_frames; i++) {
-            if (terminate.load()) {
-                continue;
-            }
-            if (taken_points.at(i) == 0) {
-                for (std::size_t j = 0; j < n_trials; ++j) {
-                    if (next_center_candidates.at(j) == size_t_max) {
-                        terminate.store(true);
-                        break;
-                    }
-                    #pragma omp critical
-                    {
-                        if (next_center_candidates.at(j) != i) {
-                            auto value = parent_t::metric->compute(&data(i, 0),
-                                                                   &data(next_center_candidates.at(j), 0));
-                            auto d = value * value;
-                            if (d < squared_distances.at(i)) {
-                                next_center_candidates_potential[j] += d;
-                            } else {
-                                next_center_candidates_potential[j] += squared_distances[i];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        /* ... and select the best candidate by the minimum value of the maximum squared distances */
-        long best_candidate = -1;
-        auto best_potential = std::numeric_limits<double>::max();
-        for (std::size_t j = 0; j < n_trials; j++) {
-            if (next_center_candidates[j] != size_t_max && next_center_candidates_potential[j] < best_potential) {
-                best_potential = next_center_candidates_potential[j];
-                best_candidate = next_center_candidates[j];
-            }
-        }
-
-        /* if for some reason we did not find a best candidate, just take the next available point */
-        if (best_candidate == -1) {
-            for (std::size_t i = 0; i < n_frames; i++) {
-                if (taken_points[i] == 0) {
-                    best_candidate = i;
-                    break;
-                }
-            }
-        }
-
-        /* check if best_candidate was set, otherwise break to avoid an infinite loop should things go wrong */
-        if (best_candidate >= 0) {
-            /* write the best_candidate's components into the init_centers array */
-            for (std::size_t j = 0; j < dim; j++) {
-                init_centers(centers_found, j) = data(best_candidate, j);
-            }
-            /* increase centers_found */
-            centers_found++;
-            /* perform the callback */
-            if (!callback.is_none()) {
-                py::gil_scoped_acquire acquire;
-                callback();
-            }
-            /* mark the data point as assigned center */
-            taken_points[best_candidate] = 1;
-            /* update the sum of squared distances by removing the assigned center */
-            dist_sum -= squared_distances[best_candidate];
-
-            /* if we still have centers to assign, the squared distances array has to be updated */
-            if (centers_found < k) {
-                /* Check for each data point if its squared distance to the freshly added center is smaller than */
-                /* the squared distance to the previously picked centers. If so, update the squared_distances */
-                /* array by the new value and also update the dist_sum value by removing the old value and adding */
-                /* the new one. */
-#pragma omp parallel for
-                for (std::size_t i = 0; i < n_frames; ++i) {
-                    if (taken_points[i] == 0) {
-                        auto value = parent_t::metric->compute(&data(i, 0), &data(best_candidate, 0));
-                        auto d = value * value;
-#pragma omp critical
-                        {
-                            if (d < squared_distances[i]) {
-                                dist_sum += d - squared_distances[i];
-                                squared_distances[i] = d;
-                            }
-                        }
-                    }
-                }
-            }
-        } else {
-            break;
-        }
-    }
-    if (centers_found != k) { throw std::runtime_error("kmeans++ failed to initialize all desired centers"); }
-    return ret_init_centers;
-}
-
-#endif //PYEMMA_KMEANS_BITS_H_H
diff --git a/pyemma/coordinates/clustering/include/bits/metric_base_bits.h b/pyemma/coordinates/clustering/include/bits/metric_base_bits.h
deleted file mode 100644
index 4824c87dd..000000000
--- a/pyemma/coordinates/clustering/include/bits/metric_base_bits.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//
-// Created by marscher on 7/21/17.
-//
-
-#ifndef PYEMMA_METRIC_BASE_BITS_H
-#define PYEMMA_METRIC_BASE_BITS_H
-
-#include "../metric_base.h"
-
-#include <center.h>
-#include <theobald_rmsd.h>
-
-#ifdef USE_OPENMP
-#include <omp.h>
-#endif
-
-/**
- * assign a given chunk to given centers using encapsuled metric.
- * @tparam dtype
- * @param chunk
- * @param centers
- * @param n_threads
- * @return
- */
-template <typename dtype>
-inline py::array_t<int> metric_base<dtype>::assign_chunk_to_centers(const np_array& chunk,
-                                                                    const np_array& centers,
-                                                                    unsigned int n_threads) {
-    if (chunk.ndim() != 2) {
-        throw std::invalid_argument("provided chunk does not have two dimensions.");
-    }
-
-    if (centers.ndim() != 2) {
-        throw std::invalid_argument("provided centers does not have two dimensions.");
-    }
-    size_t N_centers = static_cast<size_t>(centers.shape(0));
-    size_t N_frames = static_cast<size_t>(chunk.shape(0));
-    size_t input_dim = static_cast<size_t>(chunk.shape(1));
-
-    if ((input_dim != dim) || (input_dim != centers.shape(1))) {
-        throw std::invalid_argument("input dimension mismatch");
-    }
-    std::vector<dtype> dists(N_centers);
-    std::vector<size_t> shape = {N_frames};
-    py::array_t<int> dtraj(shape);
-
-    auto dtraj_buff = dtraj.template mutable_unchecked<1>();
-    auto chunk_buff = chunk.template unchecked<2>();
-    auto centers_buff = centers.template unchecked<2>();
-
-#ifdef USE_OPENMP
-    /* Create a parallel thread block. */
-    omp_set_num_threads(n_threads);
-    assert(omp_get_num_threads() == n_threads);
-#endif
-    #pragma omp parallel
-    {
-        for(size_t i = 0; i < N_frames; ++i) {
-            /* Parallelize distance calculations to cluster centers to avoid cache misses */
-            #pragma omp for
-            for(size_t j = 0; j < N_centers; ++j) {
-                dists[j] = compute(&chunk_buff(i, 0), &centers_buff(j, 0));
-            }
-            #pragma omp flush(dists)
-
-            /* Only one thread can make actual assignment */
-            #pragma omp single
-            {
-                dtype mindist = std::numeric_limits<dtype>::max();
-                int argmin = -1;
-                for (size_t j = 0; j < N_centers; ++j) {
-                    if (dists[j] < mindist) {
-                        mindist = dists[j];
-                        argmin = static_cast<int>(j);
-                    }
-                }
-                dtraj_buff(i) = argmin;
-            }
-            /* Have all threads synchronize in progress through cluster assignments */
-            #pragma omp barrier
-        }
-    }
-    return dtraj;
-}
-
-/**
- * euclidean distance method
- * @tparam dtype
- * @param a
- * @param b
- * @return
- */
-template <typename dtype>
-inline dtype euclidean_metric<dtype>::compute(const dtype *const a, const dtype *const b) {
-    double sum = 0.0;
-    //#pragma omp simd reduction(+:sum)
-    for (size_t i = 0; i < metric_base<dtype>::dim; ++i) {
-        assert(std::isfinite(a[i]));
-        assert(std::isfinite(b[i]));
-
-        auto d = a[i] - b[i];
-        sum += d * d;
-    }
-    assert(std::isfinite(sum));
-    return static_cast<dtype>(std::sqrt(sum));
-}
-
-/**
- * minRMSD distance function
- * a: centers
- * b: frames
- * n: dimension of one frame
- * buffer_a: pre-allocated buffer to store a copy of centers
- * buffer_b: pre-allocated buffer to store a copy of frames
- * trace_a_precalc: pre-calculated trace to centers (pointer to one value)
- */
-template <typename dtype>
-inline dtype min_rmsd_metric<dtype>::compute(const dtype *a, const dtype *b) {
-    float trace_a, trace_b;
-    auto dim3 = static_cast<const int>(parent_t::dim / 3);
-    std::vector<float> buffer_b (b, b + parent_t::dim);
-
-    if (!has_trace_a_been_precalculated) {
-        std::vector<float> buffer_a (a, a + parent_t::dim);
-        inplace_center_and_trace_atom_major(buffer_a.data(), &trace_a, 1, dim3);
-        inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3);
-
-    } else {
-        inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3);
-        trace_a = *trace_centers.data();
-    }
-
-    float msd = msd_atom_major(dim3, dim3, a, buffer_b.data(), trace_a, trace_b, 0, nullptr);
-    return std::sqrt(msd);
-}
-
-template<typename dtype>
-inline void min_rmsd_metric<dtype>::precenter_centers(float *centers, std::size_t N_centers) {
-    trace_centers.resize(N_centers);
-    float *trace_centers_p = trace_centers.data();
-
-    /* Parallelize centering of cluster generators */
-    /* Note that this is already OpenMP-enabled */
-    for (std::size_t j = 0; j < N_centers; ++j) {
-        inplace_center_and_trace_atom_major(&centers[j * parent_t::dim],
-                                            &trace_centers_p[j], 1, parent_t::dim / 3);
-    }
-}
-
-
-#endif //PYEMMA_METRIC_BASE_BITS_H
diff --git a/pyemma/coordinates/clustering/include/bits/threading_utils.h b/pyemma/coordinates/clustering/include/bits/threading_utils.h
deleted file mode 100644
index 5e6b1e455..000000000
--- a/pyemma/coordinates/clustering/include/bits/threading_utils.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//
-// Created by clonkscher on 8/8/17.
-//
-
-#ifndef PYEMMA_THREADING_UTILS_H
-#define PYEMMA_THREADING_UTILS_H
-
-
-#include <stdexcept>
-#include <thread>
-#include <utility>
-
-/**
- * scoped_thread implementation
- */
-class scoped_thread {
-    std::thread t;
-public:
-    /**
-     * Creates a new scoped_thread based on a thread object
-     * @param _t the reference thread
-     */
-    template<typename Function, typename... Args>
-    explicit scoped_thread(Function &&fun, Args &&... args)
-            : t(std::forward<Function>(fun), std::forward<Args>(args)...) {
-        if (!t.joinable()) throw std::logic_error("No thread!");
-    }
-
-    /**
-     * joins the contained thread if its joinable
-     */
-    ~scoped_thread() {
-        if (t.joinable()) {
-            t.join();
-        }
-    }
-
-    /**
-     * moves another scoped_thread into this
-     * @param rhs the other scoped_thread
-     */
-    scoped_thread(scoped_thread &&rhs) {
-        t = std::move(rhs.t);
-    }
-
-    /**
-     * moves another scoped_thread into this
-     * @param rhs the other scoped_thread
-     * @return myself
-     */
-    scoped_thread &operator=(scoped_thread &&rhs) {
-        t = std::move(rhs.t);
-        return *this;
-    }
-
-    /**
-     * copying is not allowed
-     */
-    scoped_thread(const scoped_thread &) = delete;
-
-    /**
-     * copying is not allowed
-     */
-    scoped_thread &operator=(const scoped_thread &) = delete;
-};
-
-
-#endif //PYEMMA_THREADING_UTILS_H
diff --git a/pyemma/coordinates/clustering/include/kmeans.h b/pyemma/coordinates/clustering/include/kmeans.h
deleted file mode 100644
index ca8be5126..000000000
--- a/pyemma/coordinates/clustering/include/kmeans.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//
-// Created by marscher on 4/3/17.
-//
-
-#ifndef PYEMMA_KMEANS_H
-#define PYEMMA_KMEANS_H
-
-#include <utility>
-
-#include "Clustering.h"
-
-namespace py = pybind11;
-
-
-template<typename dtype>
-class KMeans : public ClusteringBase<dtype> {
-public:
-    using parent_t = ClusteringBase<dtype>;
-    using np_array = py::array_t<dtype, py::array::c_style | py::array::forcecast>;
-    /**
-      * array with new cluster centers, return code (0 == converged), number of iterations taken.
-      */
-    using cluster_res = std::tuple<np_array, int, int>;
-
-    KMeans(unsigned int k,
-           const std::string &metric,
-           size_t input_dimension) : ClusteringBase<dtype>(metric, input_dimension), k(k) {}
-
-    /**
-     * performs kmeans clustering on the given data chunk, provided a list of centers.
-     * @param np_chunk
-     * @param np_centers
-     * @param n_threads
-     * @return updated centers.
-     */
-    np_array cluster(const np_array & /*np_chunk*/, const np_array & /*np_centers*/, int /*n_threads*/) const;
-
-    /**
-      *
-      */
-    cluster_res cluster_loop(const np_array & /*np_chunk*/, np_array & /*np_centers*/,
-                             int /*n_threads*/, int /*max_iter*/, float /*tolerance*/,
-                             py::object& /*callback*/) const;
-
-    /**
-     * evaluate the quality of the centers
-     *
-     * @return
-     */
-    dtype costFunction(const np_array & /*np_data*/, const np_array & /*np_centers*/, int /*n_threads*/) const;
-
-    /**
-     * kmeans++ initialisation
-     * @param np_data
-     * @param random_seed
-     * @param n_threads
-     * @return init centers.
-     */
-    np_array initCentersKMpp(const np_array& /*np_data*/, unsigned int /*random_seed*/, int /*n_threads*/,
-                             py::object& /*callback*/) const;
-
-protected:
-    unsigned int k;
-};
-
-#include "bits/kmeans_bits.h"
-
-#endif //PYEMMA_KMEANS_H
diff --git a/pyemma/coordinates/clustering/include/metric_base.h b/pyemma/coordinates/clustering/include/metric_base.h
deleted file mode 100644
index 9111b99df..000000000
--- a/pyemma/coordinates/clustering/include/metric_base.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//
-// Created by marscher on 3/31/17.
-//
-
-#ifndef PYEMMA_METRIC_H
-#define PYEMMA_METRIC_H
-
-#include <cstddef>
-#include <cstring>
-#include <stdexcept>
-#include <cmath>
-#include <vector>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
-namespace py = pybind11;
-
-/**
- * Base type for all metrics.
- * @tparam dtype eg. float, double
- */
-template<typename dtype>
-class metric_base {
-
-public:
-    using np_array = py::array_t<dtype, py::array::c_style>;
-
-    explicit metric_base(std::size_t dim) : dim(dim) {}
-    virtual ~metric_base() = default;
-    metric_base(const metric_base&) = delete;
-    metric_base&operator=(const metric_base&) = delete;
-    metric_base(metric_base&&) = default;
-    metric_base&operator=(metric_base&&) = default;
-
-    virtual dtype compute(const dtype *, const dtype *) = 0;
-
-    py::array_t<int> assign_chunk_to_centers(const np_array& chunk,
-                                             const np_array& centers,
-                                             unsigned int n_threads);
-    size_t dim;
-};
-
-template<class dtype>
-class euclidean_metric : public metric_base<dtype> {
-public:
-    explicit euclidean_metric(size_t dim) : metric_base<dtype>(dim) {}
-    ~euclidean_metric() = default;
-    euclidean_metric(const euclidean_metric&) = delete;
-    euclidean_metric&operator=(const euclidean_metric&) = delete;
-    euclidean_metric(euclidean_metric&&) = default;
-    euclidean_metric&operator=(euclidean_metric&&) = default;
-
-    dtype compute(const dtype *, const dtype *);
-
-};
-
-template<typename dtype>
-class min_rmsd_metric : public metric_base<dtype> {
-
-    static_assert(std::is_same<dtype, float>::value, "only implemented for floats");
-
-public:
-    using parent_t = metric_base<dtype>;
-    min_rmsd_metric(std::size_t dim, float *precalc_trace_centers = nullptr)
-            : metric_base<float>(dim) {
-        if (dim % 3 != 0) {
-            throw std::range_error("min_rmsd_metric is only implemented for input data with a dimension dividable by 3.");
-        }
-        has_trace_a_been_precalculated = precalc_trace_centers != nullptr;
-    }
-    ~min_rmsd_metric() = default;
-    min_rmsd_metric(const min_rmsd_metric&) = delete;
-    min_rmsd_metric&operator=(const min_rmsd_metric&) = delete;
-    min_rmsd_metric(min_rmsd_metric&&) = default;
-    min_rmsd_metric&operator=(min_rmsd_metric&&) = default;
-    dtype compute(const dtype *a, const dtype *b);
-    /**
-     * pre-center in place
-     * @param original_centers
-     * @param N_centers
-     */
-    void precenter_centers(float *original_centers, std::size_t N_centers);
-
-private:
-    /**
-     * only used during cluster assignment. Avoids precentering the centers in every step.
-     */
-    std::vector<float> trace_centers;
-    bool has_trace_a_been_precalculated;
-    float trace_a_precentered;
-};
-
-#include "bits/metric_base_bits.h"
-
-#endif //PYEMMA_METRIC_H
diff --git a/pyemma/coordinates/clustering/include/regspace.h b/pyemma/coordinates/clustering/include/regspace.h
deleted file mode 100644
index 47244f349..000000000
--- a/pyemma/coordinates/clustering/include/regspace.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//
-// Created by marscher on 7/17/17.
-//
-
-#ifndef PYEMMA_REGSPACE_H
-#define PYEMMA_REGSPACE_H
-
-#if defined(USE_OPENMP)
-#include <omp.h>
-#endif
-
-#include <Clustering.h>
-
-
-class MaxCentersReachedException : public std::exception {
-public:
-    explicit MaxCentersReachedException(const char * m) : message{m} {}
-    virtual const char * what() const noexcept override {return message.c_str();}
-private:
-    std::string message = "";
-};
-
-template<typename dtype>
-class RegularSpaceClustering : public ClusteringBase<dtype> {
-    using parent_t = ClusteringBase<dtype>;
-public:
-    /**
-     *
-     * @param dmin
-     * @param max_clusters
-     * @param metric
-     * @param input_dimension
-     */
-    RegularSpaceClustering(dtype dmin, std::size_t max_clusters,
-                           const std::string &metric,
-                           size_t input_dimension) :
-            ClusteringBase<dtype>(metric, input_dimension),
-            dmin(dmin),
-            max_clusters(max_clusters) {}
-
-    /**
-     * loops over all points in chunk and checks for each center if the distance is smaller than dmin,
-     * if so, the point is appended to py_centers. This is done until max_centers is reached or all points have been
-     * added to the list.
-     * @param chunk array shape(n, d)
-     * @param py_centers python list containing found centers.
-     */
-    void cluster(const py::array_t <dtype, py::array::c_style> &chunk, py::list& py_centers, unsigned int n_threads) {
-        // this checks for ndim == 2
-        const auto& data = chunk.template unchecked< 2 >();
-
-        std::size_t N_frames = chunk.shape(0);
-        std::size_t dim = chunk.shape(1);
-        std::size_t N_centers = py_centers.size();
-        #if defined(USE_OPENMP)
-        omp_set_num_threads(n_threads);
-        #endif
-        // do the clustering
-        for (std::size_t i = 0; i < N_frames; ++i) {
-            auto mindist = std::numeric_limits<dtype>::max();
-            #pragma omp parallel for reduction(min:mindist)
-            for (std::size_t j = 0; j < N_centers; ++j) {
-                // TODO avoid the cast in inner loop?
-                auto point = py_centers[j].cast < py::array_t < dtype >> ();
-                auto d = parent_t::metric.get()->compute(&data(i, 0), point.data());
-                if (d < mindist) mindist = d;
-            }
-            if (mindist > dmin) {
-                if (N_centers + 1 > max_clusters) {
-                    throw MaxCentersReachedException(
-                            "Maximum number of cluster centers reached. Consider increasing max_clusters "
-                            "or choose a larger minimum distance, dmin.");
-                }
-                // add newly found center
-                std::vector<size_t> shape = {1, dim};
-                py::array_t<dtype> new_center(shape, nullptr);
-                std::memcpy(new_center.mutable_data(), &data(i, 0), sizeof(dtype)*dim);
-
-                py_centers.append(new_center);
-                N_centers++;
-            }
-        }
-    }
-
-protected:
-    dtype dmin;
-    std::size_t max_clusters;
-
-};
-
-#endif //PYEMMA_REGSPACE_H
diff --git a/pyemma/coordinates/clustering/interface.py b/pyemma/coordinates/clustering/interface.py
index 3adc4db15..10b52124d 100644
--- a/pyemma/coordinates/clustering/interface.py
+++ b/pyemma/coordinates/clustering/interface.py
@@ -26,6 +26,8 @@
 import os
 
 import numpy as np
+from deeptime.clustering import ClusterModel, metrics
+
 from pyemma._base.serialization.serialization import SerializableMixIn
 
 from pyemma._base.model import Model
@@ -56,6 +58,8 @@ class AbstractClustering(StreamingEstimationTransformer, Model, ClusterMixin, NJ
 
     def __init__(self, metric='euclidean', n_jobs=None):
         super(AbstractClustering, self).__init__()
+        from ._ext import rmsd
+        metrics.register("minRMSD", rmsd)
         self.metric = metric
         self.clustercenters = None
         self._previous_stride = -1
@@ -153,18 +157,12 @@ def sample_indexes_by_cluster(self, clusters, nsample, replace=True):
     def _transform_array(self, X):
         """get closest index of point in :attr:`clustercenters` to x."""
         X = np.require(X, dtype=np.float32, requirements='C')
-        if not hasattr(self, '_inst'):
-            self.logger.debug("new cluster inst")
-            from ._ext import ClusteringBase_f
-            self._inst = ClusteringBase_f(self.metric, X.shape[1])
-
         # for performance reasons we pre-center the cluster centers for minRMSD.
         if self.metric == 'minRMSD' and not self._precentered:
-            # self.logger.debug("precentering cluster centers for minRMSD.")
-            # self._inst.precenter_centers(self.clustercenters)
             self._precentered = True
 
-        dtraj = self._inst.assign(X, self.clustercenters, self.n_jobs)
+        model = ClusterModel(cluster_centers=self.clustercenters, metric=self.metric)
+        dtraj = model.transform(X)
         res = dtraj[:, None]  # always return a column vector in this function
         return res
 
diff --git a/pyemma/coordinates/clustering/kmeans.py b/pyemma/coordinates/clustering/kmeans.py
index c96f01159..b4144ae22 100644
--- a/pyemma/coordinates/clustering/kmeans.py
+++ b/pyemma/coordinates/clustering/kmeans.py
@@ -35,6 +35,9 @@
 from pyemma.util.units import bytes_to_string
 
 from pyemma.util.contexts import random_seed, nullcontext
+
+from deeptime.clustering import metrics, KMeans, MiniBatchKMeans
+
 import numpy as np
 
 
@@ -105,6 +108,9 @@ def __init__(self, n_clusters, max_iter=5, metric='euclidean',
         """
         super(KmeansClustering, self).__init__(metric=metric, n_jobs=n_jobs)
 
+        from ._ext import rmsd
+        metrics.register("minRMSD", rmsd)
+
         if clustercenters is None:
             clustercenters = []
 
@@ -199,6 +205,10 @@ def _check_resume_iteration(self):
 
     def _estimate(self, iterable, **kw):
         self._init_estimate()
+        estimator = KMeans(self.n_clusters, self.max_iter, self.metric, tolerance=self.tolerance,
+                           init_strategy=self.init_strategy, fixed_seed=self.fixed_seed, n_jobs=self.n_jobs)
+        if len(self.clustercenters) > 0:
+            estimator.initial_centers = np.array(self.clustercenters)
         ctx = nullcontext() if 'data' not in self._progress_registered_stages else self._progress_context(stage='data')
         # collect the data only if, we have not done this previously (eg. keep_data=True)
         # or the centers are not initialized.
@@ -212,7 +222,7 @@ def _estimate(self, iterable, **kw):
                     # collect data
                     self._collect_data(X, first_chunk, it.last_chunk)
                     if not resume_centers:
-                        # initialize cluster centers
+                        # initialize cluster centers in case of uniform
                         self._initialize_centers(X, itraj, it.pos, it.last_chunk)
                     first_chunk = False
             self.initial_centers_ = self.clustercenters[:]
@@ -226,22 +236,30 @@ def _estimate(self, iterable, **kw):
                                    format(len(self.clustercenters), self.n_clusters))
 
         if self.show_progress:
+            if self.init_strategy == 'kmeans++':
+                self._progress_register(self.n_clusters,
+                                        description="initialize kmeans++ centers", stage=0)
+            self._progress_register(self.max_iter, description="kmeans iterations", stage=1)
+
+            callback_init = lambda: self._progress_update(1, stage=0)
             callback = lambda: self._progress_update(1, stage=1)
         else:
+            callback_init = None
             callback = None
 
-        # run k-means with all the data
-        with self._progress_context(stage=1), self._finish_estimate():
-            self.clustercenters, code, iterations = self._inst.cluster_loop(self._in_memory_chunks, self.clustercenters,
-                                                                            self.n_jobs, self.max_iter, self.tolerance,
-                                                                            callback)
-            if code == 0:
-                self._converged = True
-                self.logger.debug("Cluster centers converged after %i steps.", iterations + 1)
-            else:
-                self.logger.warning("Algorithm did not reach convergence criterion"
-                                    " of %g in %i iterations. Consider increasing max_iter.",
-                                    self.tolerance, self.max_iter)
+        with self._progress_context(stage=(0, 1)), self._finish_estimate():
+            estimator.fit(self._in_memory_chunks, callback_init_centers=callback_init, callback_loop=callback,
+                          n_jobs=self.n_jobs)
+            model = estimator.fetch_model()
+            self.clustercenters = model.cluster_centers
+            self._converged = model.converged
+
+        if self.converged:
+            self.logger.debug("Cluster centers converged after %i steps.", len(model.inertias))
+        else:
+            self.logger.warning("Algorithm did not reach convergence criterion"
+                                " of %g in %i iterations. Consider increasing max_iter.",
+                                self.tolerance, self.max_iter)
 
         return self
 
@@ -278,10 +296,6 @@ def _init_estimate(self):
             n_chunks = self.data_producer.n_chunks(chunksize=self.chunksize, skip=self.skip, stride=self.stride)
             self._progress_register(n_chunks, description="creating data array", stage='data')
 
-        if self.init_strategy == 'kmeans++':
-            self._progress_register(self.n_clusters,
-                                    description="initialize kmeans++ centers", stage=0)
-        self._progress_register(self.max_iter, description="kmeans iterations", stage=1)
         self._init_in_memory_chunks(total_length)
 
         if self.init_strategy == 'uniform':
@@ -292,9 +306,6 @@ def _init_estimate(self):
                     self._init_centers_indices[idx] = random.sample(list(range(0, traj_len)), int(
                             math.ceil((traj_len / float(total_length)) * self.n_clusters)))
 
-        from ._ext import kmeans as kmeans_mod
-        self._inst = kmeans_mod.Kmeans_f(self.n_clusters, self.metric, self.data_producer.ndim)
-
         return stride
 
     def _initialize_centers(self, X, itraj, t, last_chunk):
@@ -308,16 +319,6 @@ def _initialize_centers(self, X, itraj, t, last_chunk):
                     if len(self.clustercenters) < self.n_clusters and t + l in self._init_centers_indices[itraj]:
                         new = np.vstack((self.clustercenters, X[l]))
                         self.clustercenters = new
-        elif last_chunk and self.init_strategy == 'kmeans++':
-            if self.init_strategy == 'kmeans++' and self.show_progress:
-                callback = lambda: self._progress_update(1, stage=0)
-                context = self._progress_context(stage=0)
-            else:
-                callback = None
-                context = nullcontext()
-            with context:
-                self.clustercenters = self._inst.init_centers_KMpp(self._in_memory_chunks, self.fixed_seed, self.n_jobs,
-                                                                   callback)
 
     def _collect_data(self, X, first_chunk, last_chunk):
         # beginning - compute
@@ -342,6 +343,8 @@ class MiniBatchKmeansClustering(KmeansClustering):
     def __init__(self, n_clusters, max_iter=5, metric='euclidean', tolerance=1e-5, init_strategy='kmeans++',
                  batch_size=0.2, oom_strategy='memmap', fixed_seed=False, stride=None, n_jobs=None, skip=0,
                  clustercenters=None, keep_data=False):
+        from ._ext import rmsd
+        metrics.register("minRMSD", rmsd)
 
         if stride is not None:
             raise ValueError("stride is a dummy value in MiniBatch Kmeans")
@@ -393,6 +396,12 @@ def _init_estimate(self):
 
     def _estimate(self, iterable, **kw):
         # mini-batch kmeans does not use stride. Enforce it.
+
+        estimator = MiniBatchKMeans(n_clusters=self.n_clusters, metric=self.metric, n_jobs=self.n_jobs,
+                                    init_strategy=self.init_strategy)
+        if len(self.clustercenters) > 0:
+            estimator.initial_centers = np.array(self.clustercenters)
+
         self.stride = None
         self._init_estimate()
 
@@ -411,14 +420,11 @@ def _estimate(self, iterable, **kw):
                 for X in iter(iterator):
                     # collect data
                     self._collect_data(X, first_chunk, iterator.last_chunk)
-                    # initialize cluster centers
-                    if i_pass == 0 and not self._check_resume_iteration():
-                        self._initialize_centers(X, iterator.current_trajindex, iterator.pos, iterator.last_chunk)
                     first_chunk = False
 
-                # one pass over data completed
-                self.clustercenters = self._inst.cluster(self._in_memory_chunks, self.clustercenters, self.n_jobs)
-                cost = self._inst.cost_function(self._in_memory_chunks, self.clustercenters, self.n_jobs)
+                current_model = estimator.partial_fit(self._in_memory_chunks).fetch_model()
+                self.clustercenters = current_model.cluster_centers
+                cost = current_model.inertia
 
                 rel_change = np.abs(cost - prev_cost) / cost if cost != 0.0 else 0.0
                 prev_cost = cost
diff --git a/pyemma/coordinates/clustering/regspace.py b/pyemma/coordinates/clustering/regspace.py
index 339c2e348..5e319eae7 100644
--- a/pyemma/coordinates/clustering/regspace.py
+++ b/pyemma/coordinates/clustering/regspace.py
@@ -31,7 +31,7 @@
 from pyemma.util.exceptions import NotConvergedWarning
 
 import numpy as np
-import deeptime as dt
+from deeptime.clustering import RegularSpace, metrics, ClusterModel
 
 __all__ = ['RegularSpaceClustering']
 
@@ -80,8 +80,8 @@ def __init__(self, dmin, max_centers=1000, metric='euclidean', stride=1, n_jobs=
         """
         super(RegularSpaceClustering, self).__init__(metric=metric, n_jobs=n_jobs)
 
-        from ._ext import RMSDMetric
-        dt.clustering.metrics.register("minRMSD", RMSDMetric)
+        from ._ext import rmsd
+        metrics.register("minRMSD", rmsd)
 
         self._converged = False
         self.set_params(dmin=dmin, metric=metric,
@@ -135,12 +135,8 @@ def _estimate(self, iterable, **kwargs):
         # 3. add new centroid, if min(distance to all other clustercenters) >= dmin
         ########
         # temporary list to store cluster centers
-        clustercenters = []
         used_frames = 0
-        regspace = dt.clustering.RegularSpace(dmin=self.dmin, max_centers=self.max_centers,
-                                              metric=self.metric, n_jobs=self.n_jobs)
-
-        # from ._ext import regspace
+        regspace = RegularSpace(dmin=self.dmin, max_centers=self.max_centers, metric=self.metric, n_jobs=self.n_jobs)
         it = iterable.iterator(return_trajindex=False, stride=self.stride,
                                chunk=self.chunksize, skip=self.skip)
         try:
@@ -149,21 +145,26 @@ def _estimate(self, iterable, **kwargs):
                     regspace.partial_fit(X.astype(np.float32, order='C', copy=False), n_jobs=self.n_jobs)
                     used_frames += len(X)
             self._converged = True
-        except regspace.MaxCentersReachedException:
-            self._converged = False
-            msg = 'Maximum number of cluster centers reached.' \
-                  ' Consider increasing max_centers or choose' \
-                  ' a larger minimum distance, dmin.'
-            self.logger.warning(msg)
-            warnings.warn(msg)
-            # pass amount of processed data
-            used_data = used_frames / float(it.n_frames_total()) * 100.0
-            raise NotConvergedWarning("Used data for centers: %.2f%%" % used_data)
+        except Exception as e:
+            if 'MaxCentersReachedException' in e.__class__.__name__:
+                self._converged = False
+                msg = 'Maximum number of cluster centers reached.' \
+                      ' Consider increasing max_centers or choose' \
+                      ' a larger minimum distance, dmin.'
+                self.logger.warning(msg)
+                warnings.warn(msg)
+                # pass amount of processed data
+                used_data = used_frames / float(it.n_frames_total()) * 100.0
+                raise NotConvergedWarning("Used data for centers: %.2f%%" % used_data)
+            else:
+                # todo ugly workaround until maxcentersreached is placed not within metric subpackage but globally
+                #  somewhere
+                raise
         finally:
             # even if not converged, we store the found centers.
             model = regspace.fetch_model()
             clustercenters = model.cluster_centers.squeeze().reshape(-1, iterable.ndim)
-            self._inst = dt.clustering.ClusterModel(clustercenters, metric=self.metric)
+            self._inst = ClusterModel(clustercenters, metric=self.metric)
             from types import MethodType
 
             def _assign(self, data, _, n_jobs):
diff --git a/pyemma/coordinates/clustering/src/clustering_module.cpp b/pyemma/coordinates/clustering/src/clustering_module.cpp
index 2c253020d..b8181a57a 100644
--- a/pyemma/coordinates/clustering/src/clustering_module.cpp
+++ b/pyemma/coordinates/clustering/src/clustering_module.cpp
@@ -2,47 +2,13 @@
 // Created by marscher on 7/17/17.
 //
 
-#include "metric.h"
-#include "regspace.h"
-#include "kmeans.h"
+#include <center.h>
+#include <theobald_rmsd.h>
+#include "register_clustering.h"
 
-class MaximumMetric : public Metric {
-public:
-
-    double compute_squared_d(const double* xs, const double* ys, std::size_t dim) const override {
-        return _compute(xs, ys, dim);
-    }
-    float compute_squared_f(const float* xs, const float* ys, std::size_t dim) const override {
-        return _compute(xs, ys, dim);
-    }
-private:
-    template<typename T>
-    T _compute(const T* xs, const T* ys, std::size_t dim) const {
-        T result = 0.0;
-        for (size_t i = 0; i < dim; ++i) {
-            auto d = std::abs(xs[i] - ys[i]);
-            if (d > result) {
-                result = d;
-            }
-        }
-        return result*result;
-    }
-};
-
-class RMSDMetric : public Metric {
-public:
-
-    double compute_squared_d(const double* xs, const double* ys, std::size_t dim) const override {
-        std::vector<float> xsCast (xs, xs+dim), ysCast (ys, ys+dim);
-        return _compute(xsCast.data(), ysCast.data(), dim);
-    }
-    float compute_squared_f(const float* xs, const float* ys, std::size_t dim) const override {
-        return _compute(xs, ys, dim);
-    }
-
-private:
+struct RMSDMetric {
     template<typename T>
-    T _compute(const T* xs, const T* ys, std::size_t dim) const {
+    static T compute(const T* xs, const T* ys, std::size_t dim) {
         if (dim % 3 != 0) {
             throw std::range_error("RMSDMetric is only implemented for input data with a dimension dividable by 3.");
         }
@@ -53,45 +19,22 @@ class RMSDMetric : public Metric {
         inplace_center_and_trace_atom_major(buffer_a.data(), &trace_a, 1, dim3);
         inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3);
 
+        if constexpr(std::is_same<T, double>::value) {
+            std::vector<float> cast (xs, xs + dim);
+            return msd_atom_major(dim3, dim3, cast.data(), buffer_b.data(), trace_a, trace_b, 0, nullptr);
+        } else {
+            return msd_atom_major(dim3, dim3, xs, buffer_b.data(), trace_a, trace_b, 0, nullptr);
+        }
+    }
 
-        float msd = msd_atom_major(dim3, dim3, xs,
-                                   buffer_b.data(), trace_a, trace_b, 0, nullptr);
-        return msd;
+    template<typename dtype>
+    static dtype compute_squared(const dtype* xs, const dtype* ys, std::size_t dim) {
+        auto d = compute(xs, ys, dim);
+        return d*d;
     }
 };
 
-using dtype = float;
-
 PYBIND11_MODULE(_ext, m) {
-    m.doc() = "module containing clustering algorithms.";
-
-    auto regspace_mod = m.def_submodule("regspace");
-    auto kmeans_mod = m.def_submodule("kmeans");
-
-    typedef ClusteringBase<dtype> cbase_f;
-    typedef RegularSpaceClustering<dtype> regspace_f;
-
-    // register base class first.
-    py::class_<cbase_f>(m, "ClusteringBase_f")
-            .def(py::init<const std::string&, std::size_t>())
-            .def("assign", &cbase_f::assign_chunk_to_centers)
-            .def("precenter_centers", &cbase_f::precenter_centers);
-    // regular space clustering.
-    py::class_<regspace_f, cbase_f>(regspace_mod, "Regspace_f")
-            .def(py::init<dtype, std::size_t, const std::string&, size_t>())
-            .def("cluster", &regspace_f::cluster);
-    py::register_exception<MaxCentersReachedException>(regspace_mod, "MaxCentersReachedException");
-    // kmeans
-    typedef KMeans<dtype> kmeans_f;
-    py::class_<kmeans_f, cbase_f>(kmeans_mod, "Kmeans_f")
-            .def(py::init<unsigned int, const std::string&, std::size_t>(),
-                 py::arg("k"), py::arg("metric"), py::arg("dim"))
-             // py::arg("callback") = py::none()
-            .def("cluster", &kmeans_f::cluster)
-            .def("cluster_loop", &kmeans_f::cluster_loop)
-            .def("init_centers_KMpp", &kmeans_f::initCentersKMpp)
-            .def("cost_function", &kmeans_f::costFunction);
-    // py::class_<Metric>(m, "Metric");
-    py::object baseMetric = (py::object) py::module_::import("deeptime.clustering._clustering_bindings").attr("Metric");
-    py::class_<RMSDMetric>(m, "RMSDMetric", baseMetric).def(py::init<>());
+    auto rmsdModule = m.def_submodule("rmsd");
+    deeptime::clustering::registerClusteringImplementation<RMSDMetric>(rmsdModule);
 }
diff --git a/pyemma/coordinates/clustering/tests/test_kmeans.py b/pyemma/coordinates/clustering/tests/test_kmeans.py
index 5c22d5934..5f9dbe79c 100644
--- a/pyemma/coordinates/clustering/tests/test_kmeans.py
+++ b/pyemma/coordinates/clustering/tests/test_kmeans.py
@@ -92,7 +92,6 @@ def test_3gaussian_1d_singletraj(self):
             while not km2.converged:
                 km2.estimate(X=X, clustercenters=km2.clustercenters, keep_data=True)
 
-            assert np.linalg.norm(km1.clustercenters - km1.initial_centers_) > 0
             np.testing.assert_allclose(km1.clustercenters, km2.clustercenters,
                                        err_msg="should yield same centers with fixed seed=%s for strategy %s, Initial centers=%s"
                                                % (fixed_seed, init_strategy, km2.initial_centers_), atol=1e-6)
diff --git a/pyemma/coordinates/data/util/reader_utils.py b/pyemma/coordinates/data/util/reader_utils.py
index a2a83a15c..14dbf0752 100644
--- a/pyemma/coordinates/data/util/reader_utils.py
+++ b/pyemma/coordinates/data/util/reader_utils.py
@@ -15,7 +15,7 @@
 #
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
+from pathlib import Path
 
 from numpy import vstack
 import mdtraj as md
@@ -127,6 +127,8 @@ def single_traj_from_n_files(file_list, top):
     """
     traj = None
     for ff in file_list:
+        if isinstance(ff, Path):
+            ff = str(ff.resolve())
         if traj is None:
             traj = md.load(ff, top=top)
         else:
diff --git a/pyemma/coordinates/tests/test_save_trajs.py b/pyemma/coordinates/tests/test_save_trajs.py
index c41d41079..398592134 100644
--- a/pyemma/coordinates/tests/test_save_trajs.py
+++ b/pyemma/coordinates/tests/test_save_trajs.py
@@ -29,6 +29,7 @@
 import os
 import shutil
 import tempfile
+from pathlib import Path
 
 import numpy as np
 import pyemma.coordinates as coor
@@ -61,16 +62,19 @@ def setUp(self):
 
         self.sets = [set_1, set_2]
 
-        self.subdir = tempfile.mkdtemp(suffix='save_trajs_test/')
+        self.subdir = Path(tempfile.mkdtemp(suffix='save_trajs_test/'))
 
         # Instantiate the reader
         self.reader = coor.source(self.trajfiles, top=self.pdbfile)
         self.reader.chunksize = 30
-        self.n_pass_files = [self.subdir + 'n_pass.set_%06u.xtc' % ii for ii in range(len(self.sets))]
-        self.one_pass_files = [self.subdir + '1_pass.set_%06u.xtc' % ii for ii in range(len(self.sets))]
+        self.frag_reader = coor.source([self.trajfiles], top=self.pdbfile)
+        self.n_pass_files = [str(self.subdir / ('n_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))]
+        self.frag_pass_files = [str(self.subdir / ('frag_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))]
+        self.one_pass_files = [str(self.subdir / ('1_pass.set_%06u.xtc' % ii)) for ii in range(len(self.sets))]
 
         self.traj_ref = save_traj_w_md_load_frame(self.reader, self.sets)
         self.strides = [2, 3, 5]
+        self.subdir = str(self.subdir)
 
     def tearDown(self):
         shutil.rmtree(self.subdir, ignore_errors=True)
@@ -89,12 +93,21 @@ def test_save_SaveTrajs_multipass(self):
         __ = save_trajs(self.reader, self.sets,
                         outfiles=self.n_pass_files)
 
+        sets_frag = []
+        for i in range(len(self.sets)):
+            sets_frag.append([])
+            for traj_ix, frame_ix in self.sets[i]:
+                sets_frag[-1].append([0, traj_ix * 33 + frame_ix])
+
+        save_trajs(self.frag_reader, [np.array(sets_frag[0]), np.array(sets_frag[1])], outfiles=self.frag_pass_files)
         # Reload the object to memory
         traj_n_pass = single_traj_from_n_files(self.n_pass_files, top=self.pdbfile)
+        traj_frag = single_traj_from_n_files(self.frag_pass_files, top=self.pdbfile)
 
         # Check for diffs
         (found_diff, errmsg) = compare_coords_md_trajectory_objects(traj_n_pass, self.traj_ref, atom=0)
-
+        self.assertFalse(found_diff, errmsg)
+        (found_diff, errmsg) = compare_coords_md_trajectory_objects(traj_frag, self.traj_ref, atom=0)
         self.assertFalse(found_diff, errmsg)
 
     def test_save_SaveTrajs_onepass(self):
diff --git a/setup.py b/setup.py
index 38ac054e7..6b119dcc3 100755
--- a/setup.py
+++ b/setup.py
@@ -93,12 +93,11 @@ def extensions():
                   include_dirs=[
                       mdtraj_inc,
                       pybind_inc,
-                      'pyemma/coordinates/clustering/include',
                   ] + deeptime_inc,
                   language='c++',
                   libraries=[lib_prefix+'theobald'],
                   library_dirs=[mdtraj_lib],
-                  extra_compile_args=common_cflags)
+                  extra_compile_args=common_cflags + [''])
 
     covar_module = \
         Extension('pyemma._ext.variational.estimators.covar_c._covartools',
@@ -239,6 +238,8 @@ def build_extensions(self):
                 if has_flag(self.compiler, '-fvisibility=hidden'):
                     opts.append('-fvisibility=hidden')
             elif ct == 'msvc':
+                opts.append('/std:c++17')
+                opts.append('/bigobj')
                 opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
 
             # setup OpenMP support
diff --git a/setup_util.py b/setup_util.py
index 9ef2d0751..bf4e6d4fa 100644
--- a/setup_util.py
+++ b/setup_util.py
@@ -129,9 +129,11 @@ def has_flag(compiler, flagname):
 
 
 def cpp_flag(compiler):
-    """Return the -std=c++[11/14] compiler flag.
-    The c++14 is prefered over c++11 (when it is available).
+    """Return the -std=c++[11/14/17] compiler flag.
+    The newer the better.
     """
+    if has_flag(compiler, '-std=c++17'):
+        return '-std=c++17'
     if has_flag(compiler, '-std=c++14'):
         return '-std=c++14'
     elif has_flag(compiler, '-std=c++11'):