Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Progress Logger #8105

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
116 changes: 92 additions & 24 deletions sklearn/cluster/k_means_.py
Expand Up @@ -33,6 +33,8 @@
from ..externals.joblib import Parallel
from ..externals.joblib import delayed
from ..externals.six import string_types
from ..externals.six import next
from ..externals.progiter import ProgIter

from . import _k_means
from ._k_means_elkan import k_means_elkan
Expand All @@ -42,7 +44,8 @@
# Initialization heuristic


def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None,
verbose=False):
"""Init n_clusters seeds according to k-means++

Parameters
Expand All @@ -66,6 +69,9 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
Set to None to make the number of trials depend logarithmically
on the number of seeds (2+log(k)); this is the default.

verbose : boolean, optional
Verbosity mode.

Notes
-----
Selects initial cluster centers for k-mean clustering in a smart way
Expand All @@ -89,6 +95,10 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
# that it helped.
n_local_trials = 2 + int(np.log(n_clusters))

prog = ProgIter(label='kmeans++', verbose=verbose)
_iter = prog(range(n_clusters))
c = next(_iter)

# Pick first center randomly
center_id = random_state.randint(n_samples)
if sp.issparse(X):
Expand All @@ -103,7 +113,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
current_pot = closest_dist_sq.sum()

# Pick the remaining n_clusters-1 points
for c in range(1, n_clusters):
for c in _iter:
# Choose center candidates by sampling with probability proportional
# to the squared distance to the closest existing center
rand_vals = random_state.random_sample(n_local_trials) * current_pot
Expand Down Expand Up @@ -393,7 +403,7 @@ def _kmeans_single_elkan(X, n_clusters, max_iter=300, init='k-means++',
x_squared_norms = row_norms(X, squared=True)
# init
centers = _init_centroids(X, n_clusters, init, random_state=random_state,
x_squared_norms=x_squared_norms)
x_squared_norms=x_squared_norms, verbose=verbose)
centers = np.ascontiguousarray(centers)
if verbose:
print('Initialization complete')
Expand Down Expand Up @@ -475,7 +485,7 @@ def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++',
best_labels, best_inertia, best_centers = None, None, None
# init
centers = _init_centroids(X, n_clusters, init, random_state=random_state,
x_squared_norms=x_squared_norms)
x_squared_norms=x_squared_norms, verbose=verbose)
if verbose:
print("Initialization complete")

Expand Down Expand Up @@ -624,7 +634,7 @@ def _labels_inertia(X, x_squared_norms, centers,


def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
init_size=None):
init_size=None, verbose=False):
"""Compute the initial centroids

Parameters
Expand Down Expand Up @@ -653,6 +663,9 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
only algorithm is initialized by running a batch KMeans on a
random subset of the data. This needs to be larger than k.

verbose : boolean, optional
Verbosity mode.

Returns
-------
centers : array, shape(k, n_features)
Expand All @@ -679,16 +692,24 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
"n_samples=%d should be larger than k=%d" % (n_samples, k))

if isinstance(init, string_types) and init == 'k-means++':
if verbose:
print('Init cluster centers with k-means++')
centers = _k_init(X, k, random_state=random_state,
x_squared_norms=x_squared_norms)
x_squared_norms=x_squared_norms, verbose=verbose)
elif isinstance(init, string_types) and init == 'random':
if verbose:
print('Init cluster centers randomly')
seeds = random_state.permutation(n_samples)[:k]
centers = X[seeds]
elif hasattr(init, '__array__'):
if verbose:
print('Init cluster centers with predefined array')
# ensure that the centers have the same dtype as X
# this is a requirement of fused types of cython
centers = np.array(init, dtype=X.dtype)
elif callable(init):
if verbose:
print('Init cluster centers with custom callable')
centers = init(X, k, random_state=random_state)
centers = np.asarray(centers, dtype=X.dtype)
else:
Expand Down Expand Up @@ -984,7 +1005,7 @@ def _mini_batch_step(X, x_squared_norms, centers, counts,
old_center_buffer, compute_squared_diff,
distances, random_reassign=False,
random_state=None, reassignment_ratio=.01,
verbose=False):
verbose=False, prog=None):
"""Incremental update of the centers for the Minibatch K-Means algorithm.

Parameters
Expand Down Expand Up @@ -1060,6 +1081,8 @@ def _mini_batch_step(X, x_squared_norms, centers, counts,
new_centers = choice(X.shape[0], replace=False, size=n_reassigns,
random_state=random_state)
if verbose:
if prog:
prog.ensure_newline()
print("[MiniBatchKMeans] Reassigning %i cluster centers."
% n_reassigns)

Expand Down Expand Up @@ -1118,7 +1141,7 @@ def _mini_batch_step(X, x_squared_norms, centers, counts,

def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
n_samples, centers_squared_diff, batch_inertia,
context, verbose=0):
context, verbose=0, prog=None):
"""Helper function to encapsulate the early stopping logic"""
# Normalize inertia to be able to compare values when
# batch_size changes
Expand All @@ -1143,16 +1166,16 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
# Log progress to be able to monitor convergence
if verbose:
progress_msg = (
'Minibatch iteration %d/%d:'
' mean batch inertia: %f, ewa inertia: %f ' % (
iteration_idx + 1, n_iter, batch_inertia,
ewa_inertia))
print(progress_msg)
'inertias: batch=%f, ewa=%f ' % (
batch_inertia, ewa_inertia))
prog.set_extra(progress_msg)

# Early stopping based on absolute tolerance on squared change of
# centers position (using EWA smoothing)
if tol > 0.0 and ewa_diff <= tol:
if verbose:
if prog:
prog.end()
print('Converged (small centers change) at iteration %d/%d'
% (iteration_idx + 1, n_iter))
return True
Expand All @@ -1169,6 +1192,8 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
if (model.max_no_improvement is not None
and no_improvement >= model.max_no_improvement):
if verbose:
if prog:
prog.end()
print('Converged (lack of improvement in inertia)'
' at iteration %d/%d'
% (iteration_idx + 1, n_iter))
Expand All @@ -1177,6 +1202,7 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
# update the convergence context to maintain state across successive calls:
context['ewa_diff'] = ewa_diff
context['ewa_inertia'] = ewa_inertia
context['batch_inertia'] = batch_inertia
context['ewa_inertia_min'] = ewa_inertia_min
context['no_improvement'] = no_improvement
return False
Expand Down Expand Up @@ -1277,6 +1303,21 @@ class MiniBatchKMeans(KMeans):
defined as the sum of square distances of samples to their nearest
neighbor.

CommandLine:
python -m sklearn.cluster.k_means_ MiniBatchKMeans

Example:
>>> from sklearn.cluster import MiniBatchKMeans
>>> from sklearn.datasets.samples_generator import make_blobs
>>> import numpy as np
>>> n_clusters = 4000
>>> X, true_labels = make_blobs(n_samples=int(1E5), centers=n_clusters,
... cluster_std=1., random_state=42)
>>> mbkm = MiniBatchKMeans(n_clusters=n_clusters,
... init_size=3 * n_clusters, n_init=2,
... random_state=0, verbose=1).fit(X)
>>> print('mbkm.labels_ = %r' % (mbkm.labels_,))

See also
--------

Expand Down Expand Up @@ -1366,9 +1407,6 @@ def fit(self, X, y=None):
# perform several inits with random sub-sets
best_inertia = None
for init_idx in range(n_init):
if self.verbose:
print("Init %d/%d with method: %s"
% (init_idx + 1, n_init, self.init))
counts = np.zeros(self.n_clusters, dtype=np.int32)

# TODO: once the `k_means` function works with sparse input we
Expand All @@ -1380,9 +1418,14 @@ def fit(self, X, y=None):
X, self.n_clusters, self.init,
random_state=random_state,
x_squared_norms=x_squared_norms,
init_size=init_size)
init_size=init_size,
verbose=self.verbose)

# Compute the label assignment on the init dataset
if hasattr(self.init, '__array__'):
if self.verbose:
print('Taking one step using initialization dataset')

batch_inertia, centers_squared_diff = _mini_batch_step(
X_valid, x_squared_norms[validation_indices],
cluster_centers, counts, old_center_buffer, False,
Expand All @@ -1405,7 +1448,10 @@ def fit(self, X, y=None):

# Perform the iterative optimization until the final convergence
# criterion
for iteration_idx in range(n_iter):
if self.verbose:
print('Begining mini-batch iterations')
prog = ProgIter(label='minibatch', verbose=self.verbose)
for iteration_idx in prog(range(n_iter)):
# Sample a minibatch from the full dataset
minibatch_indices = random_state.randint(
0, n_samples, self.batch_size)
Expand All @@ -1424,13 +1470,13 @@ def fit(self, X, y=None):
% (10 + self.counts_.min()) == 0),
random_state=random_state,
reassignment_ratio=self.reassignment_ratio,
verbose=self.verbose)
verbose=self.verbose, prog=prog)

# Monitor convergence and do early stopping if necessary
if _mini_batch_convergence(
self, iteration_idx, n_iter, tol, n_samples,
centers_squared_diff, batch_inertia, convergence_context,
verbose=self.verbose):
verbose=self.verbose, prog=prog):
break

self.n_iter_ = iteration_idx + 1
Expand Down Expand Up @@ -1461,10 +1507,19 @@ def _labels_inertia_minibatch(self, X):
"""
if self.verbose:
print('Computing label assignment and total inertia')
n_samples = X.shape[0]
batch_size = self.batch_size
if batch_size > n_samples:
batch_size = n_samples
x_squared_norms = row_norms(X, squared=True)
slices = gen_batches(X.shape[0], self.batch_size)
results = [_labels_inertia(X[s], x_squared_norms[s],
self.cluster_centers_) for s in slices]
slices = gen_batches(n_samples, batch_size)
total_batches = int(n_samples // batch_size)
prog = ProgIter(label='labels inertia', length=total_batches,
verbose=self.verbose)
results = [
_labels_inertia(X[s], x_squared_norms[s], self.cluster_centers_)
for s in prog(slices)
]
labels, inertia = zip(*results)
return np.hstack(labels), np.sum(inertia)

Expand Down Expand Up @@ -1495,7 +1550,8 @@ def partial_fit(self, X, y=None):
self.cluster_centers_ = _init_centroids(
X, self.n_clusters, self.init,
random_state=self.random_state_,
x_squared_norms=x_squared_norms, init_size=self.init_size)
x_squared_norms=x_squared_norms, init_size=self.init_size,
verbose=self.verbose)

self.counts_ = np.zeros(self.n_clusters, dtype=np.int32)
random_reassign = False
Expand Down Expand Up @@ -1542,3 +1598,15 @@ def predict(self, X):

X = self._check_test_data(X)
return self._labels_inertia_minibatch(X)[0]


if __name__ == '__main__':
r"""
CommandLine:
python -m sklearn.cluster.k_means_
python -m sklearn.cluster.k_means_ --allexamples
"""
import multiprocessing
multiprocessing.freeze_support() # for win32
import utool as ut # NOQA
ut.doctest_funcs()
4 changes: 4 additions & 0 deletions sklearn/cluster/tests/test_k_means.py
Expand Up @@ -313,6 +313,10 @@ def test_mb_k_means_plus_plus_init_dense_array():


def test_mb_kmeans_verbose():
"""
nosetests -s sklearn/cluster/tests/test_k_means.py:test_mb_kmeans_verbose
nosetests -s sklearn/cluster/tests/test_k_means.py
"""
mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
random_state=42, verbose=1)
old_stdout = sys.stdout
Expand Down