diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index f33b3f65b714e..4b720f2804cdb 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -33,6 +33,8 @@ from ..externals.joblib import Parallel from ..externals.joblib import delayed from ..externals.six import string_types +from ..externals.six import next +from ..externals.progiter import ProgIter from . import _k_means from ._k_means_elkan import k_means_elkan @@ -42,7 +44,8 @@ # Initialization heuristic -def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): +def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None, + verbose=False): """Init n_clusters seeds according to k-means++ Parameters @@ -66,6 +69,9 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): Set to None to make the number of trials depend logarithmically on the number of seeds (2+log(k)); this is the default. + verbose : boolean, optional + Verbosity mode. + Notes ----- Selects initial cluster centers for k-mean clustering in a smart way @@ -89,6 +95,10 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): # that it helped. n_local_trials = 2 + int(np.log(n_clusters)) + prog = ProgIter(label='kmeans++', verbose=verbose) + _iter = prog(range(n_clusters)) + c = next(_iter) + # Pick first center randomly center_id = random_state.randint(n_samples) if sp.issparse(X): @@ -103,7 +113,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): current_pot = closest_dist_sq.sum() # Pick the remaining n_clusters-1 points - for c in range(1, n_clusters): + for c in _iter: # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center rand_vals = random_state.random_sample(n_local_trials) * current_pot @@ -393,7 +403,7 @@ def _kmeans_single_elkan(X, n_clusters, max_iter=300, init='k-means++', x_squared_norms = row_norms(X, squared=True) # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, - x_squared_norms=x_squared_norms) + x_squared_norms=x_squared_norms, verbose=verbose) centers = np.ascontiguousarray(centers) if verbose: print('Initialization complete') @@ -475,7 +485,7 @@ def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, - x_squared_norms=x_squared_norms) + x_squared_norms=x_squared_norms, verbose=verbose) if verbose: print("Initialization complete") @@ -624,7 +634,7 @@ def _labels_inertia(X, x_squared_norms, centers, def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, - init_size=None): + init_size=None, verbose=False): """Compute the initial centroids Parameters @@ -653,6 +663,9 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than k. + verbose : boolean, optional + Verbosity mode. + Returns ------- centers : array, shape(k, n_features) @@ -679,16 +692,24 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, "n_samples=%d should be larger than k=%d" % (n_samples, k)) if isinstance(init, string_types) and init == 'k-means++': + if verbose: + print('Init cluster centers with k-means++') centers = _k_init(X, k, random_state=random_state, - x_squared_norms=x_squared_norms) + x_squared_norms=x_squared_norms, verbose=verbose) elif isinstance(init, string_types) and init == 'random': + if verbose: + print('Init cluster centers randomly') seeds = random_state.permutation(n_samples)[:k] centers = X[seeds] elif hasattr(init, '__array__'): + if verbose: + print('Init cluster centers with predefined array') # ensure that the centers have the same dtype as X # this is a requirement of fused types of cython centers = np.array(init, dtype=X.dtype) elif callable(init): + if verbose: + print('Init cluster centers with custom callable') centers = init(X, k, random_state=random_state) centers = np.asarray(centers, dtype=X.dtype) else: @@ -984,7 +1005,7 @@ def _mini_batch_step(X, x_squared_norms, centers, counts, old_center_buffer, compute_squared_diff, distances, random_reassign=False, random_state=None, reassignment_ratio=.01, - verbose=False): + verbose=False, prog=None): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters @@ -1060,6 +1081,8 @@ def _mini_batch_step(X, x_squared_norms, centers, counts, new_centers = choice(X.shape[0], replace=False, size=n_reassigns, random_state=random_state) if verbose: + if prog: + prog.ensure_newline() print("[MiniBatchKMeans] Reassigning %i cluster centers." % n_reassigns) @@ -1118,7 +1141,7 @@ def _mini_batch_step(X, x_squared_norms, centers, counts, def _mini_batch_convergence(model, iteration_idx, n_iter, tol, n_samples, centers_squared_diff, batch_inertia, - context, verbose=0): + context, verbose=0, prog=None): """Helper function to encapsulate the early stopping logic""" # Normalize inertia to be able to compare values when # batch_size changes @@ -1143,16 +1166,16 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol, # Log progress to be able to monitor convergence if verbose: progress_msg = ( - 'Minibatch iteration %d/%d:' - ' mean batch inertia: %f, ewa inertia: %f ' % ( - iteration_idx + 1, n_iter, batch_inertia, - ewa_inertia)) - print(progress_msg) + 'inertias: batch=%f, ewa=%f ' % ( + batch_inertia, ewa_inertia)) + prog.set_extra(progress_msg) # Early stopping based on absolute tolerance on squared change of # centers position (using EWA smoothing) if tol > 0.0 and ewa_diff <= tol: if verbose: + if prog: + prog.end() print('Converged (small centers change) at iteration %d/%d' % (iteration_idx + 1, n_iter)) return True @@ -1169,6 +1192,8 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol, if (model.max_no_improvement is not None and no_improvement >= model.max_no_improvement): if verbose: + if prog: + prog.end() print('Converged (lack of improvement in inertia)' ' at iteration %d/%d' % (iteration_idx + 1, n_iter)) @@ -1177,6 +1202,7 @@ def _mini_batch_convergence(model, iteration_idx, n_iter, tol, # update the convergence context to maintain state across successive calls: context['ewa_diff'] = ewa_diff context['ewa_inertia'] = ewa_inertia + context['batch_inertia'] = batch_inertia context['ewa_inertia_min'] = ewa_inertia_min context['no_improvement'] = no_improvement return False @@ -1277,6 +1303,21 @@ class MiniBatchKMeans(KMeans): defined as the sum of square distances of samples to their nearest neighbor. + CommandLine: + python -m sklearn.cluster.k_means_ MiniBatchKMeans + + Example: + >>> from sklearn.cluster import MiniBatchKMeans + >>> from sklearn.datasets.samples_generator import make_blobs + >>> import numpy as np + >>> n_clusters = 4000 + >>> X, true_labels = make_blobs(n_samples=int(1E5), centers=n_clusters, + ... cluster_std=1., random_state=42) + >>> mbkm = MiniBatchKMeans(n_clusters=n_clusters, + ... init_size=3 * n_clusters, n_init=2, + ... random_state=0, verbose=1).fit(X) + >>> print('mbkm.labels_ = %r' % (mbkm.labels_,)) + See also -------- @@ -1366,9 +1407,6 @@ def fit(self, X, y=None): # perform several inits with random sub-sets best_inertia = None for init_idx in range(n_init): - if self.verbose: - print("Init %d/%d with method: %s" - % (init_idx + 1, n_init, self.init)) counts = np.zeros(self.n_clusters, dtype=np.int32) # TODO: once the `k_means` function works with sparse input we @@ -1380,9 +1418,14 @@ def fit(self, X, y=None): X, self.n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, - init_size=init_size) + init_size=init_size, + verbose=self.verbose) # Compute the label assignment on the init dataset + if hasattr(self.init, '__array__'): + if self.verbose: + print('Taking one step using initialization dataset') + batch_inertia, centers_squared_diff = _mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, @@ -1405,7 +1448,10 @@ def fit(self, X, y=None): # Perform the iterative optimization until the final convergence # criterion - for iteration_idx in range(n_iter): + if self.verbose: + print('Begining mini-batch iterations') + prog = ProgIter(label='minibatch', verbose=self.verbose) + for iteration_idx in prog(range(n_iter)): # Sample a minibatch from the full dataset minibatch_indices = random_state.randint( 0, n_samples, self.batch_size) @@ -1424,13 +1470,13 @@ def fit(self, X, y=None): % (10 + self.counts_.min()) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose) + verbose=self.verbose, prog=prog) # Monitor convergence and do early stopping if necessary if _mini_batch_convergence( self, iteration_idx, n_iter, tol, n_samples, centers_squared_diff, batch_inertia, convergence_context, - verbose=self.verbose): + verbose=self.verbose, prog=prog): break self.n_iter_ = iteration_idx + 1 @@ -1461,10 +1507,19 @@ def _labels_inertia_minibatch(self, X): """ if self.verbose: print('Computing label assignment and total inertia') + n_samples = X.shape[0] + batch_size = self.batch_size + if batch_size > n_samples: + batch_size = n_samples x_squared_norms = row_norms(X, squared=True) - slices = gen_batches(X.shape[0], self.batch_size) - results = [_labels_inertia(X[s], x_squared_norms[s], - self.cluster_centers_) for s in slices] + slices = gen_batches(n_samples, batch_size) + total_batches = int(n_samples // batch_size) + prog = ProgIter(label='labels inertia', length=total_batches, + verbose=self.verbose) + results = [ + _labels_inertia(X[s], x_squared_norms[s], self.cluster_centers_) + for s in prog(slices) + ] labels, inertia = zip(*results) return np.hstack(labels), np.sum(inertia) @@ -1495,7 +1550,8 @@ def partial_fit(self, X, y=None): self.cluster_centers_ = _init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, - x_squared_norms=x_squared_norms, init_size=self.init_size) + x_squared_norms=x_squared_norms, init_size=self.init_size, + verbose=self.verbose) self.counts_ = np.zeros(self.n_clusters, dtype=np.int32) random_reassign = False @@ -1542,3 +1598,15 @@ def predict(self, X): X = self._check_test_data(X) return self._labels_inertia_minibatch(X)[0] + + +if __name__ == '__main__': + r""" + CommandLine: + python -m sklearn.cluster.k_means_ + python -m sklearn.cluster.k_means_ --allexamples + """ + import multiprocessing + multiprocessing.freeze_support() # for win32 + import utool as ut # NOQA + ut.doctest_funcs() diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 31307e55801a5..4f7dad4ff773d 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -313,6 +313,10 @@ def test_mb_k_means_plus_plus_init_dense_array(): def test_mb_kmeans_verbose(): + """ + nosetests -s sklearn/cluster/tests/test_k_means.py:test_mb_kmeans_verbose + nosetests -s sklearn/cluster/tests/test_k_means.py + """ mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42, verbose=1) old_stdout = sys.stdout diff --git a/sklearn/externals/progiter.py b/sklearn/externals/progiter.py new file mode 100644 index 0000000000000..7bd92473c45ce --- /dev/null +++ b/sklearn/externals/progiter.py @@ -0,0 +1,608 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import sys +import time +import datetime +import collections +from math import log10, floor +from sklearn.externals import six + +# VT100 ANSI definitions +# https://en.wikipedia.org/wiki/ANSI_escape_code#CSI_codes +CLEARLINE_EL0 = '\33[0K' # clear line to right +CLEARLINE_EL1 = '\33[1K' # clear line to left +CLEARLINE_EL2 = '\33[2K' # clear line +DECTCEM_HIDE = '\033[?25l' # hide cursor +DECTCEM_SHOW = '\033[?25h' # show cursor + +WIN32 = sys.platform.startswith('win32') +WITH_ANSI = not WIN32 + +if WIN32: + # Use time.clock in win32 + default_timer = time.clock +else: + default_timer = time.time + +if WITH_ANSI: + CLEAR_BEFORE = '\r' + AT_END = '\n' + CLEAR_AFTER = '' +else: + CLEAR_BEFORE = '\r' + CLEARLINE_EL2 + DECTCEM_HIDE + CLEAR_AFTER = CLEARLINE_EL0 + AT_END = DECTCEM_SHOW + '\n' + + +def _infer_length(iterable): + # use PEP 424 length hint if available + # adapted from click implementation + try: + return len(iterable) + except (AttributeError, TypeError): + try: + get_hint = type(iterable).__length_hint__ + except AttributeError: + return None + try: + hint = get_hint(iterable) + except TypeError: + return None + if (hint is NotImplemented or + not isinstance(hint, int) or + hint < 0): + return None + return hint + + +class ProgIter(object): + """ + Attributes + ---------- + + iterable : sequence + A python iterable + + label : int + Maximum length of the process + (estimated from iterable if not specified) + + label : str + Message to print + + freq : int + How many iterations to wait between messages. + + adjust : bool + if True freq is adjusted based on time_thresh + + eta_window : int + number of previous measurements to use in eta calculation + + clearline : bool + if true messages are printed on the same line + + adjust : bool + if True `freq` is adjusted based on time_thresh + + time_thresh : float + desired amount of time to wait between messages if adjust is True + otherwise does nothing + + stream : file + defaults to sys.stdout + + enabled : bool + if False nothing happens. + + verbose : int + verbosity mode + 0 - no verbosity, + 1 - verbosity with clearline=True and adjust=True + 2 - verbosity without clearline=False and adjust=True + 3 - verbosity without clearline=False and adjust=False + + Examples + ---------- + >>> from sklearn.externals.progiter import ProgIter + >>> def is_prime(n): + ... return n >= 2 and not any(n % i == 0 for i in range(2, n)) + >>> for n in ProgIter(range(10000), verbose=2): + >>> # do some work + >>> is_prime(n) + 10000/10000... rate=13294.94 Hz, eta=0:00:00, total=0:00:00, wall=13:34 EST + + Notes + ---------- + Either use ProgIter in a with statement or call prog.end() at the end of + the computation if there is a possibility that the entire iterable may not + be exhausted. + """ + def __init__(self, iterable=None, label=None, length=None, freq=1, + eta_window=64, clearline=True, adjust=True, time_thresh=2.0, + enabled=True, verbose=None, stream=None): + if label is None: + label = '' + if verbose is not None: + if verbose <= 0: + enabled = False + elif verbose == 1: + enabled, clearline, adjust = 1, 1, 1 + elif verbose == 2: + enabled, clearline, adjust = 1, 0, 1 + elif verbose >= 3: + enabled, clearline, adjust = 1, 0, 0 + if stream is None: + stream = sys.stdout + + self.stream = stream + self.iterable = iterable + self.label = label + self.length = length + self.freq = freq + self.enabled = enabled + self.adjust = adjust + self.eta_window = eta_window + self.time_thresh = 1.0 + self.clearline = clearline + self.extra = '' + self.started = False + self.finished = False + + def __call__(self, iterable): + self.iterable = iterable + return iter(self) + + def __enter__(self): + return self + + def __exit__(self, type_, value, trace): + if trace is not None: + return False + else: + self.end() + + def __iter__(self): + if not self.enabled: + return iter(self.iterable) + else: + return self.iter_rate() + + def set_extra(self, extra): + """ + specify a custom info appended to the end of the next message + TODO: come up with a better name and rename + """ + self.extra = extra + + def iter_rate(self): + self.begin() + # Wrap input iterable in a generator + for self._iter_idx, item in enumerate(self.iterable, start=1): + yield item + if (self._iter_idx) % self.freq == 0: + # update progress information every so often + self.update_measurements() + self.update_estimates() + self.display_message() + self.end() + + def mark(self): + self.update_measurements() + self.update_estimates() + self.display_message() + + def reset_internals(self): + # Prepare for iteration + if self.length is None: + self.length = _infer_length(self.iterable) + self._est_seconds_left = None + self._total_seconds = 0 + self._between_time = 0 + self._iter_idx = 0 + self._last_idx = -1 + # now time is actually not right now + # now refers the the most recent measurment + # last refers to the measurement before that + self._now_idx = 0 + self._now_time = 0 + self._between_count = -1 + self._max_between_time = -1.0 + self._max_between_count = -1.0 + self._iters_per_second = 0.0 + + def begin(self): + """ + Initializes information used to measure progress + """ + # Prepare for iteration + if self.length is None: + self.length = _infer_length(self.iterable) + + self.reset_internals() + self._msg_fmtstr = self.build_message_template() + + self.tryflush() + self.display_message() + + # Time progress was initialized + self._start_time = default_timer() + # Last time measures were udpated + self._last_time = self._start_time + self._now_idx = self._iter_idx + self._now_time = self._start_time + + # use last few times to compute a more stable average rate + if self.eta_window is not None: + self._measured_times = collections.deque( + [], maxlen=self.eta_window) + self._measured_times.append((self._iter_idx, self._start_time)) + + self._cursor_at_newline = True + self.started = True + self.finished = False + + def end(self): + if not self.enabled or self.finished: + return + # Write the final progress line if it was not written in the loop + if self._iter_idx != self._now_idx: + self.update_measurements() + self.update_estimates() + self._est_seconds_left = 0 + self.display_message() + self.ensure_newline() + self._cursor_at_newline = True + self.finished = True + + def adjust_frequency(self): + # Adjust frequency so the next print will not happen until + # approximatly `time_thresh` seconds have passed as estimated by + # iter_idx. + eps = 1E-9 + self._max_between_time = max(self._max_between_time, + self._between_time) + self._max_between_time = max(self._max_between_time, eps) + self._max_between_count = max(self._max_between_count, + self._between_count) + + # If progress was uniform and all time estimates were + # perfect this would be the new freq to achieve self.time_thresh + new_freq = int(self.time_thresh * self._max_between_count / + self._max_between_time) + new_freq = max(new_freq, 1) + # But things are not perfect. So, don't make drastic changes + factor = 1.5 + max_freq_change_up = max(256, int(self.freq * factor)) + max_freq_change_down = int(self.freq // factor) + if (new_freq - self.freq) > max_freq_change_up: + self.freq += max_freq_change_up + elif (self.freq - new_freq) > max_freq_change_down: + self.freq -= max_freq_change_down + else: + self.freq = new_freq + + def update_measurements(self): + """ + update current measurements and estimated of time and progress + """ + self._last_idx = self._now_idx + self._last_time = self._now_time + + self._now_idx = self._iter_idx + self._now_time = default_timer() + + self._between_time = self._now_time - self._last_time + self._between_count = self._now_idx - self._last_idx + self._total_seconds = self._now_time - self._start_time + + # Record that measures were updated + + def update_estimates(self): + # Estimate rate of progress + if self.eta_window is None: + self._iters_per_second = self._now_idx / self._total_seconds + else: + # Smooth out rate with a window + self._measured_times.append((self._now_idx, self._now_time)) + prev_idx, prev_time = self._measured_times[0] + self._iters_per_second = ((self._now_idx - prev_idx) / + (self._now_time - prev_time)) + + if self.length is not None: + # Estimate time remaining if length is given + iters_left = self.length - self._now_idx + est_eta = iters_left / self._iters_per_second + self._est_seconds_left = est_eta + + # Adjust frequency if printing too quickly + # so progress doesnt slow down actual function + if self.adjust and (self._between_time < self.time_thresh or + self._between_time > self.time_thresh * 2.0): + self.adjust_frequency() + + def build_message_template(self): + """ Defines the template for the progress line """ + tzname = time.tzname[0] + if self.length <= 0: + n_chrs = 4 + else: + n_chrs = int(floor(log10(float(self.length))) + 1) + msg_body = [ + (self.label), + (' {iter_idx:' + str(n_chrs) + 'd}/'), + ('?' if self.length <= 0 else six.text_type(self.length)), + ('... '), + ('rate={rate:4.2f} Hz,'), + ('' if self.length == 0 else ' eta={eta},'), + (' total={total},'), + (' wall={wall} ' + tzname), + (' {extra}'), + ] + if self.clearline: + msg_body = [CLEAR_BEFORE] + msg_body + [CLEAR_AFTER] + else: + msg_body = msg_body + [AT_END] + msg_fmtstr_time = ''.join(msg_body) + return msg_fmtstr_time + + def format_message(self): + """ formats the progress template with current values """ + if self._est_seconds_left is None: + eta = '?' + else: + eta = six.text_type(datetime.timedelta( + seconds=int(self._est_seconds_left))) + total = six.text_type(datetime.timedelta( + seconds=int(self._total_seconds))) + msg = self._msg_fmtstr.format( + iter_idx=self._now_idx, + rate=self._iters_per_second, + eta=eta, total=total, + wall=time.strftime('%H:%M'), + extra=self.extra, + ) + return msg + + def ensure_newline(self): + """ + use before any custom printing when using the progress iter to ensure + your print statement starts on a new line instead of at the end of a + progress line + """ + if not self._cursor_at_newline: + self.write(AT_END) + self._cursor_at_newline = True + + def display_message(self): + """ Writes current progress to the output stream """ + msg = self.format_message() + self.write(msg) + self.tryflush() + self._cursor_at_newline = not self.clearline + + def tryflush(self): + try: + # flush sometimes causes issues in IPython notebooks + self.stream.flush() + except IOError: + pass + + def write(self, msg): + self.stream.write(msg) + + +class Timer(object): + """ + Timer with-statment context object. + """ + def __init__(self, msg='', verbose=True, newline=True): + self.msg = msg + self.verbose = verbose + self.newline = newline + self.tstart = -1 + self.ellapsed = -1 + + def tic(self): + if self.verbose: + sys.stdout.flush() + sys.stdout.write('\ntic(%r)' % self.msg) + if self.newline: + sys.stdout.write('\n') + sys.stdout.flush() + self.tstart = default_timer() + + def toc(self): + ellapsed = (default_timer() - self.tstart) + if self.verbose: + sys.stdout.write('...toc(%r)=%.4fs\n' % (self.msg, ellapsed)) + sys.stdout.flush() + return ellapsed + + start = tic + stop = toc + + def __enter__(self): + self.tic() + return self + + def __exit__(self, type_, value, trace): + self.ellapsed = self.toc() + if trace is not None: + return False + + +def test_progiter(): + from six.moves import cStringIO + from sklearn.externals.progiter import ProgIter + # Define a function that takes some time + def is_prime(n): + return n >= 2 and not any(n % i == 0 for i in range(2, n)) + N = 5000 + + if False: + stream = cStringIO() + prog = ProgIter(range(N), clearline=False, stream=stream, freq=500, + adjust=False) + stream.seek(0) + print(stream.read()) + + prog = ProgIter(range(N), clearline=False) + for n in prog: + was_prime = is_prime(n) + prog.set_extra('n=%r, was_prime=%r' % (n, was_prime,)) + if (n + 1) % 128 == 0 and was_prime: + prog.set_extra('n=%r, was_prime=%r EXTRA' % (n, was_prime,)) + stream.seek(0) + print(stream.read()) + + length = 1000 + N = 50000 + N0 = N - length + print('N = %r' % (N,)) + print('N0 = %r' % (N0,)) + + print('\n-----') + print('Demo #0: progress can be disabled and incur essentially 0 overhead') + print('However, the overhead of enabled progress is minimal and typically ' + 'insignificant') + print('this is verbosity mode verbose=0') + iterable = (is_prime(n) for n in range(N0, N)) + with Timer('demo0'): + piterable = ProgIter(iterable, length=length, label='demo0', + enabled=False) + list(piterable) + + print('\n-----') + print('Demo #1: progress is shown by default in the same line') + print('this is verbosity mode verbose=1') + iterable = (is_prime(n) for n in range(N0, N)) + with Timer('demo1'): + piterable = ProgIter(iterable, length=length, label='demo1') + list(piterable) + + # Default behavior adjusts frequency of progress reporting so + # the performance of the loop is minimally impacted + print('\n-----') + print('Demo #2: clearline=False prints multiple lines.') + print('Progress is only printed as needed') + print('Notice the adjustment behavior of the print frequency') + print('this is verbosity mode verbose=2') + with Timer('demo2'): + iterable = (is_prime(n) for n in range(N0, N)) + piterable = ProgIter(iterable, length=length, clearline=False, + label='demo2') + list(piterable) + # import utool as ut + # print(ut.repr4(piterable.__dict__)) + + print('\n-----') + print('Demo #3: Adjustments can be turned off to give constant feedback') + print('this is verbosity mode verbose=3') + iterable = (is_prime(n) for n in range(N0, N)) + with Timer('demo3'): + piterable = ProgIter(iterable, length=length, adjust=False, + clearline=False, freq=100, label='demo3') + list(piterable) + + +def time_progiter_overhead(): + # Time the overhead of this function + import timeit + import textwrap + setup = textwrap.dedent( + ''' + from sklearn.externals.progiter import ProgIter + import numpy as np + import time + from six.moves import cStringIO, range + import utool as ut + N = 500 + stream = cStringIO() + rng = np.random.RandomState(42) + ndims = 2 + vec1 = rng.rand(113, ndims) + vec2 = rng.rand(71, ndims) + + def minimal_wraper1(iterable): + for item in iterable: + yield item + + def minimal_wraper2(iterable): + for count, item in enumerate(iterable, start=1): + yield item + + def minimal_wraper3(iterable): + count = 0 + for item in iterable: + yield item + count += 1 + + def minwrap4(iterable): + for count, item in enumerate(iterable, start=1): + yield item + if count % 100: + pass + + def minwrap5(iterable): + for count, item in enumerate(iterable, start=1): + yield item + if time.time() < 100: + pass + ''' + ) + statements = { + 'baseline' : '[{work} for n in range(N)]', + 'creation' : 'ProgIter(range(N))', + 'minwrap1' : '[{work} for n in minimal_wraper1(range(N))]', + 'minwrap2' : '[{work} for n in minimal_wraper2(range(N))]', + 'minwrap3' : '[{work} for n in minimal_wraper3(range(N))]', + 'minwrap4' : '[{work} for n in minwrap4(range(N))]', + 'minwrap5' : '[{work} for n in minwrap5(range(N))]', + '(sk-disabled)' : '[{work} for n in ProgIter(range(N), enabled=False, stream=stream)]', # NOQA + '(sk-plain)' : '[{work} for n in ProgIter(range(N), stream=stream)]', # NOQA + '(sk-freq)' : '[{work} for n in ProgIter(range(N), stream=stream, freq=100)]', # NOQA + '(sk-no-adjust)' : '[{work} for n in ProgIter(range(N), stream=stream, adjust=False, freq=200)]', # NOQA + '(sk-high-freq)' : '[{work} for n in ProgIter(range(N), stream=stream, adjust=False, freq=200)]', # NOQA + + # '(ut-disabled)' : '[{work} for n in ut.ProgIter(range(N), enabled=False, stream=stream)]', # NOQA + # '(ut-plain)' : '[{work} for n in ut.ProgIter(range(N), stream=stream)]', # NOQA + # '(ut-freq)' : '[{work} for n in ut.ProgIter(range(N), freq=100, stream=stream)]', # NOQA + # '(ut-no-adjust)' : '[{work} for n in ut.ProgIter(range(N), freq=200, adjust=False, stream=stream)]', # NOQA + # '(ut-high-freq)' : '[{work} for n in ut.ProgIter(range(N), stream=stream, adjust=False, freq=200)]', # NOQA + } + # statements = { + # 'calc_baseline': '[vec1.dot(vec2.T) for n in range(M)]', # NOQA + # 'calc_plain': '[vec1.dot(vec2.T) for n in ProgIter(range(M), stream=stream)]', # NOQA + # 'calc_plain_ut': '[vec1.dot(vec2.T) for n in ut.ProgIter(range(M), stream=stream)]', # NOQA + # } + timeings = {} + + work_strs = [ + 'None', + 'vec1.dot(vec2.T)', + 'n % 10 == 0', + ] + work = work_strs[0] + # work = work_strs[1] + + number = 10000 + prog = ProgIter(label='timing', adjust=True) + for key, stmt in prog(statements.items()): + prog.set_extra(key) + secs = timeit.timeit(stmt.format(work=work), setup, number=number) + timeings[key] = secs / number + + import utool as ut + print(ut.align(ut.repr4(timeings, precision=8), ':')) + + +if __name__ == '__main__': + r""" + CommandLine: + python -m sklearn.externals.progiter + python -m sklearn.externals.progiter --allexamples + """ + test_progiter() + # time_progiter_overhead()