diff --git a/README.md b/README.md index f787443..a919c20 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ The prototype generator is built on SHA-256. ## Installation from binaries ``` -$ pip install git+git://github.com/statlab/cryptorandom.git +$ pip install cryptorandom ``` ## Installation from source diff --git a/cryptorandom/sample.py b/cryptorandom/sample.py index ae22568..e542354 100644 --- a/cryptorandom/sample.py +++ b/cryptorandom/sample.py @@ -41,17 +41,19 @@ def random_sample(a, size, replace=False, p=None, method="sample_by_index", prng If weights are provided, len(weights) must equal N. Sampling methods available are: - Fisher-Yates: sampling without weights, without replacement - PIKK: sampling without weights, without replacement - recursive: samping without weights, without replacement - Waterman_R: sampling without weights, without replacement - Vitter_Z: sampling without weights, without replacement - sample_by_index: sampling without weights, without replacement - - Exponential: sampling with weights, without replacement - Elimination: sampling with weights, without replacement - ... - + * Fisher-Yates: sampling without weights, without replacement + * PIKK: sampling without weights, without replacement + * recursive: samping without weights, without replacement + * Waterman_R: sampling without weights, without replacement + * Vitter_Z: sampling without weights, without replacement + * sample_by_index: sampling without weights, without replacement + * Exponential: sampling with weights, without replacement + * Elimination: sampling with weights, without replacement + + Fisher-Yates, PIKK, sample_by_index, Exponential, and Elimination return ordered samples, + i.e. they are equally likely to return [1, 2] as they are to return [2, 1]. Waterman_R, + Vitter_Z, and recursive aren't guaranteed to randomize the order of items in the sample. + Parameters ---------- a : 1-D array-like or int @@ -126,9 +128,9 @@ def random_permutation(a, method="Fisher-Yates", prng=None): Construct a random permutation (re-ordering) of a population `a`. The algorithms available are: - Fisher-Yates: a shuffling algorithm - random_sort: generate random floats and sort - permute_by_index: sample integer indices without replacement + * Fisher-Yates: a shuffling algorithm + * random_sort: generate random floats and sort + * permute_by_index: sample integer indices without replacement Parameters ---------- @@ -158,9 +160,9 @@ def random_permutation(a, method="Fisher-Yates", prng=None): raise ValueError("a must be an integer or array-like") methods = { - "Fisher-Yates" : lambda N: fykd_shuffle(N, prng=prng), - "random_sort" : lambda N: pikk_shuffle(N, prng=prng), - "permute_by_index" : lambda N: permute_by_index(N, prng=prng), + "Fisher-Yates" : lambda N: fykd_sample(N, N, prng=prng), + "random_sort" : lambda N: pikk(N, N, prng=prng), + "permute_by_index" : lambda N: sample_by_index(N, N, prng=prng), } try: @@ -263,7 +265,7 @@ def recursive_sample(n, k, prng=None): def waterman_r(n, k, prng=None): ''' - Waterman's Algorithm R for resevoir SRSs + Waterman's Algorithm R for reservoir SRSs Draw a sample of to sample k out of 1, ..., n without replacement Parameters @@ -291,7 +293,7 @@ def waterman_r(n, k, prng=None): def vitter_z(n, k, prng=None): ''' - Vitter's Algorithm Z for resevoir SRSs (Vitter 1985). + Vitter's Algorithm Z for reservoir SRSs (Vitter 1985). Draw a sample of to sample k out of 1, ..., n without replacement Parameters @@ -313,10 +315,15 @@ def vitter_z(n, k, prng=None): def Algorithm_X(n, t): V = prng.random() s = 0 - frac = 2 + numer = math.factorial(t+s+1-n)/math.factorial(t-n) + denom = math.factorial(t+s+1)/math.factorial(t) + frac = numer/denom + while frac > V: s += 1 - frac = ((t+1-n)/(t+1))**(s+1) + numer = (t+s+1-n)*numer + denom = (t+s+1)*denom + frac = numer/denom return s def f(x, t): @@ -338,7 +345,7 @@ def c(t): sam = np.array(range(1, k+1)) # fill the reservoir t = k - while t <= n: + while t < n: # Determine how many unseen records, nu, to skip if t <= 22*k: # the choice of 22 is taken from Vitter's 1985 ACM paper nu = Algorithm_X(k, t) @@ -353,10 +360,10 @@ def c(t): break var = f(np.floor(X), t)/(c(t)*g(X, t)) nu = np.floor(X) - if t+nu <= n: + if t+nu < n: # Make the next record a candidate, replacing one at random i = prng.randint(0, k) - sam[i] = int(t+nu) + sam[i] = int(t+nu+1) t = t+nu+1 return sam @@ -496,67 +503,7 @@ def exponential_sample(k, p, prng=None): elif k == n: return np.array(range(k)) else: - sam = prng.random(size=n) + sam = np.array(prng.random(size=n), dtype=float) sam = -np.log(sam)/weights sample = sam.argsort()[0:k] return sample+1 - -######################## Permutation functions ################################# - -def fykd_shuffle(n, prng=None): - ''' - Use Fisher-Yates-Knuth-Durstenfeld algorithm to permute 1, ..., n - - Parameters - ---------- - n : int - Population size - prng : {None, int, object} - If prng is None, return a randomly seeded instance of SHA256. - If prng is an int, return a new SHA256 instance seeded with seed. - If prng is already a PRNG instance, return it. - Returns - ------- - permuted list of {1, ..., n} - ''' - return fykd_sample(n, n, prng=prng) - - -def pikk_shuffle(n, prng=None): - ''' - Assign random values between 0 and 1 to the numbers 1, ..., n and sort them - according to these random values. - - Parameters - ---------- - n : int - Population size - prng : {None, int, object} - If prng is None, return a randomly seeded instance of SHA256. - If prng is an int, return a new SHA256 instance seeded with seed. - If prng is already a PRNG instance, return it. - Returns - ------- - list of items sampled - ''' - prng = get_prng(prng) - return np.argsort(prng.random(n)) + 1 - - -def permute_by_index(n, prng=None): - ''' - Select indices uniformly at random, without replacement, to permute 1, ..., n - - Parameters - ---------- - n : int - Population size - prng : {None, int, object} - If prng is None, return a randomly seeded instance of SHA256. - If prng is an int, return a new SHA256 instance seeded with seed. - If prng is already a PRNG instance, return it. - Returns - ------- - list of items sampled - ''' - return sample_by_index(n, n, prng=prng) diff --git a/cryptorandom/tests/test_sample.py b/cryptorandom/tests/test_sample.py index 3198a4e..37171f0 100644 --- a/cryptorandom/tests/test_sample.py +++ b/cryptorandom/tests/test_sample.py @@ -207,19 +207,19 @@ def test_vitter_z(): """ ff = fake_generator() sam = vitter_z(5, 2, prng=ff) - assert (sam == [4, 2]).all() + assert (sam == [5, 2]).all() ff = fake_generator() sam = random_sample(5, 2, method="Vitter_Z", prng=ff) - assert (sam+1 == [4, 2]).all() # shift to 1-index + assert (sam+1 == [5, 2]).all() # shift to 1-index ff = fake_generator() sam = vitter_z(500, 2, prng=ff) - assert (sam == [420, 265]).all() + assert (sam == [472, 422]).all() ff = fake_generator() sam = random_sample(500, 2, method="Vitter_Z", prng=ff) - assert (sam+1 == [420, 265]).all() # shift to 1-index + assert (sam+1 == [472, 422]).all() # shift to 1-index def test_elimination_sample(): @@ -261,7 +261,7 @@ def test_fykd_shuffle(): Test Fisher-Yates shuffle for random permutations, fykd_shuffle """ ff = fake_generator() - sam = fykd_shuffle(5, prng=ff) + sam = fykd_sample(5, 5, prng=ff) assert (sam == [1, 2, 3, 4, 5]).all() ff = fake_generator() @@ -279,7 +279,7 @@ def test_pikk_shuffle(): Test PIKK shuffling """ ff = fake_generator() - sam = pikk_shuffle(5, prng=ff) + sam = pikk(5, 5, prng=ff) assert (sam == [1, 2, 3, 4, 5]).all() ff = fake_generator() @@ -292,7 +292,7 @@ def test_permute_by_index(): Test permuting by index shuffling """ ff = fake_generator() - sam = permute_by_index(5, prng=ff) + sam = sample_by_index(5, 5, prng=ff) assert (sam == [2, 3, 1, 4, 5]).all() ff = fake_generator() diff --git a/docs/examples/sample.rst b/docs/examples/sample.rst index a6fa984..64b90f3 100644 --- a/docs/examples/sample.rst +++ b/docs/examples/sample.rst @@ -55,4 +55,68 @@ Elimination yes without replacement 10000 loops, best of 3: 22 µs per loop >>> %timeit random_sample(fruit, 2, method="sample_by_index", prng=s) 100000 loops, best of 3: 15 µs per loop + + +Some sampling methods (Fisher-Yates, PIKK, sample_by_index, Exponential, and Elimination) return ordered samples, i.e. they are equally likely to return [1, 2] as they are to return [2, 1]. +.. code:: + + >>> s = SHA256(1234567890) + >>> counts = {} + >>> for i in range(10000): + >>> sam = pikk(5, 2, prng=s) + >>> if str(sam) in counts.keys(): + >>> counts[str(sam)]+=1 + >>> else: + >>> counts[str(sam)]=0 + >>> counts + {'[1 2]': 549, + '[1 3]': 528, + '[1 4]': 512, + '[1 5]': 502, + '[2 1]': 515, + '[2 3]': 485, + '[2 4]': 487, + '[2 5]': 482, + '[3 1]': 484, + '[3 2]': 482, + '[3 4]': 466, + '[3 5]': 525, + '[4 1]': 468, + '[4 2]': 512, + '[4 3]': 490, + '[4 5]': 490, + '[5 1]': 547, + '[5 2]': 460, + '[5 3]': 507, + '[5 4]': 489} + +The reservoir algorithms (Waterman_R and Vitter_Z) and the recursive method aren't guaranteed to randomize the order of sampled items. + +.. code:: + + >>> s = SHA256(1234567890) + >>> counts = {} + >>> for i in range(10000): + >>> sam = recursive_sample(5, 2, prng=s) + >>> if str(sam) in counts.keys(): + >>> counts[str(sam)]+=1 + >>> else: + >>> counts[str(sam)]=0 + >>> counts + {'[1 2]': 492, + '[1 3]': 499, + '[1 4]': 503, + '[1 5]': 1016, + '[2 1]': 462, + '[2 3]': 487, + '[2 4]': 525, + '[2 5]': 985, + '[3 1]': 481, + '[3 2]': 485, + '[3 4]': 507, + '[3 5]': 984, + '[4 1]': 524, + '[4 2]': 475, + '[4 3]': 516, + '[4 5]': 1043} diff --git a/docs/index.rst b/docs/index.rst index 52d0016..638f5ea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,7 +4,11 @@ Welcome to cryptorandom's documentation! `cryptorandom` is a package random sampling and random number generation using cryptographically secure pseudorandom number generators. -`Download the package here!`__ +`Download the package on Github`__ or install it from PyPi: + +.. code:: + + pip install cryptorandom .. __: https://github.com/statlab/cryptorandom diff --git a/docs/release/release_0.2.txt b/docs/release/release_0.2.txt new file mode 100644 index 0000000..0c214a1 --- /dev/null +++ b/docs/release/release_0.2.txt @@ -0,0 +1,28 @@ +Announcement: cryptorandom 0.2 +=========================== + +We're happy to announce the release of cryptorandom v0.2! + +cryptorandom is a cryptographically secure PRNG and sampling module for Python. + +For more information, examples, and documentation, please visit our website: + +http://statlab.github.io/cryptorandom/ + + +New Features +------------ +* Functionality to generate random permutations of a list + + + +Improvements +------------ +* Bug fix in Vitter_Z +* Standardized output of sampling functions. Now, all of them return np.arrays +* Improved examples and documentation + + +Contributors to this release +---------------------------- +Kellie Ottoboni diff --git a/setup.py b/setup.py index b4262f3..5a0306a 100755 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ URL = 'http://www.github.com/statlab/cryptorandom' LICENSE = 'BSD License' DOWNLOAD_URL = 'http://www.github.com/statlab/cryptorandom' -VERSION = '0.1' +VERSION = '0.2' PYTHON_VERSION = (2, 7) INSTALL_REQUIRES = [