-
Notifications
You must be signed in to change notification settings - Fork 11
/
datasets.py
156 lines (129 loc) · 5.73 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from __future__ import (division, print_function, )
from collections import OrderedDict
from scipy.stats import multivariate_normal
import numpy as np
import numpy.random as npr
from fuel import config
from fuel.datasets import H5PYDataset, IndexableDataset
from fuel.transformers.defaults import uint8_pixels_to_floatX
from fuel.utils import find_in_data_path
import numpy
import theano
def as_array(obj, dtype=theano.config.floatX):
"""Converts to ndarray of specified dtype"""
return numpy.asarray(obj, dtype=dtype)
class TinyILSVRC2012(H5PYDataset):
"""The Tiny ILSVRC2012 Dataset.
Parameters
----------
which_sets : tuple of str
Which split to load. Valid values are 'train' (1,281,167 examples)
'valid' (50,000 examples), and 'test' (100,000 examples).
"""
filename = 'ilsvrc2012_tiny.hdf5'
default_transformers = uint8_pixels_to_floatX(('features',))
def __init__(self, which_sets, **kwargs):
kwargs.setdefault('load_in_memory', False)
super(TinyILSVRC2012, self).__init__(
file_or_path=find_in_data_path(self.filename),
which_sets=which_sets, **kwargs)
class GaussianMixture(IndexableDataset):
""" Toy dataset containing points sampled from a gaussian mixture distribution.
The dataset contains 3 sources:
* features
* label
* densities
"""
def __init__(self, num_examples, means=None, variances=None, priors=None,
**kwargs):
rng = kwargs.pop('rng', None)
if rng is None:
seed = kwargs.pop('seed', config.default_seed)
rng = np.random.RandomState(seed)
gaussian_mixture = GaussianMixtureDistribution(means=means,
variances=variances,
priors=priors,
rng=rng)
self.means = gaussian_mixture.means
self.variances = gaussian_mixture.variances
self.priors = gaussian_mixture.priors
features, labels = gaussian_mixture.sample(nsamples=num_examples)
densities = gaussian_mixture.pdf(x=features)
data = OrderedDict([
('features', features),
('label', labels),
('density', densities)
])
super(GaussianMixture, self).__init__(data, **kwargs)
class GaussianMixtureDistribution(object):
""" Gaussian Mixture Distribution
Parameters
----------
means : tuple of ndarray.
Specifies the means for the gaussian components.
variances : tuple of ndarray.
Specifies the variances for the gaussian components.
priors : tuple of ndarray
Specifies the prior distribution of the components.
"""
def __init__(self, means=None, variances=None, priors=None, rng=None, seed=None):
if means is None:
means = map(lambda x: 10.0 * as_array(x), [[0, 0],
[1, 1],
[-1, -1],
[1, -1],
[-1, 1]])
# Number of components
self.ncomponents = len(means)
self.dim = means[0].shape[0]
self.means = means
# If prior is not specified let prior be flat.
if priors is None:
priors = [1.0/self.ncomponents for _ in range(self.ncomponents)]
self.priors = priors
# If variances are not specified let variances be identity
if variances is None:
variances = [np.eye(self.dim) for _ in range(self.ncomponents)]
self.variances = variances
assert len(means) == len(variances), "Mean variances mismatch"
assert len(variances) == len(priors), "prior mismatch"
if rng is None:
rng = npr.RandomState(seed=seed)
self.rng = rng
def _sample_prior(self, nsamples):
return self.rng.choice(a=self.ncomponents,
size=(nsamples, ),
replace=True,
p=self.priors)
def sample(self, nsamples):
# Sampling priors
samples = []
fathers = self._sample_prior(nsamples=nsamples).tolist()
for father in fathers:
samples.append(self._sample_gaussian(self.means[father],
self.variances[father]))
return as_array(samples), as_array(fathers)
def _sample_gaussian(self, mean, variance):
# sampling unit gaussians
epsilons = self.rng.normal(size=(self.dim, ))
return mean + np.linalg.cholesky(variance).dot(epsilons)
def _gaussian_pdf(self, x, mean, variance):
return multivariate_normal.pdf(x, mean=mean, cov=variance)
def pdf(self, x):
"Evaluates the the probability density function at the given point x"
pdfs = map(lambda m, v, p: p * self._gaussian_pdf(x, m, v),
self.means, self.variances, self.priors)
return reduce(lambda x, y: x + y, pdfs, 0.0)
if __name__ == '__main__':
means = map(lambda x: as_array(x), [[0, 0],
[1, 1],
[-1, -1],
[1, -1],
[-1, 1]])
std = 0.01
variances = [np.eye(2) * std for _ in means]
priors = [1.0/len(means) for _ in means]
gaussian_mixture = GaussianMixtureDistribution(means=means,
variances=variances,
priors=priors)
gmdset = GaussianMixture(1000, means, variances, priors, sources=('features', ))