-
Notifications
You must be signed in to change notification settings - Fork 1
/
em_utilities_LORE3.py
122 lines (99 loc) · 4.75 KB
/
em_utilities_LORE3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#from scipy.sparse import csr_matrix
#from scipy.sparse import spdiags
#from scipy.stats import multivariate_normal
#import graphlab
import numpy as np
import sys
import time
from copy import deepcopy
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
def assign_clusters(data, centroids):
# Compute distances between each data point and the set of centroids:
# Fill in the blank (RHS only)
distances_from_centroids = pairwise_distances(data, centroids, metric='cosine') #metric='euclidean')
# Compute cluster assignments for each data point:
# Fill in the blank (RHS only)
cluster_assignment = np.argmin(distances_from_centroids,axis=1)
return cluster_assignment
def log_sum_exp(x, axis):
'''Compute the log of a sum of exponentials'''
x_max = np.max(x, axis=axis)
if axis == 1:
return x_max + np.log( np.sum(np.exp(x-x_max[:,np.newaxis]), axis=1) )
else:
return x_max + np.log( np.sum(np.exp(x-x_max), axis=0) )
def logpdf_diagonal_gaussian(x, mean, cov):
'''
Compute logpdf of a multivariate Gaussian distribution with diagonal covariance at a given point x.
A multivariate Gaussian distribution with a diagonal covariance is equivalent
to a collection of independent Gaussian random variables.
x should be a sparse matrix. The logpdf will be computed for each row of x.
mean and cov should be given as 1D numpy arrays
mean[i] : mean of i-th variable
cov[i] : variance of i-th variable'''
n = x.shape[0]
dim = x.shape[1]
#print(n, dim, len(mean), len(cov))
assert(dim == len(mean) and dim == len(cov))
# multiply each i-th column of x by (1/(2*sigma_i)), where sigma_i is sqrt of variance of i-th variable.
scaled_x = x.dot(np.diag(1./(2*np.sqrt(cov))))
#scaled_x = x.dot(diag(1./(2*np.sqrt(cov)))) FOR SPARCE
# multiply each i-th entry of mean by (1/(2*sigma_i))
scaled_mean = mean/(2*np.sqrt(cov))
#print(scaled_x,scaled_mean)
# sum of pairwise squared Eulidean distances gives SUM[(x_i - mean_i)^2/(2*sigma_i^2)]
try:
distance=pairwise_distances(scaled_x, [scaled_mean], 'euclidean').flatten()**2
except:
distance=0
return -np.sum(np.log(np.sqrt(2*np.pi*cov))) - distance
def EM_for_high_dimension(data, means, covs, weights, cov_smoothing=1e-5, maxiter=int(1e3), thresh=1e-4, verbose=False):
# cov_smoothing: specifies the default variance assigned to absent features in a cluster.
# If we were to assign zero variances to absent features, we would be overconfient,
# as we hastily conclude that those featurese would NEVER appear in the cluster.
# We'd like to leave a little bit of possibility for absent features to show up later.
n = data.shape[0]
dim = data.shape[1]
mu = deepcopy(means)
Sigma = deepcopy(covs)
K = len(mu)
weights = np.array(weights)
ll = None
ll_trace = []
print(n,dim,K)
print(1000, 300, 7)
for i in range(maxiter):
#print('Iteration: '+str(i))
# E-step: compute responsibilities
logresp = np.zeros((n,K)) #num_dataPoints * num_Clusters fill with zeros
for k in range(K):
logresp[:,k] = np.log(weights[k]) + logpdf_diagonal_gaussian(data, mu[k], Sigma[k])
ll_new = np.sum(log_sum_exp(logresp, axis=1))
if verbose:
print(ll_new)
sys.stdout.flush()
logresp -= np.vstack(log_sum_exp(logresp, axis=1))
resp = np.exp(logresp)
counts = np.sum(resp, axis=0)
# M-step: update weights, means, covariances
weights = counts / np.sum(counts)
for k in range(K):
#print('Iteration: '+str(i)+' '+'ClusterNum: '+str(k))
mu[k] = (np.diag(resp[:,k]).dot(data)).sum(axis=0)/counts[k]
#print(mu[k], np.ravel(mu[k]))
#mu[k] = np.ravel(mu[k]) mu[k] = mu[k].A1 not needed, same in Sigma
Sigma[k] = np.diag(resp[:,k]).dot((data*data)-2*data.dot(np.diag(mu[k])) ).sum(axis=0) \
+ (mu[k]**2)*counts[k]
Sigma[k] = Sigma[k] / counts[k] + cov_smoothing*np.ones(dim)
#print(Sigma[k], np.ravel(Sigma[k]))
#Sigma[k] = Sigma[k].A1 / counts[k] + cov_smoothing*np.ones(dim)
# check for convergence in log-likelihood
ll_trace.append(ll_new)
if ll is not None and (ll_new-ll) < thresh and ll_new > -np.inf:
ll = ll_new
break
else:
ll = ll_new
out = {'weights':weights,'means':mu,'covs':Sigma,'loglik':ll_trace,'resp':resp}
return out