/
bop.py
246 lines (196 loc) · 7.13 KB
/
bop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/python
import math
from normal_cutoff import normal_cutoffs
#
# This is an implementation of the bag-of-patterns representation for
# time series as described in the paper "Finding Structural Similarity
# in Time Series Data Using Bag-of-Patterns Representation" by Jessica
# Lin and Yuan Li. At its core is the SAX discretization method described
# in numerous papers by Jessica Lin and Eamonn Keogh. The canonical
# reference is "Experiencing SAX: a Novel Symbolic Representation of Time
# Series", by Lin, Keogh, Wei, and Lonardi.
#
# All time series are represented as lists of numbers.
#
# SAX words are represented as lists of symbols so that they can be compared
# using the MinDist metric described in the Li, Keogh, Wei, and Lombardi
# paper. Those words are represented as strings when they are stored in bags
# of patterns because similarity between bags is based on word frequencies
# rather than the words themselves.
#
#
# Return a list of all subsequences of a time series obtained by a sliding
# window.
#
# series - The time series (list of values)
# n - The size of the sliding window
# step - If step is greater than or equal to 1, the window is advanced
# by that many observations as it slides along the series. If
# step is less than 1, it is interpreted as the fraction of n by
# which the window should be advanced.
#
# The return value is a list of lists with each sublist containing a window of
# data. The sublists appear in the order in which they were extracted by
# sliding the window left-to-right.
#
def window_time_series(series, n, step = 1):
# print "in window_time_series",series
if step < 1.0:
step = max(int(step * n), 1)
return [series[i:i+n] for i in range(0, len(series) - n + 1, step)]
#
# Return a copy of a time series in which the values have been standardized
# to have mean 0 and standard deviation 1
#
# series - The time series
#
def standardize(series):
# Compute mean and standard deviation
n = float(len(series))
mean = sum(series) / n
variance = sum([(value - mean)**2 for value in series]) / (n - 1)
stddev = math.sqrt(variance)
# Just in case the data have no variation
if stddev == 0:
stddev = 1.0
# Normalize via z transform
return [(value - mean)/stddev for value in series]
#
# Create a piecewise aggregate approximation of a time series by computing
# the means of the values in adjacent, non-overlapping windows.
#
# series - The time series to discretize
# now - Number of windows (see below)
# opw - Observations per window (see below)
#
# Eaxctly one of now or opw must be None, with the other being an integer.
# If now is specified, then the original time series will be converted into
# a sequence of now values. If opw is specified, then the original time
# series will be converted into a sequence of values such that each value is
# the mean of opw observations in the time series.
#
def paa(series, now, opw):
if now == None:
now = len(series) / opw
if opw == None:
opw = len(series) / now
return [sum(series[i * opw : (i + 1) * opw]) / float(opw) for i in range(now)]
#
# normal_cutoffs[i] is a list of i+1 cut points for a standard normal
# distribution such that cutting the distribution at those points yields
# i+2 sections that are mutually exclusive, exhaustive, and equiprobable.
#
# Discretize a time series using the SAX method. The input series is
# converted into a list of symbols chosen from an alphabet such that the
# number of occurrences of each symbol is roughly equiprobable.
#
# series - The time series to discretize
# nos - Number of symbols (see below)
# ops - Observations per symbol (see below)
# a - The alphabet size
# nostd - If true then do not standardize the data
#
# Eaxctly one of nos or ops must be None, with the other being an integer.
# If nos is specified, then the original time series will be converted into
# a sequence of nos symbols. If ops is specified, then the original time
# series will be converted into a sequence of symbols such that each symbol
# spans ops observations in the time series.
#
# The nostd flag makes it possible to run SAX on a subsequence of a larger
# time series that has already been standardized. Do not run SAX on time
# series that are not standardized with nostd = True.
#
# The return value is a SAX word represented as a list of symbols.
#
def sax(series, nos, ops, a, nostd = False):
if not nostd:
series = standardize(series)
# print "in sax: series",series
# print "SERIES"
# print series
means = paa(series, nos, ops)
# print "PAA"
# print means
# Convert numeric values to integers between 0 and a-1 (i.e., the symbols
# of the alphabet)
cutoffs = normal_cutoffs[a - 2]
retval = []
for mean in means:
i = 0
for cutoff in cutoffs:
if mean > cutoff:
i = i + 1
retval.append(i)
# print "in sax: sax",retval
# print "SAX"
# print retval
return retval
#
# Compute the MinDist distance metric between two SAX words represented as
# lists of symbols.
#
# sax_word_1 - The first SAX word
# sax_word_2 - The second SAX word
# a - The size of the SAX alphabet used when creating the words
#
def min_dist(sax_word_1, sax_word_2, a):
dist = 0.0
for i in range(len(sax_word_1)):
r = sax_word_1[i]
c = sax_word_2[i]
if abs(r - c) > 1:
dist = dist + normal_cutoffs[a - 2][max(r, c) - 1]
dist = dist - normal_cutoffs[a - 2][min(r, c)]
return dist
#
# Turn the values in a list into a string where each value is delimited
# by some other string
#
# l - The list
# delim - The delimiter
#
def list_to_delimted_string(l, delim):
s = ''
for item in l:
s = s + str(item) + delim
return s[:-len(delim)]
#
# Convert a time series into a list of SAX words by sliding a window over
# the series and running SAX on each window, yielding one SAX word per window
#
# series - The series to convert
# n - The length of the sliding window
# step - See window_time_series for an explanation of this parameter
# nos, ops, a, nostd - Parameters to SAX
#
def sax_words(series, n, step, nos, ops, a, nostd = False):
# print "in sax_words", series
windows = window_time_series(series, n, step)
windows = [sax(window, nos, ops, a, nostd) for window in windows]
return [list_to_delimted_string(window, '_') for window in windows]
#
# Given a list of SAX words, reduce runs of the same word to a single
# occurrence
#
# words - The list of SAX words
#
def numerosity_reduction(words):
reduced = [words[0]]
for word in words:
if word <> reduced[-1]:
reduced.append(word)
return reduced
#
# Create a bag-of-patterns from a list of SAX words, which is a dictionary
# keyed by word with values equal to the number of times each word occurs
#
# sax_words - List of SAX words
#
def bop(sax_words):
dict = {}
# Count occurrences of words
for word in sax_words:
if not word in dict:
dict[word] = 0
dict[word] = dict[word] + 1
return dict