/
identification.py
206 lines (166 loc) · 11.6 KB
/
identification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import copy
import numpy as np
from scipy.ndimage.measurements import label
from scipy.ndimage.morphology import generate_binary_structure
from skimage import morphology
from skimage.segmentation import relabel_sequential
def identify(data: np.ndarray, morph_structure: np.ndarray) -> np.ndarray:
"""Divides the precipitation field at each time step into individual storms using morphological operations
and connected-component labeling.
:param data: the precipitation data, given as an array of dimensions Time x Rows x Cols. To identify
storms in a single time slice, reshape the array to dimensions (1, Rows, Cols).
:param morph_structure: the structural set used to perform morphological operations, given as an array. See
scipy.morphology for more information.
:return: an array of time slice maps with individual storms labeled at each time slice, with the dimensions of
data.
"""
# 1) Run connected-components algorithm on all time slices:
# find the dimensions of the input
shape = data.shape
num_time_slices, row, col = shape
# initialize a new empty array of these dimensions to hold the result of running the connected-components algorithm
# on all time slices
label_array = np.empty((num_time_slices, row, col)).astype(int)
# use 8-connectivity for determining connectedness below
connectivity = generate_binary_structure(2, 2)
# run the connected-components algorithm on the data and store it in our new array
# since we will repeat this process often, we use a higher order function to compute the desired result
perform_connected_components(data, label_array, num_time_slices, connectivity)
# 2) Run erosion on labeled maps:
# initialize an array of the same dimensions to store morphological transformations in
morphology_array = np.zeros((num_time_slices, row, col)).astype(int)
# perform the necessary erosion operation using a similar higher order function
# note: here we make use of our defined structural set given as input
perform_morph_op(morphology.erosion, label_array, morphology_array, num_time_slices, morph_structure)
# 3) Classify storms with 1 or more remaining grid cells after erosion as large:
# initialize a list of sets, where the list's indices correspond to time slices of the dataset
large_storms = [set() for _ in range(num_time_slices)]
# for each time slice, if the slice originally had any labeled storms
for time_index in range(num_time_slices):
num_labels = np.max(label_array[time_index])
# for each of these labels, check if the label still appears after the erosion
for storm_segment in range(1, num_labels + 1):
if storm_segment in morphology_array[time_index]:
# if so, add it to the set of large storms for the current time slice
large_storms[time_index].add(storm_segment)
# 4) For the set of large storms, find smoothed regions using an opening operation:
# initialize an array of the same dimensions as the others
large_storm_array = np.zeros((num_time_slices, row, col)).astype(int)
# write in the large storm labels at their appropriate locations
for time_index in range(num_time_slices):
for col_index in range(col):
for row_index in range(row):
if label_array[time_index][row_index][col_index] in large_storms[time_index]:
large_storm_array[time_index][row_index][col_index] = \
label_array[time_index][row_index][col_index]
# define the array of small storms as the array holding the initial connected-components without those found to be
# large storms
small_storm_array = np.where(large_storm_array != 0, 0, label_array)
# reuse our morphology array for another morphological operation, this time 'opening', to produce our desired
# smoothed storm regions
perform_morph_op(morphology.erosion, large_storm_array, morphology_array, num_time_slices, morph_structure)
perform_morph_op(morphology.dilation, morphology_array, morphology_array, num_time_slices, morph_structure)
# 5) Perform almost-connected-component labeling on the large storms:
# dilate newly-smoothed regions
perform_morph_op(morphology.dilation, morphology_array, morphology_array, num_time_slices, morph_structure)
# run fully-connected-components algorithm on them
perform_connected_components(morphology_array, morphology_array, num_time_slices, connectivity)
# map clustering results onto array of large storms, but only where the storms exist in the original data
for time_index in range(num_time_slices):
for storm in np.unique(morphology_array[time_index]):
if storm:
overlap = np.where((large_storm_array[time_index] != 0) & (morphology_array[time_index] == storm),
1, 0).astype(int)
if np.max(overlap) > 0:
labels = np.unique(np.where(overlap == 1, large_storm_array[time_index], 0))
large_storm_array[time_index] = np.where(np.isin(large_storm_array[time_index], labels) &
(large_storm_array[time_index] != 0),
storm + np.max(label_array), large_storm_array[time_index])
# 6) Add small areas to existing nearby rainstorm events:
# copy the large storm array to compute overlaps against, ensuring overlaps are calculated on the same large storm
# array data each time
large_copy = copy.deepcopy(large_storm_array)
# perform almost connected-component labeling on the small storm array
# small_dilated = np.empty((num_time_slices, row, col)).astype(int)
perform_morph_op(morphology.dilation, small_storm_array, morphology_array, num_time_slices, morph_structure)
perform_connected_components(morphology_array, morphology_array, num_time_slices, connectivity)
for time_index in range(num_time_slices):
small_storm_array[time_index] = np.where(small_storm_array[time_index] != 0, morphology_array[time_index],
small_storm_array[time_index])
# find unique labels at each time slice for both large and small storm arrays
for time_index in range(num_time_slices):
# find the unique labels at each time slice for both large and small storm arrays
unique_elements_large = np.unique(large_storm_array[time_index])
unique_elements_small = np.unique(small_storm_array[time_index])
# for each small storm cluster
for small_index, small_storm in enumerate(unique_elements_small):
max_cells_overlap = 0 # max overlap between a small storm and all large storms at that time slice
overlap_amount = 0 # count for any particular combination (maximal or not)
if small_storm == 0:
continue
# for each large storm
for large_index, large_storm in enumerate(unique_elements_large):
if large_storm == 0:
continue
# find where the two storms overlap
overlap_amount = np.sum(np.where((large_copy[time_index] == large_storm) &
(morphology_array[time_index] == small_storm), 1, 0))
# if we've found a larger overlap
if overlap_amount > max_cells_overlap:
# save the label of the overlapping large storm and update the maximum overlap
largest_overlap_label = large_storm
max_cells_overlap = overlap_amount
# if we've found any overlaps
if max_cells_overlap and largest_overlap_label:
# write the small storm into the (original) large storm array using the label of the cluster with
# maximum overlap, but only at the locations where it appears in the un-dilated small storm array
large_storm_array[time_index] = np.where(small_storm_array[time_index] == small_storm,
largest_overlap_label, large_storm_array[time_index])
# remove this small storm from the small storm array
small_storm_array[time_index] = np.where(small_storm_array[time_index] == small_storm, 0,
small_storm_array[time_index])
# 7) Find rainstorm events consisting only of small areas:
# perform almost-connected-component labeling on those small storms that are left
perform_morph_op(morphology.dilation, small_storm_array, morphology_array, num_time_slices, morph_structure)
perform_connected_components(morphology_array, morphology_array, num_time_slices, connectivity)
# store the max label used in each time slice of the large storm array
largest_storm_label = np.empty((num_time_slices, 1, 1)).astype(int)
# take the results of our almost-connected-component labeling on the small storms and add the
# largest label in that time slice to it in order to produce a new label to add to the large storm array in the same
# location there
for time_index in range(num_time_slices):
largest_storm_label[time_index] = np.max(large_storm_array[time_index])
large_storm_array = np.where((small_storm_array != 0), morphology_array +
largest_storm_label, large_storm_array)
result = np.empty_like(large_storm_array)
# 8) Relabel the labels in each time slice sequentially from 1:
for time_index in range(num_time_slices):
result[time_index] = relabel_sequential(large_storm_array[time_index])[0]
return result
def perform_connected_components(to_be_connected: np.ndarray, result: np.ndarray, lifetime: int,
connectivity_type: np.ndarray) -> None:
"""Higher order function used to label connected-components on all time slices of a dataset.
:param to_be_connected: the data to perform the operation on, given as an array of dimensions Time x Rows x Cols.
:param result: where the result of the operation will be stored, with the same dimensions as to_be_connected.
:param lifetime: the number of time slices in the data, given as an integer.
:param connectivity_type: an array representing the type of connectivity to be used by the labeling algorithm. See
scipy.ndimage.measurements.label for more information.
:return: (None - the operation is performed on the result in place.)
"""
for index in range(lifetime):
cc_output, _ = label(to_be_connected[index], connectivity_type) # label also returns # of labels found
result[index] = cc_output
def perform_morph_op(morph_function: object, to_be_morphed: np.ndarray,
result: np.ndarray, lifetime: int, structure: np.ndarray) -> None:
"""Higher order function used to perform a morphological operation on all time slices of a dataset.
:param morph_function: the morphological operation to perform, given as an object (function).
:param to_be_morphed: the data to perform the operation on, given as an array of dimensions Time x Rows x Cols.
:param result: where the result of the operation will be store, with the same dimensions as to_be_morphed.
:param lifetime: the number of time slices in the data, given as an integer.
:param structure: the structural set used to perform the operation, given as an array. See scipy.morphology for more
information.
:return: (None - the operation is performed on the result in place.)
"""
for index in range(lifetime):
operation = morph_function(to_be_morphed[index], structure)
result[index] = operation