/
clevr_util.py
237 lines (186 loc) · 6.73 KB
/
clevr_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import tensorflow as tf
from data_util import BigDataset
import numpy as np
from typing import List
from pathlib import Path
import json
from config import global_config
class Clevr(object):
OBJECTS = ['cylinder', 'sphere', 'cube']
def __init__(self, clevr_root: Path):
assert clevr_root.exists()
self.clevr_root = clevr_root
self.train = _add_bounding_boxes_to_file(json.loads(
Path(clevr_root / 'scenes' / 'CLEVR_train_scenes.json').read_text()
)['scenes'])
self.test = _add_bounding_boxes_to_file(json.loads(
Path(clevr_root / 'scenes' / 'CLEVR_val_scenes.json').read_text()
)['scenes'])
global_config.img_height = 128
global_config.img_width = 128
global_config.img_channels = 1
print('Loaded clevr dataset.')
def ensure_converted_image_exists(self, split, image_filename):
cache_path = Path('.cache') / split / image_filename
cache_path.parent.mkdir(parents=True, exist_ok=True)
if cache_path.exists():
return
img_path = self.clevr_root / 'images' / split / image_filename
assert img_path.exists()
img = _image_transformation(
tf.io.decode_image(
tf.io.read_file(str(img_path)),
channels=global_config.img_channels,
dtype=tf.dtypes.float32,
)
)
tf.io.write_file(
str(cache_path),
tf.image.encode_png(
tf.image.convert_image_dtype(
img, tf.dtypes.uint8)
)
)
del img
def read_image(self, split, image_filename):
cache_path = tf.strings.join(
['.cache', split, image_filename],
separator='/')
return tf.io.decode_image(
tf.io.read_file(cache_path),
channels=global_config.img_channels,
dtype=tf.dtypes.float32,
)
def convert_all_images_for_objects(self, scenes, filter_objs: List[str]):
for si, s in enumerate(scenes):
good = True
objs = s['objects']
for o in objs:
if o['shape'] not in filter_objs:
good = False
if not good:
continue
self.ensure_converted_image_exists(s['split'], s['image_filename'])
def filter_for_objects(self, filter_objs: List[str]):
def filter_once(scenes, is_test):
chosen = []
for si, s in enumerate(scenes):
good = True
objs = s['objects']
for o in objs:
if o['shape'] not in filter_objs:
good = False
if not good:
continue
t = []
t.append(
self.read_image(s['split'], s['image_filename'])
)
if is_test:
all_bb = []
for i, fobj in enumerate(filter_objs):
cur_bb = np.zeros(
(global_config.img_height,
global_config.img_width,
1),
np.float32
)
for obj in s['objects']:
if obj['shape'] != fobj:
continue
bb = obj['bounding_box']
cur_bb[
bb['xmin']: bb['xmax'],
bb['ymin']: bb['ymax']] = 1.0
all_bb.append(cur_bb)
all_bb = np.stack(all_bb)
t.append(all_bb)
t = tuple(t)
chosen.append(t)
# Columns are: image_filename; image_id; split; ?bbox
# Currently, each entry in `chosen` stores one row. However, tensorflow
# wants each entry to store one column.
tensors = []
for c in chosen:
for i, v in enumerate(c):
while len(tensors) <= i:
tensors.append([])
tensors[i].append(v)
if is_test:
D = tf.data.Dataset.from_tensor_slices(
{
'img': tensors[0],
'bbox': tensors[1],
})
else:
D = tf.data.Dataset.from_tensor_slices(
{
'img': tensors[0],
})
D = D.shuffle(len(chosen))
if not is_test:
D = D.repeat()
D = D.take(global_config.batch_size * global_config.epoch_length)
print(f'Raw dataset has {len(chosen)} examples; is_test: {is_test}')
return D
self.convert_all_images_for_objects(self.train, filter_objs)
self.convert_all_images_for_objects(self.test, filter_objs)
return BigDataset(
filter_once(self.train, False),
filter_once(self.test, True)
)
def _extract_bounding_boxes(scene):
objs = scene['objects']
rotation = scene['directions']['right']
for i, obj in enumerate(objs):
[x, y, z] = obj['pixel_coords']
[x1, y1, z1] = obj['3d_coords']
cos_theta, sin_theta, _ = rotation
x1 = x1 * cos_theta + y1 * sin_theta
y1 = x1 * -sin_theta + y1 * cos_theta
height_d = 6.9 * z1 * (15 - y1) / 2.0
height_u = height_d
width_l = height_d
width_r = height_d
if obj['shape'] == 'cylinder':
d = 9.4 + y1
h = 6.4
s = z1
height_u *= (s * (h / d + 1)) / (
(s * (h / d + 1)) - (s * (h - s) / d))
height_d = height_u * (h - s + d) / (h + s + d)
width_l *= 11 / (10 + y1)
width_r = width_l
if obj['shape'] == 'cube':
height_u *= 1.3 * 10 / (10 + y1)
height_d = height_u
width_l = height_u
width_r = height_u
obj['bounding_box'] = {
'ymin': int(y - height_d),
'ymax': int(y + height_u),
'xmin': int(x - width_l),
'xmax': int(x + width_r),
}
def _add_bounding_boxes_to_file(scenes: dict) -> dict:
for scene in scenes:
_extract_bounding_boxes(scene)
return scenes
def _image_transformation(img):
y0, y1 = 97, 300
x0, x1 = 64, 256
img = tf.image.crop_to_bounding_box(
img,
y0,
x0,
y1 - y0,
x1 - x0,
)
img = tf.image.resize(
img,
(
global_config.img_height,
global_config.img_width,
)
)
return img