/
utils.py
212 lines (178 loc) · 8.46 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import math
from typing import Union, List, Tuple
import torch
import numpy as np
import cv2 as cv
from torchvision.ops import batched_nms
def draw_bbox_and_label(x : torch.Tensor, label : str, img : np.ndarray) -> np.ndarray:
"""
Draws the predicted bounding boxes on the original image.
"""
x1,y1,x2,y2 = tuple(map(int, x))
if x is not None:
img = cv.rectangle(img, (x1,y1), (x2,y2), (0, 255, 0), 1)
t_size = cv.getTextSize(label, cv.FONT_HERSHEY_PLAIN, 1, 1)[0]
c2 = (x1 + t_size[0] + 3, y1 + t_size[1] + 4)
img = cv.putText(img, label, (x1, y1+t_size[1]+4), cv.FONT_HERSHEY_PLAIN, 1, (0,0,255), 1)
return img
def letterbox_image(
image : np.ndarray,
inp_dim : Tuple[int, int]) -> np.ndarray:
"""
Resizes images into the dimension expected by the network. This
function fills extra spaces in the image with grayscale, if the
image is smaller than the expected dimesion. This implementation
keeps the aspect ration of the original image.
"""
img_w, img_h = image.shape[1], image.shape[0] # original image dimension
net_w, net_h = inp_dim # the dimension expected by the network.
# calculate the new dimension with same aspect ration as
# the original image.
scale_factor = min(net_w/img_w, net_h/img_h)
new_w = int(round(img_w * scale_factor))
new_h = int(round(img_h * scale_factor))
resized_image = cv.resize(image, (new_w, new_h), interpolation=cv.INTER_CUBIC)
canvas = np.full((net_w, net_h, 3), 128)
canvas[(net_h - new_h)//2 : (net_h - new_h)//2 + new_h, (net_w - new_w)//2 : (net_w - new_w)//2 + new_w, :] = resized_image
return canvas
def prepare_image(
image : np.ndarray,
inp_dim : Tuple[int, int]) -> torch.Tensor:
"""
Prepared the input to match the expectation of the network.
"""
img = letterbox_image(image, inp_dim)
img = img[:, :, ::-1].transpose((2, 0, 1)).copy()
img = torch.from_numpy(img).float().div(255.0).unsqueeze(0)
return img
def bbox_iou(bbox1 : torch.Tensor, bbox2 : torch.Tensor, device="cpu"):
"""
Returns the IoU value of overlapping boxes
"""
b1_x1, b1_y1, b1_x2, b1_y2 = bbox1[:, 0], bbox1[:, 1], bbox1[:, 2], bbox1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = bbox2[:, 0], bbox2[:, 1], bbox2[:, 2], bbox2[:, 3]
# intersections
inter_rect_x1 = torch.max(b1_x1, b2_x1)
inter_rect_y1 = torch.max(b1_y1, b2_y1)
inter_rect_x2 = torch.min(b1_x2, b2_x2)
inter_rect_y2 = torch.min(b1_y2, b2_y2)
inter_area = torch.max(inter_rect_x2 - inter_rect_x1+1, torch.zeros(inter_rect_x2.shape, device=device)) * \
torch.max(inter_rect_y2 - inter_rect_y1+1, torch.zeros(inter_rect_y2.shape, device=device))
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
return inter_area / (b1_area + b2_area - inter_area)
def transform_prediction(
pred : torch.Tensor,
inp_dim : int,
anchors : Union[List[int], Tuple[int, ...], torch.Tensor],
num_classes : int,
device : str = "cpu"
) -> torch.Tensor:
"""
Transforms the predictions of the convolutional layers
from
batch_size x (3 * 5+num_classes) x grid_size x grid_size
to
batch_size x (grid_size * grid_size * anchors) x num_classes
aids the concatenation of the prediction at the three detection layers
and also for easy representation of the predicted bounding boxes.
Also, transforms the bounding box predictions and the objectness score
to match the discription specified in the paper:
Bx = sigmoid(Tx) + Cx
By = sigmoid(Ty) + Cy
Bw = Pw(exp(Tw))
Bh = Ph(exp(Th))
Parameters:
-----------
pred: prediction of the convolutional layer
inp_dim: the dimension of images expected by the yolo neural network
anchors: a list of anchors
num_classes: the numbers of unique classes as specified by COCO.
Returns:
--------
the transformed input.
"""
batch_size = pred.shape[0]
grid_size = pred.shape[2]
stride = inp_dim // grid_size
bbox_attrs = 5 + num_classes
num_anchors = len(anchors)
# transform input shape
pred = pred.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
pred = pred.transpose(1, 2).contiguous()
pred = pred.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
# since the dimensions of the anchors are in accordance with the original
# dimension of the image, it's required to scale the dimension of the
# anchors to match the dimension of the output of the convolutional
# layer
anchors = [(a[0] / stride, a[1] / stride) for a in anchors]
# sigmoid the center_x, center_y and the objectness score
pred[:, :, 0] = torch.sigmoid(pred[:, :, 0])
pred[:, :, 1] = torch.sigmoid(pred[:, :, 1])
pred[:, :, 4] = torch.sigmoid(pred[:, :, 4])
# add the center offsets
grid = torch.arange(grid_size, dtype=torch.float)
grid = np.arange(grid_size)
x_o, y_o = np.meshgrid(grid, grid)
#x_offset, y_offset = torch.meshgrid(grid, grid)
x_offset = torch.FloatTensor(x_o).view(-1, 1).to(device)
y_offset = torch.FloatTensor(y_o).view(-1, 1).to(device)
#x_offset = x_offset.transpose(0,1).reshape(-1,1).to(device)
#y_offset = y_offset.transpose(0,1).reshape(-1,1).to(device)
x_y_offset = torch.cat([x_offset, y_offset], dim=1).repeat(1, num_anchors).view(-1,2).unsqueeze(0)
pred[:, :, :2] += x_y_offset
# transform height and width
anchors = torch.FloatTensor(anchors).to(device)
anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
pred[:, :, 2:4] = torch.exp(pred[:, :, 2:4])*anchors
# apply sigmoid to class scores
pred[:, :, 5:5+num_classes] = torch.sigmoid(pred[:, :, 5:5+num_classes])
# resize bounding box prediction to the original image dimension
pred[:, :, :4] *= stride
return pred
def get_predictions(
pred : torch.Tensor,
num_classes : int,
objectness_confidence : float = 0.5,
nms_confidence_level : float = 0.4,
device : str = "cpu") -> Union[torch.Tensor, int]:
"""
This function filters the bounding boxes predicted by the network by first
discarding bounding boxes that has low objectness score, and then proceeds
to filter overlapping bounding boxes using the non-maximum suppression
algorithm.
Parameters:
-----------
pred: a tensor (predicted output) of shape
'batch_size x num_bboxes x bbox_attrs'
num_classes: the number of unique classes as provided by COCO.
objectness_confidence_level: probability threshold for bounding boxes
containing a valid object.
nms_convidence_level: threshold for overlapping bounding boxes
Returns:
--------
The prediction with reasonable bounding boxes.
"""
nB = pred.shape[0] # number of batches
bbox_attr = pred.shape[2] # center_x, center_y, height, width, class_probabilites
nBBOX = pred.shape[1] # number of bounding boxes
conf_mask = (pred[:, :, 4] > objectness_confidence).float().unsqueeze(2)
pred = pred * conf_mask
# transform the predicted centers, height and width to top-left corner and
# right bottom corner coordinates to aid the ease computation of the IoU
bbox_corner = pred.new(pred.shape)
bbox_corner[:, :, 0] = (pred[:, :, 0] - (pred[:, :, 2] / 2)) # top-left_x
bbox_corner[:, :, 1] = (pred[:, :, 1] - (pred[:, :, 3] / 2)) # top-left_y
bbox_corner[:, :, 2] = (pred[:, :, 0] + (pred[:, :, 2] / 2)) # bottom_right_x
bbox_corner[:, :, 3] = (pred[:, :, 1] + (pred[:, :, 3] / 2)) # bottom_right_y
pred[:, :, :4] = bbox_corner[:, :, :4]
n_pred = pred.view(-1, bbox_attr)
idxs = torch.arange(nB).reshape(-1,1).repeat(1, nBBOX).view(-1).to(device) # image indices
max_conf, max_idx = torch.max(n_pred[:, 5:5+num_classes], 1) # maximum class score and the index
max_conf = max_conf.float().unsqueeze(1).to(device)
max_idx = max_idx.float().unsqueeze(1).to(device)
n_pred = torch.cat([idxs.unsqueeze(1), n_pred[:, :5], max_conf, max_idx], 1) # batch_idx, x1, y1, x2, y2, objectness_score, class_score, class_idx
valid_bbox_indices = batched_nms(n_pred[:, 1:5].clone(), n_pred[:, 5].clone(), n_pred[:, 7].clone(), nms_confidence_level)
if len(valid_bbox_indices):
return n_pred[valid_bbox_indices, :]
return None