-
Notifications
You must be signed in to change notification settings - Fork 1
/
util_in_multi_h5_unnorm.py
99 lines (89 loc) · 4.44 KB
/
util_in_multi_h5_unnorm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import sys, os, os.path, glob
import pickle
from scipy.io import loadmat
import numpy
import h5py
import torch
from torch.autograd import Variable
import subprocess
import socket
curr_node = socket.gethostname().split('.')[0]
batcmd=f"squeue -u billyli | grep {curr_node}"
curr_slurm = subprocess.check_output(batcmd, shell=True, text=True)
slurm_id = curr_slurm.strip().split(' ')[0]
# GAS_FEATURE_DIR = '/jet/home/billyli/data_folder/data/googleAudioSet/pylon5/ir3l68p/kaixinm/cmu-thesis/data/audioset'
DCASE_FEATURE_DIR = '/jet/home/billyli/data_folder/data/dcase'
N_CLASSES = 527
N_WORKERS = 6
# local = os.getenv('LOCAL')
# local = '/jet/home/billyli/data_folder/data/googleAudioSet'
local = f"/local/slurm-{slurm_id}/local/audio"
GAS_FEATURE_DIR = os.path.join(local, 'pylon5/ir3l68p/kaixinm/cmu-thesis/data/audioset')
hf_train_path = os.path.join(local,'data_train.h5')
hf_val_eval_path = os.path.join(local, 'data.h5')
with open(os.path.join(GAS_FEATURE_DIR, 'normalizer.pkl'), 'rb') as f:
mu, sigma = pickle.load(f, encoding='bytes')
def batch_generator(batch_size, random_seed = 15213, normalize_scale = 1):
rng = numpy.random.RandomState(random_seed)
if batch_size !=100:
rand_int= numpy.random.randint(batch_size, 100)
else:
rand_int = 100
all_epochs = list(range(1,12,1))
all_iter = list(range(1,2501,1))
while True:
rng.shuffle(all_epochs)
rng.shuffle(all_iter)
for i in all_epochs:
hf_train = h5py.File(hf_train_path, 'r')
for j in all_iter:
key = str(i)+'_'+str(j)
# feat_a = hf_train[key]['audio'][rand_int-batch_size:rand_int]
feat_a = hf_train[key]['audio'][rand_int-batch_size:rand_int]/normalize_scale # pure hack to debug AST
# print('feat_a shape', feat_a.shape)
# print("feat_a_max:", numpy.max(feat_a))
# feat_v = hf_train[key]['video'][:]
feat_v = hf_train[key]['video'][rand_int-batch_size:rand_int]
label = hf_train[key]['label'][rand_int-batch_size:rand_int]
feat_a = feat_a.astype('float32')
feat_v = feat_v.astype('float32')
label = label.astype('float32')
# yield tuple(Variable(torch.from_numpy(numpy.stack(x))).cuda() for x in zip(*batch))
yield Variable(torch.from_numpy(feat_a)).cuda(), Variable(torch.from_numpy(feat_v)).cuda(), Variable(torch.from_numpy(label)).cuda()
del feat_a,feat_v, label
hf_train.close()
def multi_bulk_load(prefix, normalize_scale=1):
hf_val_eval = h5py.File(hf_val_eval_path, 'r')
if prefix == 'GAS_valid':
feat_a = hf_val_eval['valid']['data']['feat_a'][:]/normalize_scale
feat_v = hf_val_eval['valid']['data']['feat_v'][:]
labels = hf_val_eval['valid']['data']['labels'][:]
elif prefix == 'GAS_eval':
feat_a = hf_val_eval['eval']['data']['feat_a'][:]/normalize_scale
feat_v = hf_val_eval['eval']['data']['feat_v'][:]
labels = hf_val_eval['eval']['data']['labels'][:]
else:
assert('error')
hf_val_eval.close()
return feat_a.astype('float32'), feat_v.astype('float32'), labels.astype('float32'), None
def bulk_load(prefix, normalize_scale=1):
feat = []; labels = []; hashes = []
for filename in sorted(glob.glob(os.path.join(GAS_FEATURE_DIR, '%s_*.mat' % prefix)) +
glob.glob(os.path.join(DCASE_FEATURE_DIR, '%s_*.mat' % prefix))):
data = loadmat(filename)
feat.append(((data['feat'] - mu) / (sigma*normalize_scale)).astype('float32'))
labels.append(data['labels'].astype('bool'))
hashes.append(data['hashes'])
return numpy.concatenate(feat), numpy.concatenate(labels), numpy.concatenate(hashes)
def unnorm_bulk_load(prefix):
"""
Very very bad performance, values near 100, and would end up with MAP 0.012, do not use this function, only for testing purpose.
"""
feat = []; labels = []; hashes = []
for filename in sorted(glob.glob(os.path.join(GAS_FEATURE_DIR, '%s_*.mat' % prefix)) +
glob.glob(os.path.join(DCASE_FEATURE_DIR, '%s_*.mat' % prefix))):
data = loadmat(filename)
feat.append((data['feat']).astype('float32'))
labels.append(data['labels'].astype('bool'))
hashes.append(data['hashes'])
return numpy.concatenate(feat), numpy.concatenate(labels), numpy.concatenate(hashes)