analogy_generation.py

# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.


import torch
import json
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
from sklearn.cluster import KMeans
import argparse
import os
import h5py
import pickle
import torch_kmeans
from numpy.linalg import norm

class AnalogyRegressor(nn.Module):
    def __init__(self, featdim, innerdim=512):
        super(AnalogyRegressor,self).__init__()
        self.featdim = featdim
        self.innerdim = innerdim
        self.fc1 = nn.Linear(featdim*3, innerdim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(innerdim, innerdim)
        self.fc3 = nn.Linear(innerdim, featdim)

    def forward(self, a,c,d):
        x = torch.cat((a,c,d), dim=1)
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        return out


def init_clusters(k, dim):
    C = np.random.randn(k,dim)
    Cnorm = np.sqrt(np.sum(C**2, axis=1, keepdims=True))
    C = C/Cnorm
    return C

def cluster_feats(filehandle, base_classes, cachefile, n_clusters=100):
    if os.path.isfile(cachefile):
        with open(cachefile, 'rb') as f:
            centroids = pickle.load(f)
    else:
        centroids = []
        all_labels = filehandle['all_labels'][...]
        all_feats = filehandle['all_feats']

        count = filehandle['count'][0]
        for j, i in enumerate(base_classes):
            print('Clustering class {:d}:{:d}'.format(j,i))
            idx = np.where(all_labels==i)[0]
            idx = idx[idx<count]
            X = all_feats[idx,:]
            # use a reimplementation of torch kmeans for reproducible results
            # TODO: Figure out why this is important
            centroids_this = torch_kmeans.kmeans(X, n_clusters, 20)
            centroids.append(centroids_this)
        with open(cachefile, 'wb') as f:
            pickle.dump(centroids, f)
    return centroids


def get_difference_vectors(c_i):
    diff_i = c_i[:,np.newaxis,:] - c_i[np.newaxis,:,:]
    diff_i = diff_i.reshape((-1, diff_i.shape[2]))
    diff_i_norm = np.sqrt(np.sum(diff_i**2,axis=1, keepdims=True))
    diff_i = diff_i / (diff_i_norm + 0.00001)
    return diff_i

def mine_analogies(centroids):
    n_clusters = centroids[0].shape[0]

    analogies = np.zeros((n_clusters*n_clusters*len(centroids),4), dtype=int)
    analogy_scores = np.zeros(analogies.shape[0])
    start=0

    I, J = np.unravel_index(np.arange(n_clusters**2), (n_clusters, n_clusters))
    # for every class
    for i, c_i in enumerate(centroids):

        # get normalized difference vectors between cluster centers
        diff_i = get_difference_vectors(c_i)
        diff_i_t = torch.Tensor(diff_i).cuda()

        bestdots = np.zeros(diff_i.shape[0])
        bestdotidx = np.zeros((diff_i.shape[0],2),dtype=int)

        # for every other class
        for j, c_j in enumerate(centroids):
            if i==j:
                continue
            # get normalized difference vectors
            diff_j = get_difference_vectors(c_j)
            diff_j = torch.Tensor(diff_j).cuda()
            diff_j_sq = 0.5*torch.sum(diff_j**2, 1)
            dots = diff_i_t.mm(diff_j.transpose(0,1))
            dots = dots - diff_j_sq
            maxdots, argmaxdots = dots.max(1)
            maxdots = maxdots.cpu().numpy().reshape(-1)
            argmaxdots = argmaxdots.cpu().numpy().reshape(-1)
            #compute cosine distance and take the maximum
            # if maximum is better than best seen so far, update
            betteridx = maxdots>bestdots
            bestdots[betteridx] = maxdots[betteridx]
            bestdotidx[betteridx,0] = j*n_clusters + I[argmaxdots[betteridx]]
            bestdotidx[betteridx,1] = j*n_clusters + J[argmaxdots[betteridx]]
        # store discovered analogies
        stop = start+diff_i.shape[0]
        analogies[start : stop,0]=i*n_clusters + I
        analogies[start : stop,1]=i*n_clusters + J
        analogies[start : stop,2:] = bestdotidx
        analogy_scores[start : stop] = bestdots
        start = stop

    #prune away trivial analogies
    good_analogies = (analogy_scores>0) & (analogies[:,0]!=analogies[:,1]) & (analogies[:,2]!=analogies[:,3])
    return analogies[good_analogies,:], analogy_scores[good_analogies]


def train_analogy_regressor(analogies, centroids, base_classes, trained_classifier, lr=0.1, wt=10, niter=120000, step_after=40000, batchsize=128, momentum=0.9, wd=0.0001):
    # pre-permute analogies
    permuted_analogies = analogies[np.random.permutation(analogies.shape[0])]

    # create model and init
    featdim = centroids[0].shape[1]
    model = AnalogyRegressor(featdim)
    model = model.cuda()
    trained_classifier = trained_classifier.cuda()
    optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=wd, dampening=momentum)
    loss_1 = nn.CrossEntropyLoss().cuda()
    loss_2 = nn.MSELoss().cuda()


    num_clusters_per_class = centroids[0].shape[0]
    centroid_labels = (np.array(base_classes).reshape((-1,1)) * np.ones((1, num_clusters_per_class))).reshape(-1)
    concatenated_centroids = np.concatenate(centroids, axis=0)


    start=0
    avg_loss_1 = avg_loss_2 = count = 0.0
    for i in range(niter):
        # get current batch of analogies
        stop = min(start+batchsize, permuted_analogies.shape[0])
        to_train = permuted_analogies[start:stop,:]
        optimizer.zero_grad()

        # analogy is A:B :: C:D, goal is to predict B from A, C, D
        # Y is the class label of B (and A)
        A = concatenated_centroids[to_train[:,0]]
        B = concatenated_centroids[to_train[:,1]]
        C = concatenated_centroids[to_train[:,2]]
        D = concatenated_centroids[to_train[:,3]]
        Y = centroid_labels[to_train[:,1]]

        A = Variable(torch.Tensor(A)).cuda()
        B = Variable(torch.Tensor(B)).cuda()
        C = Variable(torch.Tensor(C)).cuda()
        D = Variable(torch.Tensor(D)).cuda()
        Y = Variable(torch.LongTensor(Y.astype(int))).cuda()

        Bhat = model(A,C,D)

        lossval_2 = loss_2(Bhat, B) # simple mean squared error loss

        # classification loss
        predicted_classprobs = trained_classifier(Bhat)
        lossval_1 = loss_1(predicted_classprobs, Y)
        loss = lossval_1 + wt * lossval_2

        loss.backward()
        optimizer.step()

        avg_loss_1 = avg_loss_1 + lossval_1.data[0]
        avg_loss_2 = avg_loss_2 + lossval_2.data[0]
        count = count+1.0


        if i % 100 == 0:
            print('{:d} : {:f}, {:f}, {:f}'.format(i, avg_loss_1/count, avg_loss_2/count, count))
            avg_loss_1 = avg_loss_2 = count = 0.0

        if (i+1) % step_after == 0:
            lr = lr / 10.0
            print(lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        start = stop
        if start==permuted_analogies.shape[0]:
            start=0

    return dict(model_state=model.state_dict(), concatenated_centroids=torch.Tensor(concatenated_centroids),
            num_base_classes=len(centroids), num_clusters_per_class=num_clusters_per_class)

def train_classifier(filehandle, base_classes, cachefile, networkfile, total_num_classes = 1000, lr=0.1, wd=0.0001, momentum=0.9, batchsize=1000, niter=10000):
    # either use pre-existing classifier or train one
    all_labels = filehandle['all_labels'][...]
    all_labels = all_labels.astype(int)
    all_feats = filehandle['all_feats']
    base_class_ids = np.where(np.in1d(all_labels, base_classes))[0]
    loss = nn.CrossEntropyLoss().cuda()
    model = nn.Linear(all_feats[0].size, total_num_classes).cuda()
    if os.path.isfile(cachefile):
        tmp = torch.load(cachefile)
        model.load_state_dict(tmp)
    elif os.path.isfile(networkfile):
        tmp = torch.load(networkfile)
        if 'module.classifier.bias' in tmp['state']:
            state_dict = {'weight':tmp['state']['module.classifier.weight'], 'bias':tmp['state']['module.classifier.bias']}
        else:
            model = nn.Linear(all_feats[0].size, total_num_classes, bias=False).cuda()
            state_dict = {'weight':tmp['state']['module.classifier.weight']}
        model.load_state_dict(state_dict)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=wd, dampening=0)
        for i in range(niter):
            optimizer.zero_grad()
            idx = np.sort(np.random.choice(base_class_ids, batchsize, replace=False))
            F = all_feats[idx,:]
            F = Variable(torch.Tensor(F)).cuda()
            L = Variable(torch.LongTensor(all_labels[idx])).cuda()
            S = model(F)
            loss_val = loss(S, L)
            loss_val.backward()
            optimizer.step()
            if i % 100 == 0:
                print('Classifier training {:d}: {:f}'.format(i, loss_val.data[0]))
        torch.save(model.state_dict(), cachefile)

    return model


def train_analogy_regressor_main(trainfile, base_classes, cachedir, networkfile, initlr=0.1):

    with h5py.File(trainfile, 'r') as f:
        classification_model = train_classifier(f, base_classes, os.path.join(cachedir, 'classifier.pkl'), networkfile)
        centroids = cluster_feats(f, base_classes, os.path.join(cachedir, 'cluster.pkl'))
    if not os.path.isfile(os.path.join(cachedir, 'analogies.npy')):
        analogies, analogy_scores = mine_analogies(centroids)
        np.save(os.path.join(cachedir, 'analogies.npy'), analogies.astype(int))
    else:
        analogies = np.load(os.path.join(cachedir, 'analogies.npy'))
    generator = train_analogy_regressor(analogies, centroids, base_classes, classification_model, lr=initlr)
    return generator


def do_generate(feats, labels, generator, max_per_label):
    # generate till there are at least max_per_label examples for each label
    unique_labels = np.unique(labels)
    generations_needed = []
    generator['concatenated_centroids'] = generator['concatenated_centroids'].numpy()
    for k, lab in enumerate(unique_labels):
        # for each label
        idx = np.where(labels==lab)[0]
        # generate this many examples:
        num_to_gen = max(max_per_label - idx.size,0)
        if num_to_gen>0:
            # choose a random seed
            seed = np.random.choice(idx, num_to_gen)
            # and a random base class
            base_class = np.random.choice(generator['num_base_classes'], num_to_gen)
            # and two random centroids from this base class
            c_c = np.random.choice(generator['num_clusters_per_class'], num_to_gen)
            c_d = np.random.choice(generator['num_clusters_per_class'], num_to_gen)

            centroid_ids_c = base_class*generator['num_clusters_per_class'] + c_c
            centroid_ids_d = base_class*generator['num_clusters_per_class'] + c_d
            # add to list of things to generate
            generations_needed.append( np.concatenate((seed.reshape((-1,1)), centroid_ids_c.reshape((-1,1)), centroid_ids_d.reshape((-1,1))),axis=1))

    if len(generations_needed)>0:
        generations_needed = np.concatenate(generations_needed, axis=0)
        gen_feats = np.zeros((generations_needed.shape[0],feats.shape[1]))
        gen_labels = np.zeros(generations_needed.shape[0])


        # batch up the generations
        batchsize=1000
        for start in range(0, generations_needed.shape[0], batchsize):
            stop = min(start + batchsize, generations_needed.shape[0])
            g_idx = generations_needed[start:stop,:]
            A = Variable(torch.Tensor(feats[g_idx[:,0],:])).cuda()
            C = Variable(torch.Tensor(generator['concatenated_centroids'][g_idx[:,1],:])).cuda()
            D = Variable(torch.Tensor(generator['concatenated_centroids'][g_idx[:,2],:])).cuda()
            F = generator['model'](A,C,D).cpu().data.numpy().copy()
            gen_feats[start:stop,:] = F
            print(np.linalg.norm(F-feats[g_idx[:,0],:]), np.linalg.norm(F), np.linalg.norm(feats[g_idx[:,0],:]))
            gen_labels[start:stop] = labels[g_idx[:,0]]

        return np.concatenate((feats, gen_feats), axis=0), np.concatenate((labels, gen_labels), axis=0)
    else:
        return feats, labels
# will consist of all base classes
'''
def find_dist(feat_A, feat_B, num_gen):
    # squeeze the single dimension out of feature b
    feat_B = np.squeeze(feat_B)
    # reshape feat A to be 1d vector
    feat_A = feat_A.reshape((1, -1))
    # subtract feature a from feature b for all the base classes
    feat_A_copy = np.squeeze(np.array([feat_A]*feat_B.shape[0]))
    # compute the norm of both the vectors
    feat_A_norm = np.sqrt(np.sum(feat_A_copy**2, axis=1))
    feat_B_norm = np.sqrt(np.sum(feat_B**2, axis=1))
    # dot product 
    feat_C = np.sum(feat_A_copy*feat_B, axis=1)
    # divide it by the norm
    feat_C = feat_C/(feat_A_norm*feat_B_norm)
    # compute the difference by 1
    feat_C = 1 - feat_C
    # see the top num_gen labels
    return np.argpartition(feat_C, num_gen)[:num_gen]
'''
def find_dist(feat_A, feat_B, num_gen):
    # squeeze the single dimension out of feature b
    feat_B = np.squeeze(feat_B)
    # reshape feat A to be 1d vector
    feat_A = feat_A.reshape((1, -1))
    # subtract feature a from feature b for all the base classes
    diff_vector = np.exp(np.sum(-np.power(feat_B - feat_A, 2), axis=1)) 
    # normalize this vector 
    diff_i_norm = np.sum(diff_vector, axis=0, keepdims=True)
    diff_vector = diff_vector/(diff_i_norm + 0.00001)
    # do argmax and find the top k 
    return np.argpartition(diff_vector, -num_gen)[-num_gen:]


def trial_generate(feats, labels, generator, centroid_file, max_per_label):
    unique_labels = np.unique(labels)
    generations_needed = []
    generator['concatenated_centroids'] = generator['concatenated_centroids'].numpy()
    # load the centroids
    with open(centroid_file, 'rb') as f:
        centroids = np.array(pickle.load(f))

    for k, lab in enumerate(unique_labels):
        # for each label
        idx = np.where(labels==lab)[0]
        # generate this many examples:
        num_to_gen = max(max_per_label - idx.size,0)
        if num_to_gen>0:
            # find the novel classes belonging to the particular label of the novel class
            novel_class_feats = feats[idx, :]
            # create a centroid for this novel class
            novel_class_centroid = novel_class_feats.mean(axis=0)

            # choose a seed as well
            seed = np.random.choice(idx, num_to_gen)
            # find the nearest base classes according to the distance criteria
            # for a particular novel class centroid 
            nearest_base = find_dist(novel_class_centroid, centroids, num_to_gen)
            c_c = np.random.choice(generator['num_clusters_per_class'], num_to_gen)
            c_d = np.random.choice(generator['num_clusters_per_class'], num_to_gen)
            centroid_ids_c = nearest_base*generator['num_clusters_per_class'] + c_c
            centroid_ids_d = nearest_base*generator['num_clusters_per_class'] + c_d
            # add to list of things to generate
            generations_needed.append( np.concatenate((seed.reshape((-1,1)), centroid_ids_c.reshape((-1,1)), centroid_ids_d.reshape((-1,1))),axis=1))


    if len(generations_needed)>0:
        generations_needed = np.concatenate(generations_needed, axis=0)
        gen_feats = np.zeros((generations_needed.shape[0],feats.shape[1]))
        gen_labels = np.zeros(generations_needed.shape[0])


        # batch up the generations
        batchsize=1000
        for start in range(0, generations_needed.shape[0], batchsize):
            stop = min(start + batchsize, generations_needed.shape[0])
            g_idx = generations_needed[start:stop,:]
            A = Variable(torch.Tensor(feats[g_idx[:,0],:])).cuda()
            C = Variable(torch.Tensor(generator['concatenated_centroids'][g_idx[:,1],:])).cuda()
            D = Variable(torch.Tensor(generator['concatenated_centroids'][g_idx[:,2],:])).cuda()
            F = generator['model'](A,C,D).cpu().data.numpy().copy()
            gen_feats[start:stop,:] = F
            print(np.linalg.norm(F-feats[g_idx[:,0],:]), np.linalg.norm(F), np.linalg.norm(feats[g_idx[:,0],:]))
            gen_labels[start:stop] = labels[g_idx[:,0]]

        return np.concatenate((feats, gen_feats), axis=0), np.concatenate((labels, gen_labels), axis=0)
    else:
        return feats, labels


def init_generator(generator_file):
    G = torch.load(generator_file)
    featdim = G['concatenated_centroids'].size(1)
    model = AnalogyRegressor(featdim)
    model.load_state_dict(G['model_state'])
    model = model.cuda()
    G['model'] =model
    return G