/
ml_utils.py
130 lines (99 loc) · 4.45 KB
/
ml_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# from transformers import BertTokenizer, BertModel
# import torch
# # Load pre-trained model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# # Encode text
# text = "Embeddings are useful for NLP tasks."
# encoded_input = tokenizer(text, return_tensors='pt')
# # Get embeddings
# with torch.no_grad():
# output = model(**encoded_input)
# # The last hidden state is the sequence of hidden states of the last layer of the model.
# embeddings = output.last_hidden_state
# # If you want the embedding for [CLS] token (often used for classification tasks)
# cls_embedding = embeddings[0][0]
# print(cls_embedding)
from transformers import BertTokenizer, BertModel
import torch
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import copy
# Load pretrained ResNet50 model + higher level layers
resnetmodel = models.resnet50(pretrained=True)
resnetmodel = torch.nn.Sequential(*(list(resnetmodel.children())[:-1])) # Remove the last layer
resnetmodel.eval()
# Define image transformations - normalization and resizing as per model's requirements
resnettransformer = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
def get_image_embedding(pil_img):
# Ensure the image is in RGB format
if pil_img.mode != "RGB":
pil_img = pil_img.convert("RGB")
# Open and preprocess the image
imimim = copy.deepcopy(pil_img)
imimim = resnettransformer(imimim).unsqueeze(0) # Transform the image and add batch dimension
# Extract the embedding
with torch.no_grad():
embedding = resnetmodel(imimim)
return embedding.squeeze().numpy() # Convert the tensor to numpy array
# def get_image_embedding(pil_img):
# # Open and preprocess the image
# imimim = copy.deepcopy(pil_img)
# imimim = resnettransformer(imimim).unsqueeze(0) # Transform the image and add batch dimension
# # Extract the embedding
# with torch.no_grad():
# embedding = resnetmodel(imimim)
# return embedding.squeeze().numpy() # Convert the tensor to numpy array
# Initialize the pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# def get_embedding(text):
# encoded_input = tokenizer(text, return_tensors='pt')
# with torch.no_grad():
# output = model(**encoded_input)
# # Get the [CLS] token embedding and squeeze to make it 1D
# return output.last_hidden_state[:, 0, :].squeeze()
# def get_embedding(text):
# # Tokenize the text and truncate to fit within BERT's limit
# tokens = tokenizer.tokenize(text)
# tokens = tokens[:512 - 2] # Account for [CLS] and [SEP] tokens
# encoded_input = tokenizer.encode_plus(tokens, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
# with torch.no_grad():
# output = model(**encoded_input['input_ids'])
# return output.last_hidden_state[:, 0, :].squeeze()
def reduce_dimensions(embeddings, method='PCA', n_components=2):
if method == 'PCA':
reducer = PCA(n_components=n_components)
elif method == 'TSNE':
reducer = TSNE(n_components=n_components)
else:
raise ValueError("Method can be either 'PCA' or 'TSNE'")
return reducer.fit_transform(embeddings)
def get_embedding(text):
encoded_input = tokenizer.encode_plus(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
with torch.no_grad():
output = model(**encoded_input) # Use the entire dictionary, not just input_ids
return output.last_hidden_state[:, 0, :].squeeze()
def cosine_similarity(embedding1, embedding2):
# Compute the dot product
dot_product = torch.dot(embedding1, embedding2)
# Compute the L2 norms (euclidean norms)
norm1 = torch.norm(embedding1)
norm2 = torch.norm(embedding2)
# Cosine similarity formula
similarity = dot_product / (norm1 * norm2)
return similarity.item()
def main():
text1 = "The sky is blue."
text2 = "The ocean is vast."
embedding1 = get_embedding(text1)
embedding2 = get_embedding(text2)
similarity_score = cosine_similarity(embedding1, embedding2)
print(f"Cosine Similarity: {similarity_score}")