/
torch_notes.py
393 lines (357 loc) · 12.2 KB
/
torch_notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
import torch
# Check if Metal is available.
if torch.backends.mps.is_available():
mps_device = torch.device("mps")
x = torch.ones(1, device=mps_device)
print(x)
else:
print("MPS device not found.")
# Example datasets of tensors.
print("Example datasets of tensors.")
print("Random integers from -100, to 100, with a shape of 6.")
randint = torch.randint(-100, 100, (6,))
print(randint)
print(" ")
print("------------")
print(" ")
print("Tensor")
tensor = torch.tensor([[0.1, 0.2], [2.2, 3.1], [4.9, 5.2]])
print(tensor)
print(" ")
print("------------")
print(" ")
print("Tensor with zeroes with a shape of 2x3.")
zeros = torch.zeros(2, 3)
print(zeros)
print(" ")
print("------------")
print(" ")
print("Tensor with ones with a shape of 3x4.")
ones = torch.ones(3, 4)
print(ones)
print(" ")
print("------------")
print(" ")
print("Tensor with zero values with a shape of 2x3.")
input = torch.empty(2, 3)
print(input)
print(" ")
print("------------")
print(" ")
print("Tensor with 5 integers from 0 to 4.")
arange = torch.arange(5)
print(arange)
print(" ")
print("------------")
print(" ")
# Increments of the steps from 3 to 10 in 5 steps.
print("Increments of the steps from 3 to 10 in 5 steps.")
linespace = torch.linspace(3, 10, steps=5)
print(linespace)
logspace = torch.logspace(-10, 10, steps=5)
print(logspace)
print(" ")
print("------------")
print(" ")
# 5x5 matrix with a diagonal of ones.
print("5x5 matrix with a diagonal of ones.")
eye = torch.eye(5)
print(eye)
a = torch.empty((2, 3), dtype=torch.int64)
empty = torch.empty_like(a)
print(empty)
print(" ")
print("------------")
print(" ")
# Probablity distributions.
print("Probablity distributions.")
prob = torch.tensor([0.1, 0.9])
# 10% or 90%, each probability points to the index of the probability in the
# tensor.
# Draw 5 samples from the multinomial distribution.
samples = torch.multinomial(prob, num_samples=10, replacement=True)
print(samples)
print(" ")
print("------------")
print(" ")
# Concat tensors.
print("Concat tensors.")
# This is used for the output of the predictions.
print("This is used for the output of the predictions.")
a = torch.tensor([1, 2, 3, 4])
out = torch.cat((a, torch.tensor([5])), dim=0)
print(out)
print(" ")
print("------------")
print(" ")
# Tril = triangle lower
print("Triangle lower")
# As each row is processed, there is more history to look back on.
print("As each row is processed, there is more history to look back on.")
# You have to know based on your history to know what the next word is.
print("You have to know based on your history to know what the next word is.")
out = torch.tril(torch.ones(5, 5))
print(out)
print(" ")
print("------------")
print(" ")
# Triu = triangle upper
print("Triangle upper")
out = torch.triu(torch.ones(5, 5))
print(out)
print(" ")
print("------------")
print(" ")
# Masked fill
print("Masked fill")
# In order to get to tril, exponentiate every element in the matrix.
print("In order to get to tril, exponentiate every element in the matrix.")
out = torch.zeros(5, 5).masked_fill(torch.tril(torch.ones(5, 5)) == 0 , float('-inf'))
print(out)
print(" ")
print("------------")
print(" ")
# Transposing
print("Transposing")
# Swap the dimensions of a tensor.
print("Swap the dimensions of a tensor.")
# 2x3x4 tensor.
print("2x3x4 tensor.")
input = torch.zeros(2, 3, 4)
print(input)
# 4x3x2 tensor.
print("Transposed version of the tensor (4x3x2).")
out = input.transpose(0, 2)
print(out)
print(" ")
print("------------")
print(" ")
# Stacks tensors along a new dimension.
print("Stack tensors along a new dimension.")
tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([4, 5, 6])
tensor3 = torch.tensor([7, 8, 9])
# Use this to stack blocks to make a batch.
stacked_tensor = torch.stack([tensor1, tensor2, tensor3])
print(stacked_tensor)
print(" ")
print("------------")
print(" ")
# nn.linear function
print("nn.linear function")
import torch.nn as nn
sample = torch.tensor([10., 10., 10.])
print(sample)
# Linear transformation.
linear = nn.Linear(3, 3, bias=False)
print(linear(sample))
# When apply weight or bias under nn.module it will learn those
# and become better and train based on how accurate those are
# or how close parameters bring it to desired output.
# Docs > torch.nn
print(" ")
print("------------")
print(" ")
# Softmax function
# Convert a tensor of numbers into a tensor of probabilities.
print("Softmax function")
# [1.0, 2.0, 3.0] -> [x, y, z]
# Exponentiate each of those numbers.
# (1).exp() = e^1 = 2.718281828459045
# (2).exp() = e^2 = 7.38905609893065
# (3).exp() = e^3 = 20.085536923187668
# Add them up.
# 2.718281828459045 + 7.38905609893065 + 20.085536923187668 = 30.19287485057736
# Divide each number by the sum.
# 2.718281828459045 / 30.19287485057736 = 0.09003057317038046
# 7.38905609893065 / 30.19287485057736 = 0.24472847105479764
# 20.085536923187668 / 30.19287485057736 = 0.6652409557748219
# The sum of the probabilities should be 1.
# 0.09003057317038046 + 0.24472847105479764 + 0.6652409557748219 = 1.0
# The softmax function is used to convert a tensor of numbers into a tensor of
# probabilities.
import torch.nn.functional as F
# Create a tensor.
tensor1 = torch.tensor([1.0, 2.0, 3.0])
# Apply softmax function.
# dim=0: sum of the probabilities should be 1. - 1 dimension, a line
softmax_output = F.softmax(tensor1, dim=0)
print(softmax_output)
print(" ")
print("------------")
print(" ")
# Embedding vectors
# Stores some information about a word or *character*, like from a vocabulary.
print("Embedding vectors")
# A vector or numerical representation of a character.
# Initialize the embedding layer.
voacb_size = 1000
embedding_dim = 100
embedding = nn.Embedding(voacb_size, embedding_dim)
# Create some input indicies.
input_indicies = torch.LongTensor([1, 5, 3, 2])
# Apply the embedding layer.
embedding_output = embedding(input_indicies)
# The output will be a tensor of shape (4, 100), where 4 is the number of
# inputs and 100 is the dimensionality of the embedding vectors.
print(embedding_output.shape)
print(" ")
print("------------")
print(" ")
# Matrix multiplication.
# The dot product of two vectors.
# Given two vectors:
# [1, 2, 3]
# [4, 5, 6]
# Multiply the corresponding elements and add them up to get the dot product.
# 1 * 4 + 2 * 5 + 3 * 6 = 32
# The 2 matrix need to be able to be multiplied.
# A 3x2 and a 2x3 can be multiplied.
# A 3x4 and a 5x1 cannot be multiplied.
# The number of columns in the first matrix must match the number of rows in
# the second matrix.
# [
# 1, 2
# 3, 4
# 5, 6
# ]
# [
# 2, 8, 9
# 10, 11, 12
# ]
# (1x7)+(2x10) = 27
# (1x8)+(2x11) = 30
# (1x9)+(2x12) = 33
# (3x7)+(4x10) = 61
# (3x8)+(4x11) = 68
# (3x9)+(4x12) = 75
# (5x7)+(6x10) = 95
# (5x8)+(6x11) = 106
# (5x9)+(6x12) = 117
print("Matrix multiplication")
a = torch.tensor([[1, 2], [3, 4], [5, 6]])
b = torch.tensor([[2, 8, 9], [10, 11, 12]])
# The @ symbol is the matrix multiplication operator in pytorch.
# You can also do print(torch.matmul(a, b))
print(a @ b)
print(" ")
print("------------")
print(" ")
# In PyTorch you cannot multiply integers and floats.
print("In PyTorch you cannot multiply integers and floats.")
int_64 = torch.randint(1, (3, 2))
float_32 = torch.rand(2,3)
# result = torch.matmul(int_64, float_32)
# Above will fail.
# You can change the type of the tensor.
int_64 = torch.randint(1, (3, 2)).float()
result = torch.matmul(int_64, float_32)
print(result)
print(" ")
print("------------")
print(" ")
# Gradient descent.
# The gradient is the slope of the loss function.
# The loss function is the difference between the predicted value
# and the actual value.
# ...or the mean squared error.
# You need to pass nn.Module to the optimizer (class).
# Example: If you have 80 characters in vocab and have just
# started model with no training and random weights.
# There is a 1 in 80 chance the next token is predicted successfully.
# How to measure the loss is the negative log likelihood ( -ln(1/80) )
# (not even 2% chance).
# You want to minimize loss and increase prediction accuracy.
# Take the derivative of the current point of where it's at now
# and move it in a different direction.
# Picture a slope with a line going across the top of it which hits
# the slope at a point. The line is the loss function.
# Take the derivative of the current point and move it in a different direction.
# Gradient descent is an optimizer.
# The point should be moved to where the derivative is heading down
# the hill.
# torch.optim is a package implementing various optimization algorithms.
# Learning rate
# Say you decide you need to take a big step in the direction of
# the derivative (Gradient descent).
# You might overshoot the minimum (bottom of the slope).
# You need to take smaller steps so the parameters don't change too
# much.
# AdamW optimizer
# AdamW takes a generalized form of gradient descent and adds
# a momentum term (weight decay).
# The weight significance shrinks as gradient descent flattens out
# so that certain weights don't become too large.
# logits.view
print ("logits.view")
a = torch.rand(2, 3, 5)
print(a.shape)
# Unbpack the shape.
x, y, z = a.shape
# Reshape as tensor.
a = a.view(x, y, z)
print(x, y, z)
print(a.shape)
print(" ")
print("------------")
print(" ")
# Optimizers:
# Mean Squared Error (MSE): A common loss function used in regression
# problems, where the goal is to predict a continous output. It measures
# the average squared difference between the predicted and actual values,
# and is often used to train neural networks for regression tasks.
# Gradient Descent (GD): An optimization algorithim used to minimize
# the loss function of a machine learning model. The loss function
# measures how well the model is able to predict the target variable
# based on the input features. The idea of GD is to iteratively
# adjust the model parameters in the direction of the steepest descent
# of the loss function.
# Momentium: An extension of SGD that adds a "momentum" term to the
# parameter updates. This term helps smooth out the updates and allows
# the optimizer to continue moving in the right direction, even if the
# gradient changes direction or varies in magnitude. Useful for training
# deep neural networks.
# RMSprop: Uses a moving average of squared gradients to normalize the
# rate of each parameter. Helps avoid oscillations in the parameter updates
# and can improve the rate of convergence.
# Adam: Uses moving average of both the gradient and its squared value to
# adapt the learning rate of each parameter. Often used as default for
# deep learning models
# Combines the advantages of RMSprop and momentum.
# AdamW: Modification of Adam that adds weight decay to the parameter
# updates. Helps to regularize the model and can improve generaization
# performance.
# Activation functions:
# Activation functions in PyTorch, and in neural networks in general,
# play a crucial role in helping the network make complex decisions.
# Without activation functions, our neural network would just be doing
# a bunch of math without really being able to make decisions or
# understand complexities.
# They help the network learn from the data by adding non-linear
# properties to it. This means the network can understand more
# complicated things than if it were just doing straight-line
# calculations.
# ReLU: If a number is negative, it is replaced with 0. If it is
# positive, it is left unchanged.
# Offers a non-lineartity to linear models than if you were to have
# 100 leayrs of linear models.
# This is like a filter that passes on positive values and blocks
# negative values.
# Sigmoid:
# This one squishes the values between 0 and 1, sort of like deciding
# how "sure" the network is about something.
# If the sigmoid outputs a value close to 0, it means the network is
# not very sure about the prediction.
# If the sigmoid outputs a value close to 1, it means the network is
# very sure about the prediction.
# Sigmoid is good for problems where you want to decide between two
# classes, like "yes" or "no".
# It is also good for problems where you want to predict the probability
# of something happening, like the probability of a cat being in a
# picture.
# Similar to softmax, but softmax prioritizes the largest values and
# squishes the smallest values. Softmax is like sigmoid on steroids.
# Tanh:
# Similar to sigmoid, but it squishes values between -1 and 1.
# This activation function is often used in the hidden layers of
# neural networks.