Initial commits for files
Browse files- A-main-notebook.ipynb +480 -0
- README.md +59 -0
- StarterCode.ipynb +446 -0
- VisualizationTools.ipynb +0 -0
- names.txt +0 -0
A-main-notebook.ipynb
ADDED
@@ -0,0 +1,480 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import torch\n",
|
10 |
+
"import torch.nn.functional as F\n",
|
11 |
+
"import matplotlib.pyplot as plt # for making figures\n",
|
12 |
+
"%matplotlib inline"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"cell_type": "code",
|
17 |
+
"execution_count": 2,
|
18 |
+
"metadata": {},
|
19 |
+
"outputs": [
|
20 |
+
{
|
21 |
+
"data": {
|
22 |
+
"text/plain": [
|
23 |
+
"['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
"execution_count": 2,
|
27 |
+
"metadata": {},
|
28 |
+
"output_type": "execute_result"
|
29 |
+
}
|
30 |
+
],
|
31 |
+
"source": [
|
32 |
+
"# read in all the words\n",
|
33 |
+
"words = open('names.txt', 'r').read().splitlines()\n",
|
34 |
+
"words[:8]"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "code",
|
39 |
+
"execution_count": 3,
|
40 |
+
"metadata": {},
|
41 |
+
"outputs": [
|
42 |
+
{
|
43 |
+
"data": {
|
44 |
+
"text/plain": [
|
45 |
+
"32033"
|
46 |
+
]
|
47 |
+
},
|
48 |
+
"execution_count": 3,
|
49 |
+
"metadata": {},
|
50 |
+
"output_type": "execute_result"
|
51 |
+
}
|
52 |
+
],
|
53 |
+
"source": [
|
54 |
+
"len(words)"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": 4,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [
|
62 |
+
{
|
63 |
+
"name": "stdout",
|
64 |
+
"output_type": "stream",
|
65 |
+
"text": [
|
66 |
+
"{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}\n",
|
67 |
+
"27\n"
|
68 |
+
]
|
69 |
+
}
|
70 |
+
],
|
71 |
+
"source": [
|
72 |
+
"# build the vocabulary of characters and mappings to/from integers\n",
|
73 |
+
"chars = sorted(list(set(''.join(words))))\n",
|
74 |
+
"stoi = {s:i+1 for i,s in enumerate(chars)}\n",
|
75 |
+
"stoi['.'] = 0\n",
|
76 |
+
"itos = {i:s for s,i in stoi.items()}\n",
|
77 |
+
"vocab_size = len(itos)\n",
|
78 |
+
"print(itos)\n",
|
79 |
+
"print(vocab_size)"
|
80 |
+
]
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"cell_type": "code",
|
84 |
+
"execution_count": 5,
|
85 |
+
"metadata": {},
|
86 |
+
"outputs": [
|
87 |
+
{
|
88 |
+
"name": "stdout",
|
89 |
+
"output_type": "stream",
|
90 |
+
"text": [
|
91 |
+
"torch.Size([182625, 3]) torch.Size([182625])\n",
|
92 |
+
"torch.Size([22655, 3]) torch.Size([22655])\n",
|
93 |
+
"torch.Size([22866, 3]) torch.Size([22866])\n"
|
94 |
+
]
|
95 |
+
}
|
96 |
+
],
|
97 |
+
"source": [
|
98 |
+
"# build the dataset\n",
|
99 |
+
"block_size = 3 # context length: how many characters do we take to predict the next one?\n",
|
100 |
+
"\n",
|
101 |
+
"def build_dataset(words): \n",
|
102 |
+
" X, Y = [], []\n",
|
103 |
+
" \n",
|
104 |
+
" for w in words:\n",
|
105 |
+
" context = [0] * block_size\n",
|
106 |
+
" for ch in w + '.':\n",
|
107 |
+
" ix = stoi[ch]\n",
|
108 |
+
" X.append(context)\n",
|
109 |
+
" Y.append(ix)\n",
|
110 |
+
" context = context[1:] + [ix] # crop and append\n",
|
111 |
+
"\n",
|
112 |
+
" X = torch.tensor(X)\n",
|
113 |
+
" Y = torch.tensor(Y)\n",
|
114 |
+
" print(X.shape, Y.shape)\n",
|
115 |
+
" return X, Y\n",
|
116 |
+
"\n",
|
117 |
+
"import random\n",
|
118 |
+
"random.seed(42)\n",
|
119 |
+
"random.shuffle(words)\n",
|
120 |
+
"n1 = int(0.8*len(words))\n",
|
121 |
+
"n2 = int(0.9*len(words))\n",
|
122 |
+
"\n",
|
123 |
+
"Xtr, Ytr = build_dataset(words[:n1]) # 80%\n",
|
124 |
+
"Xdev, Ydev = build_dataset(words[n1:n2]) # 10%\n",
|
125 |
+
"Xte, Yte = build_dataset(words[n2:]) # 10%"
|
126 |
+
]
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"cell_type": "code",
|
130 |
+
"execution_count": 7,
|
131 |
+
"metadata": {},
|
132 |
+
"outputs": [
|
133 |
+
{
|
134 |
+
"name": "stdout",
|
135 |
+
"output_type": "stream",
|
136 |
+
"text": [
|
137 |
+
"11897\n"
|
138 |
+
]
|
139 |
+
}
|
140 |
+
],
|
141 |
+
"source": [
|
142 |
+
"# MLP revisited\n",
|
143 |
+
"n_embd = 10 # the dimensionality of the character embedding vectors\n",
|
144 |
+
"n_hidden = 200 # the number of neurons in the hidden layer of the MLP\n",
|
145 |
+
"\n",
|
146 |
+
"g = torch.Generator().manual_seed(2147483647) # for reproducibility\n",
|
147 |
+
"C = torch.randn((vocab_size, n_embd), generator=g)\n",
|
148 |
+
"W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) #* 0.2\n",
|
149 |
+
"#b1 = torch.randn(n_hidden, generator=g) * 0.01\n",
|
150 |
+
"W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01\n",
|
151 |
+
"b2 = torch.randn(vocab_size, generator=g) * 0\n",
|
152 |
+
"\n",
|
153 |
+
"# BatchNorm parameters\n",
|
154 |
+
"bngain = torch.ones((1, n_hidden))\n",
|
155 |
+
"bnbias = torch.zeros((1, n_hidden))\n",
|
156 |
+
"bnmean_running = torch.zeros((1, n_hidden))\n",
|
157 |
+
"bnstd_running = torch.ones((1, n_hidden))\n",
|
158 |
+
"\n",
|
159 |
+
"parameters = [C, W1, W2, b2, bngain, bnbias]\n",
|
160 |
+
"print(sum(p.nelement() for p in parameters)) # number of parameters in total\n",
|
161 |
+
"for p in parameters:\n",
|
162 |
+
" p.requires_grad = True"
|
163 |
+
]
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"cell_type": "code",
|
167 |
+
"execution_count": 10,
|
168 |
+
"metadata": {},
|
169 |
+
"outputs": [
|
170 |
+
{
|
171 |
+
"name": "stdout",
|
172 |
+
"output_type": "stream",
|
173 |
+
"text": [
|
174 |
+
" 0/ 200000: 3.2342\n",
|
175 |
+
" 10000/ 200000: 1.8947\n",
|
176 |
+
" 20000/ 200000: 1.8914\n",
|
177 |
+
" 30000/ 200000: 1.9489\n",
|
178 |
+
" 40000/ 200000: 2.1701\n",
|
179 |
+
" 50000/ 200000: 2.0639\n",
|
180 |
+
" 60000/ 200000: 2.0728\n",
|
181 |
+
" 70000/ 200000: 2.3965\n",
|
182 |
+
" 80000/ 200000: 2.4142\n",
|
183 |
+
" 90000/ 200000: 2.2257\n",
|
184 |
+
" 100000/ 200000: 2.2824\n",
|
185 |
+
" 110000/ 200000: 1.8584\n",
|
186 |
+
" 120000/ 200000: 2.1613\n",
|
187 |
+
" 130000/ 200000: 1.9009\n",
|
188 |
+
" 140000/ 200000: 1.8430\n",
|
189 |
+
" 150000/ 200000: 2.3324\n",
|
190 |
+
" 160000/ 200000: 2.2026\n",
|
191 |
+
" 170000/ 200000: 1.6905\n",
|
192 |
+
" 180000/ 200000: 1.9502\n",
|
193 |
+
" 190000/ 200000: 2.0909\n"
|
194 |
+
]
|
195 |
+
}
|
196 |
+
],
|
197 |
+
"source": [
|
198 |
+
"# same optimization as last time\n",
|
199 |
+
"max_steps = 200000\n",
|
200 |
+
"batch_size = 32\n",
|
201 |
+
"lossi = []\n",
|
202 |
+
"\n",
|
203 |
+
"for i in range(max_steps):\n",
|
204 |
+
" \n",
|
205 |
+
" # minibatch construct\n",
|
206 |
+
" ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)\n",
|
207 |
+
" Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y\n",
|
208 |
+
" \n",
|
209 |
+
" # forward pass\n",
|
210 |
+
" emb = C[Xb] # embed the characters into vectors\n",
|
211 |
+
" embcat = emb.view(emb.shape[0], -1) # concatenate the vectors\n",
|
212 |
+
" hpreact = embcat @ W1 #+ b1 # hidden layer pre-activation\n",
|
213 |
+
" \n",
|
214 |
+
" #hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True)) / (hpreact.std(0, keepdim=True)) + bnbias #batch normalisation layer\n",
|
215 |
+
" #----------------\n",
|
216 |
+
" # BatchNorm layer\n",
|
217 |
+
" #----------------\n",
|
218 |
+
" bnmeani = hpreact.mean(0, keepdim=True)\n",
|
219 |
+
" bnstdi = hpreact.std(0, keepdim=True)\n",
|
220 |
+
" \n",
|
221 |
+
" hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias\n",
|
222 |
+
" \n",
|
223 |
+
" with torch.no_grad():\n",
|
224 |
+
" bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani\n",
|
225 |
+
" bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi\n",
|
226 |
+
" #----------------\n",
|
227 |
+
"\n",
|
228 |
+
" h = torch.tanh(hpreact) # hidden layer\n",
|
229 |
+
" logits = h @ W2 + b2 # output layer\n",
|
230 |
+
" loss = F.cross_entropy(logits, Yb) # loss function\n",
|
231 |
+
" \n",
|
232 |
+
" # backward pass\n",
|
233 |
+
" for p in parameters:\n",
|
234 |
+
" p.grad = None\n",
|
235 |
+
" loss.backward()\n",
|
236 |
+
" \n",
|
237 |
+
" # update\n",
|
238 |
+
" lr = 0.1 if i < 100000 else 0.01 # step learning rate decay\n",
|
239 |
+
" for p in parameters:\n",
|
240 |
+
" p.data += -lr * p.grad\n",
|
241 |
+
"\n",
|
242 |
+
" # track stats\n",
|
243 |
+
" if i % 10000 == 0: # print every once in a while\n",
|
244 |
+
" print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')\n",
|
245 |
+
" lossi.append(loss.log10().item())\n",
|
246 |
+
"\n",
|
247 |
+
" #break #Add this while experienting so you dont have to print all the steps"
|
248 |
+
]
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"cell_type": "markdown",
|
252 |
+
"metadata": {},
|
253 |
+
"source": [
|
254 |
+
"-----------------\n",
|
255 |
+
"\n",
|
256 |
+
"Used for lecture : [00:12:59](https://www.youtube.com/watch?v=P6sfmUTpUmc&t=779s) fixing the saturated tanh "
|
257 |
+
]
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"cell_type": "code",
|
261 |
+
"execution_count": null,
|
262 |
+
"metadata": {},
|
263 |
+
"outputs": [],
|
264 |
+
"source": [
|
265 |
+
"#plt.hist(h.view(-1).tolist(), 50)"
|
266 |
+
]
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"cell_type": "code",
|
270 |
+
"execution_count": null,
|
271 |
+
"metadata": {},
|
272 |
+
"outputs": [],
|
273 |
+
"source": [
|
274 |
+
"#plt.hist(hpreact.view(-1).tolist(), 50)"
|
275 |
+
]
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"cell_type": "code",
|
279 |
+
"execution_count": null,
|
280 |
+
"metadata": {},
|
281 |
+
"outputs": [],
|
282 |
+
"source": [
|
283 |
+
"#plt.figure(figsize=(20,10))\n",
|
284 |
+
"#plt.imshow(h.abs() > 0.99, cmap='gray', interpolation='nearest')"
|
285 |
+
]
|
286 |
+
},
|
287 |
+
{
|
288 |
+
"cell_type": "markdown",
|
289 |
+
"metadata": {},
|
290 |
+
"source": [
|
291 |
+
"End of Used for lecture : [00:12:59](https://www.youtube.com/watch?v=P6sfmUTpUmc&t=779s) fixing the saturated tanh \n",
|
292 |
+
"\n",
|
293 |
+
"----------------"
|
294 |
+
]
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"cell_type": "code",
|
298 |
+
"execution_count": 24,
|
299 |
+
"metadata": {},
|
300 |
+
"outputs": [
|
301 |
+
{
|
302 |
+
"data": {
|
303 |
+
"text/plain": [
|
304 |
+
"[<matplotlib.lines.Line2D at 0x21a782d9720>]"
|
305 |
+
]
|
306 |
+
},
|
307 |
+
"execution_count": 24,
|
308 |
+
"metadata": {},
|
309 |
+
"output_type": "execute_result"
|
310 |
+
},
|
311 |
+
{
|
312 |
+
"data": {
|
313 |
+
"image/png": "",
|
314 |
+
"text/plain": [
|
315 |
+
"<Figure size 640x480 with 1 Axes>"
|
316 |
+
]
|
317 |
+
},
|
318 |
+
"metadata": {},
|
319 |
+
"output_type": "display_data"
|
320 |
+
}
|
321 |
+
],
|
322 |
+
"source": [
|
323 |
+
"plt.plot(lossi)"
|
324 |
+
]
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"cell_type": "code",
|
328 |
+
"execution_count": null,
|
329 |
+
"metadata": {},
|
330 |
+
"outputs": [],
|
331 |
+
"source": [
|
332 |
+
"# # calibrate the batch norm at the end of training\n",
|
333 |
+
"\n",
|
334 |
+
"# with torch.no_grad():\n",
|
335 |
+
"# # pass the training set through\n",
|
336 |
+
"# emb = C[Xtr]\n",
|
337 |
+
"# embcat = emb.view(emb.shape[0], -1)\n",
|
338 |
+
"# hpreact = embcat @ W1 # + b1\n",
|
339 |
+
"# # measure the mean/std over the entire training set\n",
|
340 |
+
"# bnmean = hpreact.mean(0, keepdim=True)\n",
|
341 |
+
"# bnstd = hpreact.std(0, keepdim=True)"
|
342 |
+
]
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"cell_type": "code",
|
346 |
+
"execution_count": 11,
|
347 |
+
"metadata": {},
|
348 |
+
"outputs": [
|
349 |
+
{
|
350 |
+
"name": "stdout",
|
351 |
+
"output_type": "stream",
|
352 |
+
"text": [
|
353 |
+
"train 2.037672996520996\n",
|
354 |
+
"val 2.107128620147705\n"
|
355 |
+
]
|
356 |
+
}
|
357 |
+
],
|
358 |
+
"source": [
|
359 |
+
"@torch.no_grad() # this decorator disables gradient tracking\n",
|
360 |
+
"def split_loss(split):\n",
|
361 |
+
" x,y = {\n",
|
362 |
+
" 'train': (Xtr, Ytr),\n",
|
363 |
+
" 'val': (Xdev, Ydev),\n",
|
364 |
+
" 'test': (Xte, Yte),\n",
|
365 |
+
" }[split]\n",
|
366 |
+
" emb = C[x] # (N, block_size, n_embd)\n",
|
367 |
+
" embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)\n",
|
368 |
+
" hpreact = embcat @ W1 #+ b1\n",
|
369 |
+
" #hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True)) / (hpreact.std(0, keepdim=True)) + bnbias #batch normalisation layer\n",
|
370 |
+
" hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias\n",
|
371 |
+
" h = torch.tanh(hpreact) # (N, n_hidden)\n",
|
372 |
+
" logits = h @ W2 + b2 # (N, vocab_size)\n",
|
373 |
+
" loss = F.cross_entropy(logits, y)\n",
|
374 |
+
" print(split, loss.item())\n",
|
375 |
+
"\n",
|
376 |
+
"split_loss('train')\n",
|
377 |
+
"split_loss('val')"
|
378 |
+
]
|
379 |
+
},
|
380 |
+
{
|
381 |
+
"cell_type": "code",
|
382 |
+
"execution_count": 11,
|
383 |
+
"metadata": {},
|
384 |
+
"outputs": [
|
385 |
+
{
|
386 |
+
"data": {
|
387 |
+
"text/plain": [
|
388 |
+
"tensor(3.2958)"
|
389 |
+
]
|
390 |
+
},
|
391 |
+
"execution_count": 11,
|
392 |
+
"metadata": {},
|
393 |
+
"output_type": "execute_result"
|
394 |
+
}
|
395 |
+
],
|
396 |
+
"source": [
|
397 |
+
"#The initial loss value that we expect\n",
|
398 |
+
"-torch.tensor(1/27.0).log()"
|
399 |
+
]
|
400 |
+
},
|
401 |
+
{
|
402 |
+
"cell_type": "code",
|
403 |
+
"execution_count": 10,
|
404 |
+
"metadata": {},
|
405 |
+
"outputs": [
|
406 |
+
{
|
407 |
+
"name": "stdout",
|
408 |
+
"output_type": "stream",
|
409 |
+
"text": [
|
410 |
+
"mora.\n",
|
411 |
+
"mayah.\n",
|
412 |
+
"see.\n",
|
413 |
+
"mel.\n",
|
414 |
+
"rylee.\n",
|
415 |
+
"emmadiejd.\n",
|
416 |
+
"leg.\n",
|
417 |
+
"adelyn.\n",
|
418 |
+
"elin.\n",
|
419 |
+
"shi.\n",
|
420 |
+
"jen.\n",
|
421 |
+
"eden.\n",
|
422 |
+
"estanar.\n",
|
423 |
+
"kayziquetta.\n",
|
424 |
+
"noshir.\n",
|
425 |
+
"roshiriel.\n",
|
426 |
+
"kendreth.\n",
|
427 |
+
"konnie.\n",
|
428 |
+
"casube.\n",
|
429 |
+
"ged.\n"
|
430 |
+
]
|
431 |
+
}
|
432 |
+
],
|
433 |
+
"source": [
|
434 |
+
"# sample from the model\n",
|
435 |
+
"g = torch.Generator().manual_seed(2147483647 + 10)\n",
|
436 |
+
"\n",
|
437 |
+
"for _ in range(20):\n",
|
438 |
+
" \n",
|
439 |
+
" out = []\n",
|
440 |
+
" context = [0] * block_size # initialize with all ...\n",
|
441 |
+
" while True:\n",
|
442 |
+
" # forward pass the neural net\n",
|
443 |
+
" emb = C[torch.tensor([context])] # (1,block_size,d)\n",
|
444 |
+
" h = torch.tanh(emb.view(1, -1) @ W1 + b1)\n",
|
445 |
+
" logits = h @ W2 + b2\n",
|
446 |
+
" probs = F.softmax(logits, dim=1)\n",
|
447 |
+
" # sample from the distribution\n",
|
448 |
+
" ix = torch.multinomial(probs, num_samples=1, generator=g).item()\n",
|
449 |
+
" context = context[1:] + [ix]\n",
|
450 |
+
" out.append(ix)\n",
|
451 |
+
" # if we sample the special '.' token, break\n",
|
452 |
+
" if ix == 0:\n",
|
453 |
+
" break\n",
|
454 |
+
" \n",
|
455 |
+
" print(''.join(itos[i] for i in out)) # decode and print the generated word"
|
456 |
+
]
|
457 |
+
}
|
458 |
+
],
|
459 |
+
"metadata": {
|
460 |
+
"kernelspec": {
|
461 |
+
"display_name": "venv",
|
462 |
+
"language": "python",
|
463 |
+
"name": "python3"
|
464 |
+
},
|
465 |
+
"language_info": {
|
466 |
+
"codemirror_mode": {
|
467 |
+
"name": "ipython",
|
468 |
+
"version": 3
|
469 |
+
},
|
470 |
+
"file_extension": ".py",
|
471 |
+
"mimetype": "text/x-python",
|
472 |
+
"name": "python",
|
473 |
+
"nbconvert_exporter": "python",
|
474 |
+
"pygments_lexer": "ipython3",
|
475 |
+
"version": "3.10.0"
|
476 |
+
}
|
477 |
+
},
|
478 |
+
"nbformat": 4,
|
479 |
+
"nbformat_minor": 2
|
480 |
+
}
|
README.md
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## SET 1 - MAKEMORE (PART 3) 🔗
|
2 |
+
|
3 |
+
[](https://muzzammilshah.github.io/Road-to-GPT/Makemore-part3/)
|
4 |
+

|
5 |
+
[](https://github.com/MuzzammilShah/NeuralNetworks-LanguageModels-3/commits/main)
|
6 |
+

|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
### **Overview**
|
11 |
+
In this repository, I implemented **Batch Normalization** within a neural network framework to enhance training stability and performance, following Andrej Karpathy's approach in the **Makemore - Part 3** video.
|
12 |
+
|
13 |
+
This implementation focuses on **normalizing activations and gradients, addressing initialization issues, and utilizing Kaiming initialization to prevent saturation of activation functions**. Additionally, **visualization graphs** were created at the end to analyze the effects of these techniques on the training process and model performance.
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
### **🗂️Repository Structure**
|
18 |
+
|
19 |
+
```plaintext
|
20 |
+
├── .gitignore
|
21 |
+
├── A-Main-Notebook.ipynb
|
22 |
+
├── StarterCode.ipynb
|
23 |
+
├── VisualizationTools.ipynb
|
24 |
+
├── README.md
|
25 |
+
├── notes/
|
26 |
+
│ ├── A-main-makemore-part3.md
|
27 |
+
│ └── README.md
|
28 |
+
└── names.txt
|
29 |
+
```
|
30 |
+
|
31 |
+
- **Notes Directory**: Contains detailed notes corresponding to each notebook section.
|
32 |
+
- **Jupyter Notebooks**: Step-by-step implementation and exploration of the concepts.
|
33 |
+
- **README.md**: Overview and guide for this repository.
|
34 |
+
- **names.txt**: Supplementary data file used in training the model.
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
### **📄Instructions**
|
39 |
+
|
40 |
+
To get the best understanding:
|
41 |
+
|
42 |
+
1. Start by reading the notes in the `notes/` directory. Each section corresponds to a notebook for step-by-step explanations.
|
43 |
+
2. Open the corresponding Jupyter Notebook (e.g., `A-Main-Notebook.ipynb` for `A-main-makemore-part3.md`).
|
44 |
+
3. Follow the code and comments for a deeper dive into the implementation details.
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
### **⭐Documentation**
|
49 |
+
|
50 |
+
For a better reading experience and detailed notes, visit my **[Road to GPT Documentation Site](https://muzzammilshah.github.io/Road-to-GPT/)**.
|
51 |
+
|
52 |
+
> **💡Pro Tip**: This site provides an interactive and visually rich explanation of the notes and code. It is highly recommended you view this project from there.
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
### **✍🏻Acknowledgments**
|
57 |
+
Notes and implementations inspired by the **Makemore - Part 3** video by [Andrej Karpathy](https://karpathy.ai/).
|
58 |
+
|
59 |
+
For more of my projects, visit my [Portfolio Site](https://muhammedshah.com).
|
StarterCode.ipynb
ADDED
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"Importing the PyTorch and Matplotlib utilities as before"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 1,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"import torch\n",
|
17 |
+
"import torch.nn.functional as F\n",
|
18 |
+
"import matplotlib.pyplot as plt # for making figures\n",
|
19 |
+
"%matplotlib inline"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"cell_type": "markdown",
|
24 |
+
"metadata": {},
|
25 |
+
"source": [
|
26 |
+
"Reading all the words"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 2,
|
32 |
+
"metadata": {},
|
33 |
+
"outputs": [
|
34 |
+
{
|
35 |
+
"data": {
|
36 |
+
"text/plain": [
|
37 |
+
"['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']"
|
38 |
+
]
|
39 |
+
},
|
40 |
+
"execution_count": 2,
|
41 |
+
"metadata": {},
|
42 |
+
"output_type": "execute_result"
|
43 |
+
}
|
44 |
+
],
|
45 |
+
"source": [
|
46 |
+
"# read in all the words\n",
|
47 |
+
"words = open('names.txt', 'r').read().splitlines()\n",
|
48 |
+
"words[:8]"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "code",
|
53 |
+
"execution_count": 3,
|
54 |
+
"metadata": {},
|
55 |
+
"outputs": [
|
56 |
+
{
|
57 |
+
"data": {
|
58 |
+
"text/plain": [
|
59 |
+
"32033"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
"execution_count": 3,
|
63 |
+
"metadata": {},
|
64 |
+
"output_type": "execute_result"
|
65 |
+
}
|
66 |
+
],
|
67 |
+
"source": [
|
68 |
+
"len(words)"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "markdown",
|
73 |
+
"metadata": {},
|
74 |
+
"source": [
|
75 |
+
"Printing the vocabulary of all the lower case letters and the special dot token"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 4,
|
81 |
+
"metadata": {},
|
82 |
+
"outputs": [
|
83 |
+
{
|
84 |
+
"name": "stdout",
|
85 |
+
"output_type": "stream",
|
86 |
+
"text": [
|
87 |
+
"{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}\n",
|
88 |
+
"27\n"
|
89 |
+
]
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"source": [
|
93 |
+
"# build the vocabulary of characters and mappings to/from integers\n",
|
94 |
+
"chars = sorted(list(set(''.join(words))))\n",
|
95 |
+
"stoi = {s:i+1 for i,s in enumerate(chars)}\n",
|
96 |
+
"stoi['.'] = 0\n",
|
97 |
+
"itos = {i:s for s,i in stoi.items()}\n",
|
98 |
+
"vocab_size = len(itos)\n",
|
99 |
+
"print(itos)\n",
|
100 |
+
"print(vocab_size)"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"cell_type": "markdown",
|
105 |
+
"metadata": {},
|
106 |
+
"source": [
|
107 |
+
"Here we are reading the dataset and processing it. In the end of this cell, we are also splitting the dataset into three- Train, Dev and Loss split"
|
108 |
+
]
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"cell_type": "code",
|
112 |
+
"execution_count": 5,
|
113 |
+
"metadata": {},
|
114 |
+
"outputs": [
|
115 |
+
{
|
116 |
+
"name": "stdout",
|
117 |
+
"output_type": "stream",
|
118 |
+
"text": [
|
119 |
+
"torch.Size([182625, 3]) torch.Size([182625])\n",
|
120 |
+
"torch.Size([22655, 3]) torch.Size([22655])\n",
|
121 |
+
"torch.Size([22866, 3]) torch.Size([22866])\n"
|
122 |
+
]
|
123 |
+
}
|
124 |
+
],
|
125 |
+
"source": [
|
126 |
+
"# build the dataset\n",
|
127 |
+
"block_size = 3 # context length: how many characters do we take to predict the next one?\n",
|
128 |
+
"\n",
|
129 |
+
"def build_dataset(words): \n",
|
130 |
+
" X, Y = [], []\n",
|
131 |
+
" \n",
|
132 |
+
" for w in words:\n",
|
133 |
+
" context = [0] * block_size\n",
|
134 |
+
" for ch in w + '.':\n",
|
135 |
+
" ix = stoi[ch]\n",
|
136 |
+
" X.append(context)\n",
|
137 |
+
" Y.append(ix)\n",
|
138 |
+
" context = context[1:] + [ix] # crop and append\n",
|
139 |
+
"\n",
|
140 |
+
" X = torch.tensor(X)\n",
|
141 |
+
" Y = torch.tensor(Y)\n",
|
142 |
+
" print(X.shape, Y.shape)\n",
|
143 |
+
" return X, Y\n",
|
144 |
+
"\n",
|
145 |
+
"import random\n",
|
146 |
+
"random.seed(42)\n",
|
147 |
+
"random.shuffle(words)\n",
|
148 |
+
"n1 = int(0.8*len(words))\n",
|
149 |
+
"n2 = int(0.9*len(words))\n",
|
150 |
+
"\n",
|
151 |
+
"Xtr, Ytr = build_dataset(words[:n1]) # 80%\n",
|
152 |
+
"Xdev, Ydev = build_dataset(words[n1:n2]) # 10%\n",
|
153 |
+
"Xte, Yte = build_dataset(words[n2:]) # 10%"
|
154 |
+
]
|
155 |
+
},
|
156 |
+
{
|
157 |
+
"cell_type": "markdown",
|
158 |
+
"metadata": {},
|
159 |
+
"source": [
|
160 |
+
"Almost the same MLP, but we have cleaned it up to add those hard coded values into variables so we just have to modify them there"
|
161 |
+
]
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"cell_type": "code",
|
165 |
+
"execution_count": 6,
|
166 |
+
"metadata": {},
|
167 |
+
"outputs": [
|
168 |
+
{
|
169 |
+
"name": "stdout",
|
170 |
+
"output_type": "stream",
|
171 |
+
"text": [
|
172 |
+
"11897\n"
|
173 |
+
]
|
174 |
+
}
|
175 |
+
],
|
176 |
+
"source": [
|
177 |
+
"# MLP revisited\n",
|
178 |
+
"n_embd = 10 # the dimensionality of the character embedding vectors\n",
|
179 |
+
"n_hidden = 200 # the number of neurons in the hidden layer of the MLP\n",
|
180 |
+
"\n",
|
181 |
+
"g = torch.Generator().manual_seed(2147483647) # for reproducibility\n",
|
182 |
+
"C = torch.randn((vocab_size, n_embd), generator=g)\n",
|
183 |
+
"W1 = torch.randn((n_embd * block_size, n_hidden), generator=g)\n",
|
184 |
+
"b1 = torch.randn(n_hidden, generator=g)\n",
|
185 |
+
"W2 = torch.randn((n_hidden, vocab_size), generator=g)\n",
|
186 |
+
"b2 = torch.randn(vocab_size, generator=g)\n",
|
187 |
+
"\n",
|
188 |
+
"parameters = [C, W1, b1, W2, b2]\n",
|
189 |
+
"print(sum(p.nelement() for p in parameters)) # number of parameters in total\n",
|
190 |
+
"for p in parameters:\n",
|
191 |
+
" p.requires_grad = True"
|
192 |
+
]
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"cell_type": "markdown",
|
196 |
+
"metadata": {},
|
197 |
+
"source": [
|
198 |
+
"Here we are optimizing the NN. Same as before, just those hard coded numbers (or magic numbers as Andrej sensei calls it) have been replaced with variable names for more readability"
|
199 |
+
]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"cell_type": "code",
|
203 |
+
"execution_count": 7,
|
204 |
+
"metadata": {},
|
205 |
+
"outputs": [
|
206 |
+
{
|
207 |
+
"name": "stdout",
|
208 |
+
"output_type": "stream",
|
209 |
+
"text": [
|
210 |
+
" 0/ 200000: 27.8817\n",
|
211 |
+
" 10000/ 200000: 2.8244\n",
|
212 |
+
" 20000/ 200000: 2.5473\n",
|
213 |
+
" 30000/ 200000: 2.8961\n",
|
214 |
+
" 40000/ 200000: 2.0967\n",
|
215 |
+
" 50000/ 200000: 2.5020\n",
|
216 |
+
" 60000/ 200000: 2.4999\n",
|
217 |
+
" 70000/ 200000: 2.0510\n",
|
218 |
+
" 80000/ 200000: 2.4076\n",
|
219 |
+
" 90000/ 200000: 2.3172\n",
|
220 |
+
" 100000/ 200000: 2.0199\n",
|
221 |
+
" 110000/ 200000: 2.3338\n",
|
222 |
+
" 120000/ 200000: 1.8767\n",
|
223 |
+
" 130000/ 200000: 2.3989\n",
|
224 |
+
" 140000/ 200000: 2.2102\n",
|
225 |
+
" 150000/ 200000: 2.1937\n",
|
226 |
+
" 160000/ 200000: 2.0843\n",
|
227 |
+
" 170000/ 200000: 1.8780\n",
|
228 |
+
" 180000/ 200000: 1.9727\n",
|
229 |
+
" 190000/ 200000: 1.8222\n"
|
230 |
+
]
|
231 |
+
}
|
232 |
+
],
|
233 |
+
"source": [
|
234 |
+
"# same optimization as last time\n",
|
235 |
+
"max_steps = 200000\n",
|
236 |
+
"batch_size = 32\n",
|
237 |
+
"lossi = []\n",
|
238 |
+
"\n",
|
239 |
+
"for i in range(max_steps):\n",
|
240 |
+
" \n",
|
241 |
+
" # minibatch construct\n",
|
242 |
+
" ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)\n",
|
243 |
+
" Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y\n",
|
244 |
+
" \n",
|
245 |
+
" # forward pass\n",
|
246 |
+
" emb = C[Xb] # embed the characters into vectors\n",
|
247 |
+
" embcat = emb.view(emb.shape[0], -1) # concatenate the vectors\n",
|
248 |
+
" hpreact = embcat @ W1 + b1 # hidden layer pre-activation\n",
|
249 |
+
" h = torch.tanh(hpreact) # hidden layer\n",
|
250 |
+
" logits = h @ W2 + b2 # output layer\n",
|
251 |
+
" loss = F.cross_entropy(logits, Yb) # loss function\n",
|
252 |
+
" \n",
|
253 |
+
" # backward pass\n",
|
254 |
+
" for p in parameters:\n",
|
255 |
+
" p.grad = None\n",
|
256 |
+
" loss.backward()\n",
|
257 |
+
" \n",
|
258 |
+
" # update\n",
|
259 |
+
" lr = 0.1 if i < 100000 else 0.01 # step learning rate decay\n",
|
260 |
+
" for p in parameters:\n",
|
261 |
+
" p.data += -lr * p.grad\n",
|
262 |
+
"\n",
|
263 |
+
" # track stats\n",
|
264 |
+
" if i % 10000 == 0: # print every once in a while\n",
|
265 |
+
" print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')\n",
|
266 |
+
" lossi.append(loss.log10().item())"
|
267 |
+
]
|
268 |
+
},
|
269 |
+
{
|
270 |
+
"cell_type": "markdown",
|
271 |
+
"metadata": {},
|
272 |
+
"source": [
|
273 |
+
"Here we plot the loss"
|
274 |
+
]
|
275 |
+
},
|
276 |
+
{
|
277 |
+
"cell_type": "code",
|
278 |
+
"execution_count": 8,
|
279 |
+
"metadata": {},
|
280 |
+
"outputs": [
|
281 |
+
{
|
282 |
+
"data": {
|
283 |
+
"text/plain": [
|
284 |
+
"[<matplotlib.lines.Line2D at 0x28412485fc0>]"
|
285 |
+
]
|
286 |
+
},
|
287 |
+
"execution_count": 8,
|
288 |
+
"metadata": {},
|
289 |
+
"output_type": "execute_result"
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"data": {
|
293 |
+
"image/png": "",
|
294 |
+
"text/plain": [
|
295 |
+
"<Figure size 640x480 with 1 Axes>"
|
296 |
+
]
|
297 |
+
},
|
298 |
+
"metadata": {},
|
299 |
+
"output_type": "display_data"
|
300 |
+
}
|
301 |
+
],
|
302 |
+
"source": [
|
303 |
+
"plt.plot(lossi)"
|
304 |
+
]
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"cell_type": "markdown",
|
308 |
+
"metadata": {},
|
309 |
+
"source": [
|
310 |
+
"Seeing the loss in train and val loss. There is a slight modification to this as to how the splitting is done."
|
311 |
+
]
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"cell_type": "markdown",
|
315 |
+
"metadata": {},
|
316 |
+
"source": [
|
317 |
+
"Here the decorator `@torch.no_grad()` basically tells PyTorch to not maintain the grad value, as it assumes/anticipated that the backpropagation will be calculated after this and we are saying No."
|
318 |
+
]
|
319 |
+
},
|
320 |
+
{
|
321 |
+
"cell_type": "code",
|
322 |
+
"execution_count": 9,
|
323 |
+
"metadata": {},
|
324 |
+
"outputs": [
|
325 |
+
{
|
326 |
+
"name": "stdout",
|
327 |
+
"output_type": "stream",
|
328 |
+
"text": [
|
329 |
+
"train 2.12243390083313\n",
|
330 |
+
"val 2.1646578311920166\n"
|
331 |
+
]
|
332 |
+
}
|
333 |
+
],
|
334 |
+
"source": [
|
335 |
+
"@torch.no_grad() # this decorator disables gradient tracking\n",
|
336 |
+
"def split_loss(split):\n",
|
337 |
+
" x,y = {\n",
|
338 |
+
" 'train': (Xtr, Ytr),\n",
|
339 |
+
" 'val': (Xdev, Ydev),\n",
|
340 |
+
" 'test': (Xte, Yte),\n",
|
341 |
+
" }[split]\n",
|
342 |
+
" emb = C[x] # (N, block_size, n_embd)\n",
|
343 |
+
" embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)\n",
|
344 |
+
" h = torch.tanh(embcat @ W1 + b1) # (N, n_hidden)\n",
|
345 |
+
" logits = h @ W2 + b2 # (N, vocab_size)\n",
|
346 |
+
" loss = F.cross_entropy(logits, y)\n",
|
347 |
+
" print(split, loss.item())\n",
|
348 |
+
"\n",
|
349 |
+
"split_loss('train')\n",
|
350 |
+
"split_loss('val')"
|
351 |
+
]
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"cell_type": "markdown",
|
355 |
+
"metadata": {},
|
356 |
+
"source": [
|
357 |
+
"Sampling of the model: Forward pass -> Sampling from the distribution -> Continuing till we get the special token '.'"
|
358 |
+
]
|
359 |
+
},
|
360 |
+
{
|
361 |
+
"cell_type": "code",
|
362 |
+
"execution_count": 10,
|
363 |
+
"metadata": {},
|
364 |
+
"outputs": [
|
365 |
+
{
|
366 |
+
"name": "stdout",
|
367 |
+
"output_type": "stream",
|
368 |
+
"text": [
|
369 |
+
"mora.\n",
|
370 |
+
"mayah.\n",
|
371 |
+
"see.\n",
|
372 |
+
"mel.\n",
|
373 |
+
"rylee.\n",
|
374 |
+
"emmadiejd.\n",
|
375 |
+
"leg.\n",
|
376 |
+
"adelyn.\n",
|
377 |
+
"elin.\n",
|
378 |
+
"shi.\n",
|
379 |
+
"jen.\n",
|
380 |
+
"eden.\n",
|
381 |
+
"estanar.\n",
|
382 |
+
"kayziquetta.\n",
|
383 |
+
"noshir.\n",
|
384 |
+
"roshiriel.\n",
|
385 |
+
"kendreth.\n",
|
386 |
+
"konnie.\n",
|
387 |
+
"casube.\n",
|
388 |
+
"ged.\n"
|
389 |
+
]
|
390 |
+
}
|
391 |
+
],
|
392 |
+
"source": [
|
393 |
+
"# sample from the model\n",
|
394 |
+
"g = torch.Generator().manual_seed(2147483647 + 10)\n",
|
395 |
+
"\n",
|
396 |
+
"for _ in range(20):\n",
|
397 |
+
" \n",
|
398 |
+
" out = []\n",
|
399 |
+
" context = [0] * block_size # initialize with all ...\n",
|
400 |
+
" while True:\n",
|
401 |
+
" # forward pass the neural net\n",
|
402 |
+
" emb = C[torch.tensor([context])] # (1,block_size,d)\n",
|
403 |
+
" h = torch.tanh(emb.view(1, -1) @ W1 + b1)\n",
|
404 |
+
" logits = h @ W2 + b2\n",
|
405 |
+
" probs = F.softmax(logits, dim=1)\n",
|
406 |
+
" # sample from the distribution\n",
|
407 |
+
" ix = torch.multinomial(probs, num_samples=1, generator=g).item()\n",
|
408 |
+
" context = context[1:] + [ix]\n",
|
409 |
+
" out.append(ix)\n",
|
410 |
+
" # if we sample the special '.' token, break\n",
|
411 |
+
" if ix == 0:\n",
|
412 |
+
" break\n",
|
413 |
+
" \n",
|
414 |
+
" print(''.join(itos[i] for i in out)) # decode and print the generated word"
|
415 |
+
]
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"cell_type": "markdown",
|
419 |
+
"metadata": {},
|
420 |
+
"source": [
|
421 |
+
"So yeah, this will be our starting point. Also use this as a revision for the previous lecture."
|
422 |
+
]
|
423 |
+
}
|
424 |
+
],
|
425 |
+
"metadata": {
|
426 |
+
"kernelspec": {
|
427 |
+
"display_name": "venv",
|
428 |
+
"language": "python",
|
429 |
+
"name": "python3"
|
430 |
+
},
|
431 |
+
"language_info": {
|
432 |
+
"codemirror_mode": {
|
433 |
+
"name": "ipython",
|
434 |
+
"version": 3
|
435 |
+
},
|
436 |
+
"file_extension": ".py",
|
437 |
+
"mimetype": "text/x-python",
|
438 |
+
"name": "python",
|
439 |
+
"nbconvert_exporter": "python",
|
440 |
+
"pygments_lexer": "ipython3",
|
441 |
+
"version": "3.10.0"
|
442 |
+
}
|
443 |
+
},
|
444 |
+
"nbformat": 4,
|
445 |
+
"nbformat_minor": 2
|
446 |
+
}
|
VisualizationTools.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
names.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|