{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "import matplotlib.pyplot as plt # for making figures\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read in all the words\n", "words = open('names.txt', 'r').read().splitlines()\n", "words[:8]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "32033" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(words)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}\n" ] } ], "source": [ "# build the vocabulary of characters and mappings to/from integers\n", "chars = sorted(list(set(''.join(words))))\n", "stoi = {s:i+1 for i,s in enumerate(chars)}\n", "stoi['.'] = 0\n", "itos = {i:s for s,i in stoi.items()}\n", "print(itos)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "... ---> e\n", "..e ---> m\n", ".em ---> m\n", "emm ---> a\n", "mma ---> .\n", "... ---> o\n", "..o ---> l\n", ".ol ---> i\n", "oli ---> v\n", "liv ---> i\n", "ivi ---> a\n", "via ---> .\n", "... ---> a\n", "..a ---> v\n", ".av ---> a\n", "ava ---> .\n", "... ---> i\n", "..i ---> s\n", ".is ---> a\n", "isa ---> b\n", "sab ---> e\n", "abe ---> l\n", "bel ---> l\n", "ell ---> a\n", "lla ---> .\n", "... ---> s\n", "..s ---> o\n", ".so ---> p\n", "sop ---> h\n", "oph ---> i\n", "phi ---> a\n", "hia ---> .\n" ] } ], "source": [ "# build the dataset\n", "\n", "block_size = 3 # context length: how many characters do we take to predict the next one?\n", "X, Y = [], []\n", "for w in words[:5]:\n", " \n", " #print(w)\n", " context = [0] * block_size\n", " for ch in w + '.':\n", " ix = stoi[ch]\n", " X.append(context)\n", " Y.append(ix)\n", " print(''.join(itos[i] for i in context), '--->', itos[ix])\n", " context = context[1:] + [ix] # crop and append\n", " \n", "X = torch.tensor(X)\n", "Y = torch.tensor(Y)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape, X.dtype, Y.shape, Y.dtype" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So our dataset looks like this^ \\\n", "\\\n", "So, for each of those above 5 words, \\\n", "`torch.Size([32, 3])` we have created a dataset of 32 examples and each input of the neural net is 3 integers => X \\\n", "`torch.Size([32])` and these are the labels (single row, 32 values) => Y" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 0, 0, 0],\n", " [ 0, 0, 5],\n", " [ 0, 5, 13],\n", " [ 5, 13, 13],\n", " [13, 13, 1],\n", " [ 0, 0, 0],\n", " [ 0, 0, 15],\n", " [ 0, 15, 12],\n", " [15, 12, 9],\n", " [12, 9, 22],\n", " [ 9, 22, 9],\n", " [22, 9, 1],\n", " [ 0, 0, 0],\n", " [ 0, 0, 1],\n", " [ 0, 1, 22],\n", " [ 1, 22, 1],\n", " [ 0, 0, 0],\n", " [ 0, 0, 9],\n", " [ 0, 9, 19],\n", " [ 9, 19, 1],\n", " [19, 1, 2],\n", " [ 1, 2, 5],\n", " [ 2, 5, 12],\n", " [ 5, 12, 12],\n", " [12, 12, 1],\n", " [ 0, 0, 0],\n", " [ 0, 0, 19],\n", " [ 0, 19, 15],\n", " [19, 15, 16],\n", " [15, 16, 8],\n", " [16, 8, 9],\n", " [ 8, 9, 1]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([ 5, 13, 13, 1, 0, 15, 12, 9, 22, 9, 1, 0, 1, 22, 1, 0, 9, 19,\n", " 1, 2, 5, 12, 12, 1, 0, 19, 15, 16, 8, 9, 1, 0])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Y" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "C = torch.rand((27, 2))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([32, 3, 2])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "emb = C[X]\n", "\n", "emb.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(PyTorch indexing is awesome) \\\n", "\\\n", "To index simultaneously all the elements of X, We simply do C[X]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "W1 = torch.randn((6, 100))\n", "b1 = torch.rand(100)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "h = torch.tanh(emb.view(-1, 6) @ W1 + b1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 0.9910, 0.8405, 0.4715, ..., 0.9999, 0.8814, 0.9998],\n", " [ 0.9763, 0.9163, 0.3350, ..., 0.9991, 0.8249, 0.9992],\n", " [ 0.9791, 0.8450, -0.0272, ..., 0.9997, 0.9230, 0.9997],\n", " ...,\n", " [ 0.8995, 0.6590, 0.4667, ..., 0.9995, -0.4144, 0.9988],\n", " [ 0.9777, 0.7397, 0.2623, ..., 0.9999, 0.9593, 0.9999],\n", " [ 0.9402, 0.7154, 0.2493, ..., 0.9980, -0.6247, 0.9979]])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "h" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([32, 100])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "h.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hidden layer is now made^" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "W2 = torch.randn((100, 27))\n", "b2 = torch.rand(27)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "logits = h @ W2 + b2" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([32, 27])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "logits.shape" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "counts = logits.exp()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "prob = counts / counts.sum(1, keepdims=True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([32, 27])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prob.shape" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor(13.4043)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loss = -prob[torch.arange(32), Y].log().mean()\n", "loss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We've made the final output layer^ \\\n", "Found the loss function value, which we have to reduce" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---------------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### **Summarising what we've done so far to make this more respectable :)**" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([32, 3]), torch.Size([32]))" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Run the first 5 cells and then start from here\n", "X.shape, Y.shape #dataset" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "g = torch.Generator().manual_seed(2147483647) #For consistency ofcourse, to keep the same values as andrej\n", "C = torch.randn((27,2), generator=g)\n", "W1 = torch.rand((6, 100), generator=g)\n", "b1 = torch.rand(100, generator=g)\n", "W2 = torch.rand((100, 27), generator=g)\n", "b2 = torch.rand(27, generator=g)\n", "parameters = [C, W1, b1, W2, b2]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3481" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(p.nelement() for p in parameters) #to check number of parameters in total" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor(6.4365)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "emb = C[X]\n", "h = torch.tanh(emb.view(-1,6) @ W1 + b1)\n", "logits = h @ W2 + b2\n", "counts = logits.exp()\n", "prob = counts / counts.sum(1, keepdims=True)\n", "loss = - prob[torch.arange(32), Y].log().mean()\n", "loss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "--------------" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 2 }