Delete enigma

Browse files

Files changed (7) hide show

enigma/EnBERT.py +0 -206
enigma/TrainEnigma.ipynb +0 -919
enigma/config_enigma.json +0 -13
enigma/enigma.cpp +0 -364
enigma/generate.py +0 -126
enigma/model.py +0 -388
enigma/run.py +0 -100

enigma/EnBERT.py DELETED Viewed

@@ -1,206 +0,0 @@
-"""
-  simple BERT architecture model, paired with one more layer of
-  masked self-attention, to predict next token
-"""
-import torch
-import os
-current_directory = os.path.dirname(os.path.abspath(__file__))
-os.chdir(current_directory)
-import torch.nn as nn
-from torch.nn import functional as F
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-# hyperparams
-batch_size = 8
-block_size = 32
-max_iters = 10
-eval_interval = 10
-learning_rate = 3e-4
-eval_iters = 5
-d_model = 256
-n_layer = 16
-n_head = 12
-dropout = 0.2
-norm_eps = 1e-5
-class SWiGLU(nn.Module):
-  """ SWiGLU(x) = σ(x) ⊙ ReLU(x) + (1−σ(x)) ⊙ x """
-  def forward(self, x):
-    sigmoid_output = torch.sigmoid(x)
-    relu_output = F.relu(x)
-    out = sigmoid_output * relu_output + (1 - sigmoid_output) * x
-    return out
-class UnMaskedHead(nn.Module):
-  """ single head of self attention """
-  def __init__(self, d_model, head_size, dropout):
-    super().__init__()
-    self.key = nn.Linear(d_model, head_size, bias=True)
-    self.query = nn.Linear(d_model, head_size, bias=True)
-    self.value = nn.Linear(d_model, head_size, bias=True)
-    self.dropout = nn.Dropout(dropout)
-  def forward(self, x):
-    B, T, C = x.shape
-    key = self.key(x)
-    query = self.query(x)
-    weights = query @ key.transpose(-2, -1) * key.shape[-1]**-0.5
-    weights = F.softmax(weights, dim=-1)
-    weights = self.dropout(weights)
-    value = self.value(x)
-    out = weights @ value
-    return out
-class MaskedHead(nn.Module):
-  """ one head of self-attention """
-  def __init__(self, head_size, dropout, d_model):
-    super().__init__()
-    self.key = nn.Linear(d_model, head_size, bias=True)
-    self.query = nn.Linear(d_model, head_size, bias=True)
-    self.value = nn.Linear(d_model, head_size, bias=True)
-    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
-    self.dropout = nn.Dropout(dropout)
-  def forward(self, x):
-    B,T,C = x.shape
-    k = self.key(x)
-    q = self.query(x)
-    wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
-    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
-    wei = F.softmax(wei, dim=-1) # (B, T, T)
-    wei = self.dropout(wei)
-    v = self.value(x)
-    out = wei @ v
-    return out
-class MultiUnMasked(nn.Module):
-  def __init__(self, d_model, n_head, dropout):
-    head_size = d_model // n_head
-    super().__init__()
-    self.heads = nn.ModuleList([UnMaskedHead(d_model=d_model, dropout=dropout, head_size=head_size) for _ in range(n_head)])
-    self.proj = nn.Linear(n_head * head_size, d_model)
-    self.dropout = nn.Dropout(dropout)
-  def forward(self, x):
-    out = torch.cat([h(x) for h in self.heads], dim=-1)
-    out = self.dropout(self.proj(out))
-    return out
-class MultiMasked(nn.Module):
-  def __init__(self, d_model, n_head, dropout):
-    head_size = d_model // n_head
-    super().__init__()
-    self.heads = nn.ModuleList([MaskedHead(d_model=d_model, dropout=dropout, head_size=head_size) for _ in range(n_head)])
-    self.proj = nn.Linear(n_head * head_size, d_model)
-    self.dropout = nn.Dropout(dropout)
-  def forward(self, x):
-    out = torch.cat([h(x) for h in self.heads], dim=-1)
-    out = self.dropout(self.proj(out))
-    return out
-class FeedForward(nn.Module):
-  def __init__(self, d_model, dropout):
-    super().__init__()
-    self.net = nn.Sequential(
-      nn.Linear(d_model, 4*d_model),
-      nn.GELU(),
-      nn.Linear(4*d_model, d_model),
-      nn.Dropout(dropout)
-    )
-  def forward(self, x):
-    return self.net(x)
-class Block(nn.Module):
-  def __init__(self, d_model, n_head, norm_eps, dropout):
-    super().__init__()
-    self.sa_masked = MultiMasked(n_head=n_head, d_model=d_model, dropout=dropout)
-    self.sa_unmasked = MultiUnMasked(n_head=n_head, d_model=d_model, dropout=dropout)
-    self.ffwd = FeedForward(d_model, dropout=dropout)
-    self.norm1 = nn.LayerNorm(d_model, eps=norm_eps)
-    self.norm2 = nn.LayerNorm(d_model, eps=norm_eps)
-  def forward(self, x):
-    x2 = x + self.sa_unmasked(self.norm1(x))
-    x = x2 + self.norm2(self.ffwd(x2))
-    x2 = x + self.sa_masked(self.norm1(x))
-    x = x2 + self.norm2(self.ffwd(x2))
-    return x
-class EnigmaBERT(nn.Module):
-  def __init__(self, vocab_size):
-    super().__init__()
-    self.toked_model = nn.Embedding(vocab_size, d_model)
-    self.pos_encod = nn.Embedding(block_size, d_model)
-    self.block = nn.Sequential(*[Block(d_model=d_model, dropout=dropout, norm_eps=norm_eps, n_head=n_head) for _ in range(n_layer)])
-    self.norm_final = nn.LayerNorm(d_model, eps=norm_eps)
-    self.linear_final = nn.Linear(d_model, vocab_size)
-    self.apply(self._init_weights)
-  def _init_weights(self, module):
-    if isinstance(module, nn.Linear):
-      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-      if module.bias is not None:
-        torch.nn.init.zeros_(module.bias.data)
-    elif isinstance(module, nn.Embedding):
-      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-  def forward(self, idx, targets=None):
-    B, T = idx.shape
-    toked_model = self.toked_model(idx)
-    pos_encod = self.pos_encod(torch.arange(T, device=device))
-    x = toked_model + pos_encod
-    x = self.block(x)
-    x = self.norm_final(x)
-    logits = self.linear_final(x)
-    if targets is None:
-      loss = None
-    else:
-      B, T, C = logits.shape
-      logits = logits.view(B*T, C)
-      targets = targets.view(B*T)
-      loss = F.cross_entropy(logits, targets)
-    return logits, loss
-  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=0):
-    generated_tokens = []
-    for _ in range(max_new_tokens):
-      idx_cond = idx[:, -block_size:]
-      logits, _ = self(idx_cond)
-      logits = logits[:, -1, :]
-      scaled_logits = logits / temperature
-      if top_k > 0:
-        scaled_logits = self._top_k_filtering(scaled_logits, top_k)
-      probs = F.softmax(scaled_logits, dim=-1)
-      sampled_idx = torch.multinomial(probs, num_samples=1)
-      generated_tokens.append(sampled_idx.item())
-      idx = torch.cat((idx, sampled_idx), dim=1)
-    return generated_tokens
-  def _top_k_filtering(self, logits, top_k):
-    values, indices = torch.topk(logits, top_k, dim=-1)
-    min_value = values[:, -1].unsqueeze(-1).expand_as(logits)
-    filtered_logits = torch.where(logits < min_value, torch.ones_like(logits) * -float('inf'), logits)
-    return filtered_logits

enigma/TrainEnigma.ipynb DELETED Viewed

@@ -1,919 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "WXpJBLyr30Rx",
-        "outputId": "2806070a-648f-42ca-fa8a-9aeb8f99ceb7"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
-          ]
-        }
-      ],
-      "source": [
-        "from google.colab import drive\n",
-        "drive.mount('/content/drive')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "r7WUm0VL4bN4",
-        "outputId": "bfdefb82-479e-4f91-9a01-299ff76756e9"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "485.52 million letters\n"
-          ]
-        }
-      ],
-      "source": [
-        "import torch\n",
-        "\n",
-        "# importing the data\n",
-        "file_path = '/content/drive/MyDrive/train2.txt'\n",
-        "with open(file_path, 'r', encoding='utf-8') as file:\n",
-        "  dna_seq = file.read()\n",
-        "file.close()\n",
-        "\n",
-        "print(f\"{(len(dna_seq)/1e6):.2f} million letters\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "id": "Cdhybhz9owTK"
-      },
-      "outputs": [],
-      "source": [
-        "class PerCharTokenizer:\n",
-        "  \"\"\"\n",
-        "  Args:\n",
-        "    - chars (list): all bases along with special tokens represented as characters\n",
-        "    - vocab_size (int): size of vocabulary\n",
-        "\n",
-        "  Working:\n",
-        "    - vocab contains all the bases and ['P', 'M', 'U'] as padding, mask and unknown token\n",
-        "    - encode(): iterates over each character a time and the looks up for the position in vocab\n",
-        "      and returns it's position as integer\n",
-        "    - decode(): takes input of a list of integers and returns the specific item from vocab\n",
-        "  \"\"\"\n",
-        "  def __init__(self):\n",
-        "    super().__init__()\n",
-        "    self.chars = ['\\n', 'A', 'T', 'G', 'C', 'P', 'M', 'U', ' ']\n",
-        "    self.vocab_size = len(self.chars)\n",
-        "    self.string_to_index = {ch: i for i, ch in enumerate(self.chars)}\n",
-        "    self.index_to_string = {i: ch for i, ch in enumerate(self.chars)}\n",
-        "\n",
-        "  def encode(self, string):\n",
-        "    encoded = []\n",
-        "    for char in string:\n",
-        "      if char in self.string_to_index:\n",
-        "        encoded.append(self.string_to_index[char])\n",
-        "      else:\n",
-        "        special_index = len(self.string_to_index)\n",
-        "        self.string_to_index[char] = special_index\n",
-        "        self.index_to_string[special_index] = char\n",
-        "        encoded.append(special_index)\n",
-        "    return encoded\n",
-        "\n",
-        "  def decode(self, integer):\n",
-        "    decoded = []\n",
-        "    for i in integer:\n",
-        "      if i in self.index_to_string:\n",
-        "        decoded.append(self.index_to_string[i])\n",
-        "      else:\n",
-        "        continue\n",
-        "    return ''.join(decoded)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "6Ou9txgmAdIB",
-        "outputId": "cb5dd462-8b2a-445a-9524-1b484f288c64"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "train data 436.97million, val data 48.55million\n"
-          ]
-        }
-      ],
-      "source": [
-        "token = PerCharTokenizer()\n",
-        "data = torch.tensor(token.encode(dna_seq), dtype=torch.long)\n",
-        "\n",
-        "# Train and test splits\n",
-        "n = int(0.9*len(data)) # first 90% will be train, rest val\n",
-        "train_data = data[:n]\n",
-        "val_data = data[n:]\n",
-        "print(f\"train data {(len(train_data)/1e6):.2f}million, val data {(len(val_data)/1e6):.2f}million\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "id": "ebFKQQ9NAq4e"
-      },
-      "outputs": [],
-      "source": [
-        "# hyperparams\n",
-        "batch_size = 10\n",
-        "block_size = 512\n",
-        "max_iters = 5000\n",
-        "eval_interval = 100\n",
-        "learning_rate = 3e-4\n",
-        "eval_iters = 100\n",
-        "d_model = 384\n",
-        "n_layers = 12\n",
-        "n_head = 12\n",
-        "dropout = 0.25\n",
-        "norm_eps = 1e-4"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "id": "dZMiYkr37cmU"
-      },
-      "outputs": [],
-      "source": [
-        "import math\n",
-        "import torch.nn as nn\n",
-        "from torch.nn import functional as F\n",
-        "\n",
-        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
-        "\n",
-        "class AttentionHead(nn.Module):\n",
-        "  \"\"\"\n",
-        "  initialize a single head of self attention.\n",
-        "\n",
-        "  Args:\n",
-        "  - d_model (int): dimensionality of the model's hidden layers\n",
-        "  - head_size (int): dimensionality of each attention head\n",
-        "  - dropout (float): dropout probability\n",
-        "  - block_size (int): the maximum sequence length for positional encoding\n",
-        "  \"\"\"\n",
-        "  def __init__(self, d_model, head_size, dropout, block_size):\n",
-        "    super().__init__()\n",
-        "    self.key = nn.Linear(d_model, head_size, bias=True)\n",
-        "    self.query = nn.Linear(d_model, head_size, bias=True)\n",
-        "    self.value = nn.Linear(d_model, head_size, bias=False)\n",
-        "    self.dropout = nn.Dropout(dropout)\n",
-        "    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))\n",
-        "\n",
-        "    self.rel_pos_emb = nn.Parameter(torch.randn(block_size, block_size, head_size))\n",
-        "\n",
-        "  def forward(self, x, mask=False):\n",
-        "    \"\"\"\n",
-        "    forward pass of a single attention head.\n",
-        "\n",
-        "    Args:\n",
-        "      - x (Tensor): input tensor.\n",
-        "      - mask (bool): flag indicating whether to apply masking\n",
-        "    Returns:\n",
-        "      - out (Tensor): output tensor after self attention\n",
-        "    \"\"\"\n",
-        "    B, T, C = x.shape\n",
-        "    key = self.key(x)\n",
-        "    query = self.query(x)\n",
-        "    scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** -0.5)\n",
-        "\n",
-        "    rel_pos_scores = torch.einsum('btc,tvc->btv', query, self.rel_pos_emb[:T, :T])\n",
-        "    scores += rel_pos_scores\n",
-        "\n",
-        "    if mask:\n",
-        "      scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))\n",
-        "    weights = F.softmax(scores, dim=-1)\n",
-        "    weights = self.dropout(weights)\n",
-        "\n",
-        "    value = self.value(x)\n",
-        "    out = torch.matmul(weights, value)\n",
-        "    return out\n",
-        "\n",
-        "class MultiHeadAttention(nn.Module):\n",
-        "  \"\"\"\n",
-        "    initialize a multi-head attention module.\n",
-        "\n",
-        "    Args:\n",
-        "    - d_model (int): dimensionality of the model's hidden layers\n",
-        "    - n_head (int): no of attention heads\n",
-        "    - dropout (float): dropout probability\n",
-        "    - block_size (int): context length\n",
-        "  \"\"\"\n",
-        "  def __init__(self, d_model, n_head, dropout, block_size):\n",
-        "    head_size = d_model // n_head\n",
-        "    super().__init__()\n",
-        "    self.heads = nn.ModuleList([AttentionHead(d_model=d_model, dropout=dropout, head_size=head_size, block_size=block_size) for _ in range(n_head)])\n",
-        "    self.proj = nn.Linear(n_head * head_size, d_model)\n",
-        "    self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "  def forward(self, x, mask):\n",
-        "    \"\"\"\n",
-        "    forward pass of the multi-head attention module\n",
-        "\n",
-        "    Args:\n",
-        "      - x (Tensor): input tensor\n",
-        "      - mask (bool): flag indicating whether to apply masking\n",
-        "\n",
-        "    Returns:\n",
-        "      - out (Tensor): output tensor after multi-head attention\n",
-        "\n",
-        "    \"\"\"\n",
-        "    out = torch.cat([h(x, mask=mask) for h in self.heads], dim=-1)\n",
-        "    out = self.dropout(self.proj(out))\n",
-        "    return out\n",
-        "\n",
-        "class FeedForward(nn.Module):\n",
-        "  \"\"\"\n",
-        "    initialize a feedforward network module\n",
-        "\n",
-        "    Args:\n",
-        "    - d_model (int): the dimensionality of the model's hidden layers\n",
-        "    - dropout (float): dropout probability\n",
-        "\n",
-        "  \"\"\"\n",
-        "  def __init__(self, d_model, dropout):\n",
-        "    super().__init__()\n",
-        "    self.net = nn.Sequential(\n",
-        "      nn.Linear(d_model, 5*d_model),\n",
-        "      nn.GELU(),\n",
-        "      nn.Linear(5*d_model, d_model),\n",
-        "      nn.Dropout(dropout)\n",
-        "    )\n",
-        "\n",
-        "  def forward(self, x):\n",
-        "    \"\"\"\n",
-        "    forward pass of the feedforward network module\n",
-        "\n",
-        "    Args:\n",
-        "      - x (Tensor): input tensor\n",
-        "\n",
-        "    Returns:\n",
-        "      - out (Tensor): output tensor after passing through the feedforward network\n",
-        "    \"\"\"\n",
-        "    return self.net(x)\n",
-        "\n",
-        "class EncoderNetwork(nn.Module):\n",
-        "  \"\"\"\n",
-        "    initialize an encoder network module\n",
-        "\n",
-        "    Args:\n",
-        "    - d_model (int): dimensionality of the model's hidden layers\n",
-        "    - n_head (int): no of attention heads in multi-head attention layers\n",
-        "    - norm_eps (float): epsilon value for layer normalization\n",
-        "    - dropout (float): dropout probability\n",
-        "    - block_size (int): the maximum sequence length for positional encoding\n",
-        "    \"\"\"\n",
-        "  def __init__(self, d_model, n_head, norm_eps, dropout, block_size):\n",
-        "    super().__init__()\n",
-        "    self.s_att = MultiHeadAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)\n",
-        "    self.ffwd = FeedForward(d_model, dropout)\n",
-        "    self.dropout = nn.Dropout(dropout)\n",
-        "    self.norm1 = nn.LayerNorm(d_model, eps=norm_eps)\n",
-        "    self.norm2 = nn.LayerNorm(d_model, eps=norm_eps)\n",
-        "\n",
-        "  def forward(self, src):\n",
-        "    \"\"\"\n",
-        "      forward pass of the encoder network module.\n",
-        "\n",
-        "      Args:\n",
-        "      - src (Tensor): input tensor representing source data\n",
-        "\n",
-        "      Returns:\n",
-        "      - src (Tensor): output tensor after passing through the encoder network\n",
-        "    \"\"\"\n",
-        "    src2 = self.s_att(src, mask=False)\n",
-        "    src = src + self.dropout(src2)\n",
-        "    src = self.norm1(src)\n",
-        "\n",
-        "    src2 = self.ffwd(src)\n",
-        "    src = src + self.dropout(src2)\n",
-        "    src = self.norm2(src)\n",
-        "\n",
-        "    return src\n",
-        "\n",
-        "class DecoderNetwork(nn.Module):\n",
-        "  \"\"\"\n",
-        "    initialize a decoder network module\n",
-        "\n",
-        "    Args:\n",
-        "      - d_model (int): dimensionality of the model's hidden layers\n",
-        "      - n_head (int): no of attention heads in multi-head attention layers\n",
-        "      - norm_eps (float): epsilon value for layer normalization\n",
-        "      - dropout (float): dropout probability\n",
-        "      - block_size (int): the maximum sequence length for positional encoding\n",
-        "  \"\"\"\n",
-        "  def __init__(self, d_model, n_head, norm_eps, dropout, block_size):\n",
-        "    super().__init__()\n",
-        "    self.s_att = MultiHeadAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)\n",
-        "    self.ffwd = FeedForward(d_model, dropout)\n",
-        "    self.dropout = nn.Dropout(dropout)\n",
-        "    self.norm1 = nn.LayerNorm(d_model, eps=norm_eps)\n",
-        "    self.norm2 = nn.LayerNorm(d_model, eps=norm_eps)\n",
-        "\n",
-        "  def forward(self, src, att):\n",
-        "    \"\"\"\n",
-        "      forward pass of the decoder network module.\n",
-        "\n",
-        "      Args:\n",
-        "        - src (Tensor): input tensor, same as the encoder's inputs\n",
-        "        - trg (Tensor): encoder's attention matrix\n",
-        "\n",
-        "      Returns:\n",
-        "        - src_f (Tensor): final output tensor\n",
-        "    \"\"\"\n",
-        "    src2 = self.s_att(src, mask=True)\n",
-        "    src = src + self.dropout(src2)\n",
-        "    src = src + self.norm1(src)\n",
-        "\n",
-        "    att = src + att\n",
-        "    att2 = self.s_att(att, mask=False)\n",
-        "    att2 = att + self.dropout(att2)\n",
-        "    trg = att2 + self.norm1(att2)\n",
-        "\n",
-        "    src_f2 = self.ffwd(self.norm2(trg))\n",
-        "    src_f = src_f2 + self.dropout(src_f2)\n",
-        "    src_f = self.norm2(src_f)\n",
-        "\n",
-        "    return src_f\n",
-        "\n",
-        "class Transformer(nn.Module):\n",
-        "  \"\"\"\n",
-        "    initialize a Transformer model\n",
-        "\n",
-        "    Args:\n",
-        "      - vocab_size (int): size of the vocabulary\n",
-        "      - d_model (int): dimensionality of the model's hidden layers\n",
-        "      - block_size (int): maximum sequence length for positional encoding/context length\n",
-        "      - n_layers (int): number of encoder and decoder layers in the Transformer\n",
-        "      - n_head (int): number of attention heads in multi-head attention layers\n",
-        "      - norm_eps (float): epsilon value for layer normalization\n",
-        "      - dropout (float): dropout probability\n",
-        "  \"\"\"\n",
-        "  def __init__(self, vocab_size):\n",
-        "    super().__init__()\n",
-        "    self.block_size = block_size\n",
-        "    self.toked_model = nn.Embedding(vocab_size, d_model)\n",
-        "    self.pos_encod = nn.Embedding(block_size, d_model)\n",
-        "    self.enc_layer = nn.ModuleList([EncoderNetwork(n_head=n_head, norm_eps=norm_eps, block_size=block_size, dropout=dropout, d_model=d_model) for _ in range(n_layers)])\n",
-        "    self.dec_layer = nn.ModuleList([DecoderNetwork(n_head=n_head, norm_eps=norm_eps, block_size=block_size, dropout=dropout, d_model=d_model) for _ in range(n_layers)])\n",
-        "\n",
-        "    self.norm_final = nn.LayerNorm(d_model)\n",
-        "    self.linear_final = nn.Linear(d_model, vocab_size)\n",
-        "    self.dropout = nn.Dropout(dropout)\n",
-        "    self.apply(self._init_weights)\n",
-        "\n",
-        "  def _init_weights(self, module):\n",
-        "    \"\"\"\n",
-        "      initialize weights of linear and embedding layers\n",
-        "\n",
-        "      Args:\n",
-        "        - module (nn.Module): the module to initialize weights for\n",
-        "    \"\"\"\n",
-        "    if isinstance(module, nn.Linear):\n",
-        "      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
-        "      if module.bias is not None:\n",
-        "        torch.nn.init.zeros_(module.bias.data)\n",
-        "    elif isinstance(module, nn.Embedding):\n",
-        "      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
-        "\n",
-        "  def forward(self, idx, targets=None):\n",
-        "    \"\"\"\n",
-        "      forward pass of the transformer model\n",
-        "\n",
-        "    Args:\n",
-        "      - idx (Tensor): input tensor representing token indices\n",
-        "      - targets (Tensor): target tensor for computing loss during training\n",
-        "\n",
-        "    Returns:\n",
-        "      - logits (Tensor): output logits from the final linear layer\n",
-        "      - loss (Tensor): optional. computed cross-entropy loss if targets are provided, else None\n",
-        "    \"\"\"\n",
-        "    B, T = idx.shape\n",
-        "\n",
-        "    toked_model = self.toked_model(idx)\n",
-        "    pos_encod = self.pos_encod(torch.arange(T, device=device))\n",
-        "    x = toked_model + pos_encod\n",
-        "\n",
-        "    for layer in self.enc_layer:\n",
-        "      x_out = layer(x)\n",
-        "\n",
-        "    for layer in self.dec_layer:\n",
-        "      x_final = layer(x, x_out)\n",
-        "\n",
-        "    x_final = self.norm_final(x_final)\n",
-        "    logits = self.linear_final(x_final)\n",
-        "\n",
-        "    if targets is None:\n",
-        "      loss = None\n",
-        "\n",
-        "    else:\n",
-        "      B, T, C = logits.shape\n",
-        "      logits = logits.view(B*T, C)\n",
-        "      targets = targets.view(B*T)\n",
-        "      loss = F.cross_entropy(logits, targets)\n",
-        "\n",
-        "    return logits, loss\n",
-        "  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=0):\n",
-        "    \"\"\"\n",
-        "      generate new tokens using the trained model\n",
-        "\n",
-        "    Args:\n",
-        "      - idx (Tensor): input tensor representing initial token indices\n",
-        "      - max_new_tokens (int): max no of new tokens to generate\n",
-        "      - temperature (float): softmax temperature for sampling\n",
-        "      - top_k (int): no of top tokens to consider in sampling\n",
-        "\n",
-        "    Returns:\n",
-        "      - generated_tokens (list): list of generated token indices\n",
-        "    \"\"\"\n",
-        "    generated_tokens = []\n",
-        "\n",
-        "    for _ in range(max_new_tokens):\n",
-        "      idx_cond = idx[:, -self.block_size:]\n",
-        "      logits, _ = self(idx_cond)\n",
-        "      logits = logits[:, -1, :]\n",
-        "\n",
-        "      scaled_logits = logits / temperature\n",
-        "      if top_k > 0:\n",
-        "        scaled_logits = self._top_k_filtering(scaled_logits, top_k)\n",
-        "\n",
-        "      probs = F.softmax(scaled_logits, dim=-1)\n",
-        "      sampled_idx = torch.multinomial(probs, num_samples=1)\n",
-        "      generated_tokens.append(sampled_idx.item())\n",
-        "      idx = torch.cat((idx, sampled_idx), dim=1)\n",
-        "\n",
-        "    return generated_tokens\n",
-        "\n",
-        "  def generate_masked_tokens(self, idx, masked_indices, temperature=1.0, top_k=0):\n",
-        "    \"\"\"\n",
-        "      Generate predictions for masked tokens using the trained model.\n",
-        "\n",
-        "      Args:\n",
-        "        - idx (Tensor): input tensor representing token indices\n",
-        "        - masked_indices (Tensor): tensor of indices indicating masked positions\n",
-        "        - temperature (float): softmax temperature for sampling\n",
-        "        - top_k (int): no of top tokens to consider in sampling\n",
-        "\n",
-        "      Returns:\n",
-        "        - predicted_tokens (Tensor): tensor of predicted token indices\n",
-        "    \"\"\"\n",
-        "    B, T = idx.shape\n",
-        "\n",
-        "    toked_model = self.toked_model(idx)\n",
-        "    pos_encod = self.pos_encod(torch.arange(T, device=device))\n",
-        "    x = toked_model + pos_encod\n",
-        "\n",
-        "    for layer in self.enc_layer:\n",
-        "      x_out = layer(x)\n",
-        "\n",
-        "    for layer in self.dec_layer:\n",
-        "      x_final = layer(x, x_out)\n",
-        "\n",
-        "    x_masked = x_final.clone()\n",
-        "    x_masked[masked_indices] = self.toked_model(torch.tensor([6], device=device))\n",
-        "\n",
-        "    x_masked = self.norm_final(x_masked)\n",
-        "    logits = self.linear_final(x_masked)\n",
-        "\n",
-        "    masked_logits = logits[masked_indices].view(-1, logits.size(-1))\n",
-        "    scaled_logits = masked_logits / temperature\n",
-        "    if top_k > 0:\n",
-        "      scaled_logits = self._top_k_filtering(scaled_logits, top_k)\n",
-        "\n",
-        "    probs = F.softmax(scaled_logits, dim=-1)\n",
-        "    predicted_indices = torch.argmax(probs, dim=-1)\n",
-        "\n",
-        "    return predicted_indices\n",
-        "\n",
-        "  def _top_k_filtering(self, logits, top_k):\n",
-        "    \"\"\"\n",
-        "      filter logits to keep only the top-k tokens\n",
-        "\n",
-        "    Args:\n",
-        "      - logits (Tensor): input tensor representing unscaled logits\n",
-        "      - top_k (int): no of top tokens to keep\n",
-        "\n",
-        "    Returns:\n",
-        "      - filtered_logits (Tensor): filtered logits with only top-k tokens remaining\n",
-        "    \"\"\"\n",
-        "    values, indices = torch.topk(logits, top_k, dim=-1)\n",
-        "    min_value = values[:, -1].unsqueeze(-1).expand_as(logits)\n",
-        "    filtered_logits = torch.where(logits < min_value, torch.ones_like(logits) * -float('inf'), logits)\n",
-        "\n",
-        "    return filtered_logits"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 816
-        },
-        "id": "X9VOBZFr7g3W",
-        "outputId": "aa376025-0a37-4b93-e90a-9d95c6ef2c11"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "2.5 billion parameters\n",
-            "step 0: train loss 2.2869, val loss 2.2884\n",
-            "step 100: train loss 1.3312, val loss 1.3281\n",
-            "step 200: train loss 1.3233, val loss 1.3181\n",
-            "step 300: train loss 1.3209, val loss 1.3196\n",
-            "step 400: train loss 1.3215, val loss 1.3203\n",
-            "step 500: train loss 1.1974, val loss 1.1994\n",
-            "step 600: train loss 0.3350, val loss 0.3365\n",
-            "step 700: train loss 0.0703, val loss 0.0702\n",
-            "step 800: train loss 0.0143, val loss 0.0143\n",
-            "step 900: train loss 0.0049, val loss 0.0047\n",
-            "step 1000: train loss 0.0041, val loss 0.0037\n",
-            "step 1100: train loss 0.0035, val loss 0.0036\n",
-            "step 1200: train loss 0.0038, val loss 0.0035\n",
-            "step 1300: train loss 0.0035, val loss 0.0033\n",
-            "step 1400: train loss 0.0035, val loss 0.0033\n",
-            "step 1500: train loss 0.0033, val loss 0.0033\n",
-            "step 1600: train loss 0.0033, val loss 0.0034\n",
-            "step 1700: train loss 0.0033, val loss 0.0033\n",
-            "step 1800: train loss 0.0033, val loss 0.0031\n",
-            "step 1900: train loss 0.0031, val loss 0.0031\n",
-            "step 2000: train loss 0.0032, val loss 0.0032\n"
-          ]
-        },
-        {
-          "output_type": "error",
-          "ename": "KeyboardInterrupt",
-          "evalue": "",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-            "\u001b[0;32m<ipython-input-7-44818790f2dc>\u001b[0m in \u001b[0;36m<cell line: 45>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     55\u001b[0m   \u001b[0mxb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0myb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m   \u001b[0mlogits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0myb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     57\u001b[0m   \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset_to_none\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     58\u001b[0m   \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1510\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1513\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1518\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1519\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1521\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1522\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-6-b2af72f89b89>\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, idx, targets)\u001b[0m\n\u001b[1;32m    261\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    262\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mlayer\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdec_layer\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 263\u001b[0;31m       \u001b[0mx_final\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlayer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_out\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    264\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    265\u001b[0m     \u001b[0mx_final\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm_final\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_final\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1510\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1513\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1518\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1519\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1521\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1522\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-6-b2af72f89b89>\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, src, att)\u001b[0m\n\u001b[1;32m    189\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    190\u001b[0m     \u001b[0matt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msrc\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0matt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 191\u001b[0;31m     \u001b[0matt2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ms_att\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0matt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    192\u001b[0m     \u001b[0matt2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0matt\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0matt2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    193\u001b[0m     \u001b[0mtrg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0matt2\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0matt2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1510\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1513\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1518\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1519\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1521\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1522\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-6-b2af72f89b89>\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, x, mask)\u001b[0m\n\u001b[1;32m     81\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     82\u001b[0m     \"\"\"\n\u001b[0;32m---> 83\u001b[0;31m     \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mh\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheads\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     84\u001b[0m     \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-6-b2af72f89b89>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     81\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     82\u001b[0m     \"\"\"\n\u001b[0;32m---> 83\u001b[0;31m     \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mh\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheads\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     84\u001b[0m     \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1510\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1513\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1518\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1519\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1521\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1522\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-6-b2af72f89b89>\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, x, mask)\u001b[0m\n\u001b[1;32m     48\u001b[0m     \u001b[0mweights\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweights\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m     \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     51\u001b[0m     \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatmul\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweights\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     52\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1510\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1513\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1518\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1519\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1521\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1522\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    115\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbias\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    117\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    118\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mextra_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-          ]
-        }
-      ],
-      "source": [
-        "import timeit\n",
-        "\n",
-        "start_time = timeit.default_timer()\n",
-        "# data loading\n",
-        "def get_batch(split):\n",
-        "\n",
-        "  data = train_data if split == 'train' else val_data\n",
-        "  ix = torch.randint(len(data) - block_size, (batch_size,))\n",
-        "  x = torch.stack([data[i:i+block_size] for i in ix])\n",
-        "  y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
-        "  x, y = x.to(device), y.to(device)\n",
-        "  return x, y\n",
-        "\n",
-        "@torch.no_grad()\n",
-        "def estimate_loss():\n",
-        "  out = {}\n",
-        "  model.eval()\n",
-        "  for split in ['train', 'val']:\n",
-        "    losses = torch.zeros(eval_iters)\n",
-        "    for k in range(eval_iters):\n",
-        "      X, Y = get_batch(split)\n",
-        "      logits, loss = model(X, Y)\n",
-        "      losses[k] = loss.item()\n",
-        "    out[split] = losses.mean()\n",
-        "  model.train()\n",
-        "  return out\n",
-        "\n",
-        "vocab_size = token.vocab_size\n",
-        "model = Transformer(vocab_size)\n",
-        "# checkpoint_path = '/content/drive/MyDrive/enigma-2.5b.pth'\n",
-        "# checkpoint = torch.load(checkpoint_path)\n",
-        "# model.load_state_dict(checkpoint)\n",
-        "m = model.to(device)\n",
-        "\n",
-        "# no of parameters\n",
-        "n_param = sum(p.numel() for p in m.parameters())/1e9\n",
-        "print(f\"{n_param:.1f} billion parameters\")\n",
-        "\n",
-        "# optimizer\n",
-        "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n",
-        "steps = []\n",
-        "train_losses = []\n",
-        "val_losses = []\n",
-        "\n",
-        "for iter in range(max_iters):\n",
-        "\n",
-        "  if iter % eval_interval == 0 or iter == max_iters - 1:\n",
-        "    losses = estimate_loss()\n",
-        "    print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
-        "\n",
-        "    steps.append(iter)\n",
-        "    train_losses.append(losses['train'])\n",
-        "    val_losses.append(losses['val'])\n",
-        "\n",
-        "  xb, yb = get_batch('train')\n",
-        "  logits, loss = model(xb, yb)\n",
-        "  optimizer.zero_grad(set_to_none=True)\n",
-        "  loss.backward()\n",
-        "  optimizer.step()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 8,
-      "metadata": {
-        "id": "tzJMKoA35uIV",
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "outputId": "ba527bf5-695c-4a8f-acc4-bd60d549eaad"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "total parameters: 2.5 billion\n",
-            "trained in 1.82hrs\n"
-          ]
-        }
-      ],
-      "source": [
-        "end_time = timeit.default_timer()\n",
-        "print(f\"total parameters: {n_param:.1f} billion\")\n",
-        "print(f\"trained in {((end_time - start_time)/3600):.2f}hrs\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "model_save_name = f'enigma-{n_param:.1f}b_v1.pth'\n",
-        "path = f\"/content/drive/MyDrive/{model_save_name}\"\n",
-        "torch.save(model.state_dict(), path)"
-      ],
-      "metadata": {
-        "id": "eB47Yn9aNrrO"
-      },
-      "execution_count": 10,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# 8-bit quantization\n",
-        "\n",
-        "import torch\n",
-        "import torch.quantization\n",
-        "\n",
-        "checkpoint_path = '/content/drive/MyDrive/enigma-2.5b.pth'\n",
-        "checkpoint = torch.load(checkpoint_path)\n",
-        "model.load_state_dict(checkpoint)\n",
-        "model = model.to(device)\n",
-        "\n",
-        "quantized_model = torch.quantization.quantize_dynamic(\n",
-        "    model,\n",
-        "    dtype=torch.qint8\n",
-        ")\n",
-        "quantized_model_file = f'/content/drive/MyDrive/enigma-2.5b-quant.pth'\n",
-        "torch.save(quantized_model.state_dict(), quantized_model_file)\n",
-        "\n",
-        "print(\"Quantized model saved successfully.\")"
-      ],
-      "metadata": {
-        "id": "7iGQdNHgms_U"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# pruning\n",
-        "\n",
-        "import torch\n",
-        "from torch import nn\n",
-        "from torch.utils.model_zoo import load_url\n",
-        "import torch.nn.utils.prune as prune\n",
-        "\n",
-        "parameters_to_prune = [(model.encoder.self_attn, 'weight'), (model.encoder.linear1, 'weight')]\n",
-        "prune.global_unstructured(\n",
-        "  parameters_to_prune,\n",
-        "  pruning_method=prune.L1Unstructured,\n",
-        "  amount=0.15,\n",
-        ")\n",
-        "\n",
-        "torch.save(model.state_dict(), 'enigma-2.5b_pruned.pth')"
-      ],
-      "metadata": {
-        "id": "YTJ19n4OFvZj"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "K2FDOp7Quibq"
-      },
-      "outputs": [],
-      "source": [
-        "class Generator(Transformer):\n",
-        "  def __init__(self, vocab_size, block_size):\n",
-        "    super().__init__(vocab_size)\n",
-        "    self.vocab_size = vocab_size\n",
-        "    self.block_size = block_size\n",
-        "\n",
-        "  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=0):\n",
-        "    \"\"\"\n",
-        "      generate new tokens using the trained model\n",
-        "\n",
-        "    Args:\n",
-        "      - idx (Tensor): input tensor representing initial token indices\n",
-        "      - max_new_tokens (int): max no of new tokens to generate\n",
-        "      - temperature (float): softmax temperature for sampling\n",
-        "      - top_k (int): no of top tokens to consider in sampling\n",
-        "\n",
-        "    Returns:\n",
-        "      - generated_tokens (list): list of generated token indices\n",
-        "    \"\"\"\n",
-        "    generated_tokens = []\n",
-        "\n",
-        "    for _ in range(max_new_tokens):\n",
-        "      idx_cond = idx[:, -self.block_size:]\n",
-        "      logits, _ = self(idx_cond)\n",
-        "      logits = logits[:, -1, :]\n",
-        "\n",
-        "      scaled_logits = logits / temperature\n",
-        "      if top_k > 0:\n",
-        "        scaled_logits = self._top_k_filtering(scaled_logits, top_k)\n",
-        "\n",
-        "      probs = F.softmax(scaled_logits, dim=-1)\n",
-        "      sampled_idx = torch.multinomial(probs, num_samples=1)\n",
-        "      generated_tokens.append(sampled_idx.item())\n",
-        "      idx = torch.cat((idx, sampled_idx), dim=1)\n",
-        "\n",
-        "    return generated_tokens\n",
-        "\n",
-        "  def generate_masked_tokens(self, idx, masked_indices, temperature=1.0, top_k=0):\n",
-        "    \"\"\"\n",
-        "      Generate predictions for masked tokens using the trained model.\n",
-        "\n",
-        "      Args:\n",
-        "        - idx (Tensor): input tensor representing token indices\n",
-        "        - masked_indices (Tensor): tensor of indices indicating masked positions\n",
-        "        - temperature (float): softmax temperature for sampling\n",
-        "        - top_k (int): no of top tokens to consider in sampling\n",
-        "\n",
-        "      Returns:\n",
-        "        - predicted_tokens (Tensor): tensor of predicted token indices\n",
-        "    \"\"\"\n",
-        "    B, T = idx.shape\n",
-        "\n",
-        "    toked_model = self.toked_model(idx)\n",
-        "    pos_encod = self.pos_encod(torch.arange(T, device=device))\n",
-        "    x = toked_model + pos_encod\n",
-        "\n",
-        "    for layer in self.enc_layer:\n",
-        "      x_out = layer(x)\n",
-        "\n",
-        "    for layer in self.dec_layer:\n",
-        "      x_final = layer(x, x_out)\n",
-        "\n",
-        "    x_masked = x_final.clone()\n",
-        "    x_masked[masked_indices] = self.toked_model(torch.tensor([6], device=device))\n",
-        "\n",
-        "    x_masked = self.norm_final(x_masked)\n",
-        "    logits = self.linear_final(x_masked)\n",
-        "\n",
-        "    masked_logits = logits[masked_indices].view(-1, logits.size(-1))\n",
-        "    scaled_logits = masked_logits / temperature\n",
-        "    if top_k > 0:\n",
-        "      scaled_logits = self._top_k_filtering(scaled_logits, top_k)\n",
-        "\n",
-        "    probs = F.softmax(scaled_logits, dim=-1)\n",
-        "    predicted_indices = torch.argmax(probs, dim=-1)\n",
-        "\n",
-        "    return predicted_indices\n",
-        "\n",
-        "  def _top_k_filtering(self, logits, top_k):\n",
-        "    \"\"\"\n",
-        "      filter logits to keep only the top-k tokens\n",
-        "\n",
-        "    Args:\n",
-        "      - logits (Tensor): input tensor representing unscaled logits\n",
-        "      - top_k (int): no of top tokens to keep\n",
-        "\n",
-        "    Returns:\n",
-        "      - filtered_logits (Tensor): filtered logits with only top-k tokens remaining\n",
-        "    \"\"\"\n",
-        "    values, indices = torch.topk(logits, top_k, dim=-1)\n",
-        "    min_value = values[:, -1].unsqueeze(-1).expand_as(logits)\n",
-        "    filtered_logits = torch.where(logits < min_value, torch.ones_like(logits) * -float('inf'), logits)\n",
-        "\n",
-        "    return filtered_logits"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 429
-        },
-        "id": "c5CknylV4S2m",
-        "outputId": "12314d78-9147-4e60-f8b5-84207b97a1c7"
-      },
-      "outputs": [
-        {
-          "output_type": "error",
-          "ename": "RuntimeError",
-          "evalue": "Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
-            "\u001b[0;32m<ipython-input-17-db17ec37b06c>\u001b[0m in \u001b[0;36m<cell line: 5>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mtarget_text\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"AGTTCTGCGAT\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mcontext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlong\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mgenerated_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtoken\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenerator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgenerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_new_tokens\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtemperature\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.9\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtop_k\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{target_text}{generated_output}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-16-39da0e3e4598>\u001b[0m in \u001b[0;36mgenerate\u001b[0;34m(self, idx, max_new_tokens, temperature, top_k)\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_new_tokens\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     23\u001b[0m       \u001b[0midx_cond\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblock_size\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m       \u001b[0mlogits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx_cond\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     25\u001b[0m       \u001b[0mlogits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1510\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1513\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1518\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1519\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1521\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1522\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-7-b2af72f89b89>\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, idx, targets)\u001b[0m\n\u001b[1;32m    253\u001b[0m     \u001b[0mB\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 255\u001b[0;31m     \u001b[0mtoked_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoked_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    256\u001b[0m     \u001b[0mpos_encod\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpos_encod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    257\u001b[0m     \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtoked_model\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpos_encod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1510\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1511\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1513\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1518\u001b[0m                 \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1519\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1521\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1522\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/sparse.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    162\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 163\u001b[0;31m         return F.embedding(\n\u001b[0m\u001b[1;32m    164\u001b[0m             \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpadding_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_norm\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    165\u001b[0m             self.norm_type, self.scale_grad_by_freq, self.sparse)\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36membedding\u001b[0;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[1;32m   2235\u001b[0m         \u001b[0;31m# remove once script supports set_grad_enabled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2236\u001b[0m         \u001b[0m_no_grad_embedding_renorm_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_norm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnorm_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2237\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscale_grad_by_freq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msparse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2238\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2239\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;31mRuntimeError\u001b[0m: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)"
-          ]
-        }
-      ],
-      "source": [
-        "generator = Generator(vocab_size, block_size)\n",
-        "\n",
-        "target_text = \"AGTTCTGCGAT\"\n",
-        "context = torch.tensor([token.encode(target_text)], dtype=torch.long, device=device)\n",
-        "generated_output = token.decode(generator.generate(context, max_new_tokens=100, temperature=0.9, top_k=5))\n",
-        "print(f\"{target_text}{generated_output}\")"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "machine_shape": "hm",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}

enigma/config_enigma.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "batch_size": 10,
-  "block_size": 512,
-  "max_iters": 5000,
-  "eval_interval": 50,
-  "learning_rate": 3e-5,
-  "eval_iters": 100,
-  "d_model": 384,
-  "n_head": 12,
-  "n_layer": 12,
-  "dropout": 0.2,
-  "norm_eps": 1e-5
-}

enigma/enigma.cpp DELETED Viewed

@@ -1,364 +0,0 @@
-#include <torch/torch.h>
-#include <iostream>
-#include <vector>
-// Define device
-torch::Device device(torch::kCUDA);
-// Define constants
-const int batch_size = 8;
-const int block_size = 32;
-const int max_iters = 1000;
-const int eval_interval = 50;
-const int eval_iters = 5;
-const int d_model = 256;
-const int n_layer = 16;
-const int n_head = 12;
-const float dropout = 0.2;
-const float norm_eps = 1e-5;
-const int vocab_size = 5;
-// sample data
-torch::Tensor train_data = torch::rand({1000, block_size});
-torch::Tensor val_data = torch::rand({500, block_size});
-// Data loading function
-std::pair<torch::Tensor, torch::Tensor> get_batch(const std::string& split) {
-    torch::Tensor data = (split == "train") ? train_data : val_data;
-    torch::Tensor ix = torch::randint(data.size(0) - block_size, {batch_size});
-    torch::Tensor x = torch::empty({batch_size, block_size});
-    torch::Tensor y = torch::empty({batch_size, block_size});
-    for (int i = 0; i < batch_size; ++i) {
-        x[i] = data.index({ix[i], ix[i] + block_size});
-        y[i] = data.index({ix[i] + 1, ix[i] + block_size + 1});
-    }
-    return std::make_pair(x.to(device), y.to(device));
-}
-// Custom classes and functions
-class SWiGLU : public torch::nn::Module {
-public:
-    SWiGLU() {}
-    torch::Tensor forward(torch::Tensor x) {
-        torch::Tensor sigmoid_output = torch::sigmoid(x);
-        torch::Tensor relu_output = torch::relu(x);
-        torch::Tensor out = sigmoid_output * relu_output + (1 - sigmoid_output) * x;
-        return out;
-    }
-};
-class UnMaskedHeadImpl : public torch::nn::Module {
-public:
-    UnMaskedHeadImpl(int d_model, int head_size, float dropout)
-        : key(register_module("key", torch::nn::Linear(d_model, head_size))),
-          query(register_module("query", torch::nn::Linear(d_model, head_size))),
-          value(register_module("value", torch::nn::Linear(d_model, head_size))),
-          dropout(torch::nn::Dropout(dropout)) {
-        register_module("dropout", dropout);
-    }
-    torch::Tensor forward(torch::Tensor x) {
-        torch::Tensor key_out = key->forward(x);
-        torch::Tensor query_out = query->forward(x);
-        torch::Tensor weights = query_out.matmul(key_out.transpose(-2, -1)) * std::sqrt(key_out.size(-1));
-        weights = torch::softmax(weights, -1);
-        weights = dropout(weights);
-        torch::Tensor value_out = value->forward(x);
-        torch::Tensor out = weights.matmul(value_out);
-        return out;
-    }
-private:
-    torch::nn::Linear key, query, value;
-    torch::nn::Dropout dropout;
-};
-TORCH_MODULE(UnMaskedHead);
-class MaskedHeadImpl : public torch::nn::Module {
-public:
-    MaskedHeadImpl(int head_size, float dropout, int d_model)
-        : key(register_module("key", torch::nn::Linear(d_model, head_size))),
-          query(register_module("query", torch::nn::Linear(d_model, head_size))),
-          value(register_module("value", torch::nn::Linear(d_model, head_size))),
-          dropout(torch::nn::Dropout(dropout)) {
-        register_buffer("tril", torch::tril(torch::ones(block_size, block_size)));
-    }
-    torch::Tensor forward(torch::Tensor x) {
-        torch::Tensor key_out = key->forward(x);
-        torch::Tensor query_out = query->forward(x);
-        torch::Tensor weights = query_out.matmul(key_out.transpose(-2, -1)) * std::sqrt(key_out.size(-1));
-        weights = weights.masked_fill(tril[:x.size(1), :x.size(1)] == 0, std::numeric_limits<float>::lowest());
-        weights = torch::softmax(weights, -1);
-        weights = dropout(weights);
-        torch::Tensor value_out = value->forward(x);
-        torch::Tensor out = weights.matmul(value_out);
-        return out;
-    }
-private:
-    torch::nn::Linear key, query, value;
-    torch::nn::Dropout dropout;
-    torch::Tensor tril;
-};
-TORCH_MODULE(MaskedHead);
-class MultiUnMaskedImpl : public torch::nn::Module {
-public:
-    MultiUnMaskedImpl(int d_model, int n_head, float dropout)
-        : proj(register_module("proj", torch::nn::Linear(n_head * (d_model / n_head), d_model))),
-          dropout(torch::nn::Dropout(dropout)) {
-        for (int i = 0; i < n_head; ++i) {
-            heads.push_back(register_module("head" + std::to_string(i), UnMaskedHead(d_model, d_model / n_head, dropout)));
-        }
-    }
-    torch::Tensor forward(torch::Tensor x) {
-        std::vector<torch::Tensor> head_outputs;
-        for (auto& head : heads) {
-            head_outputs.push_back(head->forward(x));
-        }
-        torch::Tensor out = torch::cat(head_outputs, -1);
-        out = dropout(out);
-        out = proj(out);
-        return out;
-    }
-private:
-    torch::nn::Linear proj;
-    torch::nn::Dropout dropout;
-    std::vector<UnMaskedHead> heads;
-};
-TORCH_MODULE(MultiUnMasked);
-class MultiMaskedImpl : public torch::nn::Module {
-public:
-    MultiMaskedImpl(int d_model, int n_head, float dropout)
-        : proj(register_module("proj", torch::nn::Linear(n_head * (d_model / n_head), d_model))),
-          dropout(torch::nn::Dropout(dropout)) {
-        for (int i = 0; i < n_head; ++i) {
-            heads.push_back(register_module("head" + std::to_string(i), MaskedHead(d_model, d_model / n_head, dropout)));
-        }
-    }
-    torch::Tensor forward(torch::Tensor x) {
-        std::vector<torch::Tensor> head_outputs;
-        for (auto& head : heads) {
-            head_outputs.push_back(head->forward(x));
-        }
-        torch::Tensor out = torch::cat(head_outputs, -1);
-        out = dropout(out);
-        out = proj(out);
-        return out;
-    }
-private:
-    torch::nn::Linear proj;
-    torch::nn::Dropout dropout;
-    std::vector<MaskedHead> heads;
-};
-TORCH_MODULE(MultiMasked);
-class FeedForwardImpl : public torch::nn::Module {
-public:
-    FeedForwardImpl(int d_model, float dropout)
-        : net(register_module("net", torch::nn::Sequential(
-            torch::nn::Linear(d_model, 4 * d_model),
-            torch::nn::GELU(),
-            torch::nn::Linear(4 * d_model, d_model),
-            torch::nn::Dropout(dropout)
-        ))) {}
-    torch::Tensor forward(torch::Tensor x) {
-        return net->forward(x);
-    }
-private:
-    torch::nn::Sequential net;
-};
-TORCH_MODULE(FeedForward);
-class BlockImpl : public torch::nn::Module {
-public:
-    BlockImpl(int d_model, int n_head, float norm_eps, float dropout)
-        : sa_masked(MultiMasked(d_model, n_head, dropout)),
-          sa_unmasked(MultiUnMasked(d_model, n_head, dropout)),
-          ffwd(FeedForward(d_model, dropout)),
-          norm1(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))),
-          norm2(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))) {}
-    torch::Tensor forward(torch::Tensor x) {
-        torch::Tensor x2 = x + sa_unmasked->forward(norm1->forward(x));
-        x = x2 + ffwd->forward(norm2->forward(x2));
-        x2 = x + sa_masked->forward(norm1->forward(x));
-        x = x2 + ffwd->forward(norm2->forward(x2));
-        return x;
-    }
-private:
-    MultiMasked sa_masked;
-    MultiUnMasked sa_unmasked;
-    FeedForward ffwd;
-    torch::nn::LayerNorm norm1, norm2;
-};
-TORCH_MODULE(Block);
-class EnigmaImpl : public torch::nn::Module {
-public:
-    EnigmaImpl(int vocab_size, int block_size, int d_model, int n_layer, int n_head, float dropout, float norm_eps)
-        : toked_model(register_module("toked_model", torch::nn::Embedding(vocab_size, d_model))),
-          pos_encod(register_module("pos_encod", torch::nn::Embedding(block_size, d_model))),
-          norm_final(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))),
-          linear_final(register_module("linear_final", torch::nn::Linear(d_model, vocab_size))) {
-        for (int i = 0; i < n_layer; ++i) {
-            block_layers.push_back(register_module("block" + std::to_string(i), Block(d_model, n_head, norm_eps, dropout)));
-        }
-        register_buffer("block_size", torch::tensor(block_size));
-        _init_weights(this);
-    }
-    void _init_weights(torch::nn::Module* module) {
-        auto parameters = module->named_parameters();
-        for (auto& param : parameters) {
-            if (param.key().find("weight") != std::string::npos) {
-                torch::nn::init::normal_(param.value(), 0.0, 0.02);
-            } else if (param.key().find("bias") != std::string::npos) {
-                torch::nn::init::zeros_(param.value());
-            }
-        }
-    }
-    std::pair<torch::Tensor, torch::Tensor> forward(torch::Tensor idx, torch::Tensor targets=torch::Tensor()) {
-        torch::Tensor toked_model_out = toked_model->forward(idx);
-        torch::Tensor pos_encod_out = pos_encod->forward(torch::arange(idx.size(1)));
-        torch::Tensor x = toked_model_out + pos_encod_out;
-        for (auto& block : block_layers) {
-            x = block->forward(x);
-        }
-        torch::Tensor logits = linear_final->forward(norm_final->forward(x));
-        if (!targets.numel()) {
-            return {logits, torch::Tensor()};
-        } else {
-            logits = logits.view({-1, logits.size(-1)});
-            targets = targets.view({-1});
-            torch::Tensor loss = torch::nn::functional::cross_entropy(logits, targets);
-            return {logits, loss};
-        }
-    }
-    std::vector<std::vector<std::pair<torch::Tensor, float>>> complex_generate(torch::Tensor idx, int max_new_tokens, float temperature=1.0, int top_k=3, int beam_width=5) {
-        std::vector<std::vector<std::pair<torch::Tensor, float>>> completed_beams;
-        torch::Tensor current_idx = idx.clone();
-        std::vector<std::pair<torch::Tensor, float>> beam = {std::make_pair(current_idx, 0.0)};
-        for (int i = 0; i < max_new_tokens; ++i) {
-            std::vector<std::pair<torch::Tensor, float>> new_beam;
-            for (auto& beam_item : beam) {
-                torch::Tensor& current_idx = beam_item.first;
-                torch::Tensor logits, loss;
-                std::tie(logits, loss) = forward(current_idx);
-                logits = logits.index({torch::indexing::Slice(), -1}); // Get last token predictions
-                // Apply softmax and temperature
-                torch::Tensor probs = torch::nn::functional::softmax(logits / temperature, -1);
-                // Top-k sampling
-                if (top_k > 0) {
-                    probs = top_k_filtering(probs, top_k);
-                }
-                // Sample from the distribution
-                torch::Tensor sampled_idx = torch::multinomial(probs, beam_width, true);
-                for (int j = 0; j < beam_width; ++j) {
-                    torch::Tensor new_idx = torch::cat({current_idx, sampled_idx.index({torch::indexing::Slice(), j})}, 1);
-                    torch::Tensor new_log_prob = beam_item.second + torch::log(probs.index({torch::indexing::Slice(), sampled_idx.index({torch::indexing::Slice(), j})}));
-                    new_beam.push_back(std::make_pair(new_idx, new_log_prob.item()));
-                }
-            }
-            // Sort new beam by log probabilities
-            std::sort(new_beam.begin(), new_beam.end(), [](const std::pair<torch::Tensor, float>& a, const std::pair<torch::Tensor, float>& b) {
-                return a.second > b.second;
-            });
-            // Only keep top beams
-            beam = std::vector<std::pair<torch::Tensor, float>>(new_beam.begin(), new_beam.begin() + beam_width);
-        }
-        completed_beams.push_back(beam);
-        return completed_beams;
-    }
-    std::vector<std::vector<std::pair<torch::Tensor, float>>> top_k_filtering(torch::Tensor logits, int top_k) {
-        torch::Tensor top_values, top_indices;
-        std::tie(top_values, top_indices) = torch::topk(logits, top_k, -1);
-        torch::Tensor min_value = torch::index_select(top_values, -1, torch::tensor({top_k-1}));
-        torch::Tensor filtered_logits = torch::where(logits < min_value, torch::full_like(logits, -std::numeric_limits<float>::infinity()), logits);
-        return filtered_logits;
-    }
-private:
-    torch::nn::Embedding toked_model, pos_encod;
-    std::vector<Block> block_layers;
-    torch::nn::LayerNorm norm_final;
-    torch::nn::Linear linear_final;
-    int block_size;
-};
-TORCH_MODULE(Enigma);
-int main() {
-    // Set seed
-    torch::manual_seed(1400);
-    // Create model
-    Enigma model(vocab_size, block_size, d_model, n_layer, n_head, dropout, norm_eps);
-    model->to(device);
-    // Define optimizer
-    torch::optim::AdamW optimizer(model->parameters(), torch::optim::AdamWOptions(learning_rate));
-    // Training loop
-    std::vector<float> train_losses, val_losses;
-    for (int iter = 0; iter < max_iters; ++iter) {
-        if (iter % eval_interval == 0 || iter == max_iters - 1) {
-            // Evaluate and print losses
-            auto losses = estimate_loss();
-            std::cout << "step " << iter << ": train loss " << losses["train"] << ", val loss " << losses["val"] << std::endl;
-            // Save losses for plotting
-            train_losses.push_back(losses["train"]);
-            val_losses.push_back(losses["val"]);
-        }
-        // Get batch, forward pass, loss calculation, backward pass, optimizer step
-        auto [xb, yb] = get_batch("train");
-        torch::Tensor logits, loss;
-        std::tie(logits, loss) = model->forward(xb, yb);
-        optimizer.zero_grad();
-        loss.backward();
-        optimizer.step();
-    }
-    return 0;
-}

enigma/generate.py DELETED Viewed

@@ -1,126 +0,0 @@
-import os
-current_directory = os.path.dirname(os.path.abspath(__file__))
-os.chdir(current_directory)
-with open('../parquet files/new_dna.txt', 'r', encoding='utf-8') as file:
-  captions = file.read()
-print(f"{(len(captions)/1e6):.2f} million letters")
-from tokenizer import PerCharTokenizer
-tokenizer = PerCharTokenizer()
-vocab_size = tokenizer.vocab_size
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-from model import Transformer
-model = Transformer(vocab_size=vocab_size)
-class Generator(Transformer):
-  def __init__(self, vocab_size):
-    super().__init__()
-    self.vocab_size = vocab_size
-    self.block_size = Transformer.block_size
-  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=0):
-    """
-      generate new tokens using the trained model
-    Args:
-      - idx (Tensor): input tensor representing initial token indices
-      - max_new_tokens (int): max no of new tokens to generate
-      - temperature (float): softmax temperature for sampling
-      - top_k (int): no of top tokens to consider in sampling
-    Returns:
-      - generated_tokens (list): list of generated token indices
-    """
-    generated_tokens = []
-    for _ in range(max_new_tokens):
-      idx_cond = idx[:, -self.block_size:]
-      logits, _ = self(idx_cond)
-      logits = logits[:, -1, :]
-      scaled_logits = logits / temperature
-      if top_k > 0:
-        scaled_logits = self._top_k_filtering(scaled_logits, top_k)
-      probs = F.softmax(scaled_logits, dim=-1)
-      sampled_idx = torch.multinomial(probs, num_samples=1)
-      generated_tokens.append(sampled_idx.item())
-      idx = torch.cat((idx, sampled_idx), dim=1)
-    return generated_tokens
-  def generate_masked_tokens(self, idx, masked_indices, temperature=1.0, top_k=0):
-    """
-      Generate predictions for masked tokens using the trained model.
-      Args:
-        - idx (Tensor): input tensor representing token indices
-        - masked_indices (Tensor): tensor of indices indicating masked positions
-        - temperature (float): softmax temperature for sampling
-        - top_k (int): no of top tokens to consider in sampling
-      Returns:
-        - predicted_tokens (Tensor): tensor of predicted token indices
-    """
-    B, T = idx.shape
-    toked_model = self.toked_model(idx)
-    pos_encod = self.pos_encod(torch.arange(T, device=device))
-    x = toked_model + pos_encod
-    for layer in self.enc_layer:
-      x_out = layer(x)
-    for layer in self.dec_layer:
-      x_final = layer(x, x_out)
-    x_masked = x_final.clone()
-    x_masked[masked_indices] = self.toked_model(torch.tensor([6], device=device))
-    x_masked = self.norm_final(x_masked)
-    logits = self.linear_final(x_masked)
-    masked_logits = logits[masked_indices].view(-1, logits.size(-1))
-    scaled_logits = masked_logits / temperature
-    if top_k > 0:
-      scaled_logits = self._top_k_filtering(scaled_logits, top_k)
-    probs = F.softmax(scaled_logits, dim=-1)
-    predicted_indices = torch.argmax(probs, dim=-1)
-    return predicted_indices
-  def _top_k_filtering(self, logits, top_k):
-    """
-      filter logits to keep only the top-k tokens
-    Args:
-      - logits (Tensor): input tensor representing unscaled logits
-      - top_k (int): no of top tokens to keep
-    Returns:
-      - filtered_logits (Tensor): filtered logits with only top-k tokens remaining
-    """
-    values, indices = torch.topk(logits, top_k, dim=-1)
-    min_value = values[:, -1].unsqueeze(-1).expand_as(logits)
-    filtered_logits = torch.where(logits < min_value, torch.ones_like(logits) * -float('inf'), logits)
-    return filtered_logits
-checkpoint_path = '../trained models/enigma_47m.pth'
-checkpoint = torch.load(checkpoint_path)
-model.load_state_dict(checkpoint)
-m = model.to(device)
-target_text = "AGTTCTGCGAT"
-context = torch.tensor([tokenizer.encode(target_text)], dtype=torch.long, device=device)
-generated_output = tokenizer.decode(Generator.generate(context, max_new_tokens=10, temperature=0.5, top_k=5))
-print(f"{target_text}{generated_output}")

enigma/model.py DELETED Viewed

@@ -1,388 +0,0 @@
-"""
-  transformer based model, but with few minimal tweaks
-  trained a 2.5billion parameters model with current set configurations
-"""
-import torch
-import json
-import os
-current_directory = os.path.dirname(os.path.abspath(__file__))
-os.chdir(current_directory)
-import torch.nn as nn
-from torch.nn import functional as F
-with open('config_enigma.json', 'r', encoding='utf-8') as file:
-  params = json.load(file)
-batch_size = params['batch_size']
-block_size = params['block_size']
-n_head = params['n_head']
-d_model = params['d_model']
-n_layers = params['n_layer']
-dropout = params['dropout']
-norm_eps = params['norm_eps']
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-class AttentionHead(nn.Module):
-  """
-    initialize a single head of self attention.
-    Args:
-    - d_model (int): dimensionality of the model's hidden layers
-    - head_size (int): dimensionality of each attention head
-    - dropout (float): dropout probability
-    - block_size (int): the maximum sequence length for positional encoding
-  """
-  def __init__(self, d_model, head_size, dropout, block_size):
-    super().__init__()
-    self.key = nn.Linear(d_model, head_size, bias=True)
-    self.query = nn.Linear(d_model, head_size, bias=True)
-    self.value = nn.Linear(d_model, head_size, bias=False)
-    self.dropout = nn.Dropout(dropout)
-    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
-    self.rel_pos_emb = nn.Parameter(torch.randn(block_size, block_size, head_size))
-  def forward(self, x, mask=False):
-    """
-    forward pass of a single attention head.
-    Args:
-      - x (Tensor): input tensor.
-      - mask (bool): flag indicating whether to apply masking
-    Returns:
-      - out (Tensor): output tensor after self attention
-    """
-    B, T, C = x.shape
-    key = self.key(x)
-    query = self.query(x)
-    scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** -0.5)
-    rel_pos_scores = torch.einsum('btc,tvc->btv', query, self.rel_pos_emb[:T, :T])
-    scores += rel_pos_scores
-    if mask:
-      scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
-    weights = F.softmax(scores, dim=-1)
-    weights = self.dropout(weights)
-    value = self.value(x)
-    out = torch.matmul(weights, value)
-    return out
-class MultiHeadAttention(nn.Module):
-  """
-    initialize a multi-head attention module.
-    Args:
-    - d_model (int): dimensionality of the model's hidden layers
-    - n_head (int): no of attention heads
-    - dropout (float): dropout probability
-    - block_size (int): context length
-  """
-  def __init__(self, d_model, n_head, dropout, block_size):
-    head_size = d_model // n_head
-    super().__init__()
-    self.heads = nn.ModuleList([AttentionHead(d_model=d_model, dropout=dropout, head_size=head_size, block_size=block_size) for _ in range(n_head)])
-    self.proj = nn.Linear(n_head * head_size, d_model)
-    self.dropout = nn.Dropout(dropout)
-  def forward(self, x, mask):
-    """
-    forward pass of the multi-head attention module
-    Args:
-      - x (Tensor): input tensor
-      - mask (bool): flag indicating whether to apply masking
-    Returns:
-      - out (Tensor): output tensor after multi-head attention
-    """
-    out = torch.cat([h(x, mask=mask) for h in self.heads], dim=-1)
-    out = self.dropout(self.proj(out))
-    return out
-class FeedForward(nn.Module):
-  """
-    initialize a feedforward network module
-    Args:
-    - d_model (int): the dimensionality of the model's hidden layers
-    - dropout (float): dropout probability
-  """
-  def __init__(self, d_model, dropout):
-    super().__init__()
-    self.net = nn.Sequential(
-      nn.Linear(d_model, 10*d_model),
-      nn.GELU(),
-      nn.Linear(10*d_model, d_model),
-      nn.Dropout(dropout)
-    )
-  def forward(self, x):
-    """
-    forward pass of the feedforward network module
-    Args:
-      - x (Tensor): input tensor
-    Returns:
-      - out (Tensor): output tensor after passing through the feedforward network
-    """
-    return self.net(x)
-class EncoderNetwork(nn.Module):
-  """
-    initialize an encoder network module
-    Args:
-    - d_model (int): dimensionality of the model's hidden layers
-    - n_head (int): no of attention heads in multi-head attention layers
-    - norm_eps (float): epsilon value for layer normalization
-    - dropout (float): dropout probability
-    - block_size (int): the maximum sequence length for positional encoding
-    """
-  def __init__(self, d_model, n_head, norm_eps, dropout, block_size):
-    super().__init__()
-    self.s_att = MultiHeadAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
-    self.ffwd = FeedForward(d_model, dropout)
-    self.dropout = nn.Dropout(dropout)
-    self.norm1 = nn.LayerNorm(d_model, eps=norm_eps)
-    self.norm2 = nn.LayerNorm(d_model, eps=norm_eps)
-  def forward(self, src):
-    """
-      forward pass of the encoder network module.
-      Args:
-      - src (Tensor): input tensor representing source data
-      Returns:
-      - src (Tensor): output tensor after passing through the encoder network
-    """
-    src2 = self.s_att(src, mask=False)
-    src = src + self.dropout(src2)
-    src = self.norm1(src)
-    src2 = self.ffwd(src)
-    src = src + self.dropout(src2)
-    src = self.norm2(src)
-    return src
-class DecoderNetwork(nn.Module):
-  """
-    initialize a decoder network module
-    Args:
-      - d_model (int): dimensionality of the model's hidden layers
-      - n_head (int): no of attention heads in multi-head attention layers
-      - norm_eps (float): epsilon value for layer normalization
-      - dropout (float): dropout probability
-      - block_size (int): the maximum sequence length for positional encoding
-  """
-  def __init__(self, d_model, n_head, norm_eps, dropout, block_size):
-    super().__init__()
-    self.s_att = MultiHeadAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
-    self.ffwd = FeedForward(d_model, dropout)
-    self.dropout = nn.Dropout(dropout)
-    self.norm1 = nn.LayerNorm(d_model, eps=norm_eps)
-    self.norm2 = nn.LayerNorm(d_model, eps=norm_eps)
-  def forward(self, src, att):
-    """
-      forward pass of the decoder network module.
-      Args:
-        - src (Tensor): input tensor, same as the encoder's inputs
-        - trg (Tensor): encoder's attention matrix
-      Returns:
-        - src_f (Tensor): final output tensor
-    """
-    src2 = self.s_att(src, mask=True)
-    src = src + self.dropout(src2)
-    src = src + self.norm1(src)
-    att = src + att
-    att2 = self.s_att(att, mask=False)
-    att2 = att + self.dropout(att2)
-    trg = att2 + self.norm1(att2)
-    src_f2 = self.ffwd(self.norm2(trg))
-    src_f = src_f + self.dropout(src_f2)
-    src_f = self.norm2(src_f)
-    return src_f
-class Transformer(nn.Module):
-  """
-    initialize a Transformer model
-    Args:
-      - vocab_size (int): size of the vocabulary
-      - d_model (int): dimensionality of the model's hidden layers
-      - block_size (int): maximum sequence length for positional encoding/context length
-      - n_layers (int): number of encoder and decoder layers in the Transformer
-      - n_head (int): number of attention heads in multi-head attention layers
-      - norm_eps (float): epsilon value for layer normalization
-      - dropout (float): dropout probability
-  """
-  def __init__(self, vocab_size):
-    super().__init__()
-    self.block_size = block_size
-    self.toked_model = nn.Embedding(vocab_size, d_model)
-    self.pos_encod = nn.Embedding(block_size, d_model)
-    self.enc_layer = nn.ModuleList([EncoderNetwork(n_head=n_head, norm_eps=norm_eps, block_size=block_size, dropout=dropout, d_model=d_model) for _ in range(n_layers)])
-    self.dec_layer = nn.ModuleList([DecoderNetwork(n_head=n_head, norm_eps=norm_eps, block_size=block_size, dropout=dropout, d_model=d_model) for _ in range(n_layers)])
-    self.norm_final = nn.LayerNorm(d_model)
-    self.linear_final = nn.Linear(d_model, vocab_size)
-    self.dropout = nn.Dropout(dropout)
-    self.apply(self._init_weights)
-  def _init_weights(self, module):
-    """
-      initialize weights of linear and embedding layers
-      Args:
-        - module (nn.Module): the module to initialize weights for
-    """
-    if isinstance(module, nn.Linear):
-      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-      if module.bias is not None:
-        torch.nn.init.zeros_(module.bias.data)
-    elif isinstance(module, nn.Embedding):
-      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-  def forward(self, idx, targets=None):
-    """
-      forward pass of the transformer model
-    Args:
-      - idx (Tensor): input tensor representing token indices
-      - targets (Tensor): target tensor for computing loss during training
-    Returns:
-      - logits (Tensor): output logits from the final linear layer
-      - loss (Tensor): optional. computed cross-entropy loss if targets are provided, else None
-    """
-    B, T = idx.shape
-    toked_model = self.toked_model(idx)
-    pos_encod = self.pos_encod(torch.arange(T, device=device))
-    x = toked_model + pos_encod
-    for layer in self.enc_layer:
-      x_out = layer(x)
-    for layer in self.dec_layer:
-      x_final = layer(x, x_out)
-    x_final = self.norm_final(x_final)
-    logits = self.linear_final(x_final)
-    if targets is None:
-      loss = None
-    else:
-      B, T, C = logits.shape
-      logits = logits.view(B*T, C)
-      targets = targets.view(B*T)
-      loss = F.cross_entropy(logits, targets)
-    return logits, loss
-  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=0):
-    """
-      generate new tokens using the trained model
-    Args:
-      - idx (Tensor): input tensor representing initial token indices
-      - max_new_tokens (int): max no of new tokens to generate
-      - temperature (float): softmax temperature for sampling
-      - top_k (int): no of top tokens to consider in sampling
-    Returns:
-      - generated_tokens (list): list of generated token indices
-    """
-    generated_tokens = []
-    for _ in range(max_new_tokens):
-      idx_cond = idx[:, -self.block_size:]
-      logits, _ = self(idx_cond)
-      logits = logits[:, -1, :]
-      scaled_logits = logits / temperature
-      if top_k > 0:
-        scaled_logits = self._top_k_filtering(scaled_logits, top_k)
-      probs = F.softmax(scaled_logits, dim=-1)
-      sampled_idx = torch.multinomial(probs, num_samples=1)
-      generated_tokens.append(sampled_idx.item())
-      idx = torch.cat((idx, sampled_idx), dim=1)
-    return generated_tokens
-  def generate_masked_tokens(self, idx, masked_indices, temperature=1.0, top_k=0):
-    """
-      Generate predictions for masked tokens using the trained model.
-      Args:
-        - idx (Tensor): input tensor representing token indices
-        - masked_indices (Tensor): tensor of indices indicating masked positions
-        - temperature (float): softmax temperature for sampling
-        - top_k (int): no of top tokens to consider in sampling
-      Returns:
-        - predicted_tokens (Tensor): tensor of predicted token indices
-    """
-    B, T = idx.shape
-    toked_model = self.toked_model(idx)
-    pos_encod = self.pos_encod(torch.arange(T, device=device))
-    x = toked_model + pos_encod
-    for layer in self.enc_layer:
-      x_out = layer(x)
-    for layer in self.dec_layer:
-      x_final = layer(x, x_out)
-    x_masked = x_final.clone()
-    x_masked[masked_indices] = self.toked_model(torch.tensor([6], device=device))
-    x_masked = self.norm_final(x_masked)
-    logits = self.linear_final(x_masked)
-    masked_logits = logits[masked_indices].view(-1, logits.size(-1))
-    scaled_logits = masked_logits / temperature
-    if top_k > 0:
-      scaled_logits = self._top_k_filtering(scaled_logits, top_k)
-    probs = F.softmax(scaled_logits, dim=-1)
-    predicted_indices = torch.argmax(probs, dim=-1)
-    return predicted_indices
-  def _top_k_filtering(self, logits, top_k):
-    """
-      filter logits to keep only the top-k tokens
-    Args:
-      - logits (Tensor): input tensor representing unscaled logits
-      - top_k (int): no of top tokens to keep
-    Returns:
-      - filtered_logits (Tensor): filtered logits with only top-k tokens remaining
-    """
-    values, indices = torch.topk(logits, top_k, dim=-1)
-    min_value = values[:, -1].unsqueeze(-1).expand_as(logits)
-    filtered_logits = torch.where(logits < min_value, torch.ones_like(logits) * -float('inf'), logits)
-    return filtered_logits

enigma/run.py DELETED Viewed

@@ -1,100 +0,0 @@
-"""
-  use this file to train the model
-  working:
-    - imports vatious dependencies first, and then loads the training data
-    - tokenizes it, per-character basis
-    - loads the required hyper-parameters and the model file
-    - trains it till 'max_iters' and saves the model state, and generates outputs
-  with the current set configuration, model can reach upto ~60million parameters
-  and can become ~99% accurate with next token prediction
-"""
-import torch
-import json
-import os
-current_directory = os.path.dirname(os.path.abspath(__file__))
-os.chdir(current_directory)
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-with open('../parquet files/new_dna.txt', 'r', encoding='utf-8') as file:
-  captions = file.read()
-print(f"{(len(captions)/1e6):.2f} million letters")
-from ..tokenizer import PerCharTokenizer
-tokenizer = PerCharTokenizer()
-vocab_size = tokenizer.vocab_size
-# Train and test splits
-data = torch.tensor(tokenizer.encode(captions), dtype=torch.long)
-n = int(0.9*len(data)) # first 90% will be train, rest val
-train_data = data[:n]
-val_data = data[n:]
-with open('/config_enigma.json', 'r', encoding='utf-8') as file:
-  params = json.load(file)
-# required parameters
-batch_size = params['batch_size']
-block_size = params['block_size']
-max_iters = params['max_iters']
-eval_interval = params['eval_interval']
-eval_iters = params['eval_iters']
-learning_rate = params['learning_rate']
-torch.manual_seed(1400)
-# data loading
-def get_batch(split):
-    # generate a small batch of data of inputs x and targets y
-    data = train_data if split == 'train' else val_data
-    ix = torch.randint(len(data) - block_size, (batch_size,))
-    x = torch.stack([data[i:i+block_size] for i in ix])
-    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
-    x, y = x.to(device), y.to(device)
-    return x, y
-@torch.no_grad()
-def estimate_loss():
-    out = {}
-    model.eval()
-    for split in ['train', 'val']:
-        losses = torch.zeros(eval_iters)
-        for k in range(eval_iters):
-            X, Y = get_batch(split)
-            logits, loss = model(X, Y)
-            losses[k] = loss.item()
-        out[split] = losses.mean()
-    model.train()
-    return out
-from model import Transformer
-model = Transformer(vocab_size=vocab_size)
-m = model.to(device)
-# no of parameters
-n_param = sum(p.numel() for p in m.parameters())/1e6
-print(f"{n_param:.2f} million")
-optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-steps = []
-train_losses = []
-val_losses = []
-for iter in range(max_iters):
-  if iter % eval_interval == 0 or iter == max_iters - 1:
-    losses = estimate_loss()
-    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
-    steps.append(iter)
-    train_losses.append(losses['train'])
-    val_losses.append(losses['val'])
-  xb, yb = get_batch('train')
-  logits, loss = model(xb, yb)
-  optimizer.zero_grad(set_to_none=True)
-  loss.backward()
-  optimizer.step()
-torch.save(model.state_dict(), f'enigma_{n_param:.0f}m.pth')