File size: 21,063 Bytes

f1f0f7a

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f841af43-faf7-4a7b-ad55-0da226f3220f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\user\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "ds = load_dataset('merve/vqav2-small')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b47b7e33-b5eb-46ec-9e43-ed118c09b290",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = ds['validation']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "877df06d-4384-4442-a8d7-7002706b7afe",
   "metadata": {},
   "outputs": [],
   "source": [
    "split_ds = ds.train_test_split(test_size=0.05) # we'll use a very small split for demo\n",
    "train_ds = split_ds[\"test\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "870b515b-d3f5-4638-adbf-70fa39ee2ac5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['multiple_choice_answer', 'question', 'image'],\n",
       "    num_rows: 1072\n",
       "})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "50e42737-ff75-4c90-bdf4-012b45678292",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import PaliGemmaProcessor\n",
    "model_id = r\"D:\\PaliGemma\\paligemma-3b-pt-224\"\n",
    "processor = PaliGemmaProcessor.from_pretrained(model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "83f6c8f7-1960-4ae9-93cc-0a2d25d0d5f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "device = \"cuda\"\n",
    "\n",
    "image_token = processor.tokenizer.convert_tokens_to_ids(\"<image>\")\n",
    "def collate_fn(examples):\n",
    "  texts = [\"answer \" + example[\"question\"] for example in examples]\n",
    "  labels= [example['multiple_choice_answer'] for example in examples]\n",
    "  images = [example[\"image\"].convert(\"RGB\") for example in examples]\n",
    "  tokens = processor(text=texts, images=images, suffix=labels,\n",
    "                    return_tensors=\"pt\", padding=\"longest\",\n",
    "                    #tokenize_newline_separately=False\n",
    "                    )\n",
    "\n",
    "  tokens = tokens.to(torch.bfloat16).to(device)\n",
    "  return tokens\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "3fb8260e-c333-4948-8051-c85964409660",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  4.05s/it]\n"
     ]
    }
   ],
   "source": [
    "from transformers import PaliGemmaForConditionalGeneration\n",
    "import torch\n",
    "\n",
    "model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)\n",
    "\n",
    "for param in model.vision_tower.parameters():\n",
    "    param.requires_grad = False\n",
    "\n",
    "for param in model.multi_modal_projector.parameters():\n",
    "    param.requires_grad = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "7ae939ff-98e7-47f8-af29-8fd1ee8f237c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Unused kwargs: ['bnb_4bit_compute_type']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.\n",
      "Loading checkpoint shards: 100%|██████████| 3/3 [00:22<00:00,  7.45s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trainable params: 11,298,816 || all params: 2,934,765,296 || trainable%: 0.3850\n"
     ]
    }
   ],
   "source": [
    "from transformers import BitsAndBytesConfig\n",
    "from peft import get_peft_model, LoraConfig\n",
    "\n",
    "bnb_config = BitsAndBytesConfig(\n",
    "        load_in_4bit=True,\n",
    "        bnb_4bit_quant_type=\"nf4\",\n",
    "        bnb_4bit_compute_type=torch.bfloat16\n",
    ")\n",
    "\n",
    "lora_config = LoraConfig(\n",
    "    r=8,\n",
    "    target_modules=[\"q_proj\", \"o_proj\", \"k_proj\", \"v_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
    "    task_type=\"CAUSAL_LM\",\n",
    ")\n",
    "model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={\"\":0})\n",
    "model = get_peft_model(model, lora_config)\n",
    "model.print_trainable_parameters()\n",
    "#trainable params: 11,298,816 || all params: 2,934,634,224 || trainable%: 0.38501616002417344"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7fe77639-44ab-4747-8ced-343eb06e0efd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import accelerate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "98b996db-e9c5-42bf-b979-fd79a28f7e5e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.26.0\n"
     ]
    }
   ],
   "source": [
    "print(accelerate.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9a6546c4-90b3-4f4d-8de0-1e020883a702",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import TrainingArguments\n",
    "args=TrainingArguments(\n",
    "            num_train_epochs=2,\n",
    "            remove_unused_columns=False,\n",
    "            per_device_train_batch_size=4,\n",
    "            gradient_accumulation_steps=4,\n",
    "            warmup_steps=2,\n",
    "            learning_rate=2e-5,\n",
    "            weight_decay=1e-6,\n",
    "            adam_beta2=0.999,\n",
    "            logging_steps=100,\n",
    "            optim=\"adamw_torch\",\n",
    "            save_strategy=\"steps\",\n",
    "            save_steps=1000,\n",
    "            # push_to_hub=True,\n",
    "            save_total_limit=1,\n",
    "            output_dir=\"paligemma_vqav2\",\n",
    "            bf16=True,\n",
    "            report_to=[\"tensorboard\"],\n",
    "            dataloader_pin_memory=False\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df25a8dc-8ab9-467a-b6ce-dee13addb776",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9a8de871-e869-4daf-a250-0aec6437f076",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Trainer\n",
    "\n",
    "trainer = Trainer(\n",
    "        model=model,\n",
    "        train_dataset=train_ds ,\n",
    "        data_collator=collate_fn,\n",
    "        args=args\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "77d743e5-6b5b-40a0-a2f4-7591f2c8df50",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
      "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.\n",
      "C:\\Users\\user\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\transformers\\models\\siglip\\modeling_siglip.py:574: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\aten\\src\\ATen\\native\\transformers\\cuda\\sdp_utils.cpp:555.)\n",
      "  attn_output = torch.nn.functional.scaled_dot_product_attention(\n",
      "C:\\Users\\user\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\bitsandbytes\\nn\\modules.py:452: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[21], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\transformers\\trainer.py:2123\u001b[0m, in \u001b[0;36mTrainer.train\u001b[1;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[0;32m   2121\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[0;32m   2122\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 2123\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   2124\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2125\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2126\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2127\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2128\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\transformers\\trainer.py:2481\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[1;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[0;32m   2475\u001b[0m context \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m   2476\u001b[0m     functools\u001b[38;5;241m.\u001b[39mpartial(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mno_sync, model\u001b[38;5;241m=\u001b[39mmodel)\n\u001b[0;32m   2477\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(batch_samples) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   2478\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m contextlib\u001b[38;5;241m.\u001b[39mnullcontext\n\u001b[0;32m   2479\u001b[0m )\n\u001b[0;32m   2480\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[1;32m-> 2481\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2483\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   2484\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[0;32m   2485\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[0;32m   2486\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[0;32m   2487\u001b[0m ):\n\u001b[0;32m   2488\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[0;32m   2489\u001b[0m     tr_loss \u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m+\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
      "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\transformers\\trainer.py:3612\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[1;34m(***failed resolving arguments***)\u001b[0m\n\u001b[0;32m   3610\u001b[0m         scaled_loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[0;32m   3611\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 3612\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mbackward(loss, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   3613\u001b[0m     \u001b[38;5;66;03m# Finally we need to normalize the loss for reporting\u001b[39;00m\n\u001b[0;32m   3614\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m num_items_in_batch \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\accelerate\\accelerator.py:1964\u001b[0m, in \u001b[0;36mAccelerator.backward\u001b[1;34m(self, loss, **kwargs)\u001b[0m\n\u001b[0;32m   1962\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscaler\u001b[38;5;241m.\u001b[39mscale(loss)\u001b[38;5;241m.\u001b[39mbackward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1963\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1964\u001b[0m     loss\u001b[38;5;241m.\u001b[39mbackward(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\torch\\_tensor.py:521\u001b[0m, in \u001b[0;36mTensor.backward\u001b[1;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[0;32m    511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m    512\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[0;32m    513\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[0;32m    514\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    519\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[0;32m    520\u001b[0m     )\n\u001b[1;32m--> 521\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    522\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[0;32m    523\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\torch\\autograd\\__init__.py:289\u001b[0m, in \u001b[0;36mbackward\u001b[1;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[0;32m    284\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[0;32m    286\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[0;32m    287\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[0;32m    288\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[1;32m--> 289\u001b[0m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    290\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    291\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    292\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    293\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    294\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    295\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    296\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    297\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32m~\\anaconda3\\envs\\Ultralytics\\lib\\site-packages\\torch\\autograd\\graph.py:768\u001b[0m, in \u001b[0;36m_engine_run_backward\u001b[1;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[0;32m    766\u001b[0m     unregister_hooks \u001b[38;5;241m=\u001b[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[0;32m    767\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 768\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m Variable\u001b[38;5;241m.\u001b[39m_execution_engine\u001b[38;5;241m.\u001b[39mrun_backward(  \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[0;32m    769\u001b[0m         t_outputs, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[0;32m    770\u001b[0m     )  \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[0;32m    771\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m    772\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "422d8f32-ecd9-4266-b5a4-bd26d45c4fc7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "365cf997-a80e-407e-9848-74e4d4b6a8a8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}