{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "7ef3e090-1986-4080-827e-fdef2deda5ba", "metadata": {}, "outputs": [], "source": [ "import json\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline\n", "import torch\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ee142e5a-92ac-400b-a048-89a3df0060f6", "metadata": {}, "outputs": [], "source": [ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "print(f\"Device set to: {device}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ba2eea5c-108e-4305-a64e-c35800cf9bf2", "metadata": {}, "outputs": [], "source": [ "# Load CLI Q&A dataset\n", "with open(\"cli_questions.json\", \"r\", encoding=\"utf-8\") as f:\n", " data = json.load(f)\n", "\n", "# Access the list of entries inside \"data\" key\n", "qa_list = data[\"data\"]\n", "\n", "# Show a sample\n", "print(f\"Total entries: {len(qa_list)}\")\n", "print(\"Sample entry:\", qa_list[0])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "81490ae9-b6f9-4004-b098-c09677c1dcd3", "metadata": {}, "outputs": [], "source": [ "model_id = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "model = AutoModelForCausalLM.from_pretrained(model_id)\n", "model.to(device)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5eb00a02-a5a5-4746-bc1f-685ce4865600", "metadata": {}, "outputs": [], "source": [ "generator = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=-1) # -1 for CPU\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0f2b0688-a24d-4d86-90e5-9b8237620f6c", "metadata": {}, "outputs": [], "source": [ "# Pick sample questions\n", "sample_questions = [entry[\"question\"] for entry in qa_list[:5]]\n", "\n", "# Generate and print answers\n", "for i, question in enumerate(sample_questions):\n", " print(f\"Q{i+1}: {question}\")\n", " output = generator(question, max_new_tokens=150, do_sample=True, temperature=0.7)\n", " print(f\"A{i+1}: {output[0]['generated_text']}\\n{'-'*60}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0f52ebb0-e2b9-4971-b66c-5353257b7a1c", "metadata": {}, "outputs": [], "source": [ "prompt = f\"Q: {question}\\nA:\"\n", "output = generator(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)\n", "print(output[0][\"generated_text\"])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "49fcf984-bd0d-48b7-857a-e6a6e04585b8", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "# Load the dataset\n", "with open(\"cli_questions.json\", \"r\") as f:\n", " raw = json.load(f)\n", " data = raw[\"data\"] # ensure this matches your JSON structure\n", "\n", "# Generate answers\n", "results = []\n", "for i, item in enumerate(data[:50]): # run on subset first\n", " question = item[\"question\"]\n", " prompt = f\"Q: {question}\\nA:\"\n", " output = generator(prompt, max_new_tokens=150, temperature=0.7, do_sample=True)\n", " answer = output[0][\"generated_text\"].split(\"A:\")[1].strip() if \"A:\" in output[0][\"generated_text\"] else output[0][\"generated_text\"]\n", " results.append({\"question\": question, \"answer\": answer})\n", " print(f\"Q{i+1}: {question}\\nA{i+1}: {answer}\\n{'-'*60}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "819b988d-c6a1-4b11-b09d-1f1892e18158", "metadata": {}, "outputs": [], "source": [ "!pip install transformers datasets peft accelerate bitsandbytes trl --quiet\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6b3c1312-3499-4462-b435-9fe72f0d6f06", "metadata": { "scrolled": true }, "outputs": [], "source": [ "print(\"Top-level keys:\", data.keys() if isinstance(data, dict) else \"Not a dict\")\n", "print(\"Preview:\", str(data)[:500]) # Print first 500 chars of the content\n" ] }, { "cell_type": "code", "execution_count": null, "id": "96748b74-a5c7-439e-8428-680cba84e06d", "metadata": {}, "outputs": [], "source": [ "import json\n", "from datasets import Dataset\n", "\n", "# Load and extract Q&A list\n", "with open(\"cli_questions.json\", \"r\") as f:\n", " raw = json.load(f)\n", " data_list = raw[\"data\"] # ✅ correct key now\n", "\n", "# Convert to prompt/response format\n", "for sample in data_list:\n", " sample[\"prompt\"] = sample[\"question\"]\n", " sample[\"response\"] = sample[\"answer\"]\n", "\n", "# Create HuggingFace Dataset\n", "dataset = Dataset.from_list(data_list)\n", "dataset = dataset.train_test_split(test_size=0.1)\n", "\n", "print(\"Loaded dataset:\", dataset)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7a7560e5-b04f-480c-b989-0bb3d3611701", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "\n", "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\" # or try \"microsoft/phi-2\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " device_map=\"auto\",\n", " load_in_4bit=True # For LoRA on low-resource\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ae23057e-b741-4541-946d-77f9c5b8c9dc", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "\n", "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " torch_dtype=\"auto\", # or torch.float32 if you get another dtype error\n", " device_map=\"cpu\" # force CPU since no supported GPU found\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ac99fe95-b5f3-4591-bc7c-793e195eeb86", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", "\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_use_double_quant=True,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_compute_dtype=torch.float16,\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " device_map=\"auto\",\n", " quantization_config=bnb_config\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7bde0e33-3bed-4940-907f-e0c2e7af1cd3", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", "\n", "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n", "\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_use_double_quant=True,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_compute_dtype=torch.float16,\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " device_map=\"auto\",\n", " quantization_config=bnb_config\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "51e0d14a-18c7-410f-9821-0eb00d3d1bbc", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "\n", "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " device_map=\"auto\", # This will still use CPU if no GPU is found\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f4e4786e-e67c-4c0f-b169-6996a2966558", "metadata": {}, "outputs": [], "source": [ "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " device_map=\"auto\",\n", " torch_dtype=torch.float32 # or float16 if your CPU supports it\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "dfd328ef-9362-426b-894e-923e70c7ace3", "metadata": {}, "outputs": [], "source": [ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "print(f\"Device set to: {device}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6743ec8e-8bd9-4a73-8786-fd71a6790d78", "metadata": {}, "outputs": [], "source": [ "import json\n", "import torch\n", "from datasets import Dataset\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling\n", "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training" ] }, { "cell_type": "code", "execution_count": null, "id": "4252cc0c-62fe-4871-8095-ab07959b7884", "metadata": {}, "outputs": [], "source": [ "import json\n", "import torch\n", "from datasets import Dataset\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling\n", "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training" ] }, { "cell_type": "code", "execution_count": null, "id": "7153b443-8059-42d1-96fa-699d0f19f9cf", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "with open(\"cli_questions.json\") as f:\n", " data = json.load(f)\n", "\n", "# Check the top-level structure\n", "print(type(data)) # Should print \n", "print(data.keys()) # See what keys are at the top\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fbfa8025-233e-47c5-9044-146f95bb24eb", "metadata": {}, "outputs": [], "source": [ "import json\n", "from datasets import Dataset\n", "\n", "# Load the JSON and extract the list\n", "with open(\"cli_questions.json\") as f:\n", " raw = json.load(f)\n", "\n", "qa_list = raw[\"data\"] # access the list inside the 'data' key\n", "\n", "# Format for instruction tuning\n", "formatted_data = [\n", " {\"text\": f\"### Question:\\n{item['question']}\\n\\n### Answer:\\n{item['answer']}\"}\n", " for item in qa_list\n", "]\n", "\n", "# Convert to Hugging Face dataset\n", "dataset = Dataset.from_list(formatted_data)\n", "\n", "# Preview\n", "print(f\"Loaded {len(dataset)} formatted examples\")\n", "print(dataset[0])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "893c412e-0f09-44fd-b6f8-fe3557a071aa", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "model_id = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\" # You can switch to Phi-2 if you prefer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "tokenizer.pad_token = tokenizer.eos_token # Needed for causal LM padding\n", "\n", "# Tokenization function\n", "def tokenize(example):\n", " return tokenizer(example[\"text\"], padding=\"max_length\", truncation=True, max_length=512)\n", "\n", "tokenized_dataset = dataset.map(tokenize, batched=True)\n", "tokenized_dataset = tokenized_dataset.remove_columns([\"text\"])\n", "\n", "tokenized_dataset.set_format(type=\"torch\")\n", "print(tokenized_dataset[0])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fb49d005-c57c-422f-8bc5-b4037a6bb40f", "metadata": {}, "outputs": [], "source": [ "train_dataset = tokenized_dataset\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a3fb419b-703f-43c8-9be0-a71815b3da82", "metadata": {}, "outputs": [], "source": [ "# Use entire dataset as training set\n", "train_dataset = tokenized_dataset\n" ] }, { "cell_type": "code", "execution_count": null, "id": "09c26c73-e7e8-4610-97d6-6c4a10004785", "metadata": {}, "outputs": [], "source": [ "tokenized_dataset.save_to_disk(\"tokenized_dataset\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e66f130b-b80b-42fd-9f79-60f245f2c114", "metadata": {}, "outputs": [], "source": [ "from datasets import load_from_disk\n", "\n", "# Load the saved dataset\n", "tokenized_dataset = load_from_disk(\"tokenized_dataset\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2dbe3f16-4d82-40c8-be84-b1f85910620f", "metadata": {}, "outputs": [], "source": [ "train_dataset = tokenized_dataset # Use full set for training since it's only 172 examples\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7f05e8d5-fcdf-4a11-9c51-7e8ecd255848", "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForLanguageModeling\n", "\n", "data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer,\n", " mlm=False\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ec68cba4-8413-4c7d-91de-1fe798dc39fc", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling\n", "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n", "from datasets import load_from_disk\n", "import torch\n", "\n", "# Load model and tokenizer (TinyLlama)\n", "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "tokenizer.pad_token = tokenizer.eos_token # Important for Trainer padding\n", "\n", "model = AutoModelForCausalLM.from_pretrained(model_name)\n", "\n", "# Setup LoRA config\n", "lora_config = LoraConfig(\n", " r=8,\n", " lora_alpha=16,\n", " target_modules=[\"q_proj\", \"v_proj\"],\n", " lora_dropout=0.1,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\"\n", ")\n", "\n", "# Inject LoRA adapters\n", "model = get_peft_model(model, lora_config)\n", "\n", "# Load the tokenized dataset\n", "dataset = load_from_disk(\"tokenized_dataset\")\n", "\n", "# Setup data collator\n", "data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer,\n", " mlm=False\n", ")\n", "\n", "# Training args\n", "training_args = TrainingArguments(\n", " output_dir=\"./lora-tinyllama-output\",\n", " per_device_train_batch_size=2, # Small batch size for CPU\n", " gradient_accumulation_steps=4,\n", " num_train_epochs=1, # Reduce for quicker runs\n", " logging_steps=10,\n", " save_strategy=\"epoch\",\n", " learning_rate=2e-4,\n", " fp16=False, # Don't use fp16 on CPU\n", " report_to=\"none\"\n", ")\n", "\n", "# Define Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=dataset,\n", " tokenizer=tokenizer,\n", " data_collator=data_collator\n", ")\n", "\n", "# Start training\n", "trainer.train()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2eaf9fa5-540c-4bd2-b6e1-9ea60c820004", "metadata": {}, "outputs": [], "source": [ "pip install -r requirements.txt\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fad00764-e047-4fd0-b703-c9bbd343ce46", "metadata": {}, "outputs": [], "source": [ "login(token=\"REMOVED_TOKEN_...\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "075e175f-d164-420a-92fb-75150637d351", "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import login\n", "import os\n", "\n", "# Safer login using environment variable (no token exposed in notebook)\n", "login(token=os.getenv(\"HF_TOKEN\"))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "def2deab-147c-4445-8e62-96c397d72f12", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }