Harish2002 commited on
Commit
f6bc9c0
·
verified ·
1 Parent(s): 857a939

Upload training.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. training.ipynb +605 -0
training.ipynb ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "7ef3e090-1986-4080-827e-fdef2deda5ba",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import json\n",
11
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline\n",
12
+ "import torch\n"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": null,
18
+ "id": "ee142e5a-92ac-400b-a048-89a3df0060f6",
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
23
+ "print(f\"Device set to: {device}\")\n"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "id": "ba2eea5c-108e-4305-a64e-c35800cf9bf2",
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "# Load CLI Q&A dataset\n",
34
+ "with open(\"cli_questions.json\", \"r\", encoding=\"utf-8\") as f:\n",
35
+ " data = json.load(f)\n",
36
+ "\n",
37
+ "# Access the list of entries inside \"data\" key\n",
38
+ "qa_list = data[\"data\"]\n",
39
+ "\n",
40
+ "# Show a sample\n",
41
+ "print(f\"Total entries: {len(qa_list)}\")\n",
42
+ "print(\"Sample entry:\", qa_list[0])\n"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "id": "81490ae9-b6f9-4004-b098-c09677c1dcd3",
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "model_id = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n",
53
+ "\n",
54
+ "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
55
+ "model = AutoModelForCausalLM.from_pretrained(model_id)\n",
56
+ "model.to(device)\n"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "id": "5eb00a02-a5a5-4746-bc1f-685ce4865600",
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "generator = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=-1) # -1 for CPU\n"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": null,
72
+ "id": "0f2b0688-a24d-4d86-90e5-9b8237620f6c",
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "# Pick sample questions\n",
77
+ "sample_questions = [entry[\"question\"] for entry in qa_list[:5]]\n",
78
+ "\n",
79
+ "# Generate and print answers\n",
80
+ "for i, question in enumerate(sample_questions):\n",
81
+ " print(f\"Q{i+1}: {question}\")\n",
82
+ " output = generator(question, max_new_tokens=150, do_sample=True, temperature=0.7)\n",
83
+ " print(f\"A{i+1}: {output[0]['generated_text']}\\n{'-'*60}\")\n"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "id": "0f52ebb0-e2b9-4971-b66c-5353257b7a1c",
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "prompt = f\"Q: {question}\\nA:\"\n",
94
+ "output = generator(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)\n",
95
+ "print(output[0][\"generated_text\"])\n"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "id": "49fcf984-bd0d-48b7-857a-e6a6e04585b8",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "import json\n",
106
+ "\n",
107
+ "# Load the dataset\n",
108
+ "with open(\"cli_questions.json\", \"r\") as f:\n",
109
+ " raw = json.load(f)\n",
110
+ " data = raw[\"data\"] # ensure this matches your JSON structure\n",
111
+ "\n",
112
+ "# Generate answers\n",
113
+ "results = []\n",
114
+ "for i, item in enumerate(data[:50]): # run on subset first\n",
115
+ " question = item[\"question\"]\n",
116
+ " prompt = f\"Q: {question}\\nA:\"\n",
117
+ " output = generator(prompt, max_new_tokens=150, temperature=0.7, do_sample=True)\n",
118
+ " answer = output[0][\"generated_text\"].split(\"A:\")[1].strip() if \"A:\" in output[0][\"generated_text\"] else output[0][\"generated_text\"]\n",
119
+ " results.append({\"question\": question, \"answer\": answer})\n",
120
+ " print(f\"Q{i+1}: {question}\\nA{i+1}: {answer}\\n{'-'*60}\")\n"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "id": "819b988d-c6a1-4b11-b09d-1f1892e18158",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "!pip install transformers datasets peft accelerate bitsandbytes trl --quiet\n"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "id": "6b3c1312-3499-4462-b435-9fe72f0d6f06",
137
+ "metadata": {
138
+ "scrolled": true
139
+ },
140
+ "outputs": [],
141
+ "source": [
142
+ "print(\"Top-level keys:\", data.keys() if isinstance(data, dict) else \"Not a dict\")\n",
143
+ "print(\"Preview:\", str(data)[:500]) # Print first 500 chars of the content\n"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": null,
149
+ "id": "96748b74-a5c7-439e-8428-680cba84e06d",
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "import json\n",
154
+ "from datasets import Dataset\n",
155
+ "\n",
156
+ "# Load and extract Q&A list\n",
157
+ "with open(\"cli_questions.json\", \"r\") as f:\n",
158
+ " raw = json.load(f)\n",
159
+ " data_list = raw[\"data\"] # ✅ correct key now\n",
160
+ "\n",
161
+ "# Convert to prompt/response format\n",
162
+ "for sample in data_list:\n",
163
+ " sample[\"prompt\"] = sample[\"question\"]\n",
164
+ " sample[\"response\"] = sample[\"answer\"]\n",
165
+ "\n",
166
+ "# Create HuggingFace Dataset\n",
167
+ "dataset = Dataset.from_list(data_list)\n",
168
+ "dataset = dataset.train_test_split(test_size=0.1)\n",
169
+ "\n",
170
+ "print(\"Loaded dataset:\", dataset)\n"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": null,
176
+ "id": "7a7560e5-b04f-480c-b989-0bb3d3611701",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
181
+ "\n",
182
+ "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\" # or try \"microsoft/phi-2\"\n",
183
+ "\n",
184
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
185
+ "model = AutoModelForCausalLM.from_pretrained(\n",
186
+ " model_name,\n",
187
+ " device_map=\"auto\",\n",
188
+ " load_in_4bit=True # For LoRA on low-resource\n",
189
+ ")\n"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": null,
195
+ "id": "ae23057e-b741-4541-946d-77f9c5b8c9dc",
196
+ "metadata": {},
197
+ "outputs": [],
198
+ "source": [
199
+ "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
200
+ "\n",
201
+ "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n",
202
+ "\n",
203
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
204
+ "model = AutoModelForCausalLM.from_pretrained(\n",
205
+ " model_name,\n",
206
+ " torch_dtype=\"auto\", # or torch.float32 if you get another dtype error\n",
207
+ " device_map=\"cpu\" # force CPU since no supported GPU found\n",
208
+ ")\n"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": null,
214
+ "id": "ac99fe95-b5f3-4591-bc7c-793e195eeb86",
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
219
+ "\n",
220
+ "bnb_config = BitsAndBytesConfig(\n",
221
+ " load_in_4bit=True,\n",
222
+ " bnb_4bit_use_double_quant=True,\n",
223
+ " bnb_4bit_quant_type=\"nf4\",\n",
224
+ " bnb_4bit_compute_dtype=torch.float16,\n",
225
+ ")\n",
226
+ "\n",
227
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
228
+ "model = AutoModelForCausalLM.from_pretrained(\n",
229
+ " model_name,\n",
230
+ " device_map=\"auto\",\n",
231
+ " quantization_config=bnb_config\n",
232
+ ")\n"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "id": "7bde0e33-3bed-4940-907f-e0c2e7af1cd3",
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": [
242
+ "import torch\n",
243
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
244
+ "\n",
245
+ "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n",
246
+ "\n",
247
+ "bnb_config = BitsAndBytesConfig(\n",
248
+ " load_in_4bit=True,\n",
249
+ " bnb_4bit_use_double_quant=True,\n",
250
+ " bnb_4bit_quant_type=\"nf4\",\n",
251
+ " bnb_4bit_compute_dtype=torch.float16,\n",
252
+ ")\n",
253
+ "\n",
254
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
255
+ "model = AutoModelForCausalLM.from_pretrained(\n",
256
+ " model_name,\n",
257
+ " device_map=\"auto\",\n",
258
+ " quantization_config=bnb_config\n",
259
+ ")\n"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": null,
265
+ "id": "51e0d14a-18c7-410f-9821-0eb00d3d1bbc",
266
+ "metadata": {},
267
+ "outputs": [],
268
+ "source": [
269
+ "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
270
+ "\n",
271
+ "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n",
272
+ "\n",
273
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
274
+ "model = AutoModelForCausalLM.from_pretrained(\n",
275
+ " model_name,\n",
276
+ " device_map=\"auto\", # This will still use CPU if no GPU is found\n",
277
+ ")\n"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": null,
283
+ "id": "f4e4786e-e67c-4c0f-b169-6996a2966558",
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "model = AutoModelForCausalLM.from_pretrained(\n",
288
+ " model_name,\n",
289
+ " device_map=\"auto\",\n",
290
+ " torch_dtype=torch.float32 # or float16 if your CPU supports it\n",
291
+ ")\n"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "id": "dfd328ef-9362-426b-894e-923e70c7ace3",
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": [
301
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
302
+ "print(f\"Device set to: {device}\")\n"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": null,
308
+ "id": "6743ec8e-8bd9-4a73-8786-fd71a6790d78",
309
+ "metadata": {},
310
+ "outputs": [],
311
+ "source": [
312
+ "import json\n",
313
+ "import torch\n",
314
+ "from datasets import Dataset\n",
315
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling\n",
316
+ "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": null,
322
+ "id": "4252cc0c-62fe-4871-8095-ab07959b7884",
323
+ "metadata": {},
324
+ "outputs": [],
325
+ "source": [
326
+ "import json\n",
327
+ "import torch\n",
328
+ "from datasets import Dataset\n",
329
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling\n",
330
+ "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training"
331
+ ]
332
+ },
333
+ {
334
+ "cell_type": "code",
335
+ "execution_count": null,
336
+ "id": "7153b443-8059-42d1-96fa-699d0f19f9cf",
337
+ "metadata": {},
338
+ "outputs": [],
339
+ "source": [
340
+ "import json\n",
341
+ "\n",
342
+ "with open(\"cli_questions.json\") as f:\n",
343
+ " data = json.load(f)\n",
344
+ "\n",
345
+ "# Check the top-level structure\n",
346
+ "print(type(data)) # Should print <class 'dict'>\n",
347
+ "print(data.keys()) # See what keys are at the top\n"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": null,
353
+ "id": "fbfa8025-233e-47c5-9044-146f95bb24eb",
354
+ "metadata": {},
355
+ "outputs": [],
356
+ "source": [
357
+ "import json\n",
358
+ "from datasets import Dataset\n",
359
+ "\n",
360
+ "# Load the JSON and extract the list\n",
361
+ "with open(\"cli_questions.json\") as f:\n",
362
+ " raw = json.load(f)\n",
363
+ "\n",
364
+ "qa_list = raw[\"data\"] # access the list inside the 'data' key\n",
365
+ "\n",
366
+ "# Format for instruction tuning\n",
367
+ "formatted_data = [\n",
368
+ " {\"text\": f\"### Question:\\n{item['question']}\\n\\n### Answer:\\n{item['answer']}\"}\n",
369
+ " for item in qa_list\n",
370
+ "]\n",
371
+ "\n",
372
+ "# Convert to Hugging Face dataset\n",
373
+ "dataset = Dataset.from_list(formatted_data)\n",
374
+ "\n",
375
+ "# Preview\n",
376
+ "print(f\"Loaded {len(dataset)} formatted examples\")\n",
377
+ "print(dataset[0])\n"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": null,
383
+ "id": "893c412e-0f09-44fd-b6f8-fe3557a071aa",
384
+ "metadata": {},
385
+ "outputs": [],
386
+ "source": [
387
+ "from transformers import AutoTokenizer\n",
388
+ "\n",
389
+ "model_id = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\" # You can switch to Phi-2 if you prefer\n",
390
+ "\n",
391
+ "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
392
+ "tokenizer.pad_token = tokenizer.eos_token # Needed for causal LM padding\n",
393
+ "\n",
394
+ "# Tokenization function\n",
395
+ "def tokenize(example):\n",
396
+ " return tokenizer(example[\"text\"], padding=\"max_length\", truncation=True, max_length=512)\n",
397
+ "\n",
398
+ "tokenized_dataset = dataset.map(tokenize, batched=True)\n",
399
+ "tokenized_dataset = tokenized_dataset.remove_columns([\"text\"])\n",
400
+ "\n",
401
+ "tokenized_dataset.set_format(type=\"torch\")\n",
402
+ "print(tokenized_dataset[0])\n"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": null,
408
+ "id": "fb49d005-c57c-422f-8bc5-b4037a6bb40f",
409
+ "metadata": {},
410
+ "outputs": [],
411
+ "source": [
412
+ "train_dataset = tokenized_dataset\n"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "id": "a3fb419b-703f-43c8-9be0-a71815b3da82",
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": [
422
+ "# Use entire dataset as training set\n",
423
+ "train_dataset = tokenized_dataset\n"
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": null,
429
+ "id": "09c26c73-e7e8-4610-97d6-6c4a10004785",
430
+ "metadata": {},
431
+ "outputs": [],
432
+ "source": [
433
+ "tokenized_dataset.save_to_disk(\"tokenized_dataset\")\n"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": null,
439
+ "id": "e66f130b-b80b-42fd-9f79-60f245f2c114",
440
+ "metadata": {},
441
+ "outputs": [],
442
+ "source": [
443
+ "from datasets import load_from_disk\n",
444
+ "\n",
445
+ "# Load the saved dataset\n",
446
+ "tokenized_dataset = load_from_disk(\"tokenized_dataset\")\n"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "id": "2dbe3f16-4d82-40c8-be84-b1f85910620f",
453
+ "metadata": {},
454
+ "outputs": [],
455
+ "source": [
456
+ "train_dataset = tokenized_dataset # Use full set for training since it's only 172 examples\n"
457
+ ]
458
+ },
459
+ {
460
+ "cell_type": "code",
461
+ "execution_count": null,
462
+ "id": "7f05e8d5-fcdf-4a11-9c51-7e8ecd255848",
463
+ "metadata": {},
464
+ "outputs": [],
465
+ "source": [
466
+ "from transformers import DataCollatorForLanguageModeling\n",
467
+ "\n",
468
+ "data_collator = DataCollatorForLanguageModeling(\n",
469
+ " tokenizer=tokenizer,\n",
470
+ " mlm=False\n",
471
+ ")\n"
472
+ ]
473
+ },
474
+ {
475
+ "cell_type": "code",
476
+ "execution_count": null,
477
+ "id": "ec68cba4-8413-4c7d-91de-1fe798dc39fc",
478
+ "metadata": {},
479
+ "outputs": [],
480
+ "source": [
481
+ "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling\n",
482
+ "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n",
483
+ "from datasets import load_from_disk\n",
484
+ "import torch\n",
485
+ "\n",
486
+ "# Load model and tokenizer (TinyLlama)\n",
487
+ "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n",
488
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
489
+ "tokenizer.pad_token = tokenizer.eos_token # Important for Trainer padding\n",
490
+ "\n",
491
+ "model = AutoModelForCausalLM.from_pretrained(model_name)\n",
492
+ "\n",
493
+ "# Setup LoRA config\n",
494
+ "lora_config = LoraConfig(\n",
495
+ " r=8,\n",
496
+ " lora_alpha=16,\n",
497
+ " target_modules=[\"q_proj\", \"v_proj\"],\n",
498
+ " lora_dropout=0.1,\n",
499
+ " bias=\"none\",\n",
500
+ " task_type=\"CAUSAL_LM\"\n",
501
+ ")\n",
502
+ "\n",
503
+ "# Inject LoRA adapters\n",
504
+ "model = get_peft_model(model, lora_config)\n",
505
+ "\n",
506
+ "# Load the tokenized dataset\n",
507
+ "dataset = load_from_disk(\"tokenized_dataset\")\n",
508
+ "\n",
509
+ "# Setup data collator\n",
510
+ "data_collator = DataCollatorForLanguageModeling(\n",
511
+ " tokenizer=tokenizer,\n",
512
+ " mlm=False\n",
513
+ ")\n",
514
+ "\n",
515
+ "# Training args\n",
516
+ "training_args = TrainingArguments(\n",
517
+ " output_dir=\"./lora-tinyllama-output\",\n",
518
+ " per_device_train_batch_size=2, # Small batch size for CPU\n",
519
+ " gradient_accumulation_steps=4,\n",
520
+ " num_train_epochs=1, # Reduce for quicker runs\n",
521
+ " logging_steps=10,\n",
522
+ " save_strategy=\"epoch\",\n",
523
+ " learning_rate=2e-4,\n",
524
+ " fp16=False, # Don't use fp16 on CPU\n",
525
+ " report_to=\"none\"\n",
526
+ ")\n",
527
+ "\n",
528
+ "# Define Trainer\n",
529
+ "trainer = Trainer(\n",
530
+ " model=model,\n",
531
+ " args=training_args,\n",
532
+ " train_dataset=dataset,\n",
533
+ " tokenizer=tokenizer,\n",
534
+ " data_collator=data_collator\n",
535
+ ")\n",
536
+ "\n",
537
+ "# Start training\n",
538
+ "trainer.train()\n"
539
+ ]
540
+ },
541
+ {
542
+ "cell_type": "code",
543
+ "execution_count": null,
544
+ "id": "2eaf9fa5-540c-4bd2-b6e1-9ea60c820004",
545
+ "metadata": {},
546
+ "outputs": [],
547
+ "source": [
548
+ "pip install -r requirements.txt\n"
549
+ ]
550
+ },
551
+ {
552
+ "cell_type": "code",
553
+ "execution_count": null,
554
+ "id": "fad00764-e047-4fd0-b703-c9bbd343ce46",
555
+ "metadata": {},
556
+ "outputs": [],
557
+ "source": [
558
+ "login(token=\"REMOVED_TOKEN_...\")\n"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": null,
564
+ "id": "075e175f-d164-420a-92fb-75150637d351",
565
+ "metadata": {},
566
+ "outputs": [],
567
+ "source": [
568
+ "from huggingface_hub import login\n",
569
+ "import os\n",
570
+ "\n",
571
+ "# Safer login using environment variable (no token exposed in notebook)\n",
572
+ "login(token=os.getenv(\"HF_TOKEN\"))\n"
573
+ ]
574
+ },
575
+ {
576
+ "cell_type": "code",
577
+ "execution_count": null,
578
+ "id": "def2deab-147c-4445-8e62-96c397d72f12",
579
+ "metadata": {},
580
+ "outputs": [],
581
+ "source": []
582
+ }
583
+ ],
584
+ "metadata": {
585
+ "kernelspec": {
586
+ "display_name": "Python 3 (ipykernel)",
587
+ "language": "python",
588
+ "name": "python3"
589
+ },
590
+ "language_info": {
591
+ "codemirror_mode": {
592
+ "name": "ipython",
593
+ "version": 3
594
+ },
595
+ "file_extension": ".py",
596
+ "mimetype": "text/x-python",
597
+ "name": "python",
598
+ "nbconvert_exporter": "python",
599
+ "pygments_lexer": "ipython3",
600
+ "version": "3.12.7"
601
+ }
602
+ },
603
+ "nbformat": 4,
604
+ "nbformat_minor": 5
605
+ }