# Install these before running: # pip install torch transformers datasets safetensors accelerate import torch from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling ) # 1️⃣ Load a Python code dataset (small subset for quick training) print("📥 Loading dataset...") dataset = load_dataset("codeparrot/codeparrot-clean", split="train[:1%]") # only 1% for demo # 2️⃣ Load tokenizer model_name = "distilgpt2" # small GPT-2 model tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token # avoid pad token error # Tokenization function def tokenize_fn(examples): return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=128) print("🔤 Tokenizing dataset...") tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=["content"]) # 3️⃣ Data Collator (for causal language modeling) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # 4️⃣ Load GPT model print("⚙️ Loading model...") model = AutoModelForCausalLM.from_pretrained(model_name) # 5️⃣ Training arguments training_args = TrainingArguments( output_dir="./mini_gpt_code", overwrite_output_dir=True, evaluation_strategy="no", per_device_train_batch_size=2, num_train_epochs=1, save_strategy="epoch", logging_dir="./logs", save_safetensors=True, # Save in safetensors format fp16=torch.cuda.is_available(), push_to_hub=False ) # 6️⃣ Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer, data_collator=data_collator ) # 7️⃣ Train model print("🚀 Training started...") trainer.train() # 8️⃣ Save final safetensors model save_path = "./mini_gpt_code_safetensors" trainer.save_model(save_path) tokenizer.save_pretrained(save_path) print(f"✅ Training complete. Model saved at {save_path}") # 9️⃣ Inference (code generation) print("💻 Generating Python code...") prompt = "Write a Python function to calculate factorial:\n" inputs = tokenizer(prompt, return_tensors="pt") model.eval() with torch.no_grad(): outputs = model.generate( **inputs, max_length=100, temperature=0.7, do_sample=True, top_p=0.9 ) print("\nGenerated Code:\n") print(tokenizer.decode(outputs[0], skip_special_tokens=True))