pytorch-distilbert-imdb-safetensors / mini_gpt_train_and_infer.py
ankitkushwaha90's picture
Create mini_gpt_train_and_infer.py
bb2544d verified
# Install these before running:
# pip install torch transformers datasets safetensors accelerate
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling
)
# 1️⃣ Load a Python code dataset (small subset for quick training)
print("📥 Loading dataset...")
dataset = load_dataset("codeparrot/codeparrot-clean", split="train[:1%]") # only 1% for demo
# 2️⃣ Load tokenizer
model_name = "distilgpt2" # small GPT-2 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token # avoid pad token error
# Tokenization function
def tokenize_fn(examples):
return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=128)
print("🔤 Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=["content"])
# 3️⃣ Data Collator (for causal language modeling)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# 4️⃣ Load GPT model
print("⚙️ Loading model...")
model = AutoModelForCausalLM.from_pretrained(model_name)
# 5️⃣ Training arguments
training_args = TrainingArguments(
output_dir="./mini_gpt_code",
overwrite_output_dir=True,
evaluation_strategy="no",
per_device_train_batch_size=2,
num_train_epochs=1,
save_strategy="epoch",
logging_dir="./logs",
save_safetensors=True, # Save in safetensors format
fp16=torch.cuda.is_available(),
push_to_hub=False
)
# 6️⃣ Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer,
data_collator=data_collator
)
# 7️⃣ Train model
print("🚀 Training started...")
trainer.train()
# 8️⃣ Save final safetensors model
save_path = "./mini_gpt_code_safetensors"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ Training complete. Model saved at {save_path}")
# 9️⃣ Inference (code generation)
print("💻 Generating Python code...")
prompt = "Write a Python function to calculate factorial:\n"
inputs = tokenizer(prompt, return_tensors="pt")
model.eval()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=100,
temperature=0.7,
do_sample=True,
top_p=0.9
)
print("\nGenerated Code:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))