from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments # 1) Load dataset from the Hub ds = load_dataset("HarshMortal/personal-facts") # 2) Tokenizer & model model_name = "distilbert-base-cased-distilled-squad" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForQuestionAnswering.from_pretrained(model_name) # 3) Preprocess def preprocess(ex): inputs = tokenizer( ex["question"], ex["context"], truncation=True, padding="max_length", max_length=384 ) # map answer start/end here... inputs["start_positions"] = ex["answers"]["answer_start"][0] inputs["end_positions"] = ex["answers"]["answer_start"][0] + len(ex["answers"]["text"][0]) return inputs tokenized = ds["train"].map(preprocess, batched=False) # 4) Train args = TrainingArguments( output_dir="results", num_train_epochs=2, per_device_train_batch_size=1, logging_steps=5, push_to_hub=True, hub_model_id="HarshMortal/personal-qa", hub_strategy="every_save", ) trainer = Trainer(model=model, args=args, train_dataset=tokenized) trainer.train()