from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments

# 1) Load dataset from the Hub
ds = load_dataset("HarshMortal/personal-facts")

# 2) Tokenizer & model
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 3) Preprocess
def preprocess(ex):
    inputs = tokenizer(
        ex["question"], ex["context"],
        truncation=True, padding="max_length", max_length=384
    )
    # map answer start/end here...
    inputs["start_positions"] = ex["answers"]["answer_start"][0]
    inputs["end_positions"]   = ex["answers"]["answer_start"][0] + len(ex["answers"]["text"][0])
    return inputs

tokenized = ds["train"].map(preprocess, batched=False)

# 4) Train
args = TrainingArguments(
    output_dir="results",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    logging_steps=5,
    push_to_hub=True,
    hub_model_id="HarshMortal/personal-qa",
    hub_strategy="every_save",
)
trainer = Trainer(model=model, args=args, train_dataset=tokenized)
trainer.train()