File size: 1,178 Bytes
e9661a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
# 1) Load dataset from the Hub
ds = load_dataset("HarshMortal/personal-facts")
# 2) Tokenizer & model
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# 3) Preprocess
def preprocess(ex):
inputs = tokenizer(
ex["question"], ex["context"],
truncation=True, padding="max_length", max_length=384
)
# map answer start/end here...
inputs["start_positions"] = ex["answers"]["answer_start"][0]
inputs["end_positions"] = ex["answers"]["answer_start"][0] + len(ex["answers"]["text"][0])
return inputs
tokenized = ds["train"].map(preprocess, batched=False)
# 4) Train
args = TrainingArguments(
output_dir="results",
num_train_epochs=2,
per_device_train_batch_size=1,
logging_steps=5,
push_to_hub=True,
hub_model_id="HarshMortal/personal-qa",
hub_strategy="every_save",
)
trainer = Trainer(model=model, args=args, train_dataset=tokenized)
trainer.train()
|