datasetsANDmodels commited on
Commit
1399970
·
verified ·
1 Parent(s): 4e589db

Upload finetune.py

Browse files
Files changed (1) hide show
  1. finetune.py +96 -0
finetune.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from datasets import load_dataset
4
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
5
+ import torch, csv
6
+ file_dict = {
7
+ "train" : "name_dataset.csv",
8
+ "test" : "name_dataset.csv"
9
+ }
10
+
11
+ dataset = load_dataset(
12
+ 'csv',
13
+ data_files=file_dict,
14
+ delimiter=',',
15
+ column_names=['text', 'label'],
16
+ skiprows=1
17
+ )
18
+
19
+ print(f"Train dataset size: {len(dataset['train'])}")
20
+ print(f"Test dataset size: {len(dataset['test'])}")
21
+
22
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
23
+ from datasets import concatenate_datasets
24
+
25
+ model_id = "t5-small"
26
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
27
+
28
+ def tokenize_function(example):
29
+ model_inputs = tokenizer(example["text"], truncation=True)
30
+ targets = tokenizer(example["label"], truncation=True)
31
+ model_inputs['labels'] = targets['input_ids']
32
+ return model_inputs
33
+
34
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
35
+ tokenized_datasets = tokenized_datasets.remove_columns("text")
36
+ tokenized_datasets = tokenized_datasets.remove_columns("label")
37
+
38
+ from transformers import DataCollatorForSeq2Seq
39
+ model =T5ForConditionalGeneration.from_pretrained(model_id)
40
+
41
+ from peft import LoraConfig, get_peft_model,TaskType
42
+
43
+
44
+ lora_config = LoraConfig(
45
+ r=16,
46
+ lora_alpha=32,
47
+ target_modules=["q", "v"],
48
+ lora_dropout=0.05,
49
+ bias="none",
50
+ task_type=TaskType.SEQ_2_SEQ_LM
51
+ )
52
+ model = get_peft_model(model, lora_config)
53
+ model.print_trainable_parameters()
54
+
55
+ label_pad_token_id = -100
56
+ data_collator = DataCollatorForSeq2Seq(
57
+ tokenizer,
58
+ model=model,
59
+ label_pad_token_id=label_pad_token_id,
60
+ pad_to_multiple_of=8
61
+ )
62
+
63
+ from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
64
+
65
+ output_dir = "lora-t5"
66
+ training_args = Seq2SeqTrainingArguments(
67
+ output_dir=output_dir,
68
+ auto_find_batch_size=True,
69
+ learning_rate=1e-3,
70
+ num_train_epochs=100,
71
+ logging_dir=f"{output_dir}/logs",
72
+ logging_strategy="steps",
73
+ logging_steps=500,
74
+ save_strategy="no",
75
+ # report_to="tensorboard",
76
+ )
77
+
78
+ trainer = Seq2SeqTrainer(
79
+ model=model,
80
+ args=training_args,
81
+ data_collator=data_collator,
82
+ train_dataset=tokenized_datasets["train"],
83
+ )
84
+ model.config.use_cache = False
85
+ trainer.train()
86
+ peft_model_id = "name-peft"
87
+ trainer.model.save_pretrained(peft_model_id)
88
+ tokenizer.save_pretrained(peft_model_id)
89
+ from transformers import T5ForConditionalGeneration, AutoTokenizer
90
+ from peft import PeftModel
91
+ base_model = T5ForConditionalGeneration.from_pretrained(model_id)
92
+ peft_model = PeftModel.from_pretrained(base_model, "name-peft")
93
+ peft_model = peft_model.merge_and_unload()
94
+ peft_model.save_pretrained("name-extraction")
95
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
96
+ tokenizer.save_pretrained("name-extraction")