hllj commited on
Commit
ceab688
·
1 Parent(s): f99587e

Model save

Browse files
README.md CHANGED
@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 1.1997
19
 
20
  ## Model description
21
 
@@ -48,8 +48,8 @@ The following hyperparameters were used during training:
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
- | 1.5219 | 0.02 | 25 | 1.2539 |
52
- | 1.3156 | 0.03 | 50 | 1.1997 |
53
 
54
 
55
  ### Framework versions
 
15
 
16
  This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.8877
19
 
20
  ## Model description
21
 
 
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
+ | 1.1665 | 0.02 | 25 | 0.9468 |
52
+ | 0.8357 | 0.03 | 50 | 0.8877 |
53
 
54
 
55
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 0.03,
3
  "eval_loss": 0.8876652717590332,
4
- "eval_runtime": 112.9915,
5
  "eval_samples": 650,
6
- "eval_samples_per_second": 5.753,
7
- "eval_steps_per_second": 1.443,
8
  "train_loss": 1.0970729541778566,
9
- "train_runtime": 356.1922,
10
  "train_samples": 5845,
11
- "train_samples_per_second": 0.561,
12
  "train_steps_per_second": 0.14
13
  }
 
1
  {
2
  "epoch": 0.03,
3
  "eval_loss": 0.8876652717590332,
4
+ "eval_runtime": 113.7341,
5
  "eval_samples": 650,
6
+ "eval_samples_per_second": 5.715,
7
+ "eval_steps_per_second": 1.433,
8
  "train_loss": 1.0970729541778566,
9
+ "train_runtime": 357.3025,
10
  "train_samples": 5845,
11
+ "train_samples_per_second": 0.56,
12
  "train_steps_per_second": 0.14
13
  }
config_argument.yaml CHANGED
@@ -1,188 +1,49 @@
1
- !!python/tuple
2
- - !!python/object:__main__.ModelArguments
3
- bnb_4bit_quant_type: nf4
4
- cache_dir: ./cache
5
- device_map: auto
6
- load_in_4bit: true
7
- load_in_8bit: false
8
- model_name_or_path: HuggingFaceH4/zephyr-7b-beta
9
- model_revision: main
10
- model_type: auto
11
- neft_alpha: 0
12
- rope_scaling: null
13
- shift_attn: false
14
- tokenizer_name_or_path: null
15
- torch_dtype: float16
16
- trust_remote_code: true
17
- use_bnb_nested_quant: false
18
- use_fast_tokenizer: false
19
- use_flash_attention_2: false
20
- - !!python/object:__main__.DataArguments
21
- dataset_config_name: null
22
- dataset_name: null
23
- ignore_pad_token_for_loss: true
24
- max_eval_samples: null
25
- max_train_samples: null
26
- overwrite_cache: false
27
- preprocessing_num_workers: 4
28
- template_name: vicuna
29
- train_file_dir: datasets/finetune
30
- validation_file_dir: null
31
- validation_split_percentage: 10
32
- - !!python/object:__main__.SFTConfig
33
- __cached__setup_devices: !!python/object/apply:torch.device
34
- - cuda
35
- - 0
36
- _n_gpu: 1
37
- adafactor: false
38
- adam_beta1: 0.9
39
- adam_beta2: 0.999
40
- adam_epsilon: 1.0e-08
41
- auto_find_batch_size: false
42
- bf16: false
43
- bf16_full_eval: false
44
- data_seed: null
45
- dataloader_drop_last: false
46
- dataloader_num_workers: 0
47
- dataloader_pin_memory: true
48
- ddp_backend: null
49
- ddp_broadcast_buffers: null
50
- ddp_bucket_cap_mb: null
51
- ddp_find_unused_parameters: false
52
- ddp_timeout: 30000
53
- debug: []
54
- deepspeed: null
55
- deepspeed_plugin: null
56
- disable_tqdm: false
57
- dispatch_batches: null
58
- distributed_state: !!python/object:accelerate.state.PartialState
59
- _cpu: false
60
- backend: null
61
- debug: false
62
- device: !!python/object/apply:torch.device
63
- - cuda
64
- - 0
65
- distributed_type: !!python/object/apply:accelerate.utils.dataclasses.DistributedType
66
- - MULTI_GPU
67
- fork_launched: false
68
- local_process_index: 0
69
- num_processes: 1
70
- process_index: 0
71
- do_eval: true
72
- do_predict: false
73
- do_train: true
74
- eval_accumulation_steps: null
75
- eval_delay: 0
76
- eval_steps: 25
77
- evaluation_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
78
- - steps
79
- fp16: false
80
- fp16_backend: auto
81
- fp16_full_eval: false
82
- fp16_opt_level: O1
83
- fsdp: []
84
- fsdp_config:
85
- min_num_params: 0
86
- xla: false
87
- xla_fsdp_grad_ckpt: false
88
- fsdp_min_num_params: 0
89
- fsdp_transformer_layer_cls_to_wrap: null
90
- full_determinism: false
91
- gradient_accumulation_steps: 1
92
- gradient_checkpointing: true
93
- gradient_checkpointing_kwargs:
94
- use_reentrant: false
95
- greater_is_better: null
96
- group_by_length: false
97
- half_precision_backend: auto
98
- hub_always_push: false
99
- hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1
100
- hub_private_repo: false
101
- hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
102
- - every_save
103
- hub_token: null
104
- ignore_data_skip: false
105
- include_inputs_for_metrics: false
106
- include_tokens_per_second: false
107
- jit_mode_eval: false
108
- label_names: null
109
- label_smoothing_factor: 0.0
110
- learning_rate: 3.0e-05
111
- length_column_name: length
112
- load_best_model_at_end: false
113
- local_rank: 0
114
- log_level: info
115
- log_level_replica: warning
116
- log_on_each_node: true
117
- logging_dir: outputs-sft-zephyr-beta-v1/runs/Nov22_05-52-29_a72e59c0abac
118
- logging_first_step: true
119
- logging_nan_inf_filter: true
120
- logging_steps: 10
121
- logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
122
- - steps
123
- lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
124
- - cosine
125
- max_grad_norm: 1.0
126
- max_seq_length: 512
127
- max_steps: 50
128
- metric_for_best_model: null
129
- mp_parameters: ''
130
- neftune_noise_alpha: null
131
- no_cuda: false
132
- num_train_epochs: 3.0
133
- optim: !!python/object/apply:transformers.training_args.OptimizerNames
134
- - adamw_torch
135
- optim_args: null
136
- output_dir: outputs-sft-zephyr-beta-v1
137
- overwrite_output_dir: true
138
- past_index: -1
139
- per_device_eval_batch_size: 4
140
- per_device_train_batch_size: 4
141
- per_gpu_eval_batch_size: null
142
- per_gpu_train_batch_size: null
143
- prediction_loss_only: false
144
- push_to_hub: true
145
- push_to_hub_model_id: null
146
- push_to_hub_organization: null
147
- push_to_hub_token: null
148
- ray_scope: last
149
- remove_unused_columns: true
150
- report_to:
151
- - wandb
152
- resume_from_checkpoint: null
153
- run_name: sft-zephyr-7b-beta-v1
154
- save_on_each_node: false
155
- save_safetensors: true
156
- save_steps: 25
157
- save_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
158
- - steps
159
- save_total_limit: 13
160
- seed: 42
161
- skip_memory_metrics: true
162
- split_batches: false
163
- tf32: null
164
- torch_compile: false
165
- torch_compile_backend: null
166
- torch_compile_mode: null
167
- torchdynamo: null
168
- tpu_metrics_debug: false
169
- tpu_num_cores: null
170
- use_cpu: false
171
- use_ipex: false
172
- use_legacy_prediction_loop: false
173
- use_mps_device: false
174
- warmup_ratio: 0.05
175
- warmup_steps: 0
176
- weight_decay: 0.05
177
- - !!python/object:__main__.ScriptArguments
178
- lora_alpha: 16
179
- lora_dropout: 0.1
180
- lora_modules_to_save: null
181
- lora_r: 64
182
- lora_target_modules:
183
- - q_proj
184
- - k_proj
185
- - v_proj
186
- - o_proj
187
- peft_path: null
188
- use_peft: true
 
1
+ cache_dir: ./cache
2
+ ddp_find_unused_parameters: false
3
+ ddp_timeout: 30000
4
+ device_map: auto
5
+ do_eval: true
6
+ do_train: true
7
+ eval_steps: 25
8
+ evaluation_strategy: steps
9
+ gradient_accumulation_steps: 1
10
+ gradient_checkpointing: true
11
+ gradient_checkpointing_kwargs:
12
+ use_reentrant: false
13
+ hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1
14
+ hub_strategy: every_save
15
+ learning_rate: 3.0e-05
16
+ load_in_4bit: true
17
+ log_level: info
18
+ logging_first_step: true
19
+ logging_steps: 10
20
+ logging_strategy: steps
21
+ lora_alpha: 128
22
+ lora_dropout: 0.1
23
+ lora_r: 256
24
+ lora_target_modules:
25
+ - q_proj
26
+ - k_proj
27
+ - v_proj
28
+ - o_proj
29
+ lr_scheduler_type: cosine
30
+ max_seq_length: 512
31
+ max_steps: 50
32
+ model_name_or_path: HuggingFaceH4/zephyr-7b-beta
33
+ model_type: auto
34
+ output_dir: outputs-sft-zephyr-beta-v1
35
+ overwrite_output_dir: true
36
+ per_device_eval_batch_size: 4
37
+ per_device_train_batch_size: 4
38
+ preprocessing_num_workers: 4
39
+ push_to_hub: true
40
+ report_to: wandb
41
+ run_name: sft-zephyr-7b-beta-v1
42
+ save_steps: 25
43
+ save_strategy: steps
44
+ save_total_limit: 13
45
+ seed: 42
46
+ train_file_dir: datasets/finetune
47
+ use_peft: true
48
+ warmup_ratio: 0.05
49
+ weight_decay: 0.05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 0.03,
3
  "eval_loss": 0.8876652717590332,
4
- "eval_runtime": 112.9915,
5
  "eval_samples": 650,
6
- "eval_samples_per_second": 5.753,
7
- "eval_steps_per_second": 1.443
8
  }
 
1
  {
2
  "epoch": 0.03,
3
  "eval_loss": 0.8876652717590332,
4
+ "eval_runtime": 113.7341,
5
  "eval_samples": 650,
6
+ "eval_samples_per_second": 5.715,
7
+ "eval_steps_per_second": 1.433
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 0.03,
3
  "train_loss": 1.0970729541778566,
4
- "train_runtime": 356.1922,
5
  "train_samples": 5845,
6
- "train_samples_per_second": 0.561,
7
  "train_steps_per_second": 0.14
8
  }
 
1
  {
2
  "epoch": 0.03,
3
  "train_loss": 1.0970729541778566,
4
+ "train_runtime": 357.3025,
5
  "train_samples": 5845,
6
+ "train_samples_per_second": 0.56,
7
  "train_steps_per_second": 0.14
8
  }
trainer_state.json CHANGED
@@ -29,7 +29,7 @@
29
  {
30
  "epoch": 0.02,
31
  "eval_loss": 0.9467611908912659,
32
- "eval_runtime": 113.3532,
33
  "eval_samples_per_second": 5.734,
34
  "eval_steps_per_second": 1.438,
35
  "step": 25
@@ -55,9 +55,9 @@
55
  {
56
  "epoch": 0.03,
57
  "eval_loss": 0.8876652717590332,
58
- "eval_runtime": 114.2498,
59
- "eval_samples_per_second": 5.689,
60
- "eval_steps_per_second": 1.427,
61
  "step": 50
62
  },
63
  {
@@ -65,8 +65,8 @@
65
  "step": 50,
66
  "total_flos": 4008716634423296.0,
67
  "train_loss": 1.0970729541778566,
68
- "train_runtime": 356.1922,
69
- "train_samples_per_second": 0.561,
70
  "train_steps_per_second": 0.14
71
  }
72
  ],
 
29
  {
30
  "epoch": 0.02,
31
  "eval_loss": 0.9467611908912659,
32
+ "eval_runtime": 113.3606,
33
  "eval_samples_per_second": 5.734,
34
  "eval_steps_per_second": 1.438,
35
  "step": 25
 
55
  {
56
  "epoch": 0.03,
57
  "eval_loss": 0.8876652717590332,
58
+ "eval_runtime": 114.0086,
59
+ "eval_samples_per_second": 5.701,
60
+ "eval_steps_per_second": 1.43,
61
  "step": 50
62
  },
63
  {
 
65
  "step": 50,
66
  "total_flos": 4008716634423296.0,
67
  "train_loss": 1.0970729541778566,
68
+ "train_runtime": 357.3025,
69
+ "train_samples_per_second": 0.56,
70
  "train_steps_per_second": 0.14
71
  }
72
  ],