hllj commited on
Commit
dbeae30
·
1 Parent(s): 2f1348a

Model save

Browse files
README.md CHANGED
@@ -1,6 +1,5 @@
1
  ---
2
- license: mit
3
- base_model: HuggingFaceH4/zephyr-7b-beta
4
  tags:
5
  - generated_from_trainer
6
  model-index:
@@ -13,9 +12,9 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # non-qa-sft-zephyr-7b-beta-v1
15
 
16
- This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.8877
19
 
20
  ## Model description
21
 
@@ -42,14 +41,11 @@ The following hyperparameters were used during training:
42
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
  - lr_scheduler_type: cosine
44
  - lr_scheduler_warmup_ratio: 0.05
45
- - training_steps: 50
 
46
 
47
  ### Training results
48
 
49
- | Training Loss | Epoch | Step | Validation Loss |
50
- |:-------------:|:-----:|:----:|:---------------:|
51
- | 1.1665 | 0.02 | 25 | 0.9468 |
52
- | 0.8357 | 0.03 | 50 | 0.8877 |
53
 
54
 
55
  ### Framework versions
 
1
  ---
2
+ base_model: hllj/zephyr-7b-beta-vi-math
 
3
  tags:
4
  - generated_from_trainer
5
  model-index:
 
12
 
13
  # non-qa-sft-zephyr-7b-beta-v1
14
 
15
+ This model is a fine-tuned version of [hllj/zephyr-7b-beta-vi-math](https://huggingface.co/hllj/zephyr-7b-beta-vi-math) on the None dataset.
16
  It achieves the following results on the evaluation set:
17
+ - Loss: 0.9442
18
 
19
  ## Model description
20
 
 
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: cosine
43
  - lr_scheduler_warmup_ratio: 0.05
44
+ - training_steps: 10
45
+ - mixed_precision_training: Native AMP
46
 
47
  ### Training results
48
 
 
 
 
 
49
 
50
 
51
  ### Framework versions
adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "HuggingFaceH4/zephyr-7b-beta",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
@@ -9,17 +9,17 @@
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "lora_alpha": 128,
12
- "lora_dropout": 0.1,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
  "r": 256,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "v_proj",
20
- "k_proj",
21
  "q_proj",
22
- "o_proj"
 
 
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "hllj/zephyr-7b-beta-vi-math",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
 
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "lora_alpha": 128,
12
+ "lora_dropout": 0.05,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
  "r": 256,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
 
 
19
  "q_proj",
20
+ "k_proj",
21
+ "o_proj",
22
+ "v_proj"
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4507cecfab4aed7b850ede3e6b20862e39e7aafc5cefba9750cc95b8301e63a4
3
  size 872450448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32b4b241e08bb7a57725bd8062508e4bb722e0f5da90b1c6400db2fc88e3231f
3
  size 872450448
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 0.03,
3
- "eval_loss": 0.8876652717590332,
4
- "eval_runtime": 113.7754,
5
  "eval_samples": 650,
6
- "eval_samples_per_second": 5.713,
7
- "eval_steps_per_second": 1.433,
8
- "train_loss": 1.0970729541778566,
9
- "train_runtime": 345.3617,
10
  "train_samples": 5845,
11
- "train_samples_per_second": 0.579,
12
- "train_steps_per_second": 0.145
13
  }
 
1
  {
2
+ "epoch": 0.01,
3
+ "eval_loss": 0.9441541433334351,
4
+ "eval_runtime": 76.9742,
5
  "eval_samples": 650,
6
+ "eval_samples_per_second": 8.444,
7
+ "eval_steps_per_second": 2.118,
8
+ "train_loss": 0.8675599694252014,
9
+ "train_runtime": 15.492,
10
  "train_samples": 5845,
11
+ "train_samples_per_second": 2.582,
12
+ "train_steps_per_second": 0.645
13
  }
config_argument.yaml CHANGED
@@ -4,8 +4,9 @@ ddp_timeout: 30000
4
  device_map: auto
5
  do_eval: true
6
  do_train: true
7
- eval_steps: 25
8
  evaluation_strategy: steps
 
9
  gradient_accumulation_steps: 1
10
  gradient_checkpointing: true
11
  gradient_checkpointing_kwargs:
@@ -13,13 +14,12 @@ gradient_checkpointing_kwargs:
13
  hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1
14
  hub_strategy: every_save
15
  learning_rate: 3.0e-05
16
- load_in_4bit: true
17
  log_level: info
18
  logging_first_step: true
19
  logging_steps: 10
20
  logging_strategy: steps
21
  lora_alpha: 128
22
- lora_dropout: 0.1
23
  lora_r: 256
24
  lora_target_modules:
25
  - q_proj
@@ -28,8 +28,8 @@ lora_target_modules:
28
  - o_proj
29
  lr_scheduler_type: cosine
30
  max_seq_length: 512
31
- max_steps: 50
32
- model_name_or_path: HuggingFaceH4/zephyr-7b-beta
33
  model_type: auto
34
  output_dir: outputs-sft-zephyr-beta-v1
35
  overwrite_output_dir: true
@@ -39,10 +39,11 @@ preprocessing_num_workers: 4
39
  push_to_hub: true
40
  report_to: wandb
41
  run_name: sft-zephyr-7b-beta-v1
42
- save_steps: 25
43
  save_strategy: steps
44
  save_total_limit: 13
45
  seed: 42
 
46
  train_file_dir: datasets/finetune
47
  use_peft: true
48
  warmup_ratio: 0.05
 
4
  device_map: auto
5
  do_eval: true
6
  do_train: true
7
+ eval_steps: 500
8
  evaluation_strategy: steps
9
+ fp16: true
10
  gradient_accumulation_steps: 1
11
  gradient_checkpointing: true
12
  gradient_checkpointing_kwargs:
 
14
  hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1
15
  hub_strategy: every_save
16
  learning_rate: 3.0e-05
 
17
  log_level: info
18
  logging_first_step: true
19
  logging_steps: 10
20
  logging_strategy: steps
21
  lora_alpha: 128
22
+ lora_dropout: 0.05
23
  lora_r: 256
24
  lora_target_modules:
25
  - q_proj
 
28
  - o_proj
29
  lr_scheduler_type: cosine
30
  max_seq_length: 512
31
+ max_steps: 10
32
+ model_name_or_path: hllj/zephyr-7b-beta-vi-math
33
  model_type: auto
34
  output_dir: outputs-sft-zephyr-beta-v1
35
  overwrite_output_dir: true
 
39
  push_to_hub: true
40
  report_to: wandb
41
  run_name: sft-zephyr-7b-beta-v1
42
+ save_steps: 500
43
  save_strategy: steps
44
  save_total_limit: 13
45
  seed: 42
46
+ torch_dtype: float16
47
  train_file_dir: datasets/finetune
48
  use_peft: true
49
  warmup_ratio: 0.05
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.03,
3
- "eval_loss": 0.8876652717590332,
4
- "eval_runtime": 113.7754,
5
  "eval_samples": 650,
6
- "eval_samples_per_second": 5.713,
7
- "eval_steps_per_second": 1.433
8
  }
 
1
  {
2
+ "epoch": 0.01,
3
+ "eval_loss": 0.9441541433334351,
4
+ "eval_runtime": 76.9742,
5
  "eval_samples": 650,
6
+ "eval_samples_per_second": 8.444,
7
+ "eval_steps_per_second": 2.118
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.03,
3
- "train_loss": 1.0970729541778566,
4
- "train_runtime": 345.3617,
5
  "train_samples": 5845,
6
- "train_samples_per_second": 0.579,
7
- "train_steps_per_second": 0.145
8
  }
 
1
  {
2
+ "epoch": 0.01,
3
+ "train_loss": 0.8675599694252014,
4
+ "train_runtime": 15.492,
5
  "train_samples": 5845,
6
+ "train_samples_per_second": 2.582,
7
+ "train_steps_per_second": 0.645
8
  }
trainer_state.json CHANGED
@@ -1,80 +1,40 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.03419972640218878,
5
- "eval_steps": 25,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 9.999999999999999e-06,
14
- "loss": 1.4953,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.01,
19
- "learning_rate": 2.838778253789822e-05,
20
- "loss": 1.5421,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.01,
25
- "learning_rate": 2.1314021436425026e-05,
26
- "loss": 1.1665,
27
- "step": 20
28
- },
29
- {
30
- "epoch": 0.02,
31
- "eval_loss": 0.9467611908912659,
32
- "eval_runtime": 113.3365,
33
- "eval_samples_per_second": 5.735,
34
- "eval_steps_per_second": 1.438,
35
- "step": 25
36
- },
37
- {
38
- "epoch": 0.02,
39
- "learning_rate": 1.1522697745987076e-05,
40
- "loss": 0.9964,
41
- "step": 30
42
- },
43
- {
44
- "epoch": 0.03,
45
- "learning_rate": 3.2280092208200853e-06,
46
- "loss": 0.9493,
47
- "step": 40
48
- },
49
- {
50
- "epoch": 0.03,
51
- "learning_rate": 0.0,
52
- "loss": 0.8357,
53
- "step": 50
54
- },
55
- {
56
- "epoch": 0.03,
57
- "eval_loss": 0.8876652717590332,
58
- "eval_runtime": 114.0209,
59
- "eval_samples_per_second": 5.701,
60
- "eval_steps_per_second": 1.43,
61
- "step": 50
62
- },
63
- {
64
- "epoch": 0.03,
65
- "step": 50,
66
- "total_flos": 4008716634423296.0,
67
- "train_loss": 1.0970729541778566,
68
- "train_runtime": 345.3617,
69
- "train_samples_per_second": 0.579,
70
- "train_steps_per_second": 0.145
71
  }
72
  ],
73
  "logging_steps": 10,
74
- "max_steps": 50,
75
  "num_train_epochs": 1,
76
- "save_steps": 25,
77
- "total_flos": 4008716634423296.0,
78
  "trial_name": null,
79
  "trial_params": null
80
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.006839945280437756,
5
+ "eval_steps": 500,
6
+ "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 3e-05,
14
+ "loss": 0.8018,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.01,
19
+ "learning_rate": 9.046106882113753e-07,
20
+ "loss": 0.8749,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.01,
25
+ "step": 10,
26
+ "total_flos": 835654992789504.0,
27
+ "train_loss": 0.8675599694252014,
28
+ "train_runtime": 15.492,
29
+ "train_samples_per_second": 2.582,
30
+ "train_steps_per_second": 0.645
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  ],
33
  "logging_steps": 10,
34
+ "max_steps": 10,
35
  "num_train_epochs": 1,
36
+ "save_steps": 500,
37
+ "total_flos": 835654992789504.0,
38
  "trial_name": null,
39
  "trial_params": null
40
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df6692a6e6bd05c8592eb9e565104b9ee05c4f45c8501abed71f58102acd7b74
3
  size 4664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60808214d213e204fe40c4d698afc2e10b81e0c57389918246b4358b3e82dd00
3
  size 4664