Model save

Browse files

Files changed (9) hide show

README.md +5 -9
adapter_config.json +5 -5
adapter_model.safetensors +1 -1
all_results.json +9 -9
config_argument.yaml +7 -6
eval_results.json +5 -5
train_results.json +5 -5
trainer_state.json +16 -56
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -1,6 +1,5 @@
 ---
-license: mit
-base_model: HuggingFaceH4/zephyr-7b-beta
 tags:
 - generated_from_trainer
 model-index:
@@ -13,9 +12,9 @@ should probably proofread and complete it, then remove this comment. -->
 # non-qa-sft-zephyr-7b-beta-v1
-This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.8877
 ## Model description
@@ -42,14 +41,11 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.05
-- training_steps: 50
 ### Training results
-| Training Loss | Epoch | Step | Validation Loss |
-|:-------------:|:-----:|:----:|:---------------:|
-| 1.1665        | 0.02  | 25   | 0.9468          |
-| 0.8357        | 0.03  | 50   | 0.8877          |
 ### Framework versions

 ---
+base_model: hllj/zephyr-7b-beta-vi-math
 tags:
 - generated_from_trainer
 model-index:
 # non-qa-sft-zephyr-7b-beta-v1
+This model is a fine-tuned version of [hllj/zephyr-7b-beta-vi-math](https://huggingface.co/hllj/zephyr-7b-beta-vi-math) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.9442
 ## Model description
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.05
+- training_steps: 10
+- mixed_precision_training: Native AMP
 ### Training results
 ### Framework versions

adapter_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "alpha_pattern": {},
   "auto_mapping": null,
-  "base_model_name_or_path": "HuggingFaceH4/zephyr-7b-beta",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
@@ -9,17 +9,17 @@
   "layers_pattern": null,
   "layers_to_transform": null,
   "lora_alpha": 128,
-  "lora_dropout": 0.1,
   "modules_to_save": null,
   "peft_type": "LORA",
   "r": 256,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "v_proj",
-    "k_proj",
     "q_proj",
-    "o_proj"
   ],
   "task_type": "CAUSAL_LM"
 }

 {
   "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": "hllj/zephyr-7b-beta-vi-math",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "lora_alpha": 128,
+  "lora_dropout": 0.05,
   "modules_to_save": null,
   "peft_type": "LORA",
   "r": 256,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
   ],
   "task_type": "CAUSAL_LM"
 }

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4507cecfab4aed7b850ede3e6b20862e39e7aafc5cefba9750cc95b8301e63a4
 size 872450448

 version https://git-lfs.github.com/spec/v1
+oid sha256:32b4b241e08bb7a57725bd8062508e4bb722e0f5da90b1c6400db2fc88e3231f
 size 872450448

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
-    "epoch": 0.03,
-    "eval_loss": 0.8876652717590332,
-    "eval_runtime": 113.7754,
     "eval_samples": 650,
-    "eval_samples_per_second": 5.713,
-    "eval_steps_per_second": 1.433,
-    "train_loss": 1.0970729541778566,
-    "train_runtime": 345.3617,
     "train_samples": 5845,
-    "train_samples_per_second": 0.579,
-    "train_steps_per_second": 0.145
 }

 {
+    "epoch": 0.01,
+    "eval_loss": 0.9441541433334351,
+    "eval_runtime": 76.9742,
     "eval_samples": 650,
+    "eval_samples_per_second": 8.444,
+    "eval_steps_per_second": 2.118,
+    "train_loss": 0.8675599694252014,
+    "train_runtime": 15.492,
     "train_samples": 5845,
+    "train_samples_per_second": 2.582,
+    "train_steps_per_second": 0.645
 }

config_argument.yaml CHANGED Viewed

@@ -4,8 +4,9 @@ ddp_timeout: 30000
 device_map: auto
 do_eval: true
 do_train: true
-eval_steps: 25
 evaluation_strategy: steps
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
@@ -13,13 +14,12 @@ gradient_checkpointing_kwargs:
 hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1
 hub_strategy: every_save
 learning_rate: 3.0e-05
-load_in_4bit: true
 log_level: info
 logging_first_step: true
 logging_steps: 10
 logging_strategy: steps
 lora_alpha: 128
-lora_dropout: 0.1
 lora_r: 256
 lora_target_modules:
 - q_proj
@@ -28,8 +28,8 @@ lora_target_modules:
 - o_proj
 lr_scheduler_type: cosine
 max_seq_length: 512
-max_steps: 50
-model_name_or_path: HuggingFaceH4/zephyr-7b-beta
 model_type: auto
 output_dir: outputs-sft-zephyr-beta-v1
 overwrite_output_dir: true
@@ -39,10 +39,11 @@ preprocessing_num_workers: 4
 push_to_hub: true
 report_to: wandb
 run_name: sft-zephyr-7b-beta-v1
-save_steps: 25
 save_strategy: steps
 save_total_limit: 13
 seed: 42
 train_file_dir: datasets/finetune
 use_peft: true
 warmup_ratio: 0.05

 device_map: auto
 do_eval: true
 do_train: true
+eval_steps: 500
 evaluation_strategy: steps
+fp16: true
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
 hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1
 hub_strategy: every_save
 learning_rate: 3.0e-05
 log_level: info
 logging_first_step: true
 logging_steps: 10
 logging_strategy: steps
 lora_alpha: 128
+lora_dropout: 0.05
 lora_r: 256
 lora_target_modules:
 - q_proj
 - o_proj
 lr_scheduler_type: cosine
 max_seq_length: 512
+max_steps: 10
+model_name_or_path: hllj/zephyr-7b-beta-vi-math
 model_type: auto
 output_dir: outputs-sft-zephyr-beta-v1
 overwrite_output_dir: true
 push_to_hub: true
 report_to: wandb
 run_name: sft-zephyr-7b-beta-v1
+save_steps: 500
 save_strategy: steps
 save_total_limit: 13
 seed: 42
+torch_dtype: float16
 train_file_dir: datasets/finetune
 use_peft: true
 warmup_ratio: 0.05

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 0.03,
-    "eval_loss": 0.8876652717590332,
-    "eval_runtime": 113.7754,
     "eval_samples": 650,
-    "eval_samples_per_second": 5.713,
-    "eval_steps_per_second": 1.433
 }

 {
+    "epoch": 0.01,
+    "eval_loss": 0.9441541433334351,
+    "eval_runtime": 76.9742,
     "eval_samples": 650,
+    "eval_samples_per_second": 8.444,
+    "eval_steps_per_second": 2.118
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 0.03,
-    "train_loss": 1.0970729541778566,
-    "train_runtime": 345.3617,
     "train_samples": 5845,
-    "train_samples_per_second": 0.579,
-    "train_steps_per_second": 0.145
 }

 {
+    "epoch": 0.01,
+    "train_loss": 0.8675599694252014,
+    "train_runtime": 15.492,
     "train_samples": 5845,
+    "train_samples_per_second": 2.582,
+    "train_steps_per_second": 0.645
 }

trainer_state.json CHANGED Viewed

@@ -1,80 +1,40 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.03419972640218878,
-  "eval_steps": 25,
-  "global_step": 50,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0,
-      "learning_rate": 9.999999999999999e-06,
-      "loss": 1.4953,
       "step": 1
     },
     {
       "epoch": 0.01,
-      "learning_rate": 2.838778253789822e-05,
-      "loss": 1.5421,
       "step": 10
     },
     {
       "epoch": 0.01,
-      "learning_rate": 2.1314021436425026e-05,
-      "loss": 1.1665,
-      "step": 20
-    },
-    {
-      "epoch": 0.02,
-      "eval_loss": 0.9467611908912659,
-      "eval_runtime": 113.3365,
-      "eval_samples_per_second": 5.735,
-      "eval_steps_per_second": 1.438,
-      "step": 25
-    },
-    {
-      "epoch": 0.02,
-      "learning_rate": 1.1522697745987076e-05,
-      "loss": 0.9964,
-      "step": 30
-    },
-    {
-      "epoch": 0.03,
-      "learning_rate": 3.2280092208200853e-06,
-      "loss": 0.9493,
-      "step": 40
-    },
-    {
-      "epoch": 0.03,
-      "learning_rate": 0.0,
-      "loss": 0.8357,
-      "step": 50
-    },
-    {
-      "epoch": 0.03,
-      "eval_loss": 0.8876652717590332,
-      "eval_runtime": 114.0209,
-      "eval_samples_per_second": 5.701,
-      "eval_steps_per_second": 1.43,
-      "step": 50
-    },
-    {
-      "epoch": 0.03,
-      "step": 50,
-      "total_flos": 4008716634423296.0,
-      "train_loss": 1.0970729541778566,
-      "train_runtime": 345.3617,
-      "train_samples_per_second": 0.579,
-      "train_steps_per_second": 0.145
     }
   ],
   "logging_steps": 10,
-  "max_steps": 50,
   "num_train_epochs": 1,
-  "save_steps": 25,
-  "total_flos": 4008716634423296.0,
   "trial_name": null,
   "trial_params": null
 }

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.006839945280437756,
+  "eval_steps": 500,
+  "global_step": 10,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0,
+      "learning_rate": 3e-05,
+      "loss": 0.8018,
       "step": 1
     },
     {
       "epoch": 0.01,
+      "learning_rate": 9.046106882113753e-07,
+      "loss": 0.8749,
       "step": 10
     },
     {
       "epoch": 0.01,
+      "step": 10,
+      "total_flos": 835654992789504.0,
+      "train_loss": 0.8675599694252014,
+      "train_runtime": 15.492,
+      "train_samples_per_second": 2.582,
+      "train_steps_per_second": 0.645
     }
   ],
   "logging_steps": 10,
+  "max_steps": 10,
   "num_train_epochs": 1,
+  "save_steps": 500,
+  "total_flos": 835654992789504.0,
   "trial_name": null,
   "trial_params": null
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df6692a6e6bd05c8592eb9e565104b9ee05c4f45c8501abed71f58102acd7b74
 size 4664

 version https://git-lfs.github.com/spec/v1
+oid sha256:60808214d213e204fe40c4d698afc2e10b81e0c57389918246b4358b3e82dd00
 size 4664