Model save

Browse files

Files changed (6) hide show

README.md +3 -3
all_results.json +5 -5
config_argument.yaml +49 -188
eval_results.json +3 -3
train_results.json +2 -2
trainer_state.json +6 -6

README.md CHANGED Viewed

@@ -15,7 +15,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 1.1997
 ## Model description
@@ -48,8 +48,8 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 1.5219        | 0.02  | 25   | 1.2539          |
-| 1.3156        | 0.03  | 50   | 1.1997          |
 ### Framework versions

 This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.8877
 ## Model description
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 1.1665        | 0.02  | 25   | 0.9468          |
+| 0.8357        | 0.03  | 50   | 0.8877          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 0.03,
     "eval_loss": 0.8876652717590332,
-    "eval_runtime": 112.9915,
     "eval_samples": 650,
-    "eval_samples_per_second": 5.753,
-    "eval_steps_per_second": 1.443,
     "train_loss": 1.0970729541778566,
-    "train_runtime": 356.1922,
     "train_samples": 5845,
-    "train_samples_per_second": 0.561,
     "train_steps_per_second": 0.14
 }

 {
     "epoch": 0.03,
     "eval_loss": 0.8876652717590332,
+    "eval_runtime": 113.7341,
     "eval_samples": 650,
+    "eval_samples_per_second": 5.715,
+    "eval_steps_per_second": 1.433,
     "train_loss": 1.0970729541778566,
+    "train_runtime": 357.3025,
     "train_samples": 5845,
+    "train_samples_per_second": 0.56,
     "train_steps_per_second": 0.14
 }

config_argument.yaml CHANGED Viewed

@@ -1,188 +1,49 @@
-!!python/tuple
-- !!python/object:__main__.ModelArguments
-  bnb_4bit_quant_type: nf4
-  cache_dir: ./cache
-  device_map: auto
-  load_in_4bit: true
-  load_in_8bit: false
-  model_name_or_path: HuggingFaceH4/zephyr-7b-beta
-  model_revision: main
-  model_type: auto
-  neft_alpha: 0
-  rope_scaling: null
-  shift_attn: false
-  tokenizer_name_or_path: null
-  torch_dtype: float16
-  trust_remote_code: true
-  use_bnb_nested_quant: false
-  use_fast_tokenizer: false
-  use_flash_attention_2: false
-- !!python/object:__main__.DataArguments
-  dataset_config_name: null
-  dataset_name: null
-  ignore_pad_token_for_loss: true
-  max_eval_samples: null
-  max_train_samples: null
-  overwrite_cache: false
-  preprocessing_num_workers: 4
-  template_name: vicuna
-  train_file_dir: datasets/finetune
-  validation_file_dir: null
-  validation_split_percentage: 10
-- !!python/object:__main__.SFTConfig
-  __cached__setup_devices: !!python/object/apply:torch.device
-  - cuda
-  - 0
-  _n_gpu: 1
-  adafactor: false
-  adam_beta1: 0.9
-  adam_beta2: 0.999
-  adam_epsilon: 1.0e-08
-  auto_find_batch_size: false
-  bf16: false
-  bf16_full_eval: false
-  data_seed: null
-  dataloader_drop_last: false
-  dataloader_num_workers: 0
-  dataloader_pin_memory: true
-  ddp_backend: null
-  ddp_broadcast_buffers: null
-  ddp_bucket_cap_mb: null
-  ddp_find_unused_parameters: false
-  ddp_timeout: 30000
-  debug: []
-  deepspeed: null
-  deepspeed_plugin: null
-  disable_tqdm: false
-  dispatch_batches: null
-  distributed_state: !!python/object:accelerate.state.PartialState
-    _cpu: false
-    backend: null
-    debug: false
-    device: !!python/object/apply:torch.device
-    - cuda
-    - 0
-    distributed_type: !!python/object/apply:accelerate.utils.dataclasses.DistributedType
-    - MULTI_GPU
-    fork_launched: false
-    local_process_index: 0
-    num_processes: 1
-    process_index: 0
-  do_eval: true
-  do_predict: false
-  do_train: true
-  eval_accumulation_steps: null
-  eval_delay: 0
-  eval_steps: 25
-  evaluation_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
-  - steps
-  fp16: false
-  fp16_backend: auto
-  fp16_full_eval: false
-  fp16_opt_level: O1
-  fsdp: []
-  fsdp_config:
-    min_num_params: 0
-    xla: false
-    xla_fsdp_grad_ckpt: false
-  fsdp_min_num_params: 0
-  fsdp_transformer_layer_cls_to_wrap: null
-  full_determinism: false
-  gradient_accumulation_steps: 1
-  gradient_checkpointing: true
-  gradient_checkpointing_kwargs:
-    use_reentrant: false
-  greater_is_better: null
-  group_by_length: false
-  half_precision_backend: auto
-  hub_always_push: false
-  hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1
-  hub_private_repo: false
-  hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
-  - every_save
-  hub_token: null
-  ignore_data_skip: false
-  include_inputs_for_metrics: false
-  include_tokens_per_second: false
-  jit_mode_eval: false
-  label_names: null
-  label_smoothing_factor: 0.0
-  learning_rate: 3.0e-05
-  length_column_name: length
-  load_best_model_at_end: false
-  local_rank: 0
-  log_level: info
-  log_level_replica: warning
-  log_on_each_node: true
-  logging_dir: outputs-sft-zephyr-beta-v1/runs/Nov22_05-52-29_a72e59c0abac
-  logging_first_step: true
-  logging_nan_inf_filter: true
-  logging_steps: 10
-  logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
-  - steps
-  lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
-  - cosine
-  max_grad_norm: 1.0
-  max_seq_length: 512
-  max_steps: 50
-  metric_for_best_model: null
-  mp_parameters: ''
-  neftune_noise_alpha: null
-  no_cuda: false
-  num_train_epochs: 3.0
-  optim: !!python/object/apply:transformers.training_args.OptimizerNames
-  - adamw_torch
-  optim_args: null
-  output_dir: outputs-sft-zephyr-beta-v1
-  overwrite_output_dir: true
-  past_index: -1
-  per_device_eval_batch_size: 4
-  per_device_train_batch_size: 4
-  per_gpu_eval_batch_size: null
-  per_gpu_train_batch_size: null
-  prediction_loss_only: false
-  push_to_hub: true
-  push_to_hub_model_id: null
-  push_to_hub_organization: null
-  push_to_hub_token: null
-  ray_scope: last
-  remove_unused_columns: true
-  report_to:
-  - wandb
-  resume_from_checkpoint: null
-  run_name: sft-zephyr-7b-beta-v1
-  save_on_each_node: false
-  save_safetensors: true
-  save_steps: 25
-  save_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
-  - steps
-  save_total_limit: 13
-  seed: 42
-  skip_memory_metrics: true
-  split_batches: false
-  tf32: null
-  torch_compile: false
-  torch_compile_backend: null
-  torch_compile_mode: null
-  torchdynamo: null
-  tpu_metrics_debug: false
-  tpu_num_cores: null
-  use_cpu: false
-  use_ipex: false
-  use_legacy_prediction_loop: false
-  use_mps_device: false
-  warmup_ratio: 0.05
-  warmup_steps: 0
-  weight_decay: 0.05
-- !!python/object:__main__.ScriptArguments
-  lora_alpha: 16
-  lora_dropout: 0.1
-  lora_modules_to_save: null
-  lora_r: 64
-  lora_target_modules:
-  - q_proj
-  - k_proj
-  - v_proj
-  - o_proj
-  peft_path: null
-  use_peft: true

+cache_dir: ./cache
+ddp_find_unused_parameters: false
+ddp_timeout: 30000
+device_map: auto
+do_eval: true
+do_train: true
+eval_steps: 25
+evaluation_strategy: steps
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1
+hub_strategy: every_save
+learning_rate: 3.0e-05
+load_in_4bit: true
+log_level: info
+logging_first_step: true
+logging_steps: 10
+logging_strategy: steps
+lora_alpha: 128
+lora_dropout: 0.1
+lora_r: 256
+lora_target_modules:
+- q_proj
+- k_proj
+- v_proj
+- o_proj
+lr_scheduler_type: cosine
+max_seq_length: 512
+max_steps: 50
+model_name_or_path: HuggingFaceH4/zephyr-7b-beta
+model_type: auto
+output_dir: outputs-sft-zephyr-beta-v1
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+preprocessing_num_workers: 4
+push_to_hub: true
+report_to: wandb
+run_name: sft-zephyr-7b-beta-v1
+save_steps: 25
+save_strategy: steps
+save_total_limit: 13
+seed: 42
+train_file_dir: datasets/finetune
+use_peft: true
+warmup_ratio: 0.05
+weight_decay: 0.05

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 0.03,
     "eval_loss": 0.8876652717590332,
-    "eval_runtime": 112.9915,
     "eval_samples": 650,
-    "eval_samples_per_second": 5.753,
-    "eval_steps_per_second": 1.443
 }

 {
     "epoch": 0.03,
     "eval_loss": 0.8876652717590332,
+    "eval_runtime": 113.7341,
     "eval_samples": 650,
+    "eval_samples_per_second": 5.715,
+    "eval_steps_per_second": 1.433
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 0.03,
     "train_loss": 1.0970729541778566,
-    "train_runtime": 356.1922,
     "train_samples": 5845,
-    "train_samples_per_second": 0.561,
     "train_steps_per_second": 0.14
 }

 {
     "epoch": 0.03,
     "train_loss": 1.0970729541778566,
+    "train_runtime": 357.3025,
     "train_samples": 5845,
+    "train_samples_per_second": 0.56,
     "train_steps_per_second": 0.14
 }

trainer_state.json CHANGED Viewed

@@ -29,7 +29,7 @@
     {
       "epoch": 0.02,
       "eval_loss": 0.9467611908912659,
-      "eval_runtime": 113.3532,
       "eval_samples_per_second": 5.734,
       "eval_steps_per_second": 1.438,
       "step": 25
@@ -55,9 +55,9 @@
     {
       "epoch": 0.03,
       "eval_loss": 0.8876652717590332,
-      "eval_runtime": 114.2498,
-      "eval_samples_per_second": 5.689,
-      "eval_steps_per_second": 1.427,
       "step": 50
     },
     {
@@ -65,8 +65,8 @@
       "step": 50,
       "total_flos": 4008716634423296.0,
       "train_loss": 1.0970729541778566,
-      "train_runtime": 356.1922,
-      "train_samples_per_second": 0.561,
       "train_steps_per_second": 0.14
     }
   ],

     {
       "epoch": 0.02,
       "eval_loss": 0.9467611908912659,
+      "eval_runtime": 113.3606,
       "eval_samples_per_second": 5.734,
       "eval_steps_per_second": 1.438,
       "step": 25
     {
       "epoch": 0.03,
       "eval_loss": 0.8876652717590332,
+      "eval_runtime": 114.0086,
+      "eval_samples_per_second": 5.701,
+      "eval_steps_per_second": 1.43,
       "step": 50
     },
     {
       "step": 50,
       "total_flos": 4008716634423296.0,
       "train_loss": 1.0970729541778566,
+      "train_runtime": 357.3025,
+      "train_samples_per_second": 0.56,
       "train_steps_per_second": 0.14
     }
   ],