| cache_dir: ./cache | |
| ddp_find_unused_parameters: false | |
| ddp_timeout: 30000 | |
| device_map: auto | |
| do_eval: true | |
| do_train: true | |
| eval_steps: 25 | |
| evaluation_strategy: steps | |
| gradient_accumulation_steps: 1 | |
| gradient_checkpointing: true | |
| gradient_checkpointing_kwargs: | |
| use_reentrant: false | |
| hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1 | |
| hub_strategy: every_save | |
| learning_rate: 3.0e-05 | |
| load_in_4bit: true | |
| log_level: info | |
| logging_first_step: true | |
| logging_steps: 10 | |
| logging_strategy: steps | |
| lora_alpha: 128 | |
| lora_dropout: 0.1 | |
| lora_r: 256 | |
| lora_target_modules: | |
| - q_proj | |
| - k_proj | |
| - v_proj | |
| - o_proj | |
| lr_scheduler_type: cosine | |
| max_seq_length: 512 | |
| max_steps: 50 | |
| model_name_or_path: HuggingFaceH4/zephyr-7b-beta | |
| model_type: auto | |
| output_dir: outputs-sft-zephyr-beta-v1 | |
| overwrite_output_dir: true | |
| per_device_eval_batch_size: 4 | |
| per_device_train_batch_size: 4 | |
| preprocessing_num_workers: 4 | |
| push_to_hub: true | |
| report_to: wandb | |
| run_name: sft-zephyr-7b-beta-v1 | |
| save_steps: 25 | |
| save_strategy: steps | |
| save_total_limit: 13 | |
| seed: 42 | |
| train_file_dir: datasets/finetune | |
| use_peft: true | |
| warmup_ratio: 0.05 | |
| weight_decay: 0.05 | |
