|
!!python/tuple |
|
- !!python/object:__main__.ModelArguments |
|
bnb_4bit_quant_type: nf4 |
|
cache_dir: ./cache |
|
device_map: auto |
|
load_in_4bit: true |
|
load_in_8bit: false |
|
model_name_or_path: HuggingFaceH4/zephyr-7b-beta |
|
model_revision: main |
|
model_type: auto |
|
neft_alpha: 0 |
|
rope_scaling: null |
|
shift_attn: false |
|
tokenizer_name_or_path: null |
|
torch_dtype: float16 |
|
trust_remote_code: true |
|
use_bnb_nested_quant: false |
|
use_fast_tokenizer: false |
|
use_flash_attention_2: false |
|
- !!python/object:__main__.DataArguments |
|
dataset_config_name: null |
|
dataset_name: null |
|
ignore_pad_token_for_loss: true |
|
max_eval_samples: null |
|
max_train_samples: null |
|
overwrite_cache: false |
|
preprocessing_num_workers: 4 |
|
template_name: vicuna |
|
train_file_dir: datasets/finetune |
|
validation_file_dir: null |
|
validation_split_percentage: 10 |
|
- !!python/object:__main__.SFTConfig |
|
__cached__setup_devices: !!python/object/apply:torch.device |
|
- cuda |
|
- 0 |
|
_n_gpu: 1 |
|
adafactor: false |
|
adam_beta1: 0.9 |
|
adam_beta2: 0.999 |
|
adam_epsilon: 1.0e-08 |
|
auto_find_batch_size: false |
|
bf16: false |
|
bf16_full_eval: false |
|
data_seed: null |
|
dataloader_drop_last: false |
|
dataloader_num_workers: 0 |
|
dataloader_pin_memory: true |
|
ddp_backend: null |
|
ddp_broadcast_buffers: null |
|
ddp_bucket_cap_mb: null |
|
ddp_find_unused_parameters: false |
|
ddp_timeout: 30000 |
|
debug: [] |
|
deepspeed: null |
|
deepspeed_plugin: null |
|
disable_tqdm: false |
|
dispatch_batches: null |
|
distributed_state: !!python/object:accelerate.state.PartialState |
|
_cpu: false |
|
backend: null |
|
debug: false |
|
device: !!python/object/apply:torch.device |
|
- cuda |
|
- 0 |
|
distributed_type: !!python/object/apply:accelerate.utils.dataclasses.DistributedType |
|
- MULTI_GPU |
|
fork_launched: false |
|
local_process_index: 0 |
|
num_processes: 1 |
|
process_index: 0 |
|
do_eval: true |
|
do_predict: false |
|
do_train: true |
|
eval_accumulation_steps: null |
|
eval_delay: 0 |
|
eval_steps: 25 |
|
evaluation_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy |
|
- steps |
|
fp16: false |
|
fp16_backend: auto |
|
fp16_full_eval: false |
|
fp16_opt_level: O1 |
|
fsdp: [] |
|
fsdp_config: |
|
min_num_params: 0 |
|
xla: false |
|
xla_fsdp_grad_ckpt: false |
|
fsdp_min_num_params: 0 |
|
fsdp_transformer_layer_cls_to_wrap: null |
|
full_determinism: false |
|
gradient_accumulation_steps: 1 |
|
gradient_checkpointing: true |
|
gradient_checkpointing_kwargs: |
|
use_reentrant: false |
|
greater_is_better: null |
|
group_by_length: false |
|
half_precision_backend: auto |
|
hub_always_push: false |
|
hub_model_id: hllj/non-qa-sft-zephyr-7b-beta-v1 |
|
hub_private_repo: false |
|
hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy |
|
- every_save |
|
hub_token: null |
|
ignore_data_skip: false |
|
include_inputs_for_metrics: false |
|
include_tokens_per_second: false |
|
jit_mode_eval: false |
|
label_names: null |
|
label_smoothing_factor: 0.0 |
|
learning_rate: 3.0e-05 |
|
length_column_name: length |
|
load_best_model_at_end: false |
|
local_rank: 0 |
|
log_level: info |
|
log_level_replica: warning |
|
log_on_each_node: true |
|
logging_dir: outputs-sft-zephyr-beta-v1/runs/Nov22_05-52-29_a72e59c0abac |
|
logging_first_step: true |
|
logging_nan_inf_filter: true |
|
logging_steps: 10 |
|
logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy |
|
- steps |
|
lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType |
|
- cosine |
|
max_grad_norm: 1.0 |
|
max_seq_length: 512 |
|
max_steps: 50 |
|
metric_for_best_model: null |
|
mp_parameters: '' |
|
neftune_noise_alpha: null |
|
no_cuda: false |
|
num_train_epochs: 3.0 |
|
optim: !!python/object/apply:transformers.training_args.OptimizerNames |
|
- adamw_torch |
|
optim_args: null |
|
output_dir: outputs-sft-zephyr-beta-v1 |
|
overwrite_output_dir: true |
|
past_index: -1 |
|
per_device_eval_batch_size: 4 |
|
per_device_train_batch_size: 4 |
|
per_gpu_eval_batch_size: null |
|
per_gpu_train_batch_size: null |
|
prediction_loss_only: false |
|
push_to_hub: true |
|
push_to_hub_model_id: null |
|
push_to_hub_organization: null |
|
push_to_hub_token: null |
|
ray_scope: last |
|
remove_unused_columns: true |
|
report_to: |
|
- wandb |
|
resume_from_checkpoint: null |
|
run_name: sft-zephyr-7b-beta-v1 |
|
save_on_each_node: false |
|
save_safetensors: true |
|
save_steps: 25 |
|
save_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy |
|
- steps |
|
save_total_limit: 13 |
|
seed: 42 |
|
skip_memory_metrics: true |
|
split_batches: false |
|
tf32: null |
|
torch_compile: false |
|
torch_compile_backend: null |
|
torch_compile_mode: null |
|
torchdynamo: null |
|
tpu_metrics_debug: false |
|
tpu_num_cores: null |
|
use_cpu: false |
|
use_ipex: false |
|
use_legacy_prediction_loop: false |
|
use_mps_device: false |
|
warmup_ratio: 0.05 |
|
warmup_steps: 0 |
|
weight_decay: 0.05 |
|
- !!python/object:__main__.ScriptArguments |
|
lora_alpha: 16 |
|
lora_dropout: 0.1 |
|
lora_modules_to_save: null |
|
lora_r: 64 |
|
lora_target_modules: |
|
- q_proj |
|
- k_proj |
|
- v_proj |
|
- o_proj |
|
peft_path: null |
|
use_peft: true |
|
|