ryanmarten commited on
Commit
712ef1c
·
verified ·
1 Parent(s): 27ae82d

Training in progress, epoch 1

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/data/horse/ws/ryma833h-DCFT_Shared/huggingface/hub/Qwen/Qwen2.5-7B-Instruct",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:561b72ae3bd9daa58dbd310a6f0a024e6243b2545174f14cedd14107ad65524c
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f2ad04589b6dc72ec76b95b02db4661b9d5e834313eaaa4cf9463e5f43b36d8
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fc5d5f4503d877e957d0e1f80a4db682d8bfd00ab5134b100ff9b2ff05f4851
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b260d59e122a42972315dc3416b98012d73031a51a39246953bd0cc2927dc43f
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b59b8b9793b1f91c1c59d296bfb5249f4f179f5db8331b3a668fd96cb8411646
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2eb44b4c5053339b770301f7d87fbc4d7263f68083fca77ec9183e972b49d09
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53bc45428cbda968ebaddc622f57110e74a6a6203a91b475e3a1ad93b580da57
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8100a42157599910b6ec01fec7ad69d8f9fbdcef534c419fae525b1f853f650a
3
  size 1089994880
trainer_log.jsonl CHANGED
@@ -1,3 +1,3 @@
1
- {"current_steps": 1, "total_steps": 2, "loss": 1.2459, "lr": 2e-05, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:01:52", "remaining_time": "0:01:52"}
2
- {"current_steps": 2, "total_steps": 2, "loss": 1.2028, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:03:34", "remaining_time": "0:00:00"}
3
- {"current_steps": 2, "total_steps": 2, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:06:35", "remaining_time": "0:00:00"}
 
1
+ {"current_steps": 1, "total_steps": 2, "loss": 1.2459, "lr": 2e-05, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:02:03", "remaining_time": "0:02:03"}
2
+ {"current_steps": 2, "total_steps": 2, "loss": 1.2028, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:03:44", "remaining_time": "0:00:00"}
3
+ {"current_steps": 2, "total_steps": 2, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:06:21", "remaining_time": "0:00:00"}
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17fb8d58f0092a0ab77418a0643622015fc3f3a34cf3d26034253f51f4af6424
3
  size 7288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a090da2170c8c3fe4545affa6dd128b771479f14fa1ee3dede9a1a44101e360e
3
  size 7288
wandb/debug-internal.log CHANGED
@@ -1,7 +1,7 @@
1
- {"time":"2025-04-12T23:40:30.119603299+02:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250412_234030-6n803w1v/logs/debug-core.log"}
2
- {"time":"2025-04-12T23:40:30.400735107+02:00","level":"INFO","msg":"created new stream","id":"6n803w1v"}
3
- {"time":"2025-04-12T23:40:30.400829687+02:00","level":"INFO","msg":"stream: started","id":"6n803w1v"}
4
- {"time":"2025-04-12T23:40:30.400876007+02:00","level":"INFO","msg":"writer: Do: started","stream_id":"6n803w1v"}
5
- {"time":"2025-04-12T23:40:30.400943917+02:00","level":"INFO","msg":"sender: started","stream_id":"6n803w1v"}
6
- {"time":"2025-04-12T23:40:30.400957198+02:00","level":"INFO","msg":"handler: started","stream_id":"6n803w1v"}
7
- {"time":"2025-04-12T23:40:30.678619044+02:00","level":"INFO","msg":"Starting system monitor"}
 
1
+ {"time":"2025-04-13T00:22:04.825520919+02:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug-core.log"}
2
+ {"time":"2025-04-13T00:22:05.128494964+02:00","level":"INFO","msg":"created new stream","id":"33xvut2k"}
3
+ {"time":"2025-04-13T00:22:05.128573564+02:00","level":"INFO","msg":"stream: started","id":"33xvut2k"}
4
+ {"time":"2025-04-13T00:22:05.128604494+02:00","level":"INFO","msg":"writer: Do: started","stream_id":"33xvut2k"}
5
+ {"time":"2025-04-13T00:22:05.128621084+02:00","level":"INFO","msg":"sender: started","stream_id":"33xvut2k"}
6
+ {"time":"2025-04-13T00:22:05.128629264+02:00","level":"INFO","msg":"handler: started","stream_id":"33xvut2k"}
7
+ {"time":"2025-04-13T00:22:05.425669006+02:00","level":"INFO","msg":"Starting system monitor"}
wandb/debug.log CHANGED
@@ -1,25 +1,25 @@
1
- 2025-04-12 23:40:30,108 INFO MainThread:400396 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
- 2025-04-12 23:40:30,109 INFO MainThread:400396 [wandb_setup.py:_flush():67] Configure stats pid to 400396
3
- 2025-04-12 23:40:30,109 INFO MainThread:400396 [wandb_setup.py:_flush():67] Loading settings from /home/ryma833h/.config/wandb/settings
4
- 2025-04-12 23:40:30,109 INFO MainThread:400396 [wandb_setup.py:_flush():67] Loading settings from /data/horse/ws/ryma833h-DCFT_Shared/dcft_private/wandb/settings
5
- 2025-04-12 23:40:30,109 INFO MainThread:400396 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
- 2025-04-12 23:40:30,110 INFO MainThread:400396 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250412_234030-6n803w1v/logs/debug.log
7
- 2025-04-12 23:40:30,110 INFO MainThread:400396 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250412_234030-6n803w1v/logs/debug-internal.log
8
- 2025-04-12 23:40:30,110 INFO MainThread:400396 [wandb_init.py:init():761] calling init triggers
9
- 2025-04-12 23:40:30,110 INFO MainThread:400396 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
  config: {'_wandb': {}}
11
- 2025-04-12 23:40:30,110 INFO MainThread:400396 [wandb_init.py:init():784] starting backend
12
- 2025-04-12 23:40:30,110 INFO MainThread:400396 [wandb_init.py:init():788] sending inform_init request
13
- 2025-04-12 23:40:30,116 INFO MainThread:400396 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
- 2025-04-12 23:40:30,116 INFO MainThread:400396 [wandb_init.py:init():798] backend started and connected
15
- 2025-04-12 23:40:30,118 INFO MainThread:400396 [wandb_init.py:init():891] updated telemetry
16
- 2025-04-12 23:40:30,152 INFO MainThread:400396 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
- 2025-04-12 23:40:30,674 INFO MainThread:400396 [wandb_init.py:init():990] starting run threads in backend
18
- 2025-04-12 23:40:31,056 INFO MainThread:400396 [wandb_run.py:_console_start():2375] atexit reg
19
- 2025-04-12 23:40:31,057 INFO MainThread:400396 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
- 2025-04-12 23:40:31,057 INFO MainThread:400396 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
- 2025-04-12 23:40:31,057 INFO MainThread:400396 [wandb_run.py:_redirect():2315] Redirects installed.
22
- 2025-04-12 23:40:31,063 INFO MainThread:400396 [wandb_init.py:init():1032] run started, returning control to user process
23
- 2025-04-12 23:40:31,064 INFO MainThread:400396 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 3584, 'intermediate_size': 18944, 'num_hidden_layers': 28, 'num_attention_heads': 28, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151645, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/data/horse/ws/ryma833h-DCFT_Shared/huggingface/hub/Qwen/Qwen2.5-7B-Instruct', '_attn_implementation_autoset': True, 'transformers_version': '4.46.1', 'model_type': 'qwen2', 'output_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 24, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/runs/Apr12_23-40-04_c126', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'dcft/train/zero3.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 's1k-11-test-192', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 16384, 'generation_num_beams': None, 'generation_config': None}
24
- 2025-04-12 23:40:31,066 INFO MainThread:400396 [wandb_config.py:__setitem__():154] config set model/num_parameters = 0 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7efb74038d00>>
25
- 2025-04-12 23:40:31,066 INFO MainThread:400396 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 0 None
 
1
+ 2025-04-13 00:22:04,770 INFO MainThread:402765 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Configure stats pid to 402765
3
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from /home/ryma833h/.config/wandb/settings
4
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from /data/horse/ws/ryma833h-DCFT_Shared/dcft_private/wandb/settings
5
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug.log
7
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug-internal.log
8
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():761] calling init triggers
9
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
  config: {'_wandb': {}}
11
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():784] starting backend
12
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-04-13 00:22:04,822 INFO MainThread:402765 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-13 00:22:04,823 INFO MainThread:402765 [wandb_init.py:init():798] backend started and connected
15
+ 2025-04-13 00:22:04,824 INFO MainThread:402765 [wandb_init.py:init():891] updated telemetry
16
+ 2025-04-13 00:22:04,991 INFO MainThread:402765 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-04-13 00:22:05,420 INFO MainThread:402765 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-04-13 00:22:07,461 INFO MainThread:402765 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-04-13 00:22:07,505 INFO MainThread:402765 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-04-13 00:22:07,506 INFO MainThread:402765 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 3584, 'intermediate_size': 18944, 'num_hidden_layers': 28, 'num_attention_heads': 28, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151645, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-7B-Instruct', '_attn_implementation_autoset': True, 'transformers_version': '4.46.1', 'model_type': 'qwen2', 'output_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 24, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/runs/Apr13_00-20-44_c126', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'dcft/train/zero3.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 's1k-11-test-192', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 16384, 'generation_num_beams': None, 'generation_config': None}
24
+ 2025-04-13 00:22:07,508 INFO MainThread:402765 [wandb_config.py:__setitem__():154] config set model/num_parameters = 0 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7faa58768340>>
25
+ 2025-04-13 00:22:07,508 INFO MainThread:402765 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 0 None
wandb/run-20250413_002204-33xvut2k/files/output.log ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 100%|██████████| 2/2 [03:44<00:00, 110.11s/it][INFO|trainer.py:3801] 2025-04-13 00:25:53,959 >> Saving model checkpoint to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2
2
+ {'loss': 1.2459, 'grad_norm': 6.925258567695526, 'learning_rate': 2e-05, 'epoch': 0.5}
3
+ {'loss': 1.2028, 'grad_norm': 7.060579782684043, 'learning_rate': 0.0, 'epoch': 1.0}
4
+ [INFO|configuration_utils.py:414] 2025-04-13 00:25:53,969 >> Configuration saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/config.json
5
+ [INFO|configuration_utils.py:865] 2025-04-13 00:25:53,972 >> Configuration saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/generation_config.json
6
+ [INFO|modeling_utils.py:3043] 2025-04-13 00:26:09,813 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/model.safetensors.index.json.
7
+ [INFO|tokenization_utils_base.py:2646] 2025-04-13 00:26:09,816 >> tokenizer config file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/tokenizer_config.json
8
+ [INFO|tokenization_utils_base.py:2655] 2025-04-13 00:26:09,817 >> Special tokens file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/special_tokens_map.json
9
+ [2025-04-13 00:26:10,062] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2 is about to be saved!
10
+ [2025-04-13 00:26:10,107] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt
11
+ [2025-04-13 00:26:10,107] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt...
12
+ [2025-04-13 00:26:10,187] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt.
13
+ [2025-04-13 00:26:10,189] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
14
+ [2025-04-13 00:26:46,139] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
15
+ [2025-04-13 00:26:46,161] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
16
+ [2025-04-13 00:26:47,801] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2 is ready now!
17
+ [INFO|tokenization_utils_base.py:2646] 2025-04-13 00:27:40,232 >> tokenizer config file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/tokenizer_config.json
18
+ [INFO|tokenization_utils_base.py:2655] 2025-04-13 00:27:40,234 >> Special tokens file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/special_tokens_map.json
19
+ [INFO|trainer.py:3801] 2025-04-13 00:27:42,542 >> Saving model checkpoint to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2
20
+ [INFO|configuration_utils.py:414] 2025-04-13 00:27:42,547 >> Configuration saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/config.json
21
+ [INFO|configuration_utils.py:865] 2025-04-13 00:27:42,549 >> Configuration saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/generation_config.json
22
+ [INFO|modeling_utils.py:3043] 2025-04-13 00:27:57,600 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/model.safetensors.index.json.
23
+ [INFO|tokenization_utils_base.py:2646] 2025-04-13 00:27:57,602 >> tokenizer config file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/tokenizer_config.json
24
+ [INFO|tokenization_utils_base.py:2655] 2025-04-13 00:27:57,603 >> Special tokens file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/special_tokens_map.json
25
+ [2025-04-13 00:27:57,807] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2 is about to be saved!
26
+ [2025-04-13 00:27:57,815] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt
27
+ [2025-04-13 00:27:57,815] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt...
28
+ [2025-04-13 00:27:57,827] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt.
29
+ [2025-04-13 00:27:57,844] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
30
+ [2025-04-13 00:28:28,367] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
31
+ [2025-04-13 00:28:28,370] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
32
+ [2025-04-13 00:28:28,921] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2 is ready now!
33
+ [INFO|trainer.py:2584] 2025-04-13 00:28:28,929 >>
34
+
35
+ Training completed. Do not forget to share your model on huggingface.co/models =)
36
+
37
+
38
+ 100%|██████████| 2/2 [06:21<00:00, 190.72s/it]
39
+ {'train_runtime': 384.9677, 'train_samples_per_second': 0.499, 'train_steps_per_second': 0.005, 'train_loss': 1.2243931889533997, 'epoch': 1.0}
40
+ [INFO|trainer.py:4582] 2025-04-13 00:28:28,948 >> Waiting for the current checkpoint push to be finished, this might take a couple of minutes.
wandb/run-20250413_002204-33xvut2k/files/requirements.txt ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nvidia-cusolver-cu12==11.6.1.9
2
+ greenlet==3.1.1
3
+ Jinja2==3.1.6
4
+ GitPython==3.1.44
5
+ lm_eval==0.4.8
6
+ semantic-version==2.10.0
7
+ Pygments==2.19.1
8
+ nvidia-cufft-cu12==11.2.1.3
9
+ sympy==1.13.1
10
+ charset-normalizer==3.4.1
11
+ pillow==10.4.0
12
+ wandb==0.19.8
13
+ h11==0.14.0
14
+ aiohttp==3.11.14
15
+ datasets==3.1.0
16
+ fonttools==4.56.0
17
+ huggingface-hub==0.29.3
18
+ chardet==5.2.0
19
+ colorama==0.4.6
20
+ sse-starlette==2.2.1
21
+ trl==0.9.6
22
+ tzdata==2025.2
23
+ aiosignal==1.3.2
24
+ Markdown==3.7
25
+ zstandard==0.23.0
26
+ nvidia-nccl-cu12==2.21.5
27
+ tensorboard-data-server==0.7.2
28
+ filelock==3.18.0
29
+ liger_kernel==0.3.1
30
+ msgpack==1.1.0
31
+ gitdb==4.0.12
32
+ wheel==0.45.1
33
+ peft==0.12.0
34
+ mbstrdecoder==1.1.4
35
+ cycler==0.12.1
36
+ tyro==0.9.17
37
+ av==14.2.0
38
+ httpx==0.28.1
39
+ typepy==1.3.4
40
+ pytz==2025.2
41
+ py-cpuinfo==9.0.0
42
+ pydantic==2.10.6
43
+ requests==2.32.3
44
+ typeguard==4.4.2
45
+ dcft==0.1.0
46
+ exceptiongroup==1.2.2
47
+ fsspec==2024.9.0
48
+ nvidia-nvjitlink-cu12==12.4.127
49
+ tensorboard==2.19.0
50
+ tabulate==0.9.0
51
+ tokenizers==0.20.3
52
+ multidict==6.2.0
53
+ python-multipart==0.0.20
54
+ multiprocess==0.70.16
55
+ packaging==24.2
56
+ propcache==0.3.1
57
+ rich==13.9.4
58
+ nltk==3.9.1
59
+ rouge_score==0.1.2
60
+ psutil==7.0.0
61
+ deepspeed==0.15.2
62
+ nvidia-cuda-runtime-cu12==12.4.127
63
+ contourpy==1.3.1
64
+ yarl==1.18.3
65
+ tcolorpy==0.1.7
66
+ mpmath==1.3.0
67
+ Werkzeug==3.1.3
68
+ triton==3.2.0
69
+ xxhash==3.5.0
70
+ pydub==0.25.1
71
+ nvidia-cuda-cupti-cu12==12.4.127
72
+ typer==0.15.2
73
+ joblib==1.4.2
74
+ threadpoolctl==3.6.0
75
+ fire==0.7.0
76
+ kiwisolver==1.4.8
77
+ mdurl==0.1.2
78
+ SQLAlchemy==2.0.39
79
+ PyYAML==6.0.2
80
+ torch==2.6.0
81
+ attrs==25.3.0
82
+ sqlitedict==2.1.0
83
+ portalocker==3.1.1
84
+ setproctitle==1.3.5
85
+ tabledata==1.3.4
86
+ click==8.1.8
87
+ scipy==1.15.2
88
+ tiktoken==0.9.0
89
+ scikit-learn==1.6.1
90
+ pathvalidate==3.2.3
91
+ grpcio==1.71.0
92
+ nvidia-cublas-cu12==12.4.5.8
93
+ lxml==5.3.1
94
+ six==1.17.0
95
+ smmap==5.0.2
96
+ pytablewriter==1.2.1
97
+ nvidia-cudnn-cu12==9.1.0.70
98
+ numexpr==2.10.2
99
+ python-dateutil==2.9.0.post0
100
+ more-itertools==10.6.0
101
+ setuptools==75.8.0
102
+ nvidia-cusparse-cu12==12.3.1.170
103
+ nvidia-cuda-nvrtc-cu12==12.4.127
104
+ certifi==2025.1.31
105
+ protobuf==5.29.4
106
+ importlib_resources==6.5.2
107
+ gradio_client==1.3.0
108
+ starlette==0.46.1
109
+ gradio==4.44.1
110
+ uvicorn==0.34.0
111
+ pandas==2.2.3
112
+ numpy==1.26.4
113
+ markdown-it-py==3.0.0
114
+ torchvision==0.21.0
115
+ ruff==0.11.2
116
+ hjson==3.1.0
117
+ pyarrow==19.0.1
118
+ websockets==12.0
119
+ absl-py==2.2.1
120
+ ffmpy==0.5.0
121
+ termcolor==2.5.0
122
+ sentry-sdk==2.24.1
123
+ tomlkit==0.12.0
124
+ frozenlist==1.5.0
125
+ tqdm-multiprocess==0.0.11
126
+ urllib3==2.3.0
127
+ sentencepiece==0.2.0
128
+ tqdm==4.67.1
129
+ dill==0.3.8
130
+ nvidia-nvtx-cu12==12.4.127
131
+ pyparsing==3.2.3
132
+ fastapi==0.115.12
133
+ shellingham==1.5.4
134
+ annotated-types==0.7.0
135
+ psycopg2-binary==2.9.10
136
+ pybind11==2.13.6
137
+ safetensors==0.5.3
138
+ bitsandbytes==0.45.4
139
+ aiofiles==23.2.1
140
+ matplotlib==3.10.1
141
+ einops==0.8.1
142
+ pip==25.0
143
+ orjson==3.10.16
144
+ idna==3.10
145
+ typing_extensions==4.13.0
146
+ docstring_parser==0.16
147
+ nvidia-cusparselt-cu12==0.6.2
148
+ platformdirs==4.3.7
149
+ pydantic_core==2.27.2
150
+ MarkupSafe==2.1.5
151
+ async-timeout==5.0.1
152
+ word2number==1.1
153
+ accelerate==1.0.1
154
+ anyio==4.9.0
155
+ docker-pycreds==0.4.0
156
+ nvidia-curand-cu12==10.3.5.147
157
+ httpcore==1.0.7
158
+ shtab==1.7.1
159
+ transformers==4.46.1
160
+ DataProperty==1.1.0
161
+ sniffio==1.3.1
162
+ regex==2024.11.6
163
+ jsonlines==4.0.0
164
+ ninja==1.11.1.4
165
+ aiohappyeyeballs==2.6.1
166
+ python-dotenv==1.1.0
167
+ networkx==3.4.2
168
+ evaluate==0.4.3
169
+ sacrebleu==2.5.1
wandb/run-20250413_002204-33xvut2k/files/wandb-metadata.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.14.0-427.33.1.el9_4.x86_64-x86_64-with-glibc2.34",
3
+ "python": "CPython 3.10.16",
4
+ "startedAt": "2025-04-12T22:22:04.823396Z",
5
+ "args": [
6
+ "DCFT_experiments/configs/s1k-11-test-192_train_config.yaml"
7
+ ],
8
+ "program": "/data/horse/ws/ryma833h-DCFT_Shared/dcft_private/dcft/train/llamafactory/src/train.py",
9
+ "codePath": "dcft/train/llamafactory/src/train.py",
10
+ "git": {
11
+ "remote": "[email protected]:mlfoundations/dcft_private.git",
12
+ "commit": "aa9216d55a4bbf475343afa165e6a9dd8e34241b"
13
+ },
14
+ "email": "[email protected]",
15
+ "root": "/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192",
16
+ "host": "c126",
17
+ "executable": "/data/horse/ws/ryma833h-DCFT_Shared/dcft_private/env/dcft_private/bin/python",
18
+ "codePathLocal": "dcft/train/llamafactory/src/train.py",
19
+ "cpu_count": 64,
20
+ "cpu_count_logical": 64,
21
+ "gpu": "NVIDIA H100",
22
+ "gpu_count": 4,
23
+ "disk": {
24
+ "/": {
25
+ "total": "42882564096",
26
+ "used": "39661744128"
27
+ }
28
+ },
29
+ "memory": {
30
+ "total": "810822434816"
31
+ },
32
+ "cpu": {
33
+ "count": 64,
34
+ "countLogical": 64
35
+ },
36
+ "gpu_nvidia": [
37
+ {
38
+ "name": "NVIDIA H100",
39
+ "memoryTotal": "100485038080",
40
+ "cudaCores": 16896,
41
+ "architecture": "Hopper"
42
+ },
43
+ {
44
+ "name": "NVIDIA H100",
45
+ "memoryTotal": "100485038080",
46
+ "cudaCores": 16896,
47
+ "architecture": "Hopper"
48
+ },
49
+ {
50
+ "name": "NVIDIA H100",
51
+ "memoryTotal": "100485038080",
52
+ "cudaCores": 16896,
53
+ "architecture": "Hopper"
54
+ },
55
+ {
56
+ "name": "NVIDIA H100",
57
+ "memoryTotal": "100485038080",
58
+ "cudaCores": 16896,
59
+ "architecture": "Hopper"
60
+ }
61
+ ],
62
+ "slurm": {
63
+ "cluster_name": "capella",
64
+ "conf": "/etc/slurm/slurm.conf",
65
+ "cpu_bind": "quiet,mask_cpu:0x0000000FFFFF3F3F",
66
+ "cpu_bind_list": "0x0000000FFFFF3F3F",
67
+ "cpu_bind_type": "mask_cpu:",
68
+ "cpu_bind_verbose": "quiet",
69
+ "cpus_on_node": "32",
70
+ "cpus_per_task": "32",
71
+ "distribution": "cyclic",
72
+ "gpus_on_node": "4",
73
+ "gtids": "0",
74
+ "hint": "nomultithread",
75
+ "job_account": "p_finetuning",
76
+ "job_cpus_per_node": "56",
77
+ "job_end_time": "1744499811",
78
+ "job_gid": "203360",
79
+ "job_gpus": "0,1,2,3",
80
+ "job_id": "200676",
81
+ "job_name": "s1k-11-test-192",
82
+ "job_nodelist": "c126",
83
+ "job_num_nodes": "1",
84
+ "job_partition": "capella",
85
+ "job_qos": "normal",
86
+ "job_start_time": "1744496211",
87
+ "job_uid": "2215941",
88
+ "job_user": "ryma833h",
89
+ "jobid": "200676",
90
+ "launch_node_ipaddr": "172.24.74.136",
91
+ "localid": "0",
92
+ "mem_per_node": "727040",
93
+ "mpi_type": "pmix",
94
+ "nnodes": "1",
95
+ "nodeid": "0",
96
+ "nodelist": "c126",
97
+ "nprocs": "1",
98
+ "ntasks": "1",
99
+ "pmix_mapping_serv": "(vector,(0,1,1))",
100
+ "pmixp_abort_agent_port": "61937",
101
+ "prio_process": "0",
102
+ "procid": "0",
103
+ "srun_comm_host": "172.24.74.136",
104
+ "srun_comm_port": "61938",
105
+ "step_gpus": "0,1,2,3",
106
+ "step_id": "0",
107
+ "step_launcher_port": "61938",
108
+ "step_nodelist": "c126",
109
+ "step_num_nodes": "1",
110
+ "step_num_tasks": "1",
111
+ "step_tasks_per_node": "1",
112
+ "stepid": "0",
113
+ "submit_dir": "/data/horse/ws/ryma833h-DCFT_Shared/dcft_private",
114
+ "submit_host": "c2",
115
+ "task_pid": "402713",
116
+ "tasks_per_node": "1",
117
+ "threads_per_core": "1",
118
+ "topology_addr": "spines.leaf8.c126",
119
+ "topology_addr_pattern": "switch.switch.node",
120
+ "tres_per_task": "cpu:32",
121
+ "umask": "0022"
122
+ },
123
+ "cudaVersion": "12.6"
124
+ }
wandb/run-20250413_002204-33xvut2k/logs/debug-core.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2025-04-13T00:22:04.257805259+02:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpoela46y5/port-402765.txt","pid":402765,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-04-13T00:22:04.25833727+02:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":402765}
3
+ {"time":"2025-04-13T00:22:04.25833602+02:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36323,"Zone":""}}
4
+ {"time":"2025-04-13T00:22:04.372792088+02:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:39750"}
5
+ {"time":"2025-04-13T00:22:04.824089349+02:00","level":"INFO","msg":"handleInformInit: received","streamId":"33xvut2k","id":"127.0.0.1:39750"}
6
+ {"time":"2025-04-13T00:22:05.128600864+02:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"33xvut2k","id":"127.0.0.1:39750"}
wandb/run-20250413_002204-33xvut2k/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-04-13T00:22:04.825520919+02:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug-core.log"}
2
+ {"time":"2025-04-13T00:22:05.128494964+02:00","level":"INFO","msg":"created new stream","id":"33xvut2k"}
3
+ {"time":"2025-04-13T00:22:05.128573564+02:00","level":"INFO","msg":"stream: started","id":"33xvut2k"}
4
+ {"time":"2025-04-13T00:22:05.128604494+02:00","level":"INFO","msg":"writer: Do: started","stream_id":"33xvut2k"}
5
+ {"time":"2025-04-13T00:22:05.128621084+02:00","level":"INFO","msg":"sender: started","stream_id":"33xvut2k"}
6
+ {"time":"2025-04-13T00:22:05.128629264+02:00","level":"INFO","msg":"handler: started","stream_id":"33xvut2k"}
7
+ {"time":"2025-04-13T00:22:05.425669006+02:00","level":"INFO","msg":"Starting system monitor"}
wandb/run-20250413_002204-33xvut2k/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-13 00:22:04,770 INFO MainThread:402765 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Configure stats pid to 402765
3
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from /home/ryma833h/.config/wandb/settings
4
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from /data/horse/ws/ryma833h-DCFT_Shared/dcft_private/wandb/settings
5
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug.log
7
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug-internal.log
8
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():761] calling init triggers
9
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():784] starting backend
12
+ 2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-04-13 00:22:04,822 INFO MainThread:402765 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-04-13 00:22:04,823 INFO MainThread:402765 [wandb_init.py:init():798] backend started and connected
15
+ 2025-04-13 00:22:04,824 INFO MainThread:402765 [wandb_init.py:init():891] updated telemetry
16
+ 2025-04-13 00:22:04,991 INFO MainThread:402765 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-04-13 00:22:05,420 INFO MainThread:402765 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-04-13 00:22:07,461 INFO MainThread:402765 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-04-13 00:22:07,505 INFO MainThread:402765 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-04-13 00:22:07,506 INFO MainThread:402765 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 3584, 'intermediate_size': 18944, 'num_hidden_layers': 28, 'num_attention_heads': 28, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151645, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-7B-Instruct', '_attn_implementation_autoset': True, 'transformers_version': '4.46.1', 'model_type': 'qwen2', 'output_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 24, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/runs/Apr13_00-20-44_c126', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'dcft/train/zero3.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 's1k-11-test-192', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 16384, 'generation_num_beams': None, 'generation_config': None}
24
+ 2025-04-13 00:22:07,508 INFO MainThread:402765 [wandb_config.py:__setitem__():154] config set model/num_parameters = 0 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7faa58768340>>
25
+ 2025-04-13 00:22:07,508 INFO MainThread:402765 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 0 None
wandb/run-20250413_002204-33xvut2k/run-33xvut2k.wandb ADDED
Binary file (98.3 kB). View file