Training in progress, epoch 1
Browse files- config.json +1 -1
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
- trainer_log.jsonl +3 -3
- training_args.bin +1 -1
- wandb/debug-internal.log +7 -7
- wandb/debug.log +24 -24
- wandb/run-20250413_002204-33xvut2k/files/output.log +40 -0
- wandb/run-20250413_002204-33xvut2k/files/requirements.txt +169 -0
- wandb/run-20250413_002204-33xvut2k/files/wandb-metadata.json +124 -0
- wandb/run-20250413_002204-33xvut2k/logs/debug-core.log +6 -0
- wandb/run-20250413_002204-33xvut2k/logs/debug-internal.log +7 -0
- wandb/run-20250413_002204-33xvut2k/logs/debug.log +25 -0
- wandb/run-20250413_002204-33xvut2k/run-33xvut2k.wandb +0 -0
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"Qwen2ForCausalLM"
|
5 |
],
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
|
3 |
"architectures": [
|
4 |
"Qwen2ForCausalLM"
|
5 |
],
|
model-00001-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4877660776
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f2ad04589b6dc72ec76b95b02db4661b9d5e834313eaaa4cf9463e5f43b36d8
|
3 |
size 4877660776
|
model-00002-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4932751008
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b260d59e122a42972315dc3416b98012d73031a51a39246953bd0cc2927dc43f
|
3 |
size 4932751008
|
model-00003-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4330865200
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f2eb44b4c5053339b770301f7d87fbc4d7263f68083fca77ec9183e972b49d09
|
3 |
size 4330865200
|
model-00004-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1089994880
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8100a42157599910b6ec01fec7ad69d8f9fbdcef534c419fae525b1f853f650a
|
3 |
size 1089994880
|
trainer_log.jsonl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
{"current_steps": 1, "total_steps": 2, "loss": 1.2459, "lr": 2e-05, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:
|
2 |
-
{"current_steps": 2, "total_steps": 2, "loss": 1.2028, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:03:
|
3 |
-
{"current_steps": 2, "total_steps": 2, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:06:
|
|
|
1 |
+
{"current_steps": 1, "total_steps": 2, "loss": 1.2459, "lr": 2e-05, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:02:03", "remaining_time": "0:02:03"}
|
2 |
+
{"current_steps": 2, "total_steps": 2, "loss": 1.2028, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:03:44", "remaining_time": "0:00:00"}
|
3 |
+
{"current_steps": 2, "total_steps": 2, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:06:21", "remaining_time": "0:00:00"}
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 7288
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a090da2170c8c3fe4545affa6dd128b771479f14fa1ee3dede9a1a44101e360e
|
3 |
size 7288
|
wandb/debug-internal.log
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
{"time":"2025-04-
|
2 |
-
{"time":"2025-04-
|
3 |
-
{"time":"2025-04-
|
4 |
-
{"time":"2025-04-
|
5 |
-
{"time":"2025-04-
|
6 |
-
{"time":"2025-04-
|
7 |
-
{"time":"2025-04-
|
|
|
1 |
+
{"time":"2025-04-13T00:22:04.825520919+02:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug-core.log"}
|
2 |
+
{"time":"2025-04-13T00:22:05.128494964+02:00","level":"INFO","msg":"created new stream","id":"33xvut2k"}
|
3 |
+
{"time":"2025-04-13T00:22:05.128573564+02:00","level":"INFO","msg":"stream: started","id":"33xvut2k"}
|
4 |
+
{"time":"2025-04-13T00:22:05.128604494+02:00","level":"INFO","msg":"writer: Do: started","stream_id":"33xvut2k"}
|
5 |
+
{"time":"2025-04-13T00:22:05.128621084+02:00","level":"INFO","msg":"sender: started","stream_id":"33xvut2k"}
|
6 |
+
{"time":"2025-04-13T00:22:05.128629264+02:00","level":"INFO","msg":"handler: started","stream_id":"33xvut2k"}
|
7 |
+
{"time":"2025-04-13T00:22:05.425669006+02:00","level":"INFO","msg":"Starting system monitor"}
|
wandb/debug.log
CHANGED
@@ -1,25 +1,25 @@
|
|
1 |
-
2025-04-
|
2 |
-
2025-04-
|
3 |
-
2025-04-
|
4 |
-
2025-04-
|
5 |
-
2025-04-
|
6 |
-
2025-04-
|
7 |
-
2025-04-
|
8 |
-
2025-04-
|
9 |
-
2025-04-
|
10 |
config: {'_wandb': {}}
|
11 |
-
2025-04-
|
12 |
-
2025-04-
|
13 |
-
2025-04-
|
14 |
-
2025-04-
|
15 |
-
2025-04-
|
16 |
-
2025-04-
|
17 |
-
2025-04-
|
18 |
-
2025-04-
|
19 |
-
2025-04-
|
20 |
-
2025-04-
|
21 |
-
2025-04-
|
22 |
-
2025-04-
|
23 |
-
2025-04-
|
24 |
-
2025-04-
|
25 |
-
2025-04-
|
|
|
1 |
+
2025-04-13 00:22:04,770 INFO MainThread:402765 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
2 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Configure stats pid to 402765
|
3 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from /home/ryma833h/.config/wandb/settings
|
4 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from /data/horse/ws/ryma833h-DCFT_Shared/dcft_private/wandb/settings
|
5 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
6 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug.log
|
7 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug-internal.log
|
8 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():761] calling init triggers
|
9 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
10 |
config: {'_wandb': {}}
|
11 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():784] starting backend
|
12 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():788] sending inform_init request
|
13 |
+
2025-04-13 00:22:04,822 INFO MainThread:402765 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-13 00:22:04,823 INFO MainThread:402765 [wandb_init.py:init():798] backend started and connected
|
15 |
+
2025-04-13 00:22:04,824 INFO MainThread:402765 [wandb_init.py:init():891] updated telemetry
|
16 |
+
2025-04-13 00:22:04,991 INFO MainThread:402765 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-13 00:22:05,420 INFO MainThread:402765 [wandb_init.py:init():990] starting run threads in backend
|
18 |
+
2025-04-13 00:22:07,461 INFO MainThread:402765 [wandb_run.py:_console_start():2375] atexit reg
|
19 |
+
2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
20 |
+
2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
21 |
+
2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2315] Redirects installed.
|
22 |
+
2025-04-13 00:22:07,505 INFO MainThread:402765 [wandb_init.py:init():1032] run started, returning control to user process
|
23 |
+
2025-04-13 00:22:07,506 INFO MainThread:402765 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 3584, 'intermediate_size': 18944, 'num_hidden_layers': 28, 'num_attention_heads': 28, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151645, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-7B-Instruct', '_attn_implementation_autoset': True, 'transformers_version': '4.46.1', 'model_type': 'qwen2', 'output_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 24, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/runs/Apr13_00-20-44_c126', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'dcft/train/zero3.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 's1k-11-test-192', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 16384, 'generation_num_beams': None, 'generation_config': None}
|
24 |
+
2025-04-13 00:22:07,508 INFO MainThread:402765 [wandb_config.py:__setitem__():154] config set model/num_parameters = 0 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7faa58768340>>
|
25 |
+
2025-04-13 00:22:07,508 INFO MainThread:402765 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 0 None
|
wandb/run-20250413_002204-33xvut2k/files/output.log
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
100%|██████████| 2/2 [03:44<00:00, 110.11s/it][INFO|trainer.py:3801] 2025-04-13 00:25:53,959 >> Saving model checkpoint to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2
|
2 |
+
{'loss': 1.2459, 'grad_norm': 6.925258567695526, 'learning_rate': 2e-05, 'epoch': 0.5}
|
3 |
+
{'loss': 1.2028, 'grad_norm': 7.060579782684043, 'learning_rate': 0.0, 'epoch': 1.0}
|
4 |
+
[INFO|configuration_utils.py:414] 2025-04-13 00:25:53,969 >> Configuration saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/config.json
|
5 |
+
[INFO|configuration_utils.py:865] 2025-04-13 00:25:53,972 >> Configuration saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/generation_config.json
|
6 |
+
[INFO|modeling_utils.py:3043] 2025-04-13 00:26:09,813 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/model.safetensors.index.json.
|
7 |
+
[INFO|tokenization_utils_base.py:2646] 2025-04-13 00:26:09,816 >> tokenizer config file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/tokenizer_config.json
|
8 |
+
[INFO|tokenization_utils_base.py:2655] 2025-04-13 00:26:09,817 >> Special tokens file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/special_tokens_map.json
|
9 |
+
[2025-04-13 00:26:10,062] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2 is about to be saved!
|
10 |
+
[2025-04-13 00:26:10,107] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt
|
11 |
+
[2025-04-13 00:26:10,107] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt...
|
12 |
+
[2025-04-13 00:26:10,187] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt.
|
13 |
+
[2025-04-13 00:26:10,189] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
14 |
+
[2025-04-13 00:26:46,139] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
15 |
+
[2025-04-13 00:26:46,161] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
|
16 |
+
[2025-04-13 00:26:47,801] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2 is ready now!
|
17 |
+
[INFO|tokenization_utils_base.py:2646] 2025-04-13 00:27:40,232 >> tokenizer config file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/tokenizer_config.json
|
18 |
+
[INFO|tokenization_utils_base.py:2655] 2025-04-13 00:27:40,234 >> Special tokens file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/special_tokens_map.json
|
19 |
+
[INFO|trainer.py:3801] 2025-04-13 00:27:42,542 >> Saving model checkpoint to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2
|
20 |
+
[INFO|configuration_utils.py:414] 2025-04-13 00:27:42,547 >> Configuration saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/config.json
|
21 |
+
[INFO|configuration_utils.py:865] 2025-04-13 00:27:42,549 >> Configuration saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/generation_config.json
|
22 |
+
[INFO|modeling_utils.py:3043] 2025-04-13 00:27:57,600 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/model.safetensors.index.json.
|
23 |
+
[INFO|tokenization_utils_base.py:2646] 2025-04-13 00:27:57,602 >> tokenizer config file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/tokenizer_config.json
|
24 |
+
[INFO|tokenization_utils_base.py:2655] 2025-04-13 00:27:57,603 >> Special tokens file saved in /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/special_tokens_map.json
|
25 |
+
[2025-04-13 00:27:57,807] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2 is about to be saved!
|
26 |
+
[2025-04-13 00:27:57,815] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt
|
27 |
+
[2025-04-13 00:27:57,815] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt...
|
28 |
+
[2025-04-13 00:27:57,827] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/zero_pp_rank_0_mp_rank_00_model_states.pt.
|
29 |
+
[2025-04-13 00:27:57,844] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
30 |
+
[2025-04-13 00:28:28,367] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
31 |
+
[2025-04-13 00:28:28,370] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/checkpoint-2/global_step2/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
|
32 |
+
[2025-04-13 00:28:28,921] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2 is ready now!
|
33 |
+
[INFO|trainer.py:2584] 2025-04-13 00:28:28,929 >>
|
34 |
+
|
35 |
+
Training completed. Do not forget to share your model on huggingface.co/models =)
|
36 |
+
|
37 |
+
|
38 |
+
100%|██████████| 2/2 [06:21<00:00, 190.72s/it]
|
39 |
+
{'train_runtime': 384.9677, 'train_samples_per_second': 0.499, 'train_steps_per_second': 0.005, 'train_loss': 1.2243931889533997, 'epoch': 1.0}
|
40 |
+
[INFO|trainer.py:4582] 2025-04-13 00:28:28,948 >> Waiting for the current checkpoint push to be finished, this might take a couple of minutes.
|
wandb/run-20250413_002204-33xvut2k/files/requirements.txt
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nvidia-cusolver-cu12==11.6.1.9
|
2 |
+
greenlet==3.1.1
|
3 |
+
Jinja2==3.1.6
|
4 |
+
GitPython==3.1.44
|
5 |
+
lm_eval==0.4.8
|
6 |
+
semantic-version==2.10.0
|
7 |
+
Pygments==2.19.1
|
8 |
+
nvidia-cufft-cu12==11.2.1.3
|
9 |
+
sympy==1.13.1
|
10 |
+
charset-normalizer==3.4.1
|
11 |
+
pillow==10.4.0
|
12 |
+
wandb==0.19.8
|
13 |
+
h11==0.14.0
|
14 |
+
aiohttp==3.11.14
|
15 |
+
datasets==3.1.0
|
16 |
+
fonttools==4.56.0
|
17 |
+
huggingface-hub==0.29.3
|
18 |
+
chardet==5.2.0
|
19 |
+
colorama==0.4.6
|
20 |
+
sse-starlette==2.2.1
|
21 |
+
trl==0.9.6
|
22 |
+
tzdata==2025.2
|
23 |
+
aiosignal==1.3.2
|
24 |
+
Markdown==3.7
|
25 |
+
zstandard==0.23.0
|
26 |
+
nvidia-nccl-cu12==2.21.5
|
27 |
+
tensorboard-data-server==0.7.2
|
28 |
+
filelock==3.18.0
|
29 |
+
liger_kernel==0.3.1
|
30 |
+
msgpack==1.1.0
|
31 |
+
gitdb==4.0.12
|
32 |
+
wheel==0.45.1
|
33 |
+
peft==0.12.0
|
34 |
+
mbstrdecoder==1.1.4
|
35 |
+
cycler==0.12.1
|
36 |
+
tyro==0.9.17
|
37 |
+
av==14.2.0
|
38 |
+
httpx==0.28.1
|
39 |
+
typepy==1.3.4
|
40 |
+
pytz==2025.2
|
41 |
+
py-cpuinfo==9.0.0
|
42 |
+
pydantic==2.10.6
|
43 |
+
requests==2.32.3
|
44 |
+
typeguard==4.4.2
|
45 |
+
dcft==0.1.0
|
46 |
+
exceptiongroup==1.2.2
|
47 |
+
fsspec==2024.9.0
|
48 |
+
nvidia-nvjitlink-cu12==12.4.127
|
49 |
+
tensorboard==2.19.0
|
50 |
+
tabulate==0.9.0
|
51 |
+
tokenizers==0.20.3
|
52 |
+
multidict==6.2.0
|
53 |
+
python-multipart==0.0.20
|
54 |
+
multiprocess==0.70.16
|
55 |
+
packaging==24.2
|
56 |
+
propcache==0.3.1
|
57 |
+
rich==13.9.4
|
58 |
+
nltk==3.9.1
|
59 |
+
rouge_score==0.1.2
|
60 |
+
psutil==7.0.0
|
61 |
+
deepspeed==0.15.2
|
62 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
63 |
+
contourpy==1.3.1
|
64 |
+
yarl==1.18.3
|
65 |
+
tcolorpy==0.1.7
|
66 |
+
mpmath==1.3.0
|
67 |
+
Werkzeug==3.1.3
|
68 |
+
triton==3.2.0
|
69 |
+
xxhash==3.5.0
|
70 |
+
pydub==0.25.1
|
71 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
72 |
+
typer==0.15.2
|
73 |
+
joblib==1.4.2
|
74 |
+
threadpoolctl==3.6.0
|
75 |
+
fire==0.7.0
|
76 |
+
kiwisolver==1.4.8
|
77 |
+
mdurl==0.1.2
|
78 |
+
SQLAlchemy==2.0.39
|
79 |
+
PyYAML==6.0.2
|
80 |
+
torch==2.6.0
|
81 |
+
attrs==25.3.0
|
82 |
+
sqlitedict==2.1.0
|
83 |
+
portalocker==3.1.1
|
84 |
+
setproctitle==1.3.5
|
85 |
+
tabledata==1.3.4
|
86 |
+
click==8.1.8
|
87 |
+
scipy==1.15.2
|
88 |
+
tiktoken==0.9.0
|
89 |
+
scikit-learn==1.6.1
|
90 |
+
pathvalidate==3.2.3
|
91 |
+
grpcio==1.71.0
|
92 |
+
nvidia-cublas-cu12==12.4.5.8
|
93 |
+
lxml==5.3.1
|
94 |
+
six==1.17.0
|
95 |
+
smmap==5.0.2
|
96 |
+
pytablewriter==1.2.1
|
97 |
+
nvidia-cudnn-cu12==9.1.0.70
|
98 |
+
numexpr==2.10.2
|
99 |
+
python-dateutil==2.9.0.post0
|
100 |
+
more-itertools==10.6.0
|
101 |
+
setuptools==75.8.0
|
102 |
+
nvidia-cusparse-cu12==12.3.1.170
|
103 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
104 |
+
certifi==2025.1.31
|
105 |
+
protobuf==5.29.4
|
106 |
+
importlib_resources==6.5.2
|
107 |
+
gradio_client==1.3.0
|
108 |
+
starlette==0.46.1
|
109 |
+
gradio==4.44.1
|
110 |
+
uvicorn==0.34.0
|
111 |
+
pandas==2.2.3
|
112 |
+
numpy==1.26.4
|
113 |
+
markdown-it-py==3.0.0
|
114 |
+
torchvision==0.21.0
|
115 |
+
ruff==0.11.2
|
116 |
+
hjson==3.1.0
|
117 |
+
pyarrow==19.0.1
|
118 |
+
websockets==12.0
|
119 |
+
absl-py==2.2.1
|
120 |
+
ffmpy==0.5.0
|
121 |
+
termcolor==2.5.0
|
122 |
+
sentry-sdk==2.24.1
|
123 |
+
tomlkit==0.12.0
|
124 |
+
frozenlist==1.5.0
|
125 |
+
tqdm-multiprocess==0.0.11
|
126 |
+
urllib3==2.3.0
|
127 |
+
sentencepiece==0.2.0
|
128 |
+
tqdm==4.67.1
|
129 |
+
dill==0.3.8
|
130 |
+
nvidia-nvtx-cu12==12.4.127
|
131 |
+
pyparsing==3.2.3
|
132 |
+
fastapi==0.115.12
|
133 |
+
shellingham==1.5.4
|
134 |
+
annotated-types==0.7.0
|
135 |
+
psycopg2-binary==2.9.10
|
136 |
+
pybind11==2.13.6
|
137 |
+
safetensors==0.5.3
|
138 |
+
bitsandbytes==0.45.4
|
139 |
+
aiofiles==23.2.1
|
140 |
+
matplotlib==3.10.1
|
141 |
+
einops==0.8.1
|
142 |
+
pip==25.0
|
143 |
+
orjson==3.10.16
|
144 |
+
idna==3.10
|
145 |
+
typing_extensions==4.13.0
|
146 |
+
docstring_parser==0.16
|
147 |
+
nvidia-cusparselt-cu12==0.6.2
|
148 |
+
platformdirs==4.3.7
|
149 |
+
pydantic_core==2.27.2
|
150 |
+
MarkupSafe==2.1.5
|
151 |
+
async-timeout==5.0.1
|
152 |
+
word2number==1.1
|
153 |
+
accelerate==1.0.1
|
154 |
+
anyio==4.9.0
|
155 |
+
docker-pycreds==0.4.0
|
156 |
+
nvidia-curand-cu12==10.3.5.147
|
157 |
+
httpcore==1.0.7
|
158 |
+
shtab==1.7.1
|
159 |
+
transformers==4.46.1
|
160 |
+
DataProperty==1.1.0
|
161 |
+
sniffio==1.3.1
|
162 |
+
regex==2024.11.6
|
163 |
+
jsonlines==4.0.0
|
164 |
+
ninja==1.11.1.4
|
165 |
+
aiohappyeyeballs==2.6.1
|
166 |
+
python-dotenv==1.1.0
|
167 |
+
networkx==3.4.2
|
168 |
+
evaluate==0.4.3
|
169 |
+
sacrebleu==2.5.1
|
wandb/run-20250413_002204-33xvut2k/files/wandb-metadata.json
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.14.0-427.33.1.el9_4.x86_64-x86_64-with-glibc2.34",
|
3 |
+
"python": "CPython 3.10.16",
|
4 |
+
"startedAt": "2025-04-12T22:22:04.823396Z",
|
5 |
+
"args": [
|
6 |
+
"DCFT_experiments/configs/s1k-11-test-192_train_config.yaml"
|
7 |
+
],
|
8 |
+
"program": "/data/horse/ws/ryma833h-DCFT_Shared/dcft_private/dcft/train/llamafactory/src/train.py",
|
9 |
+
"codePath": "dcft/train/llamafactory/src/train.py",
|
10 |
+
"git": {
|
11 |
+
"remote": "[email protected]:mlfoundations/dcft_private.git",
|
12 |
+
"commit": "aa9216d55a4bbf475343afa165e6a9dd8e34241b"
|
13 |
+
},
|
14 |
+
"email": "[email protected]",
|
15 |
+
"root": "/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192",
|
16 |
+
"host": "c126",
|
17 |
+
"executable": "/data/horse/ws/ryma833h-DCFT_Shared/dcft_private/env/dcft_private/bin/python",
|
18 |
+
"codePathLocal": "dcft/train/llamafactory/src/train.py",
|
19 |
+
"cpu_count": 64,
|
20 |
+
"cpu_count_logical": 64,
|
21 |
+
"gpu": "NVIDIA H100",
|
22 |
+
"gpu_count": 4,
|
23 |
+
"disk": {
|
24 |
+
"/": {
|
25 |
+
"total": "42882564096",
|
26 |
+
"used": "39661744128"
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"memory": {
|
30 |
+
"total": "810822434816"
|
31 |
+
},
|
32 |
+
"cpu": {
|
33 |
+
"count": 64,
|
34 |
+
"countLogical": 64
|
35 |
+
},
|
36 |
+
"gpu_nvidia": [
|
37 |
+
{
|
38 |
+
"name": "NVIDIA H100",
|
39 |
+
"memoryTotal": "100485038080",
|
40 |
+
"cudaCores": 16896,
|
41 |
+
"architecture": "Hopper"
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"name": "NVIDIA H100",
|
45 |
+
"memoryTotal": "100485038080",
|
46 |
+
"cudaCores": 16896,
|
47 |
+
"architecture": "Hopper"
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"name": "NVIDIA H100",
|
51 |
+
"memoryTotal": "100485038080",
|
52 |
+
"cudaCores": 16896,
|
53 |
+
"architecture": "Hopper"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"name": "NVIDIA H100",
|
57 |
+
"memoryTotal": "100485038080",
|
58 |
+
"cudaCores": 16896,
|
59 |
+
"architecture": "Hopper"
|
60 |
+
}
|
61 |
+
],
|
62 |
+
"slurm": {
|
63 |
+
"cluster_name": "capella",
|
64 |
+
"conf": "/etc/slurm/slurm.conf",
|
65 |
+
"cpu_bind": "quiet,mask_cpu:0x0000000FFFFF3F3F",
|
66 |
+
"cpu_bind_list": "0x0000000FFFFF3F3F",
|
67 |
+
"cpu_bind_type": "mask_cpu:",
|
68 |
+
"cpu_bind_verbose": "quiet",
|
69 |
+
"cpus_on_node": "32",
|
70 |
+
"cpus_per_task": "32",
|
71 |
+
"distribution": "cyclic",
|
72 |
+
"gpus_on_node": "4",
|
73 |
+
"gtids": "0",
|
74 |
+
"hint": "nomultithread",
|
75 |
+
"job_account": "p_finetuning",
|
76 |
+
"job_cpus_per_node": "56",
|
77 |
+
"job_end_time": "1744499811",
|
78 |
+
"job_gid": "203360",
|
79 |
+
"job_gpus": "0,1,2,3",
|
80 |
+
"job_id": "200676",
|
81 |
+
"job_name": "s1k-11-test-192",
|
82 |
+
"job_nodelist": "c126",
|
83 |
+
"job_num_nodes": "1",
|
84 |
+
"job_partition": "capella",
|
85 |
+
"job_qos": "normal",
|
86 |
+
"job_start_time": "1744496211",
|
87 |
+
"job_uid": "2215941",
|
88 |
+
"job_user": "ryma833h",
|
89 |
+
"jobid": "200676",
|
90 |
+
"launch_node_ipaddr": "172.24.74.136",
|
91 |
+
"localid": "0",
|
92 |
+
"mem_per_node": "727040",
|
93 |
+
"mpi_type": "pmix",
|
94 |
+
"nnodes": "1",
|
95 |
+
"nodeid": "0",
|
96 |
+
"nodelist": "c126",
|
97 |
+
"nprocs": "1",
|
98 |
+
"ntasks": "1",
|
99 |
+
"pmix_mapping_serv": "(vector,(0,1,1))",
|
100 |
+
"pmixp_abort_agent_port": "61937",
|
101 |
+
"prio_process": "0",
|
102 |
+
"procid": "0",
|
103 |
+
"srun_comm_host": "172.24.74.136",
|
104 |
+
"srun_comm_port": "61938",
|
105 |
+
"step_gpus": "0,1,2,3",
|
106 |
+
"step_id": "0",
|
107 |
+
"step_launcher_port": "61938",
|
108 |
+
"step_nodelist": "c126",
|
109 |
+
"step_num_nodes": "1",
|
110 |
+
"step_num_tasks": "1",
|
111 |
+
"step_tasks_per_node": "1",
|
112 |
+
"stepid": "0",
|
113 |
+
"submit_dir": "/data/horse/ws/ryma833h-DCFT_Shared/dcft_private",
|
114 |
+
"submit_host": "c2",
|
115 |
+
"task_pid": "402713",
|
116 |
+
"tasks_per_node": "1",
|
117 |
+
"threads_per_core": "1",
|
118 |
+
"topology_addr": "spines.leaf8.c126",
|
119 |
+
"topology_addr_pattern": "switch.switch.node",
|
120 |
+
"tres_per_task": "cpu:32",
|
121 |
+
"umask": "0022"
|
122 |
+
},
|
123 |
+
"cudaVersion": "12.6"
|
124 |
+
}
|
wandb/run-20250413_002204-33xvut2k/logs/debug-core.log
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-13T00:22:04.257805259+02:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpoela46y5/port-402765.txt","pid":402765,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
2 |
+
{"time":"2025-04-13T00:22:04.25833727+02:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":402765}
|
3 |
+
{"time":"2025-04-13T00:22:04.25833602+02:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36323,"Zone":""}}
|
4 |
+
{"time":"2025-04-13T00:22:04.372792088+02:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:39750"}
|
5 |
+
{"time":"2025-04-13T00:22:04.824089349+02:00","level":"INFO","msg":"handleInformInit: received","streamId":"33xvut2k","id":"127.0.0.1:39750"}
|
6 |
+
{"time":"2025-04-13T00:22:05.128600864+02:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"33xvut2k","id":"127.0.0.1:39750"}
|
wandb/run-20250413_002204-33xvut2k/logs/debug-internal.log
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-04-13T00:22:04.825520919+02:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug-core.log"}
|
2 |
+
{"time":"2025-04-13T00:22:05.128494964+02:00","level":"INFO","msg":"created new stream","id":"33xvut2k"}
|
3 |
+
{"time":"2025-04-13T00:22:05.128573564+02:00","level":"INFO","msg":"stream: started","id":"33xvut2k"}
|
4 |
+
{"time":"2025-04-13T00:22:05.128604494+02:00","level":"INFO","msg":"writer: Do: started","stream_id":"33xvut2k"}
|
5 |
+
{"time":"2025-04-13T00:22:05.128621084+02:00","level":"INFO","msg":"sender: started","stream_id":"33xvut2k"}
|
6 |
+
{"time":"2025-04-13T00:22:05.128629264+02:00","level":"INFO","msg":"handler: started","stream_id":"33xvut2k"}
|
7 |
+
{"time":"2025-04-13T00:22:05.425669006+02:00","level":"INFO","msg":"Starting system monitor"}
|
wandb/run-20250413_002204-33xvut2k/logs/debug.log
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-13 00:22:04,770 INFO MainThread:402765 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
2 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Configure stats pid to 402765
|
3 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from /home/ryma833h/.config/wandb/settings
|
4 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from /data/horse/ws/ryma833h-DCFT_Shared/dcft_private/wandb/settings
|
5 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
6 |
+
2025-04-13 00:22:04,778 INFO MainThread:402765 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug.log
|
7 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/wandb/run-20250413_002204-33xvut2k/logs/debug-internal.log
|
8 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():761] calling init triggers
|
9 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
10 |
+
config: {'_wandb': {}}
|
11 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():784] starting backend
|
12 |
+
2025-04-13 00:22:04,779 INFO MainThread:402765 [wandb_init.py:init():788] sending inform_init request
|
13 |
+
2025-04-13 00:22:04,822 INFO MainThread:402765 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-04-13 00:22:04,823 INFO MainThread:402765 [wandb_init.py:init():798] backend started and connected
|
15 |
+
2025-04-13 00:22:04,824 INFO MainThread:402765 [wandb_init.py:init():891] updated telemetry
|
16 |
+
2025-04-13 00:22:04,991 INFO MainThread:402765 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-04-13 00:22:05,420 INFO MainThread:402765 [wandb_init.py:init():990] starting run threads in backend
|
18 |
+
2025-04-13 00:22:07,461 INFO MainThread:402765 [wandb_run.py:_console_start():2375] atexit reg
|
19 |
+
2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
20 |
+
2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
21 |
+
2025-04-13 00:22:07,462 INFO MainThread:402765 [wandb_run.py:_redirect():2315] Redirects installed.
|
22 |
+
2025-04-13 00:22:07,505 INFO MainThread:402765 [wandb_init.py:init():1032] run started, returning control to user process
|
23 |
+
2025-04-13 00:22:07,506 INFO MainThread:402765 [wandb_run.py:_config_callback():1261] config_cb None None {'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 3584, 'intermediate_size': 18944, 'num_hidden_layers': 28, 'num_attention_heads': 28, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151645, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-7B-Instruct', '_attn_implementation_autoset': True, 'transformers_version': '4.46.1', 'model_type': 'qwen2', 'output_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 24, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192/runs/Apr13_00-20-44_c126', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/data/horse/ws/ryma833h-DCFT_Shared/checkpoints/s1k-11-test-192', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'dcft/train/zero3.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 's1k-11-test-192', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 16384, 'generation_num_beams': None, 'generation_config': None}
|
24 |
+
2025-04-13 00:22:07,508 INFO MainThread:402765 [wandb_config.py:__setitem__():154] config set model/num_parameters = 0 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7faa58768340>>
|
25 |
+
2025-04-13 00:22:07,508 INFO MainThread:402765 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 0 None
|
wandb/run-20250413_002204-33xvut2k/run-33xvut2k.wandb
ADDED
Binary file (98.3 kB). View file
|
|