craffel HF Staff commited on
Commit
fa674ef
·
verified ·
1 Parent(s): 7c6cc0b

Upload ipt_fineinstructions_all_exp_chat_100b/config.yaml with huggingface_hub

Browse files
ipt_fineinstructions_all_exp_chat_100b/config.yaml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: fineinstructions_ipt_fineinstructions_all_exp_chat_100b
2
+ dump_dir: /fsx/craffel/fineinstructions/pretraining/ipt_fineinstructions_all_exp_chat_100b/
3
+ seed: 777
4
+ grad_acc_steps: 8
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 88000
8
+ data:
9
+ root_dir: /scratch/craffel/lingua/data/fineinstructions/
10
+ sources:
11
+ ipt_fineinstructions_all_exp_chat: 1.0
12
+ batch_size: 4
13
+ seq_len: 4096
14
+ n_views: 2
15
+ seed: 42
16
+ add_bos: true
17
+ add_eos: true
18
+ load_async: true
19
+ prefetch_size: 1024
20
+ tokenizer:
21
+ name: tiktoken
22
+ path: /fsx/craffel/lingua/tokenizers/llama3.model
23
+ n_words: null
24
+ optim:
25
+ lr: 0.001
26
+ weight_decay: 0.1
27
+ epsilon: 1.0e-08
28
+ beta1: 0.9
29
+ beta2: 0.95
30
+ clip: 1.0
31
+ scheduler: cosine
32
+ warmup: 2000
33
+ lr_min_ratio: 1.0e-06
34
+ cycle_length: 1.0
35
+ cosine_theta: 1.0
36
+ annealing_step: 1000
37
+ decay_fraction: 0.1
38
+ exp_factor: 0.5
39
+ model:
40
+ dim: 2048
41
+ n_layers: 25
42
+ head_dim: null
43
+ n_heads: 16
44
+ n_kv_heads: null
45
+ ffn_dim_multiplier: null
46
+ multiple_of: 256
47
+ norm_eps: 1.0e-05
48
+ rope_theta: 10000.0
49
+ init_base_std: null
50
+ init_std_factor: disabled
51
+ max_seqlen: 4096
52
+ seed: 42
53
+ vocab_size: 128256
54
+ weight_tying: false
55
+ sliding_window: null
56
+ distributed:
57
+ dp_shard: 1
58
+ dp_replicate: 8
59
+ tp_size: 1
60
+ selective_activation_checkpointing: false
61
+ compile: true
62
+ fsdp_type: full_shard
63
+ model_dtype: bf16
64
+ float8_recipe: null
65
+ float8_filter: layers\.[0-9]+\.
66
+ matmul_allow_tf32: false
67
+ detect_anomaly: false
68
+ compile_cache_size_limit: 8
69
+ spawn_method: forkserver
70
+ env:
71
+ MKL_SERVICE_FORCE_INTEL: GNU
72
+ OMP_NUM_THREADS: '1'
73
+ MKL_NUM_THREADS: '1'
74
+ ENABLE_INTRA_NODE_COMM: '1'
75
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
76
+ NCCL_IB_TIMEOUT: '22'
77
+ NCCL_DEBUG: INFO
78
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
79
+ checkpoint:
80
+ dump:
81
+ every: 2000
82
+ keep: -1
83
+ eval:
84
+ every: 2000
85
+ keep: -1
86
+ path: /fsx/craffel/fineinstructions/pretraining/ipt_fineinstructions_all_exp_chat_100b/checkpoints
87
+ init_ckpt_path: null
88
+ load_init_optimizer_state: false
89
+ save_init_ckpt: false
90
+ profiling:
91
+ run: true
92
+ trace_folder: profiling
93
+ mem_warmup: 0
94
+ mem_steps: 4
95
+ profile_warmup: 100
96
+ profile_steps: 4
97
+ logging:
98
+ freq: 1
99
+ acc_freq: null
100
+ wandb: null
101
+ async_eval_gpus: 8
102
+ eval:
103
+ harness:
104
+ apply_chat_template: true
105
+ tasks:
106
+ - hellaswag
107
+ - mmlu
108
+ - commonsense_qa
109
+ - sciq
110
+ confirm_run_unsafe_code: true
111
+ generator:
112
+ max_tokens: 8192
113
+ dtype: bf16