File size: 4,978 Bytes
			
			| c98613b 84da9dd c98613b 84da9dd c98613b f03dcd7 8a2ebd6 7d2cb3f c98613b 8a2ebd6 c98613b 024aefe c98613b ab16584 c98613b c60dfe3 c98613b c13c3d0 c98613b e9d9da3 c98613b e9d9da3 c98613b e9d9da3 c98613b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
# ``model_config``. (type: Optional[str], default: null)
model_name: 'tangled-alpha-0.10-core'
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
# ``model_config``. (type: Optional[Config], default: null)
model_config:
  name: 'tangled-alpha-0.10-core'
  block_size: 131072
  vocab_size: 131072
  padded_vocab_size: 131072
  n_layer: 32
  n_head: 12
  n_embd: 768
  n_query_groups: 4
  rotary_percentage: 1.0
  parallel_residual: False
  bias: False
  norm_class_name: "RMSNorm"
  mlp_class_name: "LLaMAMLP"
  intermediate_size: 2048 # n_embd * 2.666
  norm_eps: 1e-5
  rope_base: 4300 # https://arxiv.org/pdf/2405.14591
  head_size: 64 # n_embd / n_head
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
out_dir: "../out/pretrain-core-0/"
# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
# precision: bf16-mixed
precision: bf16-true
# Optional path to a checkpoint directory to initialize the model from.
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
initial_checkpoint_dir:
# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
# (type: Union[bool, Literal["auto"], Path], default: False)
resume: "auto"
# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
data:
  class_path: LitData
  init_args:
    data_path: "../core-data-0-0-1073741824-1025-16000/"
    num_workers: 32
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:
  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
  save_interval: 50
  # Number of iterations between logging calls (type: int, default: 1)
  log_interval: 1
  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
  global_batch_size: 512
  # Number of samples per data-parallel rank (type: int, default: 4)
  micro_batch_size: 8
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
  lr_warmup_steps: 2000
  # Number of epochs to train on (type: Optional[int], default: null)
  epochs:
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
  max_tokens: 11186775175
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
  max_steps:
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
  max_seq_length: 1025
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
  tie_embeddings: false
  #   (type: Optional[float], default: 1.0)
  max_norm: 1.0
  #   (type: float, default: 4e-05)
  min_lr: 1e-5
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:
  # Number of optimizer steps between evaluation calls (type: int, default: 1000)
  interval: 50
  # Number of tokens to generate (type: Optional[int], default: null)
  max_new_tokens:
  # Number of iterations (type: int, default: 100)
  max_iters: 100
  # Whether to evaluate on the validation set at the beginning of the training
  initial_validation: false
  # Whether to evaluate on the validation set at the end the training
  final_validation: true
# Optimizer-related arguments
# optimizer:
#   class_path: torch.optim.AdamW
#   init_args:
#     # (type: float, default: 0.001)
#     lr: 3e-4
#     # (type: float, default: 0.01)
#     weight_decay: 0.01
#     # (type: tuple, default: (0.9,0.999))
#     betas:
#       - 0.9
#       - 0.999
# optimizer:
#   class_path: sophia_opt.SophiaG
#   init_args:
#     lr: 3e-4
#     betas:
#       - 0.9
#       - 0.95
#     rho: 0.05
#     weight_decay: 0.1
optimizer:
  class_path: sophia_opt.SophiaG
  init_args:
    lr: 1e-4
    betas:
      - 0.965
      - 0.99
    rho: 0.04
    weight_decay: 1e-1
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
devices: auto
# How many nodes to use. (type: int, default: 1)
num_nodes: 1
# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
# module require this. (type: Optional[Path], default: null)
tokenizer_dir: "../tokenizer"
# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
logger_name: "wandb"
# The random seed to use for reproducibility. (type: int, default: 42)
seed: 23
 | 
