|
--- |
|
datasets: |
|
- PJMixers-Dev/Nelathan_synthetic-sugar-quill-cleaner |
|
base_model: |
|
- meta-llama/Llama-3.2-1B |
|
--- |
|
# PJMixers-Dev/LLaMa-3.2-Text-Cleaner-v0.1-1B |
|
|
|
Model was trained at 16,384 max length, so potentially 8K input 8K output. Model will likely *heavily* reformat text, but hopefully end up with a cleaner result. |
|
|
|
Probably not good for cleaning something you need to be 100% accurate to the original, like educational texts, but probably fine for cleaning creative writing datasets. |
|
|
|
## Prompt format |
|
``` |
|
<|begin_of_text|><|unclean_text|>Put your uncleaned text here.<|unclean_text|>The model will repond with a cleaned version here.<|end_of_text|> |
|
``` |
|
|
|
Example using a sample from [PJMixers/RyokoAI_Honeyfeed3600](https://huggingface.co/datasets/PJMixers/RyokoAI_Honeyfeed3600)), which the model has not been trained on: |
|
``` |
|
<|begin_of_text|><|unclean_text|>MODEL STILL TRAINING<|unclean_text|>MODEL STILL TRAINING<|end_of_text|> |
|
``` |
|
|
|
## Axolotl Config |
|
|
|
```yaml |
|
# Requirements before running |
|
# - Get latest commit of axolotl (currently c0a0c75) |
|
# - Download these to axolotl/src/axolotl/prompt_formatters |
|
# - https://github.com/xzuyn/axolotl/blob/came-plus-formatters/src/axolotl/prompt_strategies/text-cleaner.py |
|
# - pip install ftfy |
|
# - pip install git+https://github.com/xzuyn/CAME.git@sr-grams-cautious-8bit |
|
|
|
# Weights and Biases logging config |
|
wandb_project: LLaMa-3.2-1B |
|
wandb_entity: |
|
wandb_watch: |
|
wandb_name: LLaMa-3.2-Text-Cleaner-v0.1-1B-FFT-run4 |
|
wandb_log_model: |
|
|
|
# Model checkpointing config |
|
output_dir: ./Outputs/LLaMa-3.2-Text-Cleaner-v0.1-1B-FFT-run4 |
|
save_steps: 10 |
|
save_safetensors: true |
|
save_total_limit: 2 |
|
save_only_model: true |
|
|
|
# Model architecture config |
|
base_model: meta-llama/Llama-3.2-1B |
|
model_type: AutoModelForCausalLM |
|
tokenizer_type: AutoTokenizer |
|
chat_template_jinja: "{{- bos_token }}{% for message in messages %}{% if message['role'] == 'system' %}{{ raise_exception('Model does not support system turns.') }}{% elif message['role'] == 'user' %}{{ '<|unclean_text|>' + message['content'] | trim }}{% elif message['role'] == 'assistant' %}{{ '<|clean_text|>' + message['content'] | trim + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|clean_text|>' }}{% endif %}" |
|
|
|
# Mixed precision training config |
|
bf16: true |
|
fp16: false |
|
tf32: false |
|
|
|
# Model loading config |
|
load_in_8bit: false |
|
load_in_4bit: false |
|
strict: false |
|
|
|
# Sequence config |
|
sequence_len: 16384 |
|
min_sample_len: 256 |
|
sample_packing: false |
|
eval_sample_packing: false |
|
pad_to_sequence_len: false |
|
train_on_inputs: false |
|
group_by_length: false |
|
|
|
# Dataset config |
|
datasets: |
|
- path: PJMixers-Dev/Nelathan_synthetic-sugar-quill-cleaner |
|
type: text-cleaner |
|
val_set_size: 128 |
|
eval_strategy: steps |
|
eval_steps: 10 |
|
dataset_prepared_path: ./00-Tokenized-Datasets/LLaMa-3.2-Text-Cleaner-v0.1-1B-seed42 |
|
shuffle_merged_datasets: true |
|
dataset_exact_deduplication: true |
|
|
|
# Training hyperparameters |
|
num_epochs: 1 |
|
gradient_accumulation_steps: 1 |
|
micro_batch_size: 8 |
|
eval_batch_size: 8 |
|
warmup_steps: 0 |
|
optimizer: came_pytorch |
|
optim_args: |
|
enable_stochastic_rounding: true |
|
enable_cautious: true |
|
enable_8bit: true |
|
lr_scheduler: rex |
|
learning_rate: 1e-6 |
|
cosine_min_lr_ratio: 0.05 |
|
weight_decay: 0.01 |
|
max_grad_norm: 0.5 |
|
logging_steps: 1 |
|
|
|
# Model optimization |
|
embeddings_skip_upcast: true |
|
gradient_checkpointing: offload |
|
sdp_attention: true |
|
plugins: |
|
- axolotl.integrations.liger.LigerPlugin |
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin |
|
cut_cross_entropy: true |
|
liger_rope: true |
|
liger_rms_norm: true |
|
liger_layer_norm: true |
|
liger_glu_activation: true |
|
liger_cross_entropy: false |
|
liger_fused_linear_cross_entropy: false |
|
|
|
# Garbage Collection |
|
gc_steps: 1 |
|
|
|
# Debug config |
|
debug: true |
|
seed: 42 |
|
|
|
# Token config |
|
added_tokens_overrides: |
|
128011: "<|unclean_text|>" |
|
128012: "<|clean_text|>" |
|
special_tokens: |
|
bos_token: "<|begin_of_text|>" |
|
eos_token: "<|end_of_text|>" |
|
pad_token: "<|finetune_right_pad_id|>" |
|
tokens: |
|
``` |
|
|