File size: 3,991 Bytes
b92f5e6 0afddf6 b92f5e6 ddaf9ec b92f5e6 ddaf9ec b92f5e6 ddaf9ec b92f5e6 ddaf9ec b92f5e6 ddaf9ec b92f5e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
---
datasets:
- PJMixers-Dev/Nelathan_synthetic-sugar-quill-cleaner
base_model:
- meta-llama/Llama-3.2-1B
---
# PJMixers-Dev/LLaMa-3.2-Text-Cleaner-v0.1-1B
Model was trained at 16,384 max length, so potentially 8K input 8K output. Model will likely *heavily* reformat text, but hopefully end up with a cleaner result. 36,603,909 tokens, 18,552,131 of them supervised.
Probably not good for cleaning something you need to be 100% accurate to the original, like educational texts, but probably fine for cleaning creative writing datasets.
## Prompt format
```
<|begin_of_text|><|unclean_text|>Put your uncleaned text here.<|unclean_text|>The model will repond with a cleaned version here.<|end_of_text|>
```
Example using a sample from [PJMixers/RyokoAI_Honeyfeed3600](https://huggingface.co/datasets/PJMixers/RyokoAI_Honeyfeed3600)), which the model has not been trained on:
```
<|begin_of_text|><|unclean_text|>MODEL STILL TRAINING<|unclean_text|>MODEL STILL TRAINING<|end_of_text|>
```
## Axolotl Config
```yaml
# Requirements before running
# - Get latest commit of axolotl (currently c0a0c75)
# - Download these to axolotl/src/axolotl/prompt_formatters
# - https://github.com/xzuyn/axolotl/blob/came-plus-formatters/src/axolotl/prompt_strategies/text-cleaner.py
# - pip install ftfy
# - pip install git+https://github.com/xzuyn/CAME.git@sr-grams-cautious-8bit
# Weights and Biases logging config
wandb_project: LLaMa-3.2-1B
wandb_entity:
wandb_watch:
wandb_name: LLaMa-3.2-Text-Cleaner-v0.1-1B-FFT-run4
wandb_log_model:
# Model checkpointing config
output_dir: ./Outputs/LLaMa-3.2-Text-Cleaner-v0.1-1B-FFT-run4
save_steps: 10
save_safetensors: true
save_total_limit: 2
save_only_model: true
# Model architecture config
base_model: meta-llama/Llama-3.2-1B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
chat_template_jinja: "{{- bos_token }}{% for message in messages %}{% if message['role'] == 'system' %}{{ raise_exception('Model does not support system turns.') }}{% elif message['role'] == 'user' %}{{ '<|unclean_text|>' + message['content'] | trim }}{% elif message['role'] == 'assistant' %}{{ '<|clean_text|>' + message['content'] | trim + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|clean_text|>' }}{% endif %}"
# Mixed precision training config
bf16: true
fp16: false
tf32: false
# Model loading config
load_in_8bit: false
load_in_4bit: false
strict: false
# Sequence config
sequence_len: 16384
min_sample_len: 256
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: false
train_on_inputs: false
group_by_length: false
# Dataset config
datasets:
- path: PJMixers-Dev/Nelathan_synthetic-sugar-quill-cleaner
type: text-cleaner
val_set_size: 128
eval_strategy: steps
eval_steps: 10
dataset_prepared_path: ./00-Tokenized-Datasets/LLaMa-3.2-Text-Cleaner-v0.1-1B-seed42
shuffle_merged_datasets: true
dataset_exact_deduplication: true
# Training hyperparameters
num_epochs: 1
gradient_accumulation_steps: 1
micro_batch_size: 8
eval_batch_size: 8
warmup_steps: 0
optimizer: came_pytorch
optim_args:
enable_stochastic_rounding: true
enable_cautious: true
enable_8bit: true
lr_scheduler: rex
learning_rate: 1e-6
cosine_min_lr_ratio: 0.05
weight_decay: 0.01
max_grad_norm: 0.5
logging_steps: 1
# Model optimization
embeddings_skip_upcast: true
gradient_checkpointing: offload
sdp_attention: true
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false
liger_fused_linear_cross_entropy: false
# Garbage Collection
gc_steps: 1
# Debug config
debug: true
seed: 42
# Token config
added_tokens_overrides:
128011: "<|unclean_text|>"
128012: "<|clean_text|>"
special_tokens:
bos_token: "<|begin_of_text|>"
eos_token: "<|end_of_text|>"
pad_token: "<|finetune_right_pad_id|>"
tokens:
```
|