--- datasets: - PJMixers-Dev/Nelathan_synthetic-sugar-quill-cleaner base_model: - meta-llama/Llama-3.2-1B --- # PJMixers-Dev/LLaMa-3.2-Text-Cleaner-v0.1-1B Model was trained at 16,384 max length, so potentially 8K input 8K output. Model will likely *heavily* reformat text, but hopefully end up with a cleaner result. Probably not good for cleaning something you need to be 100% accurate to the original, like educational texts, but probably fine for cleaning creative writing datasets. ## Prompt format ``` <|begin_of_text|><|unclean_text|>Put your uncleaned text here.<|unclean_text|>The model will repond with a cleaned version here.<|end_of_text|> ``` Example using a sample from [PJMixers/RyokoAI_Honeyfeed3600](https://huggingface.co/datasets/PJMixers/RyokoAI_Honeyfeed3600)), which the model has not been trained on: ``` <|begin_of_text|><|unclean_text|>MODEL STILL TRAINING<|unclean_text|>MODEL STILL TRAINING<|end_of_text|> ``` ## Axolotl Config ```yaml # Requirements before running # - Get latest commit of axolotl (currently c0a0c75) # - Download these to axolotl/src/axolotl/prompt_formatters # - https://github.com/xzuyn/axolotl/blob/came-plus-formatters/src/axolotl/prompt_strategies/text-cleaner.py # - pip install ftfy # - pip install git+https://github.com/xzuyn/CAME.git@sr-grams-cautious-8bit # Weights and Biases logging config wandb_project: LLaMa-3.2-1B wandb_entity: wandb_watch: wandb_name: LLaMa-3.2-Text-Cleaner-v0.1-1B-FFT-run4 wandb_log_model: # Model checkpointing config output_dir: ./Outputs/LLaMa-3.2-Text-Cleaner-v0.1-1B-FFT-run4 save_steps: 10 save_safetensors: true save_total_limit: 2 save_only_model: true # Model architecture config base_model: meta-llama/Llama-3.2-1B model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer chat_template_jinja: "{{- bos_token }}{% for message in messages %}{% if message['role'] == 'system' %}{{ raise_exception('Model does not support system turns.') }}{% elif message['role'] == 'user' %}{{ '<|unclean_text|>' + message['content'] | trim }}{% elif message['role'] == 'assistant' %}{{ '<|clean_text|>' + message['content'] | trim + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|clean_text|>' }}{% endif %}" # Mixed precision training config bf16: true fp16: false tf32: false # Model loading config load_in_8bit: false load_in_4bit: false strict: false # Sequence config sequence_len: 16384 min_sample_len: 256 sample_packing: false eval_sample_packing: false pad_to_sequence_len: false train_on_inputs: false group_by_length: false # Dataset config datasets: - path: PJMixers-Dev/Nelathan_synthetic-sugar-quill-cleaner type: text-cleaner val_set_size: 128 eval_strategy: steps eval_steps: 10 dataset_prepared_path: ./00-Tokenized-Datasets/LLaMa-3.2-Text-Cleaner-v0.1-1B-seed42 shuffle_merged_datasets: true dataset_exact_deduplication: true # Training hyperparameters num_epochs: 1 gradient_accumulation_steps: 1 micro_batch_size: 8 eval_batch_size: 8 warmup_steps: 0 optimizer: came_pytorch optim_args: enable_stochastic_rounding: true enable_cautious: true enable_8bit: true lr_scheduler: rex learning_rate: 1e-6 cosine_min_lr_ratio: 0.05 weight_decay: 0.01 max_grad_norm: 0.5 logging_steps: 1 # Model optimization embeddings_skip_upcast: true gradient_checkpointing: offload sdp_attention: true plugins: - axolotl.integrations.liger.LigerPlugin - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin cut_cross_entropy: true liger_rope: true liger_rms_norm: true liger_layer_norm: true liger_glu_activation: true liger_cross_entropy: false liger_fused_linear_cross_entropy: false # Garbage Collection gc_steps: 1 # Debug config debug: true seed: 42 # Token config added_tokens_overrides: 128011: "<|unclean_text|>" 128012: "<|clean_text|>" special_tokens: bos_token: "<|begin_of_text|>" eos_token: "<|end_of_text|>" pad_token: "<|finetune_right_pad_id|>" tokens: ```