Download a parquet file to your Google drive and load it from there into this notebook.

Parquet files: https://huggingface.co/datasets/codeShare/chroma_prompts/tree/main

E621 JSON files: https://huggingface.co/datasets/lodestones/e621-captions/tree/main

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

from unsloth import FastVisionModel, get_chat_template
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer
from datasets import load_from_disk
import torch

# Set a writable cache directory for datasets to avoid read-only issues
os.environ["HF_DATASETS_CACHE"] = "/kaggle/working/dataset_cache"

# Create the cache directory if it doesn't exist
os.makedirs("/kaggle/working/dataset_cache", exist_ok=True)

# Load model and processor
model, processor = FastVisionModel.from_pretrained(
    model_name="codeShare/flux_chroma_image_captioner",
    load_in_4bit=True,
)
processor = get_chat_template(processor, "gemma-3")

# Load dataset
dataset_path = '/kaggle/input/image-caption-dataset'
dataset = load_from_disk(dataset_path)

# Preprocess dataset
def preprocess_data(example):
    image = example["image"]
    caption = example["text"]
    messages = [
        {
            "role": "user",
            "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": caption}]
        }
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    processed = processor(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )
    return {key: val.squeeze(0) for key, val in processed.items()}

# Apply preprocessing with explicit cache file in writable directory
tokenized_dataset = dataset.map(
    preprocess_data,
    batched=False,
    remove_columns=dataset.column_names,
    load_from_cache_file=False,
    cache_file_name="/kaggle/working/dataset_cache/tokenized_cache.arrow",
)

# Debug: Inspect the first example
print(tokenized_dataset[0])
print({key: len(val) if isinstance(val, torch.Tensor) else val for key, val in tokenized_dataset[0].items()})

# Configure training
training_args = TrainingArguments(
    output_dir="/kaggle/working/continued_lora_model",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    save_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    fp16=True,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none",
)

# Configure data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=processor,
    mlm=False,
    pad_to_multiple_of=8
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=processor,
    max_seq_length=512,
    data_collator=data_collator,
)

# Train
trainer.train()

# Save
model.save_pretrained("/kaggle/working/continued_lora_model")
processor.save_pretrained("/kaggle/working/continued_lora_model")
# Optional: Push to Hugging Face
# from huggingface_hub import login
# login()  # Enter your Hugging Face token
# model.push_to_hub("your_username/new_flux_chroma_image_captioner")
# processor.push_to_hub("your_username/new_flux_chroma_image_captioner")