Download a parquet file to your Google drive and load it from there into this notebook.

Parquet files: https://huggingface.co/datasets/codeShare/chroma_prompts/tree/main

E621 JSON files: https://huggingface.co/datasets/lodestones/e621-captions/tree/main

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@markdown Build a dataset from ALL images in /content/ with EXIF metadata (using exiftool) as separate columns and WebM files, saving to Google Drive

# Step 1: Install required libraries and exiftool
!pip install Pillow imageio[ffmpeg] datasets pandas
!apt-get update && apt-get install -y libimage-exiftool-perl

# Step 2: Import required libraries
import os
import glob
import subprocess
from PIL import Image
import imageio.v3 as iio
import pandas as pd
from datasets import Dataset, Features, Image as HFImage, Value
from google.colab import drive

# Step 3: Mount Google Drive
drive.mount('/content/drive')
output_dir = '/content/drive/My Drive/exif_dataset' #@param {type:'string'}

# Step 4: Define function to extract metadata using exiftool
def get_exif_data(image_path):
    try:
        # Run exiftool to extract all metadata as JSON
        result = subprocess.run(
            ['exiftool', '-j', image_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        # Parse JSON output (exiftool -j returns a list of dictionaries)
        metadata = eval(result.stdout)[0]  # First item in the list
        return metadata
    except subprocess.CalledProcessError as e:
        print(f"exiftool error for {image_path}: {e.stderr}")
        return {"Error": f"exiftool failed: {str(e)}"}
    except Exception as e:
        return {"Error": f"Failed to read metadata: {str(e)}"}

# Step 5: Define function to convert image to WebM
def convert_to_webm(image_path, output_path):
    try:
        img = iio.imread(image_path)
        iio.imwrite(output_path, img, codec='vp8', fps=1, quality=8)
        return True
    except Exception as e:
        print(f"Error converting {image_path} to WebM: {str(e)}")
        return False

# Step 6: Collect ALL images from /content/
image_dir = "/content/"
image_extensions = ["*.jpg", "*.jpeg", "*.png"]
image_paths = []
for ext in image_extensions:
    image_paths.extend(glob.glob(os.path.join(image_dir, ext)))

if not image_paths:
    print("No images found in /content/")
else:
    # Step 7: Process all images to collect metadata keys and data
    images = []
    webm_paths = []
    metadata_list = []
    all_metadata_keys = set()

    for img_path in image_paths:
        print(f"\nProcessing {img_path}:")

        # Load image
        try:
            img = Image.open(img_path).convert('RGB')
        except Exception as e:
            print(f"Error loading image {img_path}: {str(e)}")
            continue

        # Extract metadata with exiftool
        metadata = get_exif_data(img_path)
        print("Metadata (via exiftool):")
        for key, value in metadata.items():
            print(f"  {key}: {value}")
            all_metadata_keys.add(key)  # Collect unique metadata keys

        # Convert to WebM
        webm_path = os.path.splitext(img_path)[0] + ".webm"
        if convert_to_webm(img_path, webm_path):
            print(f"  Saved WebM: {webm_path}")
            images.append(img)
            webm_paths.append(webm_path)
            metadata_list.append(metadata)
        else:
            print(f"  Skipped WebM conversion for {img_path}")
            continue

    # Step 8: Check if any images were processed
    if not images:
        print("No images were successfully processed.")
    else:
        # Step 9: Prepare dataset dictionary with separate columns for each metadata key
        data_dict = {'image': images, 'webm_path': webm_paths}

        # Initialize columns for each metadata key with None
        for key in all_metadata_keys:
            data_dict[key] = [None] * len(images)

        # Populate metadata values
        for i, metadata in enumerate(metadata_list):
            for key, value in metadata.items():
                data_dict[key][i] = str(value)  # Convert values to strings

        # Step 10: Define dataset features
        features = Features({
            'image': HFImage(),
            'webm_path': Value("string"),
            **{key: Value("string") for key in all_metadata_keys}  # Dynamic columns for metadata keys
        })

        # Step 11: Create Hugging Face Dataset
        dataset = Dataset.from_dict(data_dict, features=features)

        # Step 12: Verify the dataset
        print("\nDataset Summary:")
        print(dataset)
        if len(dataset) > 0:
            print("\nExample of accessing first item:")
            print("WebM Path:", dataset['webm_path'][0])
            print("Image type:", type(dataset['image'][0]))
            print("Image size:", dataset['image'][0].size)
            print("Metadata columns (first item):")
            for key in all_metadata_keys:
                if dataset[key][0] is not None:
                    print(f"  {key}: {dataset[key][0]}")

        # Step 13: Save dataset to Google Drive
        try:
            os.makedirs(output_dir, exist_ok=True)
            dataset.save_to_disk(output_dir)
            print(f"\nDataset saved to {output_dir}")
        except Exception as e:
            print(f"Error saving dataset to Google Drive: {str(e)}")

In [None]:
#@markdown Create a new dataset with 'image' and 'text' from the original dataset

# Step 1: Install required libraries (if not already installed)
# !pip install datasets

# Step 2: Import required libraries
from datasets import Dataset, load_from_disk
import json
from PIL import Image

# Step 3: Define the path to the original dataset on Google Drive
dataset_path = '/content/drive/My Drive/exif_dataset'  #@param {type:'string'}

# Step 4: Load the original dataset
try:
    dataset = load_from_disk(dataset_path)
    print("Original dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Step 5: Function to extract 'text' from the 'Prompt' dictionary
def extract_text_from_prompt(prompt):
    try:
        # Parse the prompt (assuming it's a string representation of a dictionary)
        prompt_dict = json.loads(prompt) if isinstance(prompt, str) else prompt
        # Look for the 'CLIPTextEncode' node with the main text description
        for node_key, node_data in prompt_dict.items():
            if node_data.get('class_type') == 'CLIPTextEncode' and node_data['inputs']['text']:
                return node_data['inputs']['text']
        return ""  # Return empty string if no valid text is found
    except Exception as e:
        print(f"Error parsing prompt: {e}")
        return ""

# Step 6: Create lists for the new dataset
new_data = {
    'image': [],
    'text': []
}

# Step 7: Process each item in the dataset
for i in range(len(dataset)):
    image = dataset['image'][i]  # Get the image
    prompt = dataset['Prompt'][i]  # Get the Prompt field
    text = extract_text_from_prompt(prompt)  # Extract the text from Prompt

    new_data['image'].append(image)
    new_data['text'].append(text)

# Step 8: Create a new Hugging Face dataset
new_dataset = Dataset.from_dict(new_data)

# Step 9: Define the path to save the new dataset
new_dataset_path = '/content/drive/MyDrive/custom_dataset2'  #@param {type:'string'}

# Step 10: Save the new dataset
try:
    new_dataset.save_to_disk(new_dataset_path)
    print(f"New dataset saved successfully to {new_dataset_path}!")
except Exception as e:
    print(f"at index = {i} : Error saving new dataset: {e}")
    raise

# Step 11: Verify the new dataset
print("\nNew dataset info:")
print(new_dataset)

# Step 12: Example of accessing an item in the new dataset
index = 4  #@param {type:'slider', max:200}
print("\nExample of accessing item at index", index)
print("Text:", new_dataset['text'][index])
print("Image type:", type(new_dataset['image'][index]))
print("Image size:", new_dataset['image'][index].size)

# Optional: Display the image
new_dataset['image'][index]

In [None]:
#@markdown Load a dataset from Drive

# Step 1: Install required libraries (if not already installed)
# !pip install datasets

# Step 2: Mount Google Drive (only needed in Google Colab)
#from google.colab import drive
#drive.mount('/content/drive')

# Step 3: Import required library
from datasets import load_from_disk

# Step 4: Define the path to the saved dataset on Google Drive
dataset_path = '/content/drive/MyDrive/custom_dataset'#@param {type:'string'}

# Step 5: Load the dataset
try:
    dataset = load_from_disk(dataset_path)
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Step 6: Verify the dataset
print(dataset)

# Step 7: Example of accessing an image and text
#print("\nExample of accessing first item:")
#print("Text:", dataset['text'][0])
#print("Image type:", type(dataset['image'][0]))
#print("Image size:", dataset['image'][0].size)

In [None]:
#@markdown Display an image from the dataset
index = 85 #@param {type:'slider',max:200}
dataset['image'][index]




In [None]:
#@markdown Display matching text from the dataset
#index = 85 #@param {type:'slider',max:200}
dataset['Prompt'][index]




In [None]:
# Unassign memory
dataset=''
dataset1=''
dataset2=''

In [None]:

#@markdown Build a dataset for training using a .parquet file

num_dataset_items = 10 #@param {type:'slider',max:1000}

output_name='/content/drive/MyDrive/mini_dataset'#@param {type:'string'}

# Step 1: Install required libraries (if not already installed)
# !pip install datasets pandas pillow requests

# Step 2: Import required libraries
import pandas as pd
from datasets import Dataset
from PIL import Image
import requests
from io import BytesIO
import numpy as np
import math,random

# Step 3: Define the path to the Parquet file
file_path = '/content/drive/MyDrive/Saved from Chrome/vlm_captions_cc12m_00.parquet' #@param {type:'string'}

# Step 4: Read the Parquet file
df = pd.read_parquet(file_path)

# Step 5: Randomly select 300 rows to account for potential image loading failures
df_sample = df.sample(n=math.floor(num_dataset_items*1.5), random_state=math.floor(random.random()*10000)).reset_index(drop=True)

# Step 6: Function to download, resize, and process images
def load_and_resize_image_from_url(url, max_size=(1024, 1024)):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        img = Image.open(BytesIO(response.content)).convert('RGB')
        # Resize image to fit within 1024x1024 while maintaining aspect ratio
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        return img
    except Exception as e:
        print(f"Error loading image from {url}: {e}")
        return None

# Step 7: Create lists for images and captions
images = []
texts = []
num=1
for index, row in df_sample.iterrows():
    if len(images) >= num_dataset_items:  # Stop once we have 200 valid images
        break
    url = row['url']
    caption = row['original_caption'] + ', ' + row['vlm_caption'].replace('This image displays:','').replace('This image displays','')
    num=num+1
    print(f'{num}')
    # Load and resize image
    img = load_and_resize_image_from_url(url)
    if img is not None:
        images.append(img)
        texts.append(caption)
    else:
        print(f"Skipping row {index} due to image loading failure.")

# Step 8: Check if we have enough images
if len(images) < num_dataset_items:
    print(f"Warning: Only {len(images)} images were successfully loaded.")
else:
    # Truncate to exactly 200 if we have more
    images = images[:num_dataset_items]
    texts = texts[:num_dataset_items]

# Step 9: Create a Hugging Face Dataset
dataset = Dataset.from_dict({
    'image': images,
    'text': texts
})

# Step 10: Verify the dataset
print(dataset)

# Step 11: Example of accessing an image and text
print("\nExample of accessing first item:")
print("Text:", dataset['text'][0])
print("Image type:", type(dataset['image'][0]))
print("Image size:", dataset['image'][0].size)

#Optional: Save the dataset to disk (if needed)
dataset.save_to_disk(f'{output_name}')

In [None]:

#@markdown Convert Tensor Art style dataset into a training dataset

#@markdown Create a new dataset with 'image' and 'text' from the original dataset

# Step 1: Install required libraries (if not already installed)
# !pip install datasets

# Step 2: Import required libraries
from datasets import Dataset, load_from_disk
import json
from PIL import Image

# Step 3: Define the path to the original dataset on Google Drive
dataset_path = '/content/drive/MyDrive/raw_dataset'  #@param {type:'string'}

# Step 4: Load the original dataset
try:
    dataset = load_from_disk(dataset_path)
    print("Original dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Step 5: Function to extract 'text' from the 'Prompt' dictionary
def extract_text_from_prompt(prompt):
    try:
        # Parse the prompt (assuming it's a string representation of a dictionary)
        prompt_dict = json.loads(prompt) if isinstance(prompt, str) else prompt
        # Look for the 'CLIPTextEncode' node with the main text description
        for node_key, node_data in prompt_dict.items():
            if node_data.get('class_type') == 'CLIPTextEncode' and node_data['inputs']['text']:
                return node_data['inputs']['text']
        return ""  # Return empty string if no valid text is found
    except Exception as e:
        print(f"Error parsing prompt: {e}")
        return ""

# Step 6: Create lists for the new dataset
new_data = {
    'image': [],
    'text': []
}

# Step 7: Process each item in the dataset
for i in range(len(dataset)):
    image = dataset['image'][i]  # Get the image
    prompt = dataset['Prompt'][i]  # Get the Prompt field
    text = extract_text_from_prompt(prompt)  # Extract the text from Prompt

    new_data['image'].append(image)
    new_data['text'].append(text)

# Step 8: Create a new Hugging Face dataset
new_dataset = Dataset.from_dict(new_data)

# Step 9: Define the path to save the new dataset
new_dataset_path = '/content/drive/MyDrive/processed_dataset'  #@param {type:'string'}

# Step 10: Save the new dataset
try:
    new_dataset.save_to_disk(new_dataset_path)
    print(f"New dataset saved successfully to {new_dataset_path}!")
except Exception as e:
    print(f"Error saving new dataset: {e}")
    raise

# Step 11: Verify the new dataset
print("\nNew dataset info:")
print(new_dataset)

# Step 12: Example of accessing an item in the new dataset
index = 85  #@param {type:'slider', max:200}
print("\nExample of accessing item at index", index)
print("Text:", new_dataset['text'][index])
print("Image type:", type(new_dataset['image'][index]))
print("Image size:", new_dataset['image'][index].size)

# Optional: Display the image
new_dataset['image'][index]



In [None]:
#@markdown Merge the two datasets into one

# Step 1: Import required libraries
from datasets import load_from_disk, concatenate_datasets
from google.colab import drive

# Step 2: Mount Google Drive (only needed in Google Colab)
drive.mount('/content/drive')

# Step 3: Define paths for the datasets
dataset1_path = '' #@param {type:'string'}
dataset2_path = '' #@param {type:'string'}
merged_dataset_path = ''  #@param {type:'string'}

# Step 4: Load the datasets
try:
    dataset1 = load_from_disk(dataset1_path)
    dataset2 = load_from_disk(dataset2_path)
    print("Datasets loaded successfully!")
except Exception as e:
    print(f"Error loading datasets: {e}")
    raise

# Step 5: Verify the datasets
print("Dataset 1:", dataset1)
print("Dataset 2:", dataset2)

# Step 6: Merge the datasets
try:
    dataset = concatenate_datasets([dataset1, dataset2])
    print("Datasets merged successfully!")
except Exception as e:
    print(f"Error merging datasets: {e}")
    raise

# Step 7: Verify the merged dataset
print("Merged Dataset:", dataset)
dataset1=''
dataset2=''
# Step 8: Save the merged dataset to Google Drive
try:
    dataset.save_to_disk(merged_dataset_path)
    print(f"Merged dataset saved successfully to {merged_dataset_path}")
except Exception as e:
    print(f"Error saving merged dataset: {e}")
    raise

# Step 9: Optional - Verify the saved dataset by loading it back
try:
    dataset = load_from_disk(merged_dataset_path)
    print("Saved merged dataset loaded successfully for verification:")
    print(dataset)
except Exception as e:
    print(f"Error loading saved merged dataset: {e}")
    raise

In [None]:
dataset.save_to_disk('/content/dataset2')






In [None]:
#@markdown Build a dataset for training using a .jsonl file

num_dataset_items = 800 #@param {type:'slider', max:10000}

# Step 1: Install required libraries (if not already installed)
# !pip install datasets pandas pillow requests

# Step 2: Import required libraries
import json
import pandas as pd
from datasets import Dataset
from PIL import Image
import requests
from io import BytesIO
import math,random

# Step 3: Define the path to the JSONL file
file_path = '/content/drive/MyDrive/Saved from Chrome/2022-08_grouped.jsonl' #@param {type:'string'}

# Step 4: Read the JSONL file
data = []
with open(file_path, 'r') as file:
    for line in file:
        try:
            json_obj = json.loads(line.strip())
            data.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON line: {e}")
            continue

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 5: Randomly select rows to account for potential image loading failures
df_sample = df.sample(n=math.floor(num_dataset_items * 1.1), random_state=math.floor(random.random()*10000)).reset_index(drop=True)
# Step 6: Function to download, resize, and process images
def load_and_resize_image_from_url(url, max_size=(1024, 1024)):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        img = Image.open(BytesIO(response.content)).convert('RGB')
        # Resize image to fit within 1024x1024 while maintaining aspect ratio
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        #num=num+1
        #print(f"{num}")
        return img
    except Exception as e:
        print(f"Error loading image from {url}: {e}")
        return None

# Step 7: Create lists for images and captions
images = []
texts = []
num=1
for index, row in df_sample.iterrows():
    if len(images) >= num_dataset_items:  # Stop once we have enough valid images
        break
    url = row['url']
    # Combine description and tag_string for caption, ensuring no missing values
    description = row['description'] if pd.notnull(row['description']) else ''
    tag_string = row['tag_string'] if pd.notnull(row['tag_string']) else ''
    caption = f"{description}, {tag_string}".strip(', ')

    num=num+1
    print(f'{num}')

    # Load and resize image
    img = load_and_resize_image_from_url(url)
    if img is not None:
        images.append(img)
        texts.append(caption)
    else:
        print(f"Skipping row {index} due to image loading failure.")

# Step 8: Check if we have enough images
if len(images) < num_dataset_items:
    print(f"Warning: Only {len(images)} images were successfully loaded.")
else:
    # Truncate to exactly num_dataset_items if we have more
    images = images[:num_dataset_items]
    texts = texts[:num_dataset_items]

# Step 9: Create a Hugging Face Dataset
dataset = Dataset.from_dict({
    'image': images,
    'text': texts
})

# Step 10: Verify the dataset
print(dataset)

# Step 11: Example of accessing an image and text
print("\nExample of accessing first item:")
print("Text:", dataset['text'][0])
print("Image type:", type(dataset['image'][0]))
print("Image size:", dataset['image'][0].size)
output_name='dataset1'#@param {type:'string'}
# Optional: Save the dataset to disk (if needed)
dataset.save_to_disk(f'/content/{output_name}')

In [None]:
#@markdown load two datasets for merging

# Step 1: Install required libraries (if not already installed)
# !pip install datasets

# Step 2: Mount Google Drive (only needed in Google Colab)
#from google.colab import drive
#drive.mount('/content/drive')

# Step 3: Import required library
from datasets import load_from_disk

# Step 4: Define the path to the saved dataset on Google Drive
dataset1_path = '' #@param {type: 'string'}

dataset2_path = '' #@param {type:'string'}

# Step 5: Load the dataset
try:
    dataset1 = load_from_disk(dataset1_path)
    dataset2 = load_from_disk(dataset2_path)
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Step 6: Verify the dataset
print(dataset1)
print(dataset2)

# Step 7: Example of accessing an image and text
#print("\nExample of accessing first item:")
#print("Text:", redcaps_dataset['text'][0])
#print("Image type:", type(dataset['image'][0]))
#print("Image size:", dataset['image'][0].size)

In [None]:
dataset.save_to_disk(f'/content/drive/MyDrive/{output_name}')





In [None]:
#@markdown Investigate a json file

import json
import pandas as pd

# Path to the uploaded .jsonl file
file_path = '' #@param {type:'string'}

# Initialize lists to store data
data = []

# Read the .jsonl file line by line
with open(file_path, 'r') as file:
    for line in file:
        try:
            # Parse each line as a JSON object
            json_obj = json.loads(line.strip())
            data.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON line: {e}")
            continue

# Convert the list of JSON objects to a Pandas DataFrame for easier exploration
df = pd.DataFrame(data)

# Display basic information about the DataFrame
print("=== File Overview ===")
print(f"Number of records: {len(df)}")
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)

# Display the first few rows
print("\n=== First 5 Rows ===")
print(df.head())

# Display basic statistics
print("\n=== Basic Statistics ===")
print(df.describe(include='all'))

# Check for missing values
print("\n=== Missing Values ===")
print(df.isnull().sum())

# Optional: Display unique values in each column
print("\n=== Unique Values per Column ===")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

In [None]:
#@markdown Investigate a json file pt 2

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

# Set up plotting style
plt.style.use('default')
%matplotlib inline

# Path to the uploaded .jsonl file
#file_path = ''

# Read the .jsonl file into a DataFrame
data = []
with open(file_path, 'r') as file:
    for line in file:
        try:
            json_obj = json.loads(line.strip())
            data.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON line: {e}")
            continue
df = pd.DataFrame(data)

# 1. Rating Distribution
print("=== Rating Distribution ===")
rating_counts = df['rating'].value_counts()
plt.figure(figsize=(8, 5))
sns.barplot(x=rating_counts.index, y=rating_counts.values)
plt.title('Distribution of Image Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()
print(rating_counts)

# 2. Tag Analysis
print("\n=== Top 10 Most Common Tags ===")
# Combine all tags into a single list
all_tags = []
for tags in df['tag_string'].dropna():
    all_tags.extend(tags.split())
tag_counts = Counter(all_tags)
top_tags = pd.DataFrame(tag_counts.most_common(10), columns=['Tag', 'Count'])
plt.figure(figsize=(10, 6))
sns.barplot(x='Count', y='Tag', data=top_tags)
plt.title('Top 10 Most Common Tags')
plt.show()
print(top_tags)

# 3. Image Dimensions Analysis
print("\n=== Image Dimensions Analysis ===")
plt.figure(figsize=(10, 6))
plt.scatter(df['image_width'], df['image_height'], alpha=0.5, s=50)
plt.title('Image Width vs. Height')
plt.xlabel('Width (pixels)')
plt.ylabel('Height (pixels)')
plt.xscale('log')
plt.yscale('log')
plt.grid(True)
plt.show()
print(f"Median Width: {df['image_width'].median()}")
print(f"Median Height: {df['image_height'].median()}")
print(f"Aspect Ratio (Width/Height) Stats:\n{df['image_width'].div(df['image_height']).describe()}")

# 4. Score and Voting Analysis
print("\n=== Score and Voting Analysis ===")
plt.figure(figsize=(10, 6))
sns.histplot(df['score'], bins=30, kde=True)
plt.title('Distribution of Image Scores')
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()
print(f"Score Stats:\n{df['score'].describe()}")
print(f"\nCorrelation between Up Score and Down Score: {df['up_score'].corr(df['down_score'])}")

# 5. Summary Length Analysis
print("\n=== Summary Length Analysis ===")
df['summary_length'] = df['regular_summary'].dropna().apply(lambda x: len(str(x).split()))
plt.figure(figsize=(10, 6))
sns.histplot(df['summary_length'], bins=30, kde=True)
plt.title('Distribution of Regular Summary Word Counts')
plt.xlabel('Word Count')
plt.ylabel('Count')
plt.show()
print(f"Summary Length Stats:\n{df['summary_length'].describe()}")

# 6. Missing Data Heatmap
print("\n=== Missing Data Heatmap ===")
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()

# 7. Source Platform Analysis
print("\n=== Source Platform Analysis ===")
# Extract domain from source URLs
df['source_domain'] = df['source'].dropna().str.extract(r'(https?://[^/]+)')
source_counts = df['source_domain'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=source_counts.values, y=source_counts.index)
plt.title('Top 10 Source Domains')
plt.xlabel('Count')
plt.ylabel('Domain')
plt.show()
print(source_counts)

# 8. File Size vs. Image Dimensions
print("\n=== File Size vs. Image Dimensions ===")
plt.figure(figsize=(10, 6))
plt.scatter(df['image_width'] * df['image_height'], df['file_size'], alpha=0.5)
plt.title('File Size vs. Image Area')
plt.xlabel('Image Area (Width * Height)')
plt.ylabel('File Size (bytes)')
plt.xscale('log')
plt.yscale('log')
plt.grid(True)
plt.show()
print(f"Correlation between Image Area and File Size: {df['file_size'].corr(df['image_width'] * df['image_height'])}")

In [None]:
#@markdown  convert E621 JSON to parquet file

import json,os
import pandas as pd

# Path to the uploaded .jsonl file
file_path = '' #@param {type:'string'}

# Read the .jsonl file into a DataFrame
data = []
with open(file_path, 'r') as file:
    for line in file:
        try:
            json_obj = json.loads(line.strip())
            data.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON line: {e}")
            continue
df = pd.DataFrame(data)

# Define columns that likely contain prompts/image descriptions
description_columns = [
    'regular_summary',
    'individual_parts',
    'midjourney_style_summary',
    'deviantart_commission_request',
    'brief_summary'
]

# Initialize a list to store all descriptions
all_descriptions = []

# Iterate through each row and collect non-empty descriptions
for index, row in df.iterrows():
    record_descriptions = []
    for col in description_columns:
        if pd.notnull(row[col]) and row[col]:  # Check for non-null and non-empty values
            record_descriptions.append(f"{col}: {row[col]}")
    if record_descriptions:
        all_descriptions.append({
            'id': row['id'],
            'descriptions': '; '.join(record_descriptions)  # Join descriptions with semicolon
        })

# Convert to DataFrame for Parquet
output_df = pd.DataFrame(all_descriptions)

# Save to Parquet file
output_path = '' #@param {type:'string'}
output_df.to_parquet(output_path, index=False)
os.remove(f'{file_path}')
print(f"\nDescriptions have been saved to '{output_path}'.")

In [None]:
#@markdown Build a dataset for training using a .jsonl file

num_dataset_items = 200 #@param {type:'slider', max:10000}

# Step 1: Install required libraries (if not already installed)
# !pip install datasets pandas pillow requests

# Step 2: Import required libraries
import json
import pandas as pd
from datasets import Dataset
from PIL import Image
import requests
from io import BytesIO
import math

# Step 3: Define the path to the JSONL file
file_path = '' #@param {type:'string'}

# Step 4: Read the JSONL file
data = []
with open(file_path, 'r') as file:
    for line in file:
        try:
            json_obj = json.loads(line.strip())
            data.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON line: {e}")
            continue

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 5: Randomly select rows to account for potential image loading failures
df_sample = df.sample(n=math.floor(num_dataset_items * 1.2), random_state=42).reset_index(drop=True)

# Step 6: Function to download, resize, and process images
def load_and_resize_image_from_url(url, max_size=(1024, 1024)):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        img = Image.open(BytesIO(response.content)).convert('RGB')
        # Resize image to fit within 1024x1024 while maintaining aspect ratio
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        return img
    except Exception as e:
        print(f"Error loading image from {url}: {e}")
        return None

# Step 7: Create lists for images and captions
images = []
texts = []

for index, row in df_sample.iterrows():
    if len(images) >= num_dataset_items:  # Stop once we have enough valid images
        break
    url = row['url']
    # Combine description and tag_string for caption, ensuring no missing values
    description = row['description'] if pd.notnull(row['description']) else ''
    tag_string = row['tag_string'] if pd.notnull(row['tag_string']) else ''
    caption = f"{description}, {tag_string}".strip(', ')

    # Load and resize image
    img = load_and_resize_image_from_url(url)
    if img is not None:
        images.append(img)
        texts.append(caption)
    else:
        print(f"Skipping row {index} due to image loading failure.")

# Step 8: Check if we have enough images
if len(images) < num_dataset_items:
    print(f"Warning: Only {len(images)} images were successfully loaded.")
else:
    # Truncate to exactly num_dataset_items if we have more
    images = images[:num_dataset_items]
    texts = texts[:num_dataset_items]

# Step 9: Create a Hugging Face Dataset
dataset = Dataset.from_dict({
    'image': images,
    'text': texts
})

# Step 10: Verify the dataset
print(dataset)

# Step 11: Example of accessing an image and text
print("\nExample of accessing first item:")
print("Text:", dataset['text'][0])
print("Image type:", type(dataset['image'][0]))
print("Image size:", dataset['image'][0].size)

# Optional: Save the dataset to disk (if needed)
dataset.save_to_disk('/kaggle/output/custom_dataset')

In [None]:
# Step 1: Mount Google Drive
#from google.colab import drive
#drive.mount('/content/drive')

#@markdown paste .parquet file stored on your Google Drive folder to see its characteristics

# Step 2: Import required libraries
import pandas as pd

# Step 3: Define the path to the Parquet file
file_path = '' #@param {type:'string'}

# Step 4: Read the Parquet file
df = pd.read_parquet(file_path)

# Step 5: Basic exploration of the Parquet file
print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())

print("\nColumn Names:")
print(df.columns.tolist())

print("\nMissing Values:")
print(df.isnull().sum())

# Optional: Display number of rows and columns
print(f"\nShape of the dataset: {df.shape}")

In [None]:
#@markdown Read contents of a .parquet file

# Import pandas
import pandas as pd

# Define the path to the Parquet file
file_path = '' #@param {type:'string'}

parquet_column = 'descriptions' #@param {type:'string'}
# Read the Parquet file
df = pd.read_parquet(file_path)

# Set pandas display options to show full text without truncation
pd.set_option('display.max_colwidth', None)  # Show full content of columns
pd.set_option('display.width', None)         # Use full display width

# Create sliders for selecting the range of captions
#@markdown Caption Range { run: "auto", display_mode: "form" }
start_at = 16814 #@param {type:"slider", min:0, max:33147, step:1}
range = 247 #@param {type:'slider',min:1,max:1000,step:1}
start_index = start_at
end_index = start_at + range
###@param {type:"slider", min:1, max:33148, step:1}

include_either_words = '' #@param {type:'string', placeholder:'item1,item2...'}
#display_only = True #@param {type:'boolean'}

_include_either_words = ''
for include_word in include_either_words.split(','):
  if include_word.strip()=='':continue
  _include_either_words= include_either_words + include_word.lower()+','+include_word.title() +','
#-----#
_include_either_words = _include_either_words[:len(_include_either_words)-1]


# Ensure end_index is greater than start_index and within bounds
if end_index <= start_index:
    print("Error: End index must be greater than start index.")
elif end_index > len(df):
    print(f"Error: End index cannot exceed {len(df)}. Setting to maximum value.")
    end_index = len(df)
elif start_index < 0:
    print("Error: Start index cannot be negative. Setting to 0.")
    start_index = 0

# Display the selected range of captions
tmp =''

categories= ['regular_summary:',';midjourney_style_summary:', 'individual_parts:']

print(f"\nDisplaying captions from index {start_index} to {end_index-1}:")
for index, caption in df[f'{parquet_column}'][start_index:end_index].items():
  for include_word in _include_either_words.split(','):
    found = True
    if (include_word.strip() in caption) or include_word.strip()=='':
      #----#
      if not found: continue
      tmp= caption + '\n\n'
      for category in categories:
        tmp = tmp.replace(f'{category}',f'\n\n{category}\n')
      #----#
      print(f'Index {index}: {tmp}')


In [None]:

#@markdown Build a dataset for training using a .parquet file

num_dataset_items = 200 #@param {type:'slider',max:1000}

outout_name='dataset1'#@param {type:'string'}

# Step 1: Install required libraries (if not already installed)
# !pip install datasets pandas pillow requests

# Step 2: Import required libraries
import pandas as pd
from datasets import Dataset
from PIL import Image
import requests
from io import BytesIO
import numpy as np

# Step 3: Define the path to the Parquet file
file_path = '/content/drive/MyDrive/dataset1.parquet' #@param {type:'string'}

# Step 4: Read the Parquet file
df = pd.read_parquet(file_path)

# Step 5: Randomly select 300 rows to account for potential image loading failures
df_sample = df.sample(n=math.floor(num_dataset_items*1.2), random_state=42).reset_index(drop=True)

# Step 6: Function to download, resize, and process images
def load_and_resize_image_from_url(url, max_size=(1024, 1024)):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        img = Image.open(BytesIO(response.content)).convert('RGB')
        # Resize image to fit within 1024x1024 while maintaining aspect ratio
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        return img
    except Exception as e:
        print(f"Error loading image from {url}: {e}")
        return None

# Step 7: Create lists for images and captions
images = []
texts = []

for index, row in df_sample.iterrows():
    if len(images) >= num_dataset_items:  # Stop once we have 200 valid images
        break
    url = row['url']
    caption = row['original_caption'] + ', ' + row['vlm_caption'].replace('This image displays:','').replace('This image displays','')

    # Load and resize image
    img = load_and_resize_image_from_url(url)
    if img is not None:
        images.append(img)
        texts.append(caption)
    else:
        print(f"Skipping row {index} due to image loading failure.")

# Step 8: Check if we have enough images
if len(images) < num_dataset_items:
    print(f"Warning: Only {len(images)} images were successfully loaded.")
else:
    # Truncate to exactly 200 if we have more
    images = images[:num_dataset_items]
    texts = texts[:num_dataset_items]

# Step 9: Create a Hugging Face Dataset
dataset = Dataset.from_dict({
    'image': images,
    'text': texts
})

# Step 10: Verify the dataset
print(dataset)

# Step 11: Example of accessing an image and text
print("\nExample of accessing first item:")
print("Text:", dataset['text'][0])
print("Image type:", type(dataset['image'][0]))
print("Image size:", dataset['image'][0].size)

#Optional: Save the dataset to disk (if needed)
dataset.save_to_disk(f'/content/drive/MyDrive/{output_name}')

In [None]:
#@markdown Save a dataset to Drive
dataset_name=''#@param {type:'string'}

if dataset_name.strip()=='':
  dataset_name='my_dataset'


dataset.save_to_disk(f'/content/drive/MyDrive/{dataset_name}')




In [None]:
#@markdown Display an image from the dataset
index = 85 #@param {type:'slider',max:200}
dataset['image'][index]




In [None]:
#@markdown Display matching prompt text caption
dataset['text'][index]

In [None]:
#@markdown Display an image from the dataset
index = 85 #@param {type:'slider',max:200}
dataset['image'][index]




In [None]:
#@markdown Display matching prompt text caption
dataset['text'][index]

ðŸ”„ Change to T4 Runtime  : Past this point you can train a LoRa on the Dataset , but you need to change the runtime to T4 for that first

See original file at:https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(4B)-Vision.ipynb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@markdown Test the merged dataset

# Step 1: Install required libraries (if not already installed)
# !pip install datasets

# Step 2: Mount Google Drive (only needed in Google Colab)
#from google.colab import drive
#drive.mount('/content/drive')

# Step 3: Import required library
from datasets import load_from_disk

# Step 4: Define the path to the saved dataset on Google Drive
dataset_path = ''#@param {type:'string'}

# Step 5: Load the dataset
try:
    dataset = load_from_disk(dataset_path)
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Step 6: Verify the dataset
print(dataset)

# Step 7: Example of accessing an image and text
print("\nExample of accessing first item:")
print("Text:", dataset['text'][0])
print("Image type:", type(dataset['image'][0]))
print("Image size:", dataset['image'][0].size)

In [None]:
#@markdown Display an image from the dataset
index = 85 #@param {type:'slider',max:200}
dataset['image'][index]




In [None]:
#@markdown Display matching prompt text caption
dataset['text'][index]

To format the dataset, all vision fine-tuning tasks should follow this format:

```python
[
    {
        "role": "user",
        "content": [
            {"type": "text", "text": instruction},
            {"type": "image", "image": sample["image"]},
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": instruction},
            {"type": "image", "image": sample["image"]},
        ],
    },
]
```

In [None]:
#@markdown Convert the merged dataset to the 'correct' format for training the Gemma LoRa model

instruction = "Describe this image." # <- Select the prompt for your use case here

def convert_to_conversation(sample):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": sample["image"]},
            ],
        },
        {"role": "assistant", "content": [{"type": "text", "text": sample["text"]}]},
    ]
    return {"messages": conversation}
pass

In [None]:
converted_dataset = [convert_to_conversation(sample) for sample in dataset]

In [None]:
converted_dataset[0]

### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, processor = FastVisionModel.from_pretrained(
    "unsloth/gemma-3-4b-pt",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
from unsloth import get_chat_template

processor = get_chat_template(
    processor,
    "gemma-3"
)

We now add LoRA adapters for parameter efficient fine-tuning, allowing us to train only 1% of all model parameters efficiently.

**[NEW]** We also support fine-tuning only the vision component, only the language component, or both. Additionally, you can choose to fine-tune the attention modules, the MLP layers, or both!

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,                           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,                  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3408,
    use_rslora = False,               # We support rank stabilized LoRA
    loftq_config = None,               # And LoftQ
    target_modules = "all-linear",    # Optional now! Can specify a list if needed
    modules_to_save=[
        "lm_head",
        "embed_tokens",
    ],
)

Before fine-tuning, let us evaluate the base model's performance. We do not expect strong results, as it has not encountered this chat template before.

In [None]:
FastVisionModel.for_inference(model)  # Enable for inference!

image = dataset[2]["image"]
instruction = "Describe this image."

messages = [
    {
        "role": "user",
        "content": [{"type": "image"}, {"type": "text", "text": instruction}],
    }
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(processor, skip_prompt=True)
result = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                        use_cache=True, temperature = 1.0, top_p = 0.95, top_k = 64)

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

We use our new `UnslothVisionDataCollator` which will help in our vision finetuning setup.

In [None]:
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model=model,
    train_dataset=converted_dataset,
    processing_class=processor.tokenizer,
    data_collator=UnslothVisionDataCollator(model, processor),
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        gradient_checkpointing = True,

        # use reentrant checkpointing
        gradient_checkpointing_kwargs = {"use_reentrant": False},
        max_grad_norm = 0.3,              # max gradient norm based on QLoRA paper
        warmup_ratio = 0.03,
        #max_steps = 30,
        num_train_epochs = 5,          # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        logging_steps = 1,
        save_strategy="steps",
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",             # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        max_length = 2048,
    )
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! You can modify the instruction and inputâ€”just leave the output blank.

We'll use the best hyperparameters for inference on Gemma: `top_p=0.95`, `top_k=64`, and `temperature=1.0`.

In [None]:
FastVisionModel.for_inference(model)  # Enable for inference!

image = dataset[10]["image"]
instruction = "Describe this image."

messages = [
    {
        "role": "user",
        "content": [{"type": "image"}, {"type": "text", "text": instruction}],
    }
]

input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(processor, skip_prompt=True)
result = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                        use_cache=True, temperature = 1.0, top_p = 0.95, top_k = 64)

In [None]:
# Step 1: Import required libraries
from PIL import Image
import io
import torch
from google.colab import files  # For file upload in Colab

# Step 2: Assume model and processor are already loaded and configured
FastVisionModel.for_inference(model)  # Enable for inference!

# Step 3: Upload image from user
print("Please upload an image file (e.g., .jpg, .png):")
uploaded = files.upload()  # Opens a file upload widget in Colab

# Step 4: Load the uploaded image
if not uploaded:
    raise ValueError("No file uploaded. Please upload an image.")

# Get the first uploaded file
file_name = list(uploaded.keys())[0]
try:
    image = Image.open(io.BytesIO(uploaded[file_name])).convert('RGB')
except Exception as e:
    raise ValueError(f"Error loading image: {e}")

# Step 5: Define the instruction
instruction = "Describe this image."

# Step 6: Prepare messages for the model
messages = [
    {
        "role": "user",
        "content": [{"type": "image"}, {"type": "text", "text": instruction}],
    }
]

# Step 7: Apply chat template and prepare inputs
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

# Step 8: Generate output with text streaming
from transformers import TextStreamer

text_streamer = TextStreamer(processor, skip_prompt=True)
result = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=512,
    use_cache=True,
    temperature=1.0,
    top_p=0.95,
    top_k=64
)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, use Hugging Faceâ€™s `push_to_hub` for online saving, or `save_pretrained` for local storage.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model")  # Local saving
processor.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# processor.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastVisionModel

    model, processor = FastVisionModel.from_pretrained(
        model_name="lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit=True,  # Set to False for 16bit LoRA
    )
    FastVisionModel.for_inference(model)  # Enable for inference!

FastVisionModel.for_inference(model)  # Enable for inference!

sample = dataset[1]
image = sample["image"].convert("RGB")
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": sample["text"],
            },
            {
                "type": "image",
            },
        ],
    },
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(processor.tokenizer, skip_prompt=True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache=True, temperature = 1.0, top_p = 0.95, top_k = 64)

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.