merve
/

smol-vision

+# -*- coding: utf-8 -*-
+"""Gemma3n Fine-tuning on All Modalities.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1iEZUJuvKJpGU8t50BqfkiCQmGkaR6gd4
+# Fine-tune Gemma3n on FineVideo
+In this notebook, we will see how to fine-tune Gemma3n an videos with audios inside.
+Using all three modalities is very costly compute-wise, so keep in mind that this is an educational tutorial to fit the model in 40GB VRAM.
+"""
+!pip install -U -q timm transformers trl peft datasets
+import io
+import os
+import zipfile
+import torch
+from datasets import load_dataset
+from PIL import Image
+from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+from trl import (
+    SFTConfig,
+    SFTTrainer,
+)
+"""## Download videos and preprocessing
+FineVideo is a quite large dataset, we don't need a ton of examples, so we stream the dataset, check the duration and download the videos shorter than 30 secs.
+"""
+from datasets import load_dataset
+import json
+import os
+dataset = load_dataset("HuggingFaceFV/finevideo", split="train", streaming=True)
+os.makedirs("videos", exist_ok=True)
+os.makedirs("metadata", exist_ok=True)
+for idx, sample in enumerate(dataset):
+    data = sample["json"]
+    duration = data.get("duration_seconds", 0)
+    if duration < 30:
+      video_filename = f"videos/sample_{idx}.mp4"
+      with open(video_filename, 'wb') as video_file:
+        video_file.write(sample['mp4'])
+      json_filename = f"metadata/sample_{idx}.json"
+      with open(json_filename, 'w') as json_file:
+          json.dump(sample['json'], json_file)
+print(f"Number of items in content/videos: {len(os.listdir('videos'))}")
+"""In FineVideo some frames are dark so we downsample 6 frames and if we can't get meaningful videos we remove them."""
+import cv2
+from PIL import Image
+import numpy as np
+def is_dark(frame, threshold=10):
+    return np.max(frame) < threshold  # all pixels are very close to 0
+def downsample_video(video_path):
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    # Generate 8 evenly spaced indices, skip first and last
+    full_indices = np.linspace(0, total_frames - 1, 8, dtype=int)[1:-1]
+    for i in full_indices:
+        found_valid = False
+        for offset in [0, -1, 1, -2, 2]:  # Try nearby frames if original is dark
+            candidate_idx = i + offset
+            if 0 <= candidate_idx < total_frames:
+                vidcap.set(cv2.CAP_PROP_POS_FRAMES, candidate_idx)
+                success, image = vidcap.read()
+                if success:
+                    if not is_dark(image):
+                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                        pil_image = Image.fromarray(image)
+                        timestamp = round(candidate_idx / fps, 2)
+                        frames.append((pil_image, timestamp))
+                        found_valid = True
+                        break
+        if not found_valid:
+            print(f"Warning: Could not find non-dark frame near index {i}")
+    vidcap.release()
+    # If still fewer than 8, try to top off by scanning more frames
+    if len(frames) < 6:
+        print("Trying to top off with additional non-dark frames...")
+        idx = 0
+        while len(frames) < 8 and idx < total_frames:
+            vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            success, image = vidcap.read()
+            if success and not is_dark(image):
+                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                pil_image = Image.fromarray(image)
+                timestamp = round(idx / fps, 2)
+                # Avoid adding duplicate timestamps
+                if not any(ts == timestamp for _, ts in frames):
+                    frames.append((pil_image, timestamp))
+            idx += 1
+    return frames[:8]  # Ensure exactly 8 frames
+import os
+import glob
+def remove_dark_videos(video_dir, metadata_dir, audio_dir):
+    """
+    Remove videos (and their metadata/audio files) if all frames are dark.
+    """
+    video_paths = glob.glob(os.path.join(video_dir, "*.mp4"))
+    for video_path in video_paths:
+        filename = os.path.basename(video_path)
+        base_name = os.path.splitext(filename)[0]
+        frames = downsample_video(video_path)
+        if len(frames) < 6:
+            try:
+                os.remove(video_path)
+                print(f"Deleted: {video_path}")
+            except Exception as e:
+                print(f"Failed to delete {video_path}: {e}")
+            metadata_path = os.path.join(metadata_dir, f"{base_name}.json")
+            if os.path.exists(metadata_path):
+                os.remove(metadata_path)
+            # Remove audio
+            audio_path = os.path.join(audio_dir, f"{base_name}.wav")
+            if os.path.exists(audio_path):
+                os.remove(audio_path)
+remove_dark_videos(
+    video_dir="videos",
+    metadata_dir="metadata",
+    audio_dir="audios"
+    )
+"""Gemma-3n accepts video (image frames) and audio separately, so we strip audio from video."""
+import os
+import subprocess
+video_dir = "videos"
+audio_dir = "audios"
+os.makedirs(audio_dir, exist_ok=True)
+for filename in os.listdir(video_dir):
+    if not filename.endswith(".mp4"):
+        continue
+    idx = filename.split("_")[1].split(".")[0]
+    video_path = os.path.join(video_dir, filename)
+    audio_path = os.path.join(audio_dir, f"sample_{idx}.wav")
+    subprocess.run([
+        "ffmpeg", "-i", video_path,
+        "-q:a", "0", "-map", "a",
+        audio_path,
+        "-y"
+    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+"""Construct a new dataset with audio, video, metadata (video categories). This dataset is very cool, it has some questions and answers, captions and more so get creative if you have the GPU VRAM to do so. Here we solve an easier task for educational purposes."""
+from datasets import Dataset
+import json
+def gen():
+    meta_dir = "metadata"
+    for filename in os.listdir(meta_dir):
+        if not filename.endswith(".json"):
+            continue
+        idx = filename.split("_")[1].split(".")[0]
+        if os.path.exists(f"videos/sample_{idx}.mp4"):
+          video_filename = f"sample_{idx}.mp4"
+          audio_filename = f"sample_{idx}.wav"
+          json_path = os.path.join(meta_dir, filename)
+          with open(json_path, "r") as f:
+              metadata = json.load(f)
+          yield {
+              "video": video_filename,
+              "audio": audio_filename,
+              "content_parent_category": metadata["content_parent_category"],
+              "sample_index": int(idx)
+          }
+        else:
+          pass
+dataset = Dataset.from_generator(gen)
+"""We will speed-up and downsample the audios to save space during training."""
+import torchaudio
+from torchaudio.transforms import Resample
+import os
+import torch
+def preprocess_audio(audio_path, target_sample_rate=16000, max_duration_sec=5, speedup_factor=1.25):
+    waveform, sample_rate = torchaudio.load(audio_path)
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    if sample_rate != target_sample_rate:
+        resampler = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+        waveform = resampler(waveform)
+        sample_rate = target_sample_rate
+    if speedup_factor > 1.0:
+        indices = torch.arange(0, waveform.shape[1], step=speedup_factor).long()
+        if indices[-1] >= waveform.shape[1]:
+            indices = indices[:-1]
+        waveform = waveform[:, indices]
+    max_length = int(target_sample_rate * max_duration_sec)
+    if waveform.shape[1] > max_length:
+        waveform = waveform[:, :max_length]
+    torchaudio.save(audio_path, waveform, sample_rate)
+for file_name in os.listdir("audios"):
+    if file_name.lower().endswith(".wav"):
+        audio_path = os.path.join("audios", file_name)
+        preprocess_audio(audio_path)
+dataset = dataset.train_test_split(test_size=0.10, seed=42)
+"""### Load the model
+Make sure you have your Hugging Face token in your Colab secrets.
+"""
+model = Gemma3nForConditionalGeneration.from_pretrained(
+    "google/gemma-3n-E2B-it", torch_dtype=torch.bfloat16,
+)
+processor = AutoProcessor.from_pretrained(
+    "google/gemma-3n-E2B-it",
+)
+processor.tokenizer.padding_side = "right"
+processor.tokenizer.all_special_ids
+"""Write our dataset collator. We will train model to predict category of a video (which can be done easily). You can do much better things, for instance FineVideo has QnA section, you can train this model to do open-ended QnA if you have a big VRAM and a lot of patience. Open-ended tasks are harder to work with, and this notebook carries educational purposes on feeding different modalities.
+In collator we also downsample videos to 6 frames, we have written the helper above. For better results you need more frames.
+"""
+def collate_fn(examples):
+  video_path = examples[0]["video"]
+  audio_path = examples[0]["audio"]
+  sample_idx = filename.split("_")[1].split(".")[0]
+  frames = downsample_video(f"videos/{video_path}")
+  text = "Based on the video, predict the category of it."
+  message = [
+      {
+          "role": "user",
+          "content": [
+              {"type": "text", "text": text}
+          ],
+      },
+  ]
+  # this is how video inference should be formatted in Gemma3n
+  for frame in frames:
+    image, timestamp = frame
+    message[0]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+    timestamp = str(timestamp).replace(".", "_")
+    image.save(f"image_idx_{sample_idx}_{timestamp}.png")
+    message[0]["content"].append({"type": "image", "url": f"image_idx_{sample_idx}_{timestamp}.png"})
+  message[0]["content"].append({"type": "audio", "audio": f"audios/{audio_path}"})
+  message.append({"role": "assistant", "content": [{"type": "text", "text": examples[0]["content_parent_category"]}]})
+  inputs = processor.apply_chat_template(
+      message,
+      add_generation_prompt=False,
+      tokenize=True,
+      return_dict=True,
+      return_tensors="pt",
+      padding=True,
+  ).to(model.device)
+  labels = inputs["input_ids"].clone()
+  special_token_ids = processor.tokenizer.all_special_ids
+  special_token_ids_tensor = torch.tensor(special_token_ids, device=labels.device)
+  mask = torch.isin(labels, special_token_ids_tensor)
+  labels[mask] = -100
+  inputs["labels"] = labels
+  if torch.all(inputs["pixel_values"] == 0):
+    print("Frames are dark")
+  return inputs
+"""## Training
+We do LoRA fine-tuning again to save up on space.
+"""
+from peft import LoraConfig
+peft_config = LoraConfig(
+    task_type="CAUSAL_LM",
+    r=16,
+    target_modules="all-linear",
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    use_rslora=False,
+    use_dora=False,
+    modules_to_save=None
+)
+model.gradient_checkpointing_disable()
+model.config.use_cache = False
+training_args = SFTConfig(
+    output_dir="/content/gemma-3n-finevideo",
+    eval_strategy='epoch',
+    per_device_train_batch_size=1,
+    per_device_eval_batch_size=1,
+    gradient_accumulation_steps=4,
+    gradient_checkpointing=False,
+    learning_rate=1e-05,
+    num_train_epochs=3.0,
+    logging_steps=10,
+    save_steps=100,
+    bf16=True,
+    report_to=["tensorboard"],
+    dataset_kwargs={'skip_prepare_dataset': True},
+    remove_unused_columns=False,
+    max_seq_length=None,
+    push_to_hub=True,
+    dataloader_pin_memory=False,
+)
+trainer = SFTTrainer(
+    model=model,
+    args=training_args,
+    data_collator=collate_fn,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"] if training_args.eval_strategy != "no" else None,
+    processing_class=processor.tokenizer,
+    peft_config=peft_config,
+)
+trainer.train()
+"""Test the model with a video of snowboarding."""
+!wget https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4
+model = trainer.model # trainer has the adapter
+"""Strip audio and downsample video."""
+audio_path = "/content/test_audio.wav"
+subprocess.run([
+        "ffmpeg", "-i", "/content/IMG_8137.mp4",
+        "-q:a", "0", "-map", "a",
+        f"{audio_path}",
+        "-y"
+    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+frames = downsample_video("/content/IMG_8137.mp4")
+# repeat the chat template
+text = "Based on the video, predict the category of it."
+message = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": text}
+        ],
+    },
+]
+for frame in frames:
+  image, timestamp = frame
+  message[0]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+  timestamp = str(timestamp).replace(".", "_")
+  image.save(f"test_frame_{timestamp}.png")
+  message[0]["content"].append({"type": "image", "url": f"test_frame_{timestamp}.png"})
+message[0]["content"].append({"type": "audio", "audio": f"{audio_path}"})
+message
+inputs = processor.apply_chat_template(
+    message,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    padding=True,
+).to(model.device).to(model.dtype)
+input_len = inputs["input_ids"].shape[-1]
+with torch.inference_mode():
+    generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+    generation = generation[0][input_len:]
+decoded = processor.decode(generation, skip_special_tokens=True)
+print(decoded)
+"""Thanks a lot for reading! Keep training the model further with more data or unfreeze the layers for better performance 💗"""