Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.ipynb_checkpoints/image_embed-checkpoint.py +116 -0
image_embed.py +116 -0
raw_embeds/raw_embeds_device_0.pkl +3 -0
raw_embeds/raw_embeds_device_1.pkl +3 -0
shifted_embeds/embeds_device_0.pkl +3 -0
shifted_embeds/embeds_device_1.pkl +3 -0

.ipynb_checkpoints/image_embed-checkpoint.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import json
+import pickle
+import torch
+import numpy as np
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel
+import torch.multiprocessing as mp
+# Paths and settings
+INPUT_JSON = "Pretrain.json"
+mean_shift = True  # Enable full-pass mean shifting
+CKPT = "/root/autodl-tmp/model/siglip2"
+BATCH_SIZE = 512
+LOAD_LIMIT = None  # Limit number of items, or None for all
+# Output directories
+RAW_DIR = "raw_embeds"
+SHIFTED_DIR = "shifted_embeds"
+# Create output directories if they don't exist
+os.makedirs(RAW_DIR, exist_ok=True)
+os.makedirs(SHIFTED_DIR, exist_ok=True)
+# 1. Load data
+with open(INPUT_JSON, "r", encoding="utf-8") as f:
+    items = json.load(f)
+if LOAD_LIMIT is not None:
+    items = items[:LOAD_LIMIT]
+# 2. Initialize tokenizer
+tokenizer = AutoTokenizer.from_pretrained(CKPT)
+# 3. Split data among GPUs
+num_gpus = torch.cuda.device_count()
+chunks = np.array_split(items, num_gpus)
+# Function to compute raw embeddings (no shift)
+def compute_raw_embeddings(device, data_chunk, gpu_id):
+    device = torch.device(device)
+    model = AutoModel.from_pretrained(CKPT).to(device).eval()
+    results = []  # To store raw embeddings
+    for i in tqdm(range(0, len(data_chunk), BATCH_SIZE), desc=f"Device {gpu_id} Raw Batches"):
+        batch = data_chunk[i:i + BATCH_SIZE]
+        ids = [it['id'] for it in batch]
+        captions = [it.get('caption', '') for it in batch]
+        inputs = tokenizer(
+            captions,
+            padding="max_length",
+            truncation=True,
+            max_length=64,
+            return_tensors="pt"
+        ).to(device)
+        with torch.no_grad():
+            embs = model.get_text_features(**inputs)
+        embs_np = embs.cpu().numpy()
+        for idx, item_id in enumerate(ids):
+            results.append({'id': item_id, 'embed': embs_np[idx]})
+    # Save raw embeddings
+    raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
+    with open(raw_file, 'wb') as f:
+        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
+    print(f"Device {gpu_id} saved {len(results)} raw embeddings to {raw_file}")
+# Function to apply mean shift and save final embeddings
+def apply_mean_shift_and_save(global_mean, gpu_id):
+    raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
+    out_file = os.path.join(SHIFTED_DIR, f"embeds_device_{gpu_id}.pkl")
+    with open(raw_file, 'rb') as f:
+        data = pickle.load(f)
+    # Subtract global mean
+    for item in data:
+        item['embed'] = item['embed'] - global_mean
+    # Save shifted embeddings
+    with open(out_file, 'wb') as f:
+        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
+    print(f"Device {gpu_id} saved {len(data)} shifted embeddings to {out_file}")
+# Main entry
+def main():
+    # 1st pass: compute raw embeddings in parallel
+    procs = []
+    for i in range(num_gpus):
+        p = mp.Process(target=compute_raw_embeddings, args=(f"cuda:{i}", chunks[i], i))
+        p.start()
+        procs.append(p)
+    for p in procs:
+        p.join()
+    if mean_shift:
+        # Load all raw embeddings to compute global mean
+        all_embeds = []
+        for i in range(num_gpus):
+            raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{i}.pkl")
+            with open(raw_file, 'rb') as f:
+                data = pickle.load(f)
+            all_embeds.extend([item['embed'] for item in data])
+        all_embeds = np.stack(all_embeds, axis=0)
+        global_mean = np.mean(all_embeds, axis=0)
+        print("Computed global mean of shape", global_mean.shape)
+        # 2nd pass: subtract mean and save shifted embeddings
+        for i in range(num_gpus):
+            apply_mean_shift_and_save(global_mean, i)
+if __name__ == "__main__":
+    main()

image_embed.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import json
+import pickle
+import torch
+import numpy as np
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel
+import torch.multiprocessing as mp
+# Paths and settings
+INPUT_JSON = "Pretrain.json"
+mean_shift = True  # Enable full-pass mean shifting
+CKPT = "/root/autodl-tmp/model/siglip2"
+BATCH_SIZE = 512
+LOAD_LIMIT = None  # Limit number of items, or None for all
+# Output directories
+RAW_DIR = "raw_embeds"
+SHIFTED_DIR = "shifted_embeds"
+# Create output directories if they don't exist
+os.makedirs(RAW_DIR, exist_ok=True)
+os.makedirs(SHIFTED_DIR, exist_ok=True)
+# 1. Load data
+with open(INPUT_JSON, "r", encoding="utf-8") as f:
+    items = json.load(f)
+if LOAD_LIMIT is not None:
+    items = items[:LOAD_LIMIT]
+# 2. Initialize tokenizer
+tokenizer = AutoTokenizer.from_pretrained(CKPT)
+# 3. Split data among GPUs
+num_gpus = torch.cuda.device_count()
+chunks = np.array_split(items, num_gpus)
+# Function to compute raw embeddings (no shift)
+def compute_raw_embeddings(device, data_chunk, gpu_id):
+    device = torch.device(device)
+    model = AutoModel.from_pretrained(CKPT).to(device).eval()
+    results = []  # To store raw embeddings
+    for i in tqdm(range(0, len(data_chunk), BATCH_SIZE), desc=f"Device {gpu_id} Raw Batches"):
+        batch = data_chunk[i:i + BATCH_SIZE]
+        ids = [it['id'] for it in batch]
+        captions = [it.get('caption', '') for it in batch]
+        inputs = tokenizer(
+            captions,
+            padding="max_length",
+            truncation=True,
+            max_length=64,
+            return_tensors="pt"
+        ).to(device)
+        with torch.no_grad():
+            embs = model.get_text_features(**inputs)
+        embs_np = embs.cpu().numpy()
+        for idx, item_id in enumerate(ids):
+            results.append({'id': item_id, 'embed': embs_np[idx]})
+    # Save raw embeddings
+    raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
+    with open(raw_file, 'wb') as f:
+        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
+    print(f"Device {gpu_id} saved {len(results)} raw embeddings to {raw_file}")
+# Function to apply mean shift and save final embeddings
+def apply_mean_shift_and_save(global_mean, gpu_id):
+    raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
+    out_file = os.path.join(SHIFTED_DIR, f"embeds_device_{gpu_id}.pkl")
+    with open(raw_file, 'rb') as f:
+        data = pickle.load(f)
+    # Subtract global mean
+    for item in data:
+        item['embed'] = item['embed'] - global_mean
+    # Save shifted embeddings
+    with open(out_file, 'wb') as f:
+        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
+    print(f"Device {gpu_id} saved {len(data)} shifted embeddings to {out_file}")
+# Main entry
+def main():
+    # 1st pass: compute raw embeddings in parallel
+    procs = []
+    for i in range(num_gpus):
+        p = mp.Process(target=compute_raw_embeddings, args=(f"cuda:{i}", chunks[i], i))
+        p.start()
+        procs.append(p)
+    for p in procs:
+        p.join()
+    if mean_shift:
+        # Load all raw embeddings to compute global mean
+        all_embeds = []
+        for i in range(num_gpus):
+            raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{i}.pkl")
+            with open(raw_file, 'rb') as f:
+                data = pickle.load(f)
+            all_embeds.extend([item['embed'] for item in data])
+        all_embeds = np.stack(all_embeds, axis=0)
+        global_mean = np.mean(all_embeds, axis=0)
+        print("Computed global mean of shape", global_mean.shape)
+        # 2nd pass: subtract mean and save shifted embeddings
+        for i in range(num_gpus):
+            apply_mean_shift_and_save(global_mean, i)
+if __name__ == "__main__":
+    main()

raw_embeds/raw_embeds_device_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63d187ce40096e63b74d154149a57618a542d0fafa074035f4767ae047bb2012
+size 29844542625

raw_embeds/raw_embeds_device_1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2335055511cde0b646f3ab10072951ac283e0b6a09768a14b2c30d33410e8c19
+size 29844673953

shifted_embeds/embeds_device_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abd270a46450d7339737bd0b9655a758026e2b2b5d2413f1ab75519a7956ce8a
+size 29844542625

shifted_embeds/embeds_device_1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dee0e5d96e3b26c42ef33b0ecaa024399dcc18a481e085e80e0f3bc8b69d3fe6
+size 29844673953