Upload folder using huggingface_hub
Browse files
.ipynb_checkpoints/image_embed-checkpoint.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import pickle
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
from tqdm import tqdm
|
7 |
+
from transformers import AutoTokenizer, AutoModel
|
8 |
+
import torch.multiprocessing as mp
|
9 |
+
|
10 |
+
# Paths and settings
|
11 |
+
INPUT_JSON = "Pretrain.json"
|
12 |
+
mean_shift = True # Enable full-pass mean shifting
|
13 |
+
CKPT = "/root/autodl-tmp/model/siglip2"
|
14 |
+
BATCH_SIZE = 512
|
15 |
+
LOAD_LIMIT = None # Limit number of items, or None for all
|
16 |
+
|
17 |
+
# Output directories
|
18 |
+
RAW_DIR = "raw_embeds"
|
19 |
+
SHIFTED_DIR = "shifted_embeds"
|
20 |
+
|
21 |
+
# Create output directories if they don't exist
|
22 |
+
os.makedirs(RAW_DIR, exist_ok=True)
|
23 |
+
os.makedirs(SHIFTED_DIR, exist_ok=True)
|
24 |
+
|
25 |
+
# 1. Load data
|
26 |
+
with open(INPUT_JSON, "r", encoding="utf-8") as f:
|
27 |
+
items = json.load(f)
|
28 |
+
if LOAD_LIMIT is not None:
|
29 |
+
items = items[:LOAD_LIMIT]
|
30 |
+
|
31 |
+
# 2. Initialize tokenizer
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained(CKPT)
|
33 |
+
|
34 |
+
# 3. Split data among GPUs
|
35 |
+
num_gpus = torch.cuda.device_count()
|
36 |
+
chunks = np.array_split(items, num_gpus)
|
37 |
+
|
38 |
+
# Function to compute raw embeddings (no shift)
|
39 |
+
def compute_raw_embeddings(device, data_chunk, gpu_id):
|
40 |
+
device = torch.device(device)
|
41 |
+
model = AutoModel.from_pretrained(CKPT).to(device).eval()
|
42 |
+
results = [] # To store raw embeddings
|
43 |
+
|
44 |
+
for i in tqdm(range(0, len(data_chunk), BATCH_SIZE), desc=f"Device {gpu_id} Raw Batches"):
|
45 |
+
batch = data_chunk[i:i + BATCH_SIZE]
|
46 |
+
ids = [it['id'] for it in batch]
|
47 |
+
captions = [it.get('caption', '') for it in batch]
|
48 |
+
|
49 |
+
inputs = tokenizer(
|
50 |
+
captions,
|
51 |
+
padding="max_length",
|
52 |
+
truncation=True,
|
53 |
+
max_length=64,
|
54 |
+
return_tensors="pt"
|
55 |
+
).to(device)
|
56 |
+
|
57 |
+
with torch.no_grad():
|
58 |
+
embs = model.get_text_features(**inputs)
|
59 |
+
embs_np = embs.cpu().numpy()
|
60 |
+
|
61 |
+
for idx, item_id in enumerate(ids):
|
62 |
+
results.append({'id': item_id, 'embed': embs_np[idx]})
|
63 |
+
|
64 |
+
# Save raw embeddings
|
65 |
+
raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
|
66 |
+
with open(raw_file, 'wb') as f:
|
67 |
+
pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
|
68 |
+
print(f"Device {gpu_id} saved {len(results)} raw embeddings to {raw_file}")
|
69 |
+
|
70 |
+
# Function to apply mean shift and save final embeddings
|
71 |
+
def apply_mean_shift_and_save(global_mean, gpu_id):
|
72 |
+
raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
|
73 |
+
out_file = os.path.join(SHIFTED_DIR, f"embeds_device_{gpu_id}.pkl")
|
74 |
+
|
75 |
+
with open(raw_file, 'rb') as f:
|
76 |
+
data = pickle.load(f)
|
77 |
+
|
78 |
+
# Subtract global mean
|
79 |
+
for item in data:
|
80 |
+
item['embed'] = item['embed'] - global_mean
|
81 |
+
|
82 |
+
# Save shifted embeddings
|
83 |
+
with open(out_file, 'wb') as f:
|
84 |
+
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
|
85 |
+
print(f"Device {gpu_id} saved {len(data)} shifted embeddings to {out_file}")
|
86 |
+
|
87 |
+
# Main entry
|
88 |
+
def main():
|
89 |
+
# 1st pass: compute raw embeddings in parallel
|
90 |
+
procs = []
|
91 |
+
for i in range(num_gpus):
|
92 |
+
p = mp.Process(target=compute_raw_embeddings, args=(f"cuda:{i}", chunks[i], i))
|
93 |
+
p.start()
|
94 |
+
procs.append(p)
|
95 |
+
for p in procs:
|
96 |
+
p.join()
|
97 |
+
|
98 |
+
if mean_shift:
|
99 |
+
# Load all raw embeddings to compute global mean
|
100 |
+
all_embeds = []
|
101 |
+
for i in range(num_gpus):
|
102 |
+
raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{i}.pkl")
|
103 |
+
with open(raw_file, 'rb') as f:
|
104 |
+
data = pickle.load(f)
|
105 |
+
all_embeds.extend([item['embed'] for item in data])
|
106 |
+
|
107 |
+
all_embeds = np.stack(all_embeds, axis=0)
|
108 |
+
global_mean = np.mean(all_embeds, axis=0)
|
109 |
+
print("Computed global mean of shape", global_mean.shape)
|
110 |
+
|
111 |
+
# 2nd pass: subtract mean and save shifted embeddings
|
112 |
+
for i in range(num_gpus):
|
113 |
+
apply_mean_shift_and_save(global_mean, i)
|
114 |
+
|
115 |
+
if __name__ == "__main__":
|
116 |
+
main()
|
image_embed.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import pickle
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
from tqdm import tqdm
|
7 |
+
from transformers import AutoTokenizer, AutoModel
|
8 |
+
import torch.multiprocessing as mp
|
9 |
+
|
10 |
+
# Paths and settings
|
11 |
+
INPUT_JSON = "Pretrain.json"
|
12 |
+
mean_shift = True # Enable full-pass mean shifting
|
13 |
+
CKPT = "/root/autodl-tmp/model/siglip2"
|
14 |
+
BATCH_SIZE = 512
|
15 |
+
LOAD_LIMIT = None # Limit number of items, or None for all
|
16 |
+
|
17 |
+
# Output directories
|
18 |
+
RAW_DIR = "raw_embeds"
|
19 |
+
SHIFTED_DIR = "shifted_embeds"
|
20 |
+
|
21 |
+
# Create output directories if they don't exist
|
22 |
+
os.makedirs(RAW_DIR, exist_ok=True)
|
23 |
+
os.makedirs(SHIFTED_DIR, exist_ok=True)
|
24 |
+
|
25 |
+
# 1. Load data
|
26 |
+
with open(INPUT_JSON, "r", encoding="utf-8") as f:
|
27 |
+
items = json.load(f)
|
28 |
+
if LOAD_LIMIT is not None:
|
29 |
+
items = items[:LOAD_LIMIT]
|
30 |
+
|
31 |
+
# 2. Initialize tokenizer
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained(CKPT)
|
33 |
+
|
34 |
+
# 3. Split data among GPUs
|
35 |
+
num_gpus = torch.cuda.device_count()
|
36 |
+
chunks = np.array_split(items, num_gpus)
|
37 |
+
|
38 |
+
# Function to compute raw embeddings (no shift)
|
39 |
+
def compute_raw_embeddings(device, data_chunk, gpu_id):
|
40 |
+
device = torch.device(device)
|
41 |
+
model = AutoModel.from_pretrained(CKPT).to(device).eval()
|
42 |
+
results = [] # To store raw embeddings
|
43 |
+
|
44 |
+
for i in tqdm(range(0, len(data_chunk), BATCH_SIZE), desc=f"Device {gpu_id} Raw Batches"):
|
45 |
+
batch = data_chunk[i:i + BATCH_SIZE]
|
46 |
+
ids = [it['id'] for it in batch]
|
47 |
+
captions = [it.get('caption', '') for it in batch]
|
48 |
+
|
49 |
+
inputs = tokenizer(
|
50 |
+
captions,
|
51 |
+
padding="max_length",
|
52 |
+
truncation=True,
|
53 |
+
max_length=64,
|
54 |
+
return_tensors="pt"
|
55 |
+
).to(device)
|
56 |
+
|
57 |
+
with torch.no_grad():
|
58 |
+
embs = model.get_text_features(**inputs)
|
59 |
+
embs_np = embs.cpu().numpy()
|
60 |
+
|
61 |
+
for idx, item_id in enumerate(ids):
|
62 |
+
results.append({'id': item_id, 'embed': embs_np[idx]})
|
63 |
+
|
64 |
+
# Save raw embeddings
|
65 |
+
raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
|
66 |
+
with open(raw_file, 'wb') as f:
|
67 |
+
pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
|
68 |
+
print(f"Device {gpu_id} saved {len(results)} raw embeddings to {raw_file}")
|
69 |
+
|
70 |
+
# Function to apply mean shift and save final embeddings
|
71 |
+
def apply_mean_shift_and_save(global_mean, gpu_id):
|
72 |
+
raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
|
73 |
+
out_file = os.path.join(SHIFTED_DIR, f"embeds_device_{gpu_id}.pkl")
|
74 |
+
|
75 |
+
with open(raw_file, 'rb') as f:
|
76 |
+
data = pickle.load(f)
|
77 |
+
|
78 |
+
# Subtract global mean
|
79 |
+
for item in data:
|
80 |
+
item['embed'] = item['embed'] - global_mean
|
81 |
+
|
82 |
+
# Save shifted embeddings
|
83 |
+
with open(out_file, 'wb') as f:
|
84 |
+
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
|
85 |
+
print(f"Device {gpu_id} saved {len(data)} shifted embeddings to {out_file}")
|
86 |
+
|
87 |
+
# Main entry
|
88 |
+
def main():
|
89 |
+
# 1st pass: compute raw embeddings in parallel
|
90 |
+
procs = []
|
91 |
+
for i in range(num_gpus):
|
92 |
+
p = mp.Process(target=compute_raw_embeddings, args=(f"cuda:{i}", chunks[i], i))
|
93 |
+
p.start()
|
94 |
+
procs.append(p)
|
95 |
+
for p in procs:
|
96 |
+
p.join()
|
97 |
+
|
98 |
+
if mean_shift:
|
99 |
+
# Load all raw embeddings to compute global mean
|
100 |
+
all_embeds = []
|
101 |
+
for i in range(num_gpus):
|
102 |
+
raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{i}.pkl")
|
103 |
+
with open(raw_file, 'rb') as f:
|
104 |
+
data = pickle.load(f)
|
105 |
+
all_embeds.extend([item['embed'] for item in data])
|
106 |
+
|
107 |
+
all_embeds = np.stack(all_embeds, axis=0)
|
108 |
+
global_mean = np.mean(all_embeds, axis=0)
|
109 |
+
print("Computed global mean of shape", global_mean.shape)
|
110 |
+
|
111 |
+
# 2nd pass: subtract mean and save shifted embeddings
|
112 |
+
for i in range(num_gpus):
|
113 |
+
apply_mean_shift_and_save(global_mean, i)
|
114 |
+
|
115 |
+
if __name__ == "__main__":
|
116 |
+
main()
|
raw_embeds/raw_embeds_device_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63d187ce40096e63b74d154149a57618a542d0fafa074035f4767ae047bb2012
|
3 |
+
size 29844542625
|
raw_embeds/raw_embeds_device_1.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2335055511cde0b646f3ab10072951ac283e0b6a09768a14b2c30d33410e8c19
|
3 |
+
size 29844673953
|
shifted_embeds/embeds_device_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:abd270a46450d7339737bd0b9655a758026e2b2b5d2413f1ab75519a7956ce8a
|
3 |
+
size 29844542625
|
shifted_embeds/embeds_device_1.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dee0e5d96e3b26c42ef33b0ecaa024399dcc18a481e085e80e0f3bc8b69d3fe6
|
3 |
+
size 29844673953
|