XiN0919 commited on
Commit
cd77de9
·
verified ·
1 Parent(s): 3896721

Upload folder using huggingface_hub

Browse files
.ipynb_checkpoints/image_embed-checkpoint.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pickle
4
+ import torch
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+ from transformers import AutoTokenizer, AutoModel
8
+ import torch.multiprocessing as mp
9
+
10
+ # Paths and settings
11
+ INPUT_JSON = "Pretrain.json"
12
+ mean_shift = True # Enable full-pass mean shifting
13
+ CKPT = "/root/autodl-tmp/model/siglip2"
14
+ BATCH_SIZE = 512
15
+ LOAD_LIMIT = None # Limit number of items, or None for all
16
+
17
+ # Output directories
18
+ RAW_DIR = "raw_embeds"
19
+ SHIFTED_DIR = "shifted_embeds"
20
+
21
+ # Create output directories if they don't exist
22
+ os.makedirs(RAW_DIR, exist_ok=True)
23
+ os.makedirs(SHIFTED_DIR, exist_ok=True)
24
+
25
+ # 1. Load data
26
+ with open(INPUT_JSON, "r", encoding="utf-8") as f:
27
+ items = json.load(f)
28
+ if LOAD_LIMIT is not None:
29
+ items = items[:LOAD_LIMIT]
30
+
31
+ # 2. Initialize tokenizer
32
+ tokenizer = AutoTokenizer.from_pretrained(CKPT)
33
+
34
+ # 3. Split data among GPUs
35
+ num_gpus = torch.cuda.device_count()
36
+ chunks = np.array_split(items, num_gpus)
37
+
38
+ # Function to compute raw embeddings (no shift)
39
+ def compute_raw_embeddings(device, data_chunk, gpu_id):
40
+ device = torch.device(device)
41
+ model = AutoModel.from_pretrained(CKPT).to(device).eval()
42
+ results = [] # To store raw embeddings
43
+
44
+ for i in tqdm(range(0, len(data_chunk), BATCH_SIZE), desc=f"Device {gpu_id} Raw Batches"):
45
+ batch = data_chunk[i:i + BATCH_SIZE]
46
+ ids = [it['id'] for it in batch]
47
+ captions = [it.get('caption', '') for it in batch]
48
+
49
+ inputs = tokenizer(
50
+ captions,
51
+ padding="max_length",
52
+ truncation=True,
53
+ max_length=64,
54
+ return_tensors="pt"
55
+ ).to(device)
56
+
57
+ with torch.no_grad():
58
+ embs = model.get_text_features(**inputs)
59
+ embs_np = embs.cpu().numpy()
60
+
61
+ for idx, item_id in enumerate(ids):
62
+ results.append({'id': item_id, 'embed': embs_np[idx]})
63
+
64
+ # Save raw embeddings
65
+ raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
66
+ with open(raw_file, 'wb') as f:
67
+ pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
68
+ print(f"Device {gpu_id} saved {len(results)} raw embeddings to {raw_file}")
69
+
70
+ # Function to apply mean shift and save final embeddings
71
+ def apply_mean_shift_and_save(global_mean, gpu_id):
72
+ raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
73
+ out_file = os.path.join(SHIFTED_DIR, f"embeds_device_{gpu_id}.pkl")
74
+
75
+ with open(raw_file, 'rb') as f:
76
+ data = pickle.load(f)
77
+
78
+ # Subtract global mean
79
+ for item in data:
80
+ item['embed'] = item['embed'] - global_mean
81
+
82
+ # Save shifted embeddings
83
+ with open(out_file, 'wb') as f:
84
+ pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
85
+ print(f"Device {gpu_id} saved {len(data)} shifted embeddings to {out_file}")
86
+
87
+ # Main entry
88
+ def main():
89
+ # 1st pass: compute raw embeddings in parallel
90
+ procs = []
91
+ for i in range(num_gpus):
92
+ p = mp.Process(target=compute_raw_embeddings, args=(f"cuda:{i}", chunks[i], i))
93
+ p.start()
94
+ procs.append(p)
95
+ for p in procs:
96
+ p.join()
97
+
98
+ if mean_shift:
99
+ # Load all raw embeddings to compute global mean
100
+ all_embeds = []
101
+ for i in range(num_gpus):
102
+ raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{i}.pkl")
103
+ with open(raw_file, 'rb') as f:
104
+ data = pickle.load(f)
105
+ all_embeds.extend([item['embed'] for item in data])
106
+
107
+ all_embeds = np.stack(all_embeds, axis=0)
108
+ global_mean = np.mean(all_embeds, axis=0)
109
+ print("Computed global mean of shape", global_mean.shape)
110
+
111
+ # 2nd pass: subtract mean and save shifted embeddings
112
+ for i in range(num_gpus):
113
+ apply_mean_shift_and_save(global_mean, i)
114
+
115
+ if __name__ == "__main__":
116
+ main()
image_embed.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pickle
4
+ import torch
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+ from transformers import AutoTokenizer, AutoModel
8
+ import torch.multiprocessing as mp
9
+
10
+ # Paths and settings
11
+ INPUT_JSON = "Pretrain.json"
12
+ mean_shift = True # Enable full-pass mean shifting
13
+ CKPT = "/root/autodl-tmp/model/siglip2"
14
+ BATCH_SIZE = 512
15
+ LOAD_LIMIT = None # Limit number of items, or None for all
16
+
17
+ # Output directories
18
+ RAW_DIR = "raw_embeds"
19
+ SHIFTED_DIR = "shifted_embeds"
20
+
21
+ # Create output directories if they don't exist
22
+ os.makedirs(RAW_DIR, exist_ok=True)
23
+ os.makedirs(SHIFTED_DIR, exist_ok=True)
24
+
25
+ # 1. Load data
26
+ with open(INPUT_JSON, "r", encoding="utf-8") as f:
27
+ items = json.load(f)
28
+ if LOAD_LIMIT is not None:
29
+ items = items[:LOAD_LIMIT]
30
+
31
+ # 2. Initialize tokenizer
32
+ tokenizer = AutoTokenizer.from_pretrained(CKPT)
33
+
34
+ # 3. Split data among GPUs
35
+ num_gpus = torch.cuda.device_count()
36
+ chunks = np.array_split(items, num_gpus)
37
+
38
+ # Function to compute raw embeddings (no shift)
39
+ def compute_raw_embeddings(device, data_chunk, gpu_id):
40
+ device = torch.device(device)
41
+ model = AutoModel.from_pretrained(CKPT).to(device).eval()
42
+ results = [] # To store raw embeddings
43
+
44
+ for i in tqdm(range(0, len(data_chunk), BATCH_SIZE), desc=f"Device {gpu_id} Raw Batches"):
45
+ batch = data_chunk[i:i + BATCH_SIZE]
46
+ ids = [it['id'] for it in batch]
47
+ captions = [it.get('caption', '') for it in batch]
48
+
49
+ inputs = tokenizer(
50
+ captions,
51
+ padding="max_length",
52
+ truncation=True,
53
+ max_length=64,
54
+ return_tensors="pt"
55
+ ).to(device)
56
+
57
+ with torch.no_grad():
58
+ embs = model.get_text_features(**inputs)
59
+ embs_np = embs.cpu().numpy()
60
+
61
+ for idx, item_id in enumerate(ids):
62
+ results.append({'id': item_id, 'embed': embs_np[idx]})
63
+
64
+ # Save raw embeddings
65
+ raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
66
+ with open(raw_file, 'wb') as f:
67
+ pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
68
+ print(f"Device {gpu_id} saved {len(results)} raw embeddings to {raw_file}")
69
+
70
+ # Function to apply mean shift and save final embeddings
71
+ def apply_mean_shift_and_save(global_mean, gpu_id):
72
+ raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{gpu_id}.pkl")
73
+ out_file = os.path.join(SHIFTED_DIR, f"embeds_device_{gpu_id}.pkl")
74
+
75
+ with open(raw_file, 'rb') as f:
76
+ data = pickle.load(f)
77
+
78
+ # Subtract global mean
79
+ for item in data:
80
+ item['embed'] = item['embed'] - global_mean
81
+
82
+ # Save shifted embeddings
83
+ with open(out_file, 'wb') as f:
84
+ pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
85
+ print(f"Device {gpu_id} saved {len(data)} shifted embeddings to {out_file}")
86
+
87
+ # Main entry
88
+ def main():
89
+ # 1st pass: compute raw embeddings in parallel
90
+ procs = []
91
+ for i in range(num_gpus):
92
+ p = mp.Process(target=compute_raw_embeddings, args=(f"cuda:{i}", chunks[i], i))
93
+ p.start()
94
+ procs.append(p)
95
+ for p in procs:
96
+ p.join()
97
+
98
+ if mean_shift:
99
+ # Load all raw embeddings to compute global mean
100
+ all_embeds = []
101
+ for i in range(num_gpus):
102
+ raw_file = os.path.join(RAW_DIR, f"raw_embeds_device_{i}.pkl")
103
+ with open(raw_file, 'rb') as f:
104
+ data = pickle.load(f)
105
+ all_embeds.extend([item['embed'] for item in data])
106
+
107
+ all_embeds = np.stack(all_embeds, axis=0)
108
+ global_mean = np.mean(all_embeds, axis=0)
109
+ print("Computed global mean of shape", global_mean.shape)
110
+
111
+ # 2nd pass: subtract mean and save shifted embeddings
112
+ for i in range(num_gpus):
113
+ apply_mean_shift_and_save(global_mean, i)
114
+
115
+ if __name__ == "__main__":
116
+ main()
raw_embeds/raw_embeds_device_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63d187ce40096e63b74d154149a57618a542d0fafa074035f4767ae047bb2012
3
+ size 29844542625
raw_embeds/raw_embeds_device_1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2335055511cde0b646f3ab10072951ac283e0b6a09768a14b2c30d33410e8c19
3
+ size 29844673953
shifted_embeds/embeds_device_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abd270a46450d7339737bd0b9655a758026e2b2b5d2413f1ab75519a7956ce8a
3
+ size 29844542625
shifted_embeds/embeds_device_1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dee0e5d96e3b26c42ef33b0ecaa024399dcc18a481e085e80e0f3bc8b69d3fe6
3
+ size 29844673953