AjayP13 commited on
Commit
5ff0fd8
·
verified ·
1 Parent(s): 2d2c449

Upload pooling_coverage.py

Browse files
Files changed (1) hide show
  1. pooling_coverage.py +160 -0
pooling_coverage.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class GaussianCoveragePooling(torch.nn.Module):
5
+ def __init__(self, coverage_chunks, sigma, alpha):
6
+ """
7
+ Custom pooling layer that computes weighted mean pooling using Gaussian-based weights.
8
+
9
+ Args:
10
+ coverage_chunks (int): Number of weighted pooling operations (N).
11
+ sigma (float): Standard deviation for Gaussian weighting.
12
+ alpha (float): Weighting factor for merging with standard mean pooling.
13
+ """
14
+ super().__init__()
15
+ self.coverage_chunks = coverage_chunks
16
+ self.sigma = sigma # Controls width of Gaussians
17
+ self.alpha = alpha # Blends standard mean with weighted mean
18
+
19
+ def forward(self, features, chunk_indicators=None):
20
+ """
21
+ Computes weighted mean pooling using Gaussian-based weights.
22
+
23
+ Args:
24
+ self (SentenceTransformer): The model.
25
+ features (dict): The token embeddings and attention mask.
26
+ chunk_indicators (tensor[bz, 1]): Index indicators to return a specific chunk,
27
+ leave as None to return embeddings for all chunks. Mainly useful for training,
28
+ not inference. Leave as None for inference.
29
+ """
30
+
31
+ # Get token embeddings and attention mask
32
+ token_embeddings = features[
33
+ "token_embeddings"
34
+ ] # (batch_size, seq_len, hidden_dim)
35
+ attention_mask = (
36
+ features["attention_mask"].float().unsqueeze(-1)
37
+ ) # (batch_size, seq_len, 1)
38
+
39
+ # Get shapes and devices
40
+ batch_size, seq_len, hidden_dim = token_embeddings.shape
41
+ device = token_embeddings.device
42
+
43
+ # Compute actual sequence lengths (ignoring padding)
44
+ # (batch_size, 1)
45
+ seq_lengths = attention_mask.squeeze(-1).sum(dim=1, keepdim=True)
46
+ max_seq_length = int(torch.max(seq_lengths).item())
47
+
48
+ # Standard mean pooling
49
+ sum_embeddings = torch.sum(token_embeddings * attention_mask, dim=1)
50
+ sum_mask = torch.sum(attention_mask, dim=1).clamp(min=1e-9)
51
+ standard_mean = sum_embeddings / sum_mask # (batch_size, hidden_dim)
52
+
53
+ # Compute chunk centers dynamically based on sequence length
54
+ chunk_positions = torch.linspace(0, 1, self.coverage_chunks + 2, device=device)[
55
+ 1:-1
56
+ ] # Excludes 0 and 1
57
+ chunk_centers = chunk_positions * seq_lengths # (batch_size, N)
58
+
59
+ # Token positions per sequence (batch_size, seq_len)
60
+ token_positions = (
61
+ torch.arange(seq_len, device=device).float().unsqueeze(0)
62
+ ) # (1, seq_len)
63
+
64
+ # Compute Gaussian weights (batch_size, N, seq_len)
65
+ seq_lengths = seq_lengths.view(seq_lengths.shape[0], 1, 1).repeat(
66
+ 1, self.coverage_chunks, max_seq_length
67
+ )
68
+ gaussians = torch.exp(
69
+ -0.5
70
+ * (
71
+ (token_positions.unsqueeze(1) - chunk_centers.unsqueeze(2))
72
+ / (self.sigma * seq_lengths)
73
+ )
74
+ ** 2
75
+ )
76
+
77
+ # Mask out padding and normalize Gaussian weights per sequence
78
+ # (batch_size, N, seq_len)
79
+ gaussians = gaussians * attention_mask.squeeze(-1).unsqueeze(1)
80
+
81
+ # Normalize against gaussian weights
82
+ gaussians /= gaussians.sum(dim=2, keepdim=True).clamp(min=1e-9)
83
+
84
+ # Compute weighted mean for each chunk (batch_size, N, hidden_dim)
85
+ weighted_means = torch.einsum(
86
+ "bns,bsh->bnh", gaussians.to(token_embeddings.dtype), token_embeddings
87
+ )
88
+
89
+ # Blend with standard mean pooling
90
+ # (batch_size, N, hidden_dim)
91
+ combined_embeddings = (1 - self.alpha) * standard_mean.unsqueeze(
92
+ 1
93
+ ) + self.alpha * weighted_means
94
+
95
+ # Add an embedding for the entire document at index 0
96
+ # (batch_size, N+1, hidden_dim)
97
+ combined_embeddings = torch.cat(
98
+ [torch.zeros_like(combined_embeddings[:, :1]), combined_embeddings], 1
99
+ )
100
+ combined_embeddings[:, 0:1, :] = standard_mean.unsqueeze(1)
101
+
102
+ # Select the indicator if provided
103
+ if chunk_indicators is not None:
104
+ combined_embeddings = combined_embeddings[
105
+ torch.arange(combined_embeddings.size(0)), chunk_indicators
106
+ ]
107
+
108
+ # Normalize all the embeddings
109
+ combined_embeddings = torch.nn.functional.normalize(
110
+ combined_embeddings, p=2, dim=-1
111
+ )
112
+
113
+ # Flatten final embeddings (batch_size, hidden_dim * (N+1))
114
+ if chunk_indicators is None:
115
+ sentence_embedding = combined_embeddings.reshape(
116
+ batch_size, hidden_dim * (self.coverage_chunks + 1)
117
+ )
118
+ else:
119
+ sentence_embedding = combined_embeddings
120
+
121
+ # Return the final flattened entence embedding
122
+ features["sentence_embedding"] = sentence_embedding
123
+ return features
124
+
125
+
126
+ def use_gaussian_coverage_pooling(m, coverage_chunks=10, sigma=0.05, alpha=1.0):
127
+ """
128
+ Add custom pooling layer that computes weighted mean pooling using Gaussian-based weights.
129
+
130
+ Args:
131
+ m (SentenceTransformer): The model to add pooling layer to.
132
+ coverage_chunks (int): Number of weighted pooling operations (N).
133
+ sigma (float): Standard deviation for Gaussian weighting.
134
+ alpha (float): Weighting factor for merging with standard mean pooling.
135
+ """
136
+ if isinstance(m[1], GaussianCoveragePooling):
137
+ m = unuse_gaussian_coverage_pooling(m)
138
+ word_embedding_model = m[0]
139
+ custom_pooling = GaussianCoveragePooling(
140
+ coverage_chunks=coverage_chunks, sigma=sigma, alpha=alpha
141
+ )
142
+ old_pooling = m[1]
143
+ new_m = m.__class__(modules=[word_embedding_model, custom_pooling])
144
+ new_m.old_pooling = {"old_pooling": old_pooling}
145
+ return new_m
146
+
147
+
148
+ def unuse_gaussian_coverage_pooling(m):
149
+ """
150
+ Removes the custom pooling layer.
151
+
152
+ Args:
153
+ m (SentenceTransformer): The model to remove the pooling layer from.
154
+ """
155
+
156
+ if isinstance(m[1], GaussianCoveragePooling):
157
+ new_m = m.__class__(modules=[m[0], m.old_pooling["old_pooling"]])
158
+ return new_m
159
+ else:
160
+ return m