ankitkushwaha90 commited on
Commit
6bfa0c9
·
verified ·
1 Parent(s): c3d203f

Create Large_Action_Model_Transformer.py

Browse files
Files changed (1) hide show
  1. Large_Action_Model_Transformer.py +329 -0
Large_Action_Model_Transformer.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import math
5
+ from torch.utils.data import Dataset, DataLoader
6
+ import numpy as np
7
+
8
+ class PositionalEncoding(nn.Module):
9
+ """
10
+ Positional Encoding for Transformer models
11
+ """
12
+ def __init__(self, d_model, dropout=0.1, max_len=5000):
13
+ super(PositionalEncoding, self).__init__()
14
+ self.dropout = nn.Dropout(p=dropout)
15
+
16
+ position = torch.arange(max_len).unsqueeze(1)
17
+ div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
18
+ pe = torch.zeros(max_len, 1, d_model)
19
+ pe[:, 0, 0::2] = torch.sin(position * div_term)
20
+ pe[:, 0, 1::2] = torch.cos(position * div_term)
21
+ self.register_buffer('pe', pe)
22
+
23
+ def forward(self, x):
24
+ """
25
+ Args:
26
+ x: Tensor, shape [seq_len, batch_size, embedding_dim]
27
+ """
28
+ x = x + self.pe[:x.size(0)]
29
+ return self.dropout(x)
30
+
31
+ class MultiHeadAttention(nn.Module):
32
+ """
33
+ Multi-head attention mechanism
34
+ """
35
+ def __init__(self, d_model, num_heads, dropout=0.1):
36
+ super().__init__()
37
+ assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
38
+
39
+ self.d_model = d_model
40
+ self.num_heads = num_heads
41
+ self.d_k = d_model // num_heads
42
+
43
+ self.w_q = nn.Linear(d_model, d_model)
44
+ self.w_k = nn.Linear(d_model, d_model)
45
+ self.w_v = nn.Linear(d_model, d_model)
46
+ self.w_o = nn.Linear(d_model, d_model)
47
+
48
+ self.dropout = nn.Dropout(dropout)
49
+ self.scale = torch.sqrt(torch.FloatTensor([self.d_k])).to(device)
50
+
51
+ def forward(self, q, k, v, mask=None):
52
+ batch_size = q.size(0)
53
+
54
+ # Linear projections
55
+ q = self.w_q(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
56
+ k = self.w_k(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
57
+ v = self.w_v(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
58
+
59
+ # Scaled dot-product attention
60
+ attn = torch.matmul(q, k.transpose(-2, -1)) / self.scale
61
+
62
+ if mask is not None:
63
+ attn = attn.masked_fill(mask == 0, -1e10)
64
+
65
+ attn = F.softmax(attn, dim=-1)
66
+ attn = self.dropout(attn)
67
+
68
+ output = torch.matmul(attn, v)
69
+ output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
70
+ output = self.w_o(output)
71
+
72
+ return output
73
+
74
+ class PositionwiseFeedforward(nn.Module):
75
+ """
76
+ Position-wise feedforward network
77
+ """
78
+ def __init__(self, d_model, d_ff, dropout=0.1):
79
+ super().__init__()
80
+ self.fc1 = nn.Linear(d_model, d_ff)
81
+ self.fc2 = nn.Linear(d_ff, d_model)
82
+ self.dropout = nn.Dropout(dropout)
83
+
84
+ def forward(self, x):
85
+ x = F.relu(self.fc1(x))
86
+ x = self.dropout(x)
87
+ x = self.fc2(x)
88
+ return x
89
+
90
+ class EncoderLayer(nn.Module):
91
+ """
92
+ Single encoder layer
93
+ """
94
+ def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
95
+ super().__init__()
96
+ self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
97
+ self.ffn = PositionwiseFeedforward(d_model, d_ff, dropout)
98
+ self.norm1 = nn.LayerNorm(d_model)
99
+ self.norm2 = nn.LayerNorm(d_model)
100
+ self.dropout1 = nn.Dropout(dropout)
101
+ self.dropout2 = nn.Dropout(dropout)
102
+
103
+ def forward(self, x, mask=None):
104
+ # Self attention
105
+ attn_output = self.self_attn(x, x, x, mask)
106
+ x = x + self.dropout1(attn_output)
107
+ x = self.norm1(x)
108
+
109
+ # Feedforward
110
+ ff_output = self.ffn(x)
111
+ x = x + self.dropout2(ff_output)
112
+ x = self.norm2(x)
113
+
114
+ return x
115
+
116
+ class DecoderLayer(nn.Module):
117
+ """
118
+ Single decoder layer
119
+ """
120
+ def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
121
+ super().__init__()
122
+ self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
123
+ self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
124
+ self.ffn = PositionwiseFeedforward(d_model, d_ff, dropout)
125
+ self.norm1 = nn.LayerNorm(d_model)
126
+ self.norm2 = nn.LayerNorm(d_model)
127
+ self.norm3 = nn.LayerNorm(d_model)
128
+ self.dropout1 = nn.Dropout(dropout)
129
+ self.dropout2 = nn.Dropout(dropout)
130
+ self.dropout3 = nn.Dropout(dropout)
131
+
132
+ def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
133
+ # Self attention
134
+ attn_output = self.self_attn(x, x, x, tgt_mask)
135
+ x = x + self.dropout1(attn_output)
136
+ x = self.norm1(x)
137
+
138
+ # Cross attention
139
+ attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
140
+ x = x + self.dropout2(attn_output)
141
+ x = self.norm2(x)
142
+
143
+ # Feedforward
144
+ ff_output = self.ffn(x)
145
+ x = x + self.dropout3(ff_output)
146
+ x = self.norm3(x)
147
+
148
+ return x
149
+
150
+ class Transformer(nn.Module):
151
+ """
152
+ Complete Transformer model
153
+ """
154
+ def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8,
155
+ num_layers=6, d_ff=2048, dropout=0.1, max_len=5000):
156
+ super().__init__()
157
+ self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
158
+ self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
159
+ self.pos_encoding = PositionalEncoding(d_model, dropout, max_len)
160
+
161
+ self.encoder_layers = nn.ModuleList([
162
+ EncoderLayer(d_model, num_heads, d_ff, dropout)
163
+ for _ in range(num_layers)
164
+ ])
165
+
166
+ self.decoder_layers = nn.ModuleList([
167
+ DecoderLayer(d_model, num_heads, d_ff, dropout)
168
+ for _ in range(num_layers)
169
+ ])
170
+
171
+ self.fc_out = nn.Linear(d_model, tgt_vocab_size)
172
+ self.dropout = nn.Dropout(dropout)
173
+
174
+ def forward(self, src, tgt, src_mask=None, tgt_mask=None):
175
+ # Encoder
176
+ src_embedded = self.dropout(self.pos_encoding(self.encoder_embedding(src)))
177
+ enc_output = src_embedded
178
+ for layer in self.encoder_layers:
179
+ enc_output = layer(enc_output, src_mask)
180
+
181
+ # Decoder
182
+ tgt_embedded = self.dropout(self.pos_encoding(self.decoder_embedding(tgt)))
183
+ dec_output = tgt_embedded
184
+ for layer in self.decoder_layers:
185
+ dec_output = layer(dec_output, enc_output, src_mask, tgt_mask)
186
+
187
+ output = self.fc_out(dec_output)
188
+ return output
189
+
190
+ class CodeDataset(Dataset):
191
+ """
192
+ Dataset for code sequences
193
+ """
194
+ def __init__(self, sequences, max_len):
195
+ self.sequences = sequences
196
+ self.max_len = max_len
197
+
198
+ def __len__(self):
199
+ return len(self.sequences)
200
+
201
+ def __getitem__(self, idx):
202
+ seq = self.sequences[idx]
203
+ # Pad sequences to max_len
204
+ padded = np.zeros(self.max_len, dtype=np.int64)
205
+ length = min(len(seq), self.max_len)
206
+ padded[:length] = seq[:length]
207
+ return torch.tensor(padded, dtype=torch.long)
208
+
209
+ def create_masks(src, tgt, pad_idx):
210
+ """
211
+ Create masks for source and target sequences
212
+ """
213
+ src_mask = (src != pad_idx).unsqueeze(1).unsqueeze(2)
214
+
215
+ tgt_mask = (tgt != pad_idx).unsqueeze(1).unsqueeze(2)
216
+ seq_len = tgt.size(1)
217
+ nopeak_mask = (1 - torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1)).bool()
218
+ tgt_mask = tgt_mask & nopeak_mask.to(device)
219
+
220
+ return src_mask, tgt_mask
221
+
222
+ def train_model(model, dataloader, optimizer, criterion, epochs, pad_idx):
223
+ """
224
+ Training loop for the transformer model
225
+ """
226
+ model.train()
227
+
228
+ for epoch in range(epochs):
229
+ total_loss = 0
230
+ for src, tgt in dataloader:
231
+ src, tgt = src.to(device), tgt.to(device)
232
+
233
+ # Create masks
234
+ src_mask, tgt_mask = create_masks(src, tgt, pad_idx)
235
+
236
+ # Forward pass
237
+ optimizer.zero_grad()
238
+ output = model(src, tgt[:, :-1], src_mask, tgt_mask[:, :-1, :-1])
239
+
240
+ # Calculate loss
241
+ output_dim = output.shape[-1]
242
+ output = output.contiguous().view(-1, output_dim)
243
+ tgt = tgt[:, 1:].contiguous().view(-1)
244
+
245
+ loss = criterion(output, tgt)
246
+
247
+ # Backward pass
248
+ loss.backward()
249
+ optimizer.step()
250
+
251
+ total_loss += loss.item()
252
+
253
+ print(f'Epoch: {epoch+1}, Loss: {total_loss / len(dataloader)}')
254
+
255
+ def generate_code(model, src, max_len, start_symbol, end_symbol, pad_idx):
256
+ """
257
+ Generate code sequence using the trained model
258
+ """
259
+ model.eval()
260
+ src = src.to(device)
261
+ src_mask = (src != pad_idx).unsqueeze(1).unsqueeze(2).to(device)
262
+
263
+ memory = model.encode(src, src_mask)
264
+ ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
265
+
266
+ for i in range(max_len-1):
267
+ tgt_mask = (torch.triu(torch.ones(1, ys.size(1), ys.size(1))) == 0).transpose(0, 1)
268
+ tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))
269
+
270
+ out = model.decode(ys, memory, src_mask, tgt_mask)
271
+ prob = model.fc_out(out[:, -1])
272
+ _, next_word = torch.max(prob, dim=1)
273
+ next_word = next_word.item()
274
+
275
+ ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
276
+
277
+ if next_word == end_symbol:
278
+ break
279
+
280
+ return ys
281
+
282
+ # Configuration
283
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
284
+ print(f"Using device: {device}")
285
+
286
+ # Hyperparameters
287
+ VOCAB_SIZE = 10000 # Should be adjusted based on your actual vocabulary
288
+ D_MODEL = 512
289
+ NUM_HEADS = 8
290
+ NUM_LAYERS = 6
291
+ D_FF = 2048
292
+ DROPOUT = 0.1
293
+ BATCH_SIZE = 32
294
+ EPOCHS = 10
295
+ MAX_LEN = 100
296
+ LEARNING_RATE = 0.0001
297
+ PAD_IDX = 0 # Assuming 0 is the padding index
298
+
299
+ # Sample data - in practice you would load your code dataset here
300
+ # For demonstration, we'll create some dummy data
301
+ sample_data = [np.random.randint(1, VOCAB_SIZE, size=np.random.randint(10, MAX_LEN)) for _ in range(1000)]
302
+ dataset = CodeDataset(sample_data, MAX_LEN)
303
+ dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
304
+
305
+ # Initialize model
306
+ model = Transformer(
307
+ src_vocab_size=VOCAB_SIZE,
308
+ tgt_vocab_size=VOCAB_SIZE,
309
+ d_model=D_MODEL,
310
+ num_heads=NUM_HEADS,
311
+ num_layers=NUM_LAYERS,
312
+ d_ff=D_FF,
313
+ dropout=DROPOUT,
314
+ max_len=MAX_LEN
315
+ ).to(device)
316
+
317
+ # Loss and optimizer
318
+ criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
319
+ optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
320
+
321
+ # Train the model
322
+ train_model(model, dataloader, optimizer, criterion, EPOCHS, PAD_IDX)
323
+
324
+ # Example of generating code
325
+ start_symbol = 1 # Assuming 1 is the start token
326
+ end_symbol = 2 # Assuming 2 is the end token
327
+ sample_input = torch.tensor([sample_data[0][:10]], dtype=torch.long) # First 10 tokens of first sample
328
+ generated_code = generate_code(model, sample_input, MAX_LEN, start_symbol, end_symbol, PAD_IDX)
329
+ print("Generated code sequence:", generated_code)