FlameF0X commited on
Commit
6c3d1f5
·
verified ·
1 Parent(s): 80599d6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +219 -2
README.md CHANGED
@@ -5,5 +5,222 @@ pipeline_tag: text-generation
5
 
6
  I barely remember something about this Muffin version but its okay. It has 5.8M parameters. And its a LSMT.
7
 
8
- datasets:
9
- - A book, i dont remember.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  I barely remember something about this Muffin version but its okay. It has 5.8M parameters. And its a LSMT.
7
 
8
+ datasets: A book, i dont remember.
9
+
10
+ code, here:
11
+ ```python
12
+ ################################################################
13
+ # Muffin V5.7l -- VERSION 5 large (code name: Elizabeth) #
14
+ # Now more BIG (5.8M) #
15
+ ################################################################
16
+
17
+ import os
18
+ import random
19
+ from typing import List
20
+
21
+ import torch
22
+ import torch.nn as nn
23
+ import torch.optim as optim
24
+ from torch.utils.data import DataLoader, Dataset
25
+
26
+
27
+ class CorpusDataset(Dataset):
28
+ def __init__(self, data: List[str], seq_length: int):
29
+ self.data = data
30
+ self.seq_length = seq_length
31
+
32
+ def __len__(self):
33
+ return len(self.data) - self.seq_length
34
+
35
+ def __getitem__(self, index):
36
+ input_seq = self.data[index:index + self.seq_length]
37
+ target_seq = self.data[index + 1:index + self.seq_length + 1]
38
+ return torch.tensor(input_seq), torch.tensor(target_seq)
39
+
40
+
41
+ class TextGeneratorNN(nn.Module):
42
+ def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, num_layers: int):
43
+ super(TextGeneratorNN, self).__init__()
44
+ self.embedding = nn.Embedding(vocab_size, embedding_dim)
45
+ self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
46
+ self.fc = nn.Linear(hidden_dim, vocab_size)
47
+
48
+ def forward(self, x, hidden=None):
49
+ x = self.embedding(x)
50
+ output, hidden = self.lstm(x, hidden)
51
+ output = self.fc(output)
52
+ return output, hidden
53
+
54
+
55
+ class TextGenerator:
56
+ def __init__(self, corpus_path: str, seq_length: int = 20, embedding_dim: int = 128, hidden_dim: int = 256, num_layers: int = 2) -> None:
57
+ self.seq_length = seq_length
58
+ self.corpus = self.load_corpus(corpus_path)
59
+ self.words = self.split_words(self.corpus)
60
+ self.vocab = list(set(self.words)) # Unique words
61
+ self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
62
+ self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
63
+
64
+ self.model = TextGeneratorNN(len(self.vocab), embedding_dim, hidden_dim, num_layers)
65
+ self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
66
+ self.loss_fn = nn.CrossEntropyLoss()
67
+
68
+ # Prepare dataset and dataloader
69
+ corpus_indices = [self.word_to_idx[word] for word in self.words]
70
+ self.dataset = CorpusDataset(corpus_indices, self.seq_length)
71
+ self.dataloader = DataLoader(self.dataset, batch_size=64, shuffle=True)
72
+
73
+ # Directory for saving/loading model
74
+ self.model_path = 'Models/V5/model-main.pth'
75
+ self.training_dir = 'Models/V5'
76
+
77
+ # Ensure the directory exists
78
+ if not os.path.exists(self.training_dir):
79
+ os.makedirs(self.training_dir)
80
+
81
+ # Check if the model file exists
82
+ if os.path.exists(self.model_path):
83
+ print("Loading saved model from:", self.model_path)
84
+ self.load_model()
85
+ else:
86
+ print("No saved model found. Training from scratch.")
87
+
88
+ def load_corpus(self, file_path: str) -> str:
89
+ """Load the corpus from a file."""
90
+ with open(file_path, 'r', encoding='utf-8') as file:
91
+ return file.read()
92
+
93
+ def split_words(self, input_text: str) -> List[str]:
94
+ """Split a string into words."""
95
+ return input_text.split()
96
+
97
+ def train(self, epochs: int = 10) -> None:
98
+ """Train the neural network."""
99
+ self.model.train()
100
+ for epoch in range(epochs):
101
+ total_loss = 0
102
+ for input_seq, target_seq in self.dataloader:
103
+ input_seq, target_seq = input_seq.long(), target_seq.long()
104
+ self.optimizer.zero_grad()
105
+
106
+ output, _ = self.model(input_seq)
107
+ loss = self.loss_fn(output.view(-1, len(self.vocab)), target_seq.view(-1))
108
+ loss.backward()
109
+ self.optimizer.step()
110
+
111
+ total_loss += loss.item()
112
+
113
+ print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(self.dataloader)}")
114
+
115
+ # Save the model after training
116
+ print("Saving trained model to:", self.model_path)
117
+ self.save_model()
118
+
119
+ def generate(self, start_words: str, length: int, temperature: float) -> str:
120
+ self.model.eval()
121
+
122
+ current_words = start_words.split()
123
+ input_seq = torch.tensor([self.word_to_idx[word] for word in current_words]).unsqueeze(0)
124
+
125
+ hidden = None
126
+ result = current_words[:]
127
+
128
+ for _ in range(length):
129
+ with torch.no_grad():
130
+ output, hidden = self.model(input_seq, hidden)
131
+
132
+ probabilities = torch.softmax(output[:, -1, :] / temperature, dim=-1).squeeze()
133
+ next_word_idx = torch.multinomial(probabilities, 1).item()
134
+ next_word = self.idx_to_word[next_word_idx]
135
+
136
+ result.append(next_word)
137
+ input_seq = torch.tensor([next_word_idx]).unsqueeze(0)
138
+
139
+ # Continue generating until we hit punctuation after reaching the length limit
140
+ while not self.ends_with_punctuation(result[-1]):
141
+ with torch.no_grad():
142
+ output, hidden = self.model(input_seq, hidden)
143
+
144
+ probabilities = torch.softmax(output[:, -1, :] / temperature, dim=-1).squeeze()
145
+ next_word_idx = torch.multinomial(probabilities, 1).item()
146
+ next_word = self.idx_to_word[next_word_idx]
147
+
148
+ result.append(next_word)
149
+ input_seq = torch.tensor([next_word_idx]).unsqueeze(0)
150
+
151
+ return ' '.join(result)
152
+
153
+ @staticmethod
154
+ def ends_with_punctuation(word: str) -> bool:
155
+ """Check if the word ends with punctuation."""
156
+ return word[-1] in {'.', '!', '?'}
157
+
158
+ def get_random_starting_words(self, word_count: int = 2) -> str:
159
+ """Select random starting words that exist in the corpus."""
160
+ if len(self.words) < word_count:
161
+ raise ValueError("Not enough words in the corpus for starting sequence.")
162
+ start_index = random.randint(0, len(self.words) - word_count)
163
+ return ' '.join(self.words[start_index:start_index + word_count])
164
+
165
+ def save_model(self):
166
+ """Save the trained model and optimizer state."""
167
+ torch.save({
168
+ 'model_state_dict': self.model.state_dict(),
169
+ 'optimizer_state_dict': self.optimizer.state_dict(),
170
+ 'vocab': self.vocab,
171
+ 'word_to_idx': self.word_to_idx,
172
+ 'idx_to_word': self.idx_to_word,
173
+ }, self.model_path)
174
+
175
+ def load_model(self):
176
+ """Load the saved model and optimizer state."""
177
+ checkpoint = torch.load(self.model_path, map_location=torch.device('cpu')) # Add map_location
178
+ self.model.load_state_dict(checkpoint['model_state_dict'])
179
+ self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
180
+ self.vocab = checkpoint['vocab']
181
+ self.word_to_idx = checkpoint['word_to_idx']
182
+ self.idx_to_word = checkpoint['idx_to_word']
183
+
184
+ def save_generated_text(self, text: str, file_path: str = './SaveGeneratedText.txt') -> None:
185
+ """Save the generated text to a specified file."""
186
+ with open(file_path, 'a', encoding='utf-8') as file:
187
+ file.write(text + '\n') # Append the text followed by a newline
188
+
189
+
190
+ # Use the larger corpus dataset (dataset-4.txt)
191
+ corpus_file_path = 'Snapshots/Datasets/dataset-5-large.txt'
192
+
193
+ # Initialize the text generator with the LSTM model
194
+ generator = TextGenerator(corpus_file_path)
195
+
196
+ # If model doesn't exist, train the neural network model (adjust epochs as needed)
197
+ if not os.path.exists(generator.model_path):
198
+ generator.train(epochs=50)
199
+
200
+ # Loop to generate text until the user decides to save it
201
+ while True:
202
+ # Randomly select starting words from the dataset
203
+ start_words = generator.get_random_starting_words(word_count=3)
204
+ length = 50 # Length of the generated text
205
+ temperature = 0.835 # Adjust the randomness (0.835)
206
+
207
+ # Generate text starting with the randomly selected start_words
208
+ generated_text = generator.generate(start_words, length, temperature)
209
+
210
+ print("Starting Words: " + start_words)
211
+ print("Generated Text: " + generated_text)
212
+
213
+ # Prompt to save the generated text
214
+ save_choice = input(">> Do you want to save the generated text? (yes/no/cancel/stop): ").strip().lower()
215
+ if save_choice == 'yes':
216
+ generator.save_generated_text(generated_text)
217
+ print("Generated text saved to './SaveGeneratedText.txt'.")
218
+
219
+ elif save_choice == 'no':
220
+ print("Generating a new text...")
221
+ elif save_choice in ('cancel', 'stop'):
222
+ print("Operation cancelled.")
223
+ break
224
+ else:
225
+ print("Invalid input. Please respond with 'yes', 'no' or 'cancel'/'stop'.")
226
+ ```