adricl commited on
Commit
0c2af15
·
1 Parent(s): 2a062a8

Some small refactoring for consistant settings

Browse files
HuggingFace_Mistral_Transformer_Single_Instrument.ipynb CHANGED
@@ -175,6 +175,16 @@
175
  "Then data augmentation is performed on each subset independently, and the MIDIs are split into smaller chunks that make approximately the desired token sequence length for training."
176
  ]
177
  },
 
 
 
 
 
 
 
 
 
 
178
  {
179
  "cell_type": "code",
180
  "execution_count": null,
@@ -204,7 +214,7 @@
204
  " files_paths=files_paths,\n",
205
  " tokenizer=tokenizer,\n",
206
  " save_dir=subset_chunks_dir,\n",
207
- " max_seq_len=1024,\n",
208
  " num_overlap_bars=2,\n",
209
  " )\n",
210
  "\n",
@@ -230,7 +240,7 @@
230
  "midi_paths_valid = list(root_save.joinpath(Path(\"Maestro_valid\")).glob(\"**/*.mid\")) + list(root_save.joinpath(Path(\"Maestro_valid\")).glob(\"**/*.midi\")) \n",
231
  "midi_paths_test = list(root_save.joinpath(Path(\"Maestro_test\")).glob(\"**/*.mid\")) + list(root_save.joinpath(Path(\"Maestro_test\")).glob(\"**/*.midi\"))\n",
232
  "\n",
233
- "kwargs_dataset = {\"max_seq_len\": 1024, \"tokenizer\": tokenizer, \"bos_token_id\": tokenizer[\"BOS_None\"], \"eos_token_id\": tokenizer[\"EOS_None\"]}\n",
234
  "\n",
235
  "dataset_train = DatasetMIDI(midi_paths_train, **kwargs_dataset)\n",
236
  "dataset_valid = DatasetMIDI(midi_paths_valid, **kwargs_dataset)\n",
@@ -255,7 +265,7 @@
255
  },
256
  "outputs": [],
257
  "source": [
258
- "testing_files = \n",
259
  "preview_files_path = []\n",
260
  "for testing_file in testing_files:\n",
261
  " preview_files_path.append(Path(testing_file))\n",
@@ -265,7 +275,7 @@
265
  " files_paths=preview_files_path,\n",
266
  " tokenizer=tokenizer,\n",
267
  " save_dir=preview_dir,\n",
268
- " max_seq_len=1024,\n",
269
  " num_overlap_bars=2,\n",
270
  " )\n"
271
  ]
@@ -286,7 +296,7 @@
286
  " file_name_lookup.append(p3.name)\n",
287
  " return file_name_lookup.index(p3.name)\n",
288
  " \n",
289
- "kwargs_dataset = {\"max_seq_len\": 1024, \"tokenizer\": tokenizer, \"bos_token_id\": tokenizer[\"BOS_None\"], \"eos_token_id\": tokenizer[\"EOS_None\"], \"func_to_get_labels\" : func_to_get_labels}\n",
290
  "dataset_preview = DatasetMIDI(midi_split_preview, **kwargs_dataset)"
291
  ]
292
  },
@@ -362,14 +372,14 @@
362
  "source": [
363
  "# Creates model\n",
364
  "model_config = MistralConfig(\n",
365
- " vocab_size=len(tokenizer),\n",
366
- " hidden_size=512,\n",
367
- " intermediate_size=2048,\n",
368
- " num_hidden_layers=8,\n",
369
- " num_attention_heads=8,\n",
370
- " num_key_value_heads=4,\n",
371
- " sliding_window=256,\n",
372
- " max_position_embeddings=8192,\n",
373
  " pad_token_id=tokenizer['PAD_None'],\n",
374
  " bos_token_id=tokenizer['BOS_None'],\n",
375
  " eos_token_id=tokenizer['EOS_None'],\n",
@@ -449,7 +459,7 @@
449
  " learning_rate=1e-4,\n",
450
  " weight_decay=0.01,\n",
451
  " max_grad_norm=3.0,\n",
452
- " max_steps=20000,\n",
453
  " lr_scheduler_type=\"cosine_with_restarts\",\n",
454
  " warmup_ratio=0.3,\n",
455
  " log_level=\"debug\",\n",
@@ -485,10 +495,20 @@
485
  " compute_metrics=compute_metrics,\n",
486
  " callbacks=None,\n",
487
  " preprocess_logits_for_metrics=preprocess_logits,\n",
 
488
  ")\n",
489
  "\n"
490
  ]
491
  },
 
 
 
 
 
 
 
 
 
492
  {
493
  "cell_type": "code",
494
  "execution_count": null,
 
175
  "Then data augmentation is performed on each subset independently, and the MIDIs are split into smaller chunks that make approximately the desired token sequence length for training."
176
  ]
177
  },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": null,
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": [
184
+ "sequence_length = 1024 # The maximum sequence length for data samples.\n",
185
+ "kwargs_dataset = {\"max_seq_len\": sequence_length, \"tokenizer\": tokenizer, \"bos_token_id\": tokenizer[\"BOS_None\"], \"eos_token_id\": tokenizer[\"EOS_None\"]}"
186
+ ]
187
+ },
188
  {
189
  "cell_type": "code",
190
  "execution_count": null,
 
214
  " files_paths=files_paths,\n",
215
  " tokenizer=tokenizer,\n",
216
  " save_dir=subset_chunks_dir,\n",
217
+ " max_seq_len=sequence_length,\n",
218
  " num_overlap_bars=2,\n",
219
  " )\n",
220
  "\n",
 
240
  "midi_paths_valid = list(root_save.joinpath(Path(\"Maestro_valid\")).glob(\"**/*.mid\")) + list(root_save.joinpath(Path(\"Maestro_valid\")).glob(\"**/*.midi\")) \n",
241
  "midi_paths_test = list(root_save.joinpath(Path(\"Maestro_test\")).glob(\"**/*.mid\")) + list(root_save.joinpath(Path(\"Maestro_test\")).glob(\"**/*.midi\"))\n",
242
  "\n",
243
+ "\n",
244
  "\n",
245
  "dataset_train = DatasetMIDI(midi_paths_train, **kwargs_dataset)\n",
246
  "dataset_valid = DatasetMIDI(midi_paths_valid, **kwargs_dataset)\n",
 
265
  },
266
  "outputs": [],
267
  "source": [
268
+ "#testing_files = \n",
269
  "preview_files_path = []\n",
270
  "for testing_file in testing_files:\n",
271
  " preview_files_path.append(Path(testing_file))\n",
 
275
  " files_paths=preview_files_path,\n",
276
  " tokenizer=tokenizer,\n",
277
  " save_dir=preview_dir,\n",
278
+ " max_seq_len=sequence_length,\n",
279
  " num_overlap_bars=2,\n",
280
  " )\n"
281
  ]
 
296
  " file_name_lookup.append(p3.name)\n",
297
  " return file_name_lookup.index(p3.name)\n",
298
  " \n",
299
+ "kwargs_dataset = {\"max_seq_len\": sequence_length, \"tokenizer\": tokenizer, \"bos_token_id\": tokenizer[\"BOS_None\"], \"eos_token_id\": tokenizer[\"EOS_None\"], \"func_to_get_labels\" : func_to_get_labels}\n",
300
  "dataset_preview = DatasetMIDI(midi_split_preview, **kwargs_dataset)"
301
  ]
302
  },
 
372
  "source": [
373
  "# Creates model\n",
374
  "model_config = MistralConfig(\n",
375
+ " vocab_size=len(tokenizer), #from miditok output default 32K\n",
376
+ " hidden_size=512, # default 4096\n",
377
+ " intermediate_size=2048, # default 14336\n",
378
+ " num_hidden_layers=8, # default 32\n",
379
+ " num_attention_heads=8, # default 32\n",
380
+ " num_key_value_heads=4, # default 8\n",
381
+ " sliding_window=256, # default 4096\n",
382
+ " max_position_embeddings=sequence_length + 256, # 8192 this was before # default 4096*32\n",
383
  " pad_token_id=tokenizer['PAD_None'],\n",
384
  " bos_token_id=tokenizer['BOS_None'],\n",
385
  " eos_token_id=tokenizer['EOS_None'],\n",
 
459
  " learning_rate=1e-4,\n",
460
  " weight_decay=0.01,\n",
461
  " max_grad_norm=3.0,\n",
462
+ " max_steps=40000,\n",
463
  " lr_scheduler_type=\"cosine_with_restarts\",\n",
464
  " warmup_ratio=0.3,\n",
465
  " log_level=\"debug\",\n",
 
495
  " compute_metrics=compute_metrics,\n",
496
  " callbacks=None,\n",
497
  " preprocess_logits_for_metrics=preprocess_logits,\n",
498
+ " \n",
499
  ")\n",
500
  "\n"
501
  ]
502
  },
503
+ {
504
+ "cell_type": "code",
505
+ "execution_count": null,
506
+ "metadata": {},
507
+ "outputs": [],
508
+ "source": [
509
+ "print(model)"
510
+ ]
511
+ },
512
  {
513
  "cell_type": "code",
514
  "execution_count": null,
README.md CHANGED
@@ -22,6 +22,9 @@ I have trained a MidiTok tokeniser (REMI) and its made by spliting multi-track m
22
  We then trained in on a small dataset.
23
  Its using the Mistral model that has been cut down quite a bit.
24
 
 
 
 
25
  ### Training hyperparameters
26
 
27
  The following hyperparameters were used during training:
 
22
  We then trained in on a small dataset.
23
  Its using the Mistral model that has been cut down quite a bit.
24
 
25
+ ### What else needs to be done
26
+ Update model training to use small positional embeddings for the model 1024 + a padding amount like 8
27
+
28
  ### Training hyperparameters
29
 
30
  The following hyperparameters were used during training: