Upload HuggingFace_Mistral_Transformer_Single_Instrument.ipynb

Browse files

Files changed (1) hide show

HuggingFace_Mistral_Transformer_Single_Instrument.ipynb +660 -0

HuggingFace_Mistral_Transformer_Single_Instrument.ipynb ADDED Viewed

	@@ -0,0 +1,660 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "SiTIpPjArIyr"
+   },
+   "source": [
+    "# Using Midi traning data and MidiTok Remi to generate music with Mistral model \n",
+    "# split music into Single Instrument and split into 1024\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gOd93yV0sGd2"
+   },
+   "source": [
+    "## Setup Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "To compile Symusic \n",
+    "\n",
+    "Get g++11 or higher\n",
+    "\n",
+    "git clone --recursive https://github.com/Yikai-Liao/symusic\n",
+    "CXX=/usr/bin/g++-11 pip install ./symusic\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "fX12Yquyuihc"
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "from copy import deepcopy\n",
+    "from pathlib import Path\n",
+    "from random import shuffle\n",
+    "\n",
+    "from evaluate import load as load_metric\n",
+    "from miditok import REMI, TokenizerConfig, TokTrainingIterator\n",
+    "from miditok.pytorch_data import DatasetMIDI, DataCollator\n",
+    "from miditok.utils import split_files_for_training\n",
+    "\n",
+    "from miditok.data_augmentation import augment_dataset\n",
+    "from torch import Tensor, argmax\n",
+    "from torch.utils.data import DataLoader\n",
+    "from torch.cuda import is_available as cuda_available, is_bf16_supported\n",
+    "from torch.backends.mps import is_available as mps_available\n",
+    "from transformers import AutoModelForCausalLM, MistralConfig, Trainer, TrainingArguments, GenerationConfig, AutoConfig\n",
+    "from transformers.trainer_utils import set_seed\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup Tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Seed\n",
+    "set_seed(777)\n",
+    "\n",
+    "# Our tokenizer's configuration\n",
+    "BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}\n",
+    "TOKENIZER_PARAMS = {\n",
+    "    \"pitch_range\": (21, 109),\n",
+    "    \"beat_res\": BEAT_RES,\n",
+    "    \"num_velocities\": 24,\n",
+    "    \"special_tokens\": [\"PAD\", \"BOS\", \"EOS\"],\n",
+    "    \"use_chords\": True,\n",
+    "    \"use_rests\": True,\n",
+    "    \"use_tempos\": True,\n",
+    "    \"use_time_signatures\": True,\n",
+    "    \"use_programs\": False,  # We want single track \n",
+    "    \"one_token_stream_for_programs\": True,\n",
+    "    \"programs\": list(range(0, 128)), #-1 drums, skip drums\n",
+    "    \"num_tempos\": 32,\n",
+    "    \"tempo_range\": (50, 200),  # (min_tempo, max_tempo)\n",
+    "}\n",
+    "config = TokenizerConfig(**TOKENIZER_PARAMS)\n",
+    "\n",
+    "# Creates the tokenizer REMI PLUS\n",
+    "tokenizer = REMI(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load Midi filed and train the the tokenizer on the midi files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root_data_dir = Path('/home/wombat/Documents/projects/music/midiTok/data/')\n",
+    "root_save = Path(root_data_dir / 'HuggingFace_Mistral_Transformer_Single_Instrument')\n",
+    "\n",
+    "tokenizer_name = \"HuggingFace_Mistral_Transformer_Single_Instrument.json\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 30k tokens\n",
+    "data_dirs = [\"adl-piano-midi\", \"maestro-v3.0.0\", \"musicnet_midis\" ] # for single \n",
+    "midi_paths = []\n",
+    "for data_dir in data_dirs:\n",
+    "    path = Path(root_data_dir / 'Traning Data' / data_dir)\n",
+    "    midi_paths.extend(list(path.resolve().glob(\"**/*.mid\")) + list(path.resolve().glob(\"**/*.midi\")))\n",
+    "\n",
+    "print(f\"Found {len(midi_paths)} MIDI files\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Note the size of the dataset is quite large, so it requires a huge amount of memory to train the tokenizer for 61749 files it took 64gb of memory\n",
+    "tokenizer.train(\n",
+    "    vocab_size=32000,\n",
+    "    files_paths=midi_paths,\n",
+    ")\n",
+    "tokenizer.save(root_save / tokenizer_name)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = REMI(params=Path(root_save / tokenizer_name))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare MIDIs for training\n",
+    "\n",
+    "Here we split the files in three subsets: train, validation and test.\n",
+    "Then data augmentation is performed on each subset independently, and the MIDIs are split into smaller chunks that make approximately the desired token sequence length for training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Split MIDI paths in train/valid/test sets\n",
+    "total_num_files = len(midi_paths)\n",
+    "num_files_valid = round(total_num_files * 0.15)\n",
+    "num_files_test = round(total_num_files * 0.15)\n",
+    "shuffle(midi_paths)\n",
+    "midi_paths_valid = midi_paths[:num_files_valid]\n",
+    "midi_paths_test = midi_paths[num_files_valid:num_files_valid + num_files_test]\n",
+    "midi_paths_train = midi_paths[num_files_valid + num_files_test:]\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Chunk MIDIs and perform data augmentation on each subset independently\n",
+    "for files_paths, subset_name in (\n",
+    "    (midi_paths_train, \"train\"), (midi_paths_valid, \"valid\"), (midi_paths_test, \"test\")\n",
+    "):\n",
+    "\n",
+    "    # Split the MIDIs into chunks of sizes approximately about 1024 tokens\n",
+    "    subset_chunks_dir = root_save / f\"Maestro_{subset_name}\"\n",
+    "    print(subset_chunks_dir)\n",
+    "    split_files_for_training(\n",
+    "        files_paths=files_paths,\n",
+    "        tokenizer=tokenizer,\n",
+    "        save_dir=subset_chunks_dir,\n",
+    "        max_seq_len=1024,\n",
+    "        num_overlap_bars=2,\n",
+    "    )\n",
+    "\n",
+    "    if subset_name == 'train':\n",
+    "        print(\"Augmentation\")\n",
+    "        # Perform data augmentation\n",
+    "        augment_dataset(\n",
+    "            subset_chunks_dir,\n",
+    "            pitch_offsets=[-12, 12],\n",
+    "            velocity_offsets=[-4, 4],\n",
+    "            duration_offsets=[-0.5, 0.5],\n",
+    "        )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create Dataset and Collator for training\n",
+    "midi_paths_train = list(root_save.joinpath(Path(\"Maestro_train\")).glob(\"**/*.mid\")) + list(root_save.joinpath(Path(\"Maestro_train\")).glob(\"**/*.midi\"))\n",
+    "midi_paths_valid = list(root_save.joinpath(Path(\"Maestro_valid\")).glob(\"**/*.mid\")) + list(root_save.joinpath(Path(\"Maestro_valid\")).glob(\"**/*.midi\")) \n",
+    "midi_paths_test = list(root_save.joinpath(Path(\"Maestro_test\")).glob(\"**/*.mid\")) + list(root_save.joinpath(Path(\"Maestro_test\")).glob(\"**/*.midi\"))\n",
+    "\n",
+    "kwargs_dataset = {\"max_seq_len\": 1024, \"tokenizer\": tokenizer, \"bos_token_id\": tokenizer[\"BOS_None\"], \"eos_token_id\": tokenizer[\"EOS_None\"]}\n",
+    "\n",
+    "dataset_train = DatasetMIDI(midi_paths_train, **kwargs_dataset)\n",
+    "dataset_valid = DatasetMIDI(midi_paths_valid, **kwargs_dataset)\n",
+    "dataset_test = DatasetMIDI(midi_paths_test, **kwargs_dataset)\n",
+    "print (len(midi_paths_train), len(midi_paths_valid), len(midi_paths_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preview files data load and split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "Generate Preview Files"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "testing_files = \n",
+    "preview_files_path = []\n",
+    "for testing_file in testing_files:\n",
+    "    preview_files_path.append(Path(testing_file))\n",
+    "\n",
+    "preview_dir = Path(root_save / \"preview\")\n",
+    "split_files_for_training(\n",
+    "        files_paths=preview_files_path,\n",
+    "        tokenizer=tokenizer,\n",
+    "        save_dir=preview_dir,\n",
+    "        max_seq_len=1024,\n",
+    "        num_overlap_bars=2,\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "valid_midi_path = root_save / \"Maestro_valid\"\n",
+    "midi_split_preview = list(valid_midi_path.resolve().glob(\"**/*.mid\")) + list(valid_midi_path.resolve().glob(\"**/*.midi\"))\n",
+    "\n",
+    "print(len(midi_split_preview))\n",
+    "file_name_lookup = []\n",
+    "def func_to_get_labels(p1, p2, p3):\n",
+    "    if p3.name not in file_name_lookup:\n",
+    "        file_name_lookup.append(p3.name)\n",
+    "    return file_name_lookup.index(p3.name)\n",
+    "    \n",
+    "kwargs_dataset = {\"max_seq_len\": 1024, \"tokenizer\": tokenizer, \"bos_token_id\": tokenizer[\"BOS_None\"], \"eos_token_id\": tokenizer[\"EOS_None\"], \"func_to_get_labels\" : func_to_get_labels}\n",
+    "dataset_preview = DatasetMIDI(midi_split_preview, **kwargs_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Save and Load datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_dir = root_save / \"data\"\n",
+    "dataset_dir.mkdir(parents=True, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "torch.save(dataset_train, Path(dataset_dir / \"dataset_train.pt\"))\n",
+    "torch.save(dataset_valid, Path(dataset_dir / \"dataset_valid.pt\"))\n",
+    "torch.save(dataset_test, Path(dataset_dir / \"dataset_test.pt\"))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "dataset_train = torch.load(Path(dataset_dir / \"dataset_train.pt\"))\n",
+    "dataset_valid = torch.load(Path(dataset_dir / \"dataset_valid.pt\"))\n",
+    "dataset_test = torch.load(Path(dataset_dir / \"dataset_test.pt\"))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(dataset_train[0])\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model initialization\n",
+    "\n",
+    "We will use the [Mistral implementation of Hugging Face](https://huggingface.co/docs/transformers/model_doc/mistral).\n",
+    "Feel free to explore the documentation and source code to dig deeper.\n",
+    "\n",
+    "**You may need to adjust the model's configuration, the training configuration and the maximum input sequence length (cell above) depending on your hardware.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Creates model\n",
+    "model_config = MistralConfig(\n",
+    "    vocab_size=len(tokenizer),\n",
+    "    hidden_size=512,\n",
+    "    intermediate_size=2048,\n",
+    "    num_hidden_layers=8,\n",
+    "    num_attention_heads=8,\n",
+    "    num_key_value_heads=4,\n",
+    "    sliding_window=256,\n",
+    "    max_position_embeddings=8192,\n",
+    "    pad_token_id=tokenizer['PAD_None'],\n",
+    "    bos_token_id=tokenizer['BOS_None'],\n",
+    "    eos_token_id=tokenizer['EOS_None'],\n",
+    ")\n",
+    "model = AutoModelForCausalLM.from_config(model_config)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_dir = root_save / 'run'\n",
+    "model_dir_str = str(model_dir)\n",
+    "print(model_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metrics = {metric: load_metric(metric) for metric in [\"accuracy\"]}\n",
+    "\n",
+    "def compute_metrics(eval_pred):\n",
+    "    \"\"\"\n",
+    "    Compute metrics for pretraining.\n",
+    "\n",
+    "    Must use preprocess_logits function that converts logits to predictions (argmax or sampling).\n",
+    "\n",
+    "    :param eval_pred: EvalPrediction containing predictions and labels\n",
+    "    :return: metrics\n",
+    "    \"\"\"\n",
+    "    predictions, labels = eval_pred\n",
+    "    not_pad_mask = labels != -100\n",
+    "    labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]\n",
+    "    return metrics[\"accuracy\"].compute(predictions=predictions.flatten(), references=labels.flatten())\n",
+    "\n",
+    "def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:\n",
+    "    \"\"\"\n",
+    "    Preprocess the logits before accumulating them during evaluation.\n",
+    "\n",
+    "    This allows to significantly reduce the memory usage and make the training tractable.\n",
+    "    \"\"\"\n",
+    "    pred_ids = argmax(logits, dim=-1)  # long dtype\n",
+    "    return pred_ids\n",
+    "\n",
+    "# Create config for the Trainer\n",
+    "USE_CUDA = cuda_available()\n",
+    "print(USE_CUDA)\n",
+    "if not cuda_available():\n",
+    "    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False\n",
+    "elif is_bf16_supported():\n",
+    "    BF16 = BF16_EVAL = True\n",
+    "    FP16 = FP16_EVAL = False\n",
+    "else:\n",
+    "    BF16 = BF16_EVAL = False\n",
+    "    FP16 = FP16_EVAL = True\n",
+    "USE_MPS = not USE_CUDA and mps_available()\n",
+    "training_config = TrainingArguments(\n",
+    "    model_dir_str, False, True, True, False, \"steps\",\n",
+    "    per_device_train_batch_size=30, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time \n",
+    "    per_device_eval_batch_size=30, #was 24 now 32\n",
+    "    gradient_accumulation_steps=3, #change this to 4\n",
+    "    eval_accumulation_steps=None,\n",
+    "    eval_steps=1000,\n",
+    "    learning_rate=1e-4,\n",
+    "    weight_decay=0.01,\n",
+    "    max_grad_norm=3.0,\n",
+    "    max_steps=20000,\n",
+    "    lr_scheduler_type=\"cosine_with_restarts\",\n",
+    "    warmup_ratio=0.3,\n",
+    "    log_level=\"debug\",\n",
+    "    logging_strategy=\"steps\",\n",
+    "    logging_steps=20,\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=1000,\n",
+    "    save_total_limit=5,\n",
+    "    no_cuda=not USE_CUDA,\n",
+    "    seed=444,\n",
+    "    fp16=FP16,\n",
+    "    fp16_full_eval=FP16_EVAL,\n",
+    "    bf16=BF16,\n",
+    "    bf16_full_eval=BF16_EVAL,\n",
+    "    load_best_model_at_end=True,\n",
+    "    label_smoothing_factor=0.,\n",
+    "    optim=\"adamw_torch\",\n",
+    "    report_to=[\"tensorboard\"],\n",
+    "    gradient_checkpointing=True,\n",
+    "    dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process\n",
+    "    dataloader_pin_memory=True, #we want the dataset in memory\n",
+    "    torch_compile=True #added to speed up \n",
+    "    \n",
+    ")\n",
+    "\n",
+    "collator = DataCollator(tokenizer[\"PAD_None\"], copy_inputs_as_labels=True)\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_config,\n",
+    "    data_collator=collator,\n",
+    "    train_dataset=dataset_train,\n",
+    "    eval_dataset=dataset_valid,\n",
+    "    compute_metrics=compute_metrics,\n",
+    "    callbacks=None,\n",
+    "    preprocess_logits_for_metrics=preprocess_logits,\n",
+    ")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Training\n",
+    "train_result = trainer.train()\n",
+    "trainer.save_model()  # Saves the tokenizer too\n",
+    "trainer.log_metrics(\"train\", train_result.metrics)\n",
+    "trainer.save_metrics(\"train\", train_result.metrics)\n",
+    "trainer.save_state()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.create_model_card(tags=[\"mistral\", \"midi\", \"miditok\", \"music\", \"instrument\"],\n",
+    "                          model_name=\"Mistral_MidiTok_Transformer_Single_Instrument_Small\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "model.hub_model_id = \"adricl/midi_single_instrument_mistral_transformer\"\n",
+    "\n",
+    "model.push_to_hub(commit_message=\"Training Basic Model for Mistral MidiTok Transformer Single Instrument Small\", repo_id=\"adricl/midi_single_instrument_mistral_transformer\",\n",
+    "                 token=\"\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# For Tensorboard tensorboard --logdir runs/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = AutoConfig.from_pretrained(str(model_dir / \"config.json\"))\n",
+    "model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=str(model_dir / \"model.safetensors\"), from_tf=False, config=config)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate music"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "OaNkGcFo9UP_"
+   },
+   "outputs": [],
+   "source": [
+    "# for single track midi files splits \n",
+    "\n",
+    "gen_results_path = root_save / 'gen_res'\n",
+    "gen_results_path.mkdir(parents=True, exist_ok=True)\n",
+    "generation_config = GenerationConfig(\n",
+    "    max_new_tokens=200,  # extends samples by 200 tokens\n",
+    "    num_beams=1,         # no beam search\n",
+    "    do_sample=True,      # but sample instead\n",
+    "    temperature=0.9,\n",
+    "    top_k=15,\n",
+    "    top_p=0.95,\n",
+    "    epsilon_cutoff=3e-4,\n",
+    "    eta_cutoff=1e-3,\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    ")\n",
+    "\n",
+    "# Here the sequences are padded to the left, so that the last token along the time dimension\n",
+    "# is always the last token of each seq, allowing to efficiently generate by batch\n",
+    "collator.pad_on_left = True\n",
+    "collator.eos_token = None\n",
+    "dataloader_test = DataLoader(dataset_preview, batch_size=24, collate_fn=collator)\n",
+    "model.eval()\n",
+    "count = 0\n",
+    "for batch in tqdm(dataloader_test, desc='Testing model / Generating results'):  # (N,T)\n",
+    "    print(batch)\n",
+    "    res = model.generate(\n",
+    "        inputs=batch[\"input_ids\"].to(model.device),\n",
+    "        attention_mask=batch[\"attention_mask\"].to(model.device),\n",
+    "        generation_config=generation_config)  # (N,T)\n",
+    "\n",
+    "\n",
+    "    # Saves the generated music, as MIDI files and tokens (json)\n",
+    "    for prompt, continuation in zip(batch[\"input_ids\"], res):\n",
+    "        generated = continuation[len(prompt):]\n",
+    "        midi = tokenizer.decode([deepcopy(generated.tolist())])\n",
+    "        tokens = [generated, prompt, continuation]  # list compr. as seqs of dif. lengths\n",
+    "        tokens = [seq.tolist() for seq in tokens]\n",
+    "        for tok_seq in tokens[1:]:\n",
+    "            _midi = tokenizer.decode([deepcopy(tok_seq)])\n",
+    "            midi.tracks.append(_midi.tracks[0])\n",
+    "            \n",
+    "        file_name = file_name_lookup[count]\n",
+    "        print(file_name)\n",
+    "        midi.tracks[0].name = f'Continuation of original sample ({len(generated)} tokens) Original file {file_name}'\n",
+    "        midi.tracks[1].name = f'Original sample ({len(prompt)} tokens)'\n",
+    "        if (len(midi.tracks) > 2):\n",
+    "            midi.tracks[2].name = f'Original sample and continuation'\n",
+    "        midi.dump_midi(gen_results_path / f'{count}_{file_name}.mid')\n",
+    "        tokenizer.save_tokens(tokens, gen_results_path / f'{count}_{file_name}.json') \n",
+    "\n",
+    "        count += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(file_name_lookup)"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "machine_shape": "hm",
+   "name": "Optimus_VIRTUOSO_Multi_Instrumental_RGA_Edition.ipynb",
+   "private_outputs": true,
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}