adricl commited on
Commit
94d5306
·
1 Parent(s): 4c06149

Adding filter to work on more

Browse files
HuggingFace_Mistral_Transformer_Single_Instrument.ipynb CHANGED
@@ -185,6 +185,38 @@
185
  "kwargs_dataset = {\"max_seq_len\": sequence_length, \"tokenizer\": tokenizer, \"bos_token_id\": tokenizer[\"BOS_None\"], \"eos_token_id\": tokenizer[\"EOS_None\"]}"
186
  ]
187
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  {
189
  "cell_type": "code",
190
  "execution_count": null,
 
185
  "kwargs_dataset = {\"max_seq_len\": sequence_length, \"tokenizer\": tokenizer, \"bos_token_id\": tokenizer[\"BOS_None\"], \"eos_token_id\": tokenizer[\"EOS_None\"]}"
186
  ]
187
  },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": null,
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "def remove_files_with_boring_data( file_paths: list[Path], rms_threshold: float = 0.01) -> list[Path]:\n",
195
+ " \"\"\"\n",
196
+ " Remove files with boring data, i.e. files with low RMS.\n",
197
+ " \"\"\"\n",
198
+ " from symusic import Score\n",
199
+ " from tqdm import tqdm\n",
200
+ " import numpy as np\n",
201
+ "\n",
202
+ " rms = lambda data: (sum(x * x for x in data) / len(data)) ** 0.5\n",
203
+ "\n",
204
+ " filtered_files = []\n",
205
+ " for file_path in tqdm(file_paths, desc=\"Filtering boring files\"):\n",
206
+ " try:\n",
207
+ " scores = [Score(file_path)]\n",
208
+ " except SCORE_LOADING_EXCEPTION:\n",
209
+ " continue\n",
210
+ "\n",
211
+ " for track in scores[0].tracks:\n",
212
+ " values = track.notes['pitch']\n",
213
+ " result = rms(values)\n",
214
+ "\n",
215
+ "\n",
216
+ " filtered_files.append(file_path)\n",
217
+ " return filtered_files"
218
+ ]
219
+ },
220
  {
221
  "cell_type": "code",
222
  "execution_count": null,