new READMe, tidy up main and add hparams

Files changed (3) hide show

README.md +35 -7
hparams.yaml +50 -0
main.py +25 -194

README.md CHANGED Viewed

@@ -8,12 +8,13 @@ Model trained in int8 with LoRA
 Usage:
-prepare pipeline, setting to default generate_opts will give you (deterministic) greedy decoding with up to 112 tokens generated, no repetition penalty:
 ```
 asr_model=prepare_pipeline(
         model_dir='.', # wherever you save the model
-        generate_opts={'max_new_tokens':112,
                 'num_beams':1,
                 'repetition_penalty':1,
                 'do_sample':False
@@ -25,8 +26,35 @@ run ASR:
 asr_model(audio_path)
 ```
-See also:
-https://github.com/rosyvs/isatasr
-Model is on Github at https://github.com/rosyvs/isatasr/tree/main/models/whisat-1.2
-Training script: https://github.com/rosyvs/isatasr/blob/main/train/whisat/tune_hf_whisper.py
-Training hyperparameters: https://github.com/rosyvs/isatasr/blob/main/train/whisat/hparams/redo_for_ICASSP/publicKS_ig_hf_LoRA_int8_largev2.yaml

 Usage:
+prepare pipeline, providing any custom generate_kwargs supprted by https://huggingface.co/docs/transformers/v4.40.0/en/main_classes/text_generation#transformers.GenerationConfig
 ```
 asr_model=prepare_pipeline(
         model_dir='.', # wherever you save the model
+        generate_kwargs={
+                'max_new_tokens':112,
                 'num_beams':1,
                 'repetition_penalty':1,
                 'do_sample':False
 asr_model(audio_path)
 ```
+run ASR on full directory in `audio_dir`:
+If generate_kwargs not specified,  will give you (deterministic) greedy decoding with up to 112 tokens generated, no repetition penalty
+```
+ASRdirWhisat(
+        audio_dir,
+        out_dir = '../whisat_results/',
+        model_dir=".",
+)
+```
+Training information:
+Training script: tune_hf_whisper.py
+Training hyperparameters: hparams.yaml
+Training data manifest: PUBLIC_KIDS_TRAIN_v4_deduped.csv
+Note: to recreate this training you will need to acquire the following public datasets:
+MyST (myst-v0.4.2)
+CuKids
+CSLU
+and ensure they are stored at paths consistend with those in the data manifest above.
+Reference:
+@inproceedings{southwell2024,
+  title={Automatic speech recognition tuned for child speech in the classroom},
+  author={ Southwell, Rosy and  Ward , Wayne and Trinh , Viet Anh and Clevenger, Charis and  Clevenger, Clay and  Watts, Emily and Reitman, Jason and  D’Mello, Sidney and Whitehill, Jacob},
+booktitle={{IEEE} International Conference on Acoustics, Speech and Signal Processing
+                  {ICASSP} 2024, Seoul, South Korea, April 14-19, 2024},
+                  year={2024},
+}

hparams.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+# parameters to set
+model_cfg:
+  init_from_hub_path: openai/whisper-large-v2
+  # lang: None
+  # apply_spec_augment: True
+  # mask_time_prob: 0.05
+  # mask_feature_prob: 0.05
+  # mask_time_length: 40
+  # mask_feature_length: 30
+  # mask_time_min_masks: 2
+  # mask_feature_min_masks: 2
+data_cfg:
+  data_root: ~/corpora/
+  train_manif: ~/corpora/data_manifests/ASR/PUBLIC_KIDS_TRAIN_v4_deduped.csv
+  val_manif: # small private dataset of classroom speech, only affects training if load_best_model_at_end: True
+  test_manif: # small private dataset of classroom speech, doesn't affect training
+experiment_cfg:
+  OUT_DIR: train/whisat/save/publicKS_LoRA_int8
+  use_lora: True
+  use_int8: True
+train_cfg:
+  training_args:
+    output_dir: !ref <experiment_cfg[OUT_DIR]>
+    per_device_train_batch_size: 32 # 64
+    learning_rate: 0.0001 # 1e-5 orig, 1e-3 lora
+    warmup_steps: 50 # 500 orig 50 lora
+    num_train_epochs: 1
+    fp16: True # True
+    evaluation_strategy: steps # or epochs
+    per_device_eval_batch_size: 4
+    predict_with_generate: True
+    generation_max_length: 112
+    save_steps: 500
+    eval_steps: 500
+    eval_accumulation_steps: 2
+    logging_steps: 25
+    report_to:
+      - tensorboard
+    load_best_model_at_end: False
+    metric_for_best_model: wer
+    greater_is_better: False
+    push_to_hub: False
+    remove_unused_columns: False  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
+    label_names:
+      - labels

main.py CHANGED Viewed

@@ -14,110 +14,30 @@ import json
 import pandas as pd
 import csv
-def prepare_pipeline(model_type='large-v2',
-                 model_dir="../models/whisat-1.2/",
-                 use_stock_model=False,
-                 generate_opts={'max_new_tokens':112,
-                            'num_beams':1,
-                            'repetition_penalty':1,
-                            'do_sample':False}
-                            ):
-    #%% options (TODO make these CLI options)
-    lang='english'
-    USE_INT8 = False
-    import warnings
-    warnings.filterwarnings("ignore")
-    transformers.utils.logging.set_verbosity_error()
-    init_from_hub_path = f"openai/whisper-{model_type}" # TODO infer automatically from PEFT checkpoint
-    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-    print(device)
-    feature_extractor = WhisperFeatureExtractor.from_pretrained(init_from_hub_path)
-    # TODO: no need to specify lanf/task?
-    tokenizer = WhisperTokenizer.from_pretrained(init_from_hub_path, language=lang, task="transcribe")
-    processor = WhisperProcessor.from_pretrained(init_from_hub_path, language=lang, task="transcribe")
-    if use_stock_model:
-        model =WhisperForConditionalGeneration.from_pretrained(init_from_hub_path)
-    else:
-        checkpoint_dir = os.path.expanduser(model_dir)
-        # check if PEFT
-        if os.path.isdir(os.path.join(checkpoint_dir , "adapter_model")):
-            print('...it looks like this model was tuned using PEFT, because adapter_model/ is present in ckpt dir')
-            # checkpoint dir needs adapter model subdir with adapter_model.bin and adapter_confg.json
-            peft_config = PeftConfig.from_pretrained(os.path.join(checkpoint_dir , "adapter_model"))
-            # except ValueError as e: # if final checkpoint these are in the parent checkpoint direcory
-            #     peft_config = PeftConfig.from_pretrained(os.path.join(checkpoint_dir ), subfolder=None)
-            model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path,
-                load_in_8bit=USE_INT8,
-                device_map='auto',
-                use_cache=False,
-                )
-            model = PeftModel.from_pretrained(model, os.path.join(checkpoint_dir,"adapter_model"))
-        else:
-            model = WhisperForConditionalGeneration.from_pretrained(checkpoint_dir,
-                load_in_8bit=USE_INT8,
-                device_map='auto',
-                use_cache=False,
-                )
-    model.eval() # needed?
-    pipe = AutomaticSpeechRecognitionPipeline(
-        # task="automatic-speech-recognition",
-        model=model,
-        tokenizer=tokenizer,
-        feature_extractor=feature_extractor,
-        chunk_length_s=30,
-        device=device,
-        return_timestamps=False,
-        generate_kwargs=generate_opts,
-    )
-    return(pipe)
-def load_model(model_type='large-v2',
-                 model_dir="../models/whisat-1.2/"):
-    lang='english'
-    USE_INT8 = False
-    import warnings
-    warnings.filterwarnings("ignore")
-    transformers.utils.logging.set_verbosity_error()
-    init_from_hub_path = f"openai/whisper-{model_type}" # TODO infer automatically from PEFT checkpoint
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    print(device)
-    feature_extractor = WhisperFeatureExtractor.from_pretrained(init_from_hub_path)
-    # TODO: no need to specify lanf/task?
-    tokenizer = WhisperTokenizer.from_pretrained(init_from_hub_path, language=lang, task="transcribe")
-    processor = WhisperProcessor.from_pretrained(init_from_hub_path, language=lang, task="transcribe")
-    checkpoint_dir = os.path.expanduser(model_dir)
-    # checkpoint dir needs adapter model subdir with adapter_model.bin and adapter_confg.json
-    peft_config = PeftConfig.from_pretrained(os.path.join(checkpoint_dir , "adapter_model"))
-    # except ValueError as e: # if final checkpoint these are in the parent checkpoint direcory
-    #     peft_config = PeftConfig.from_pretrained(os.path.join(checkpoint_dir ), subfolder=None)
-    model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path,
-    load_in_8bit=USE_INT8,  # TODO: seemed slightly better without?
-    device_map='auto',
-    use_cache=False,
-    )
-    model = PeftModel.from_pretrained(model, os.path.join(checkpoint_dir,"adapter_model"))
-    model.eval() # needed?
-    return(model, tokenizer, processor)
 def ASRdirWhisat(
                 audio_dir,
-                files_to_include=None,
                 out_dir = '../whisat_results/',
-                model_type='large-v2',
-                model_name='whisat-1.2',
-                model_dir="../models/whisat-1.2",
-                use_stock_model=False,
                 max_new_tokens=112,
                 num_beams=1,
                 do_sample=False,
@@ -131,54 +51,36 @@ def ASRdirWhisat(
     # Save output in same directory structure as input in specified top-level folder
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    #TODO optional arg listing files to transcribe in a list or a text file
     asr_model=prepare_pipeline(
         model_type=model_type,
         model_dir=model_dir,
         use_stock_model=use_stock_model,
-        generate_opts={'max_new_tokens':max_new_tokens,
                 'num_beams':num_beams,
                 'repetition_penalty':repetition_penalty,
                 'do_sample':do_sample
                             }
                 )
-    if use_stock_model: # set some alternative defaults if using stock model
-        model_name='whisper_' + model_type + '_stock'
-    if files_to_include:
-        assert isinstance(files_to_include,list) ,'files_to_include should be a list of paths relative to audio_dir to transcribe'
-        audio_files=files_to_include
-        # audio_files=[]
-        # for f in [str(f) for f in Path(audio_dir).rglob("*") if (str(f).rsplit('.',maxsplit=1)[-1] in ['MOV', 'mov', 'WAV', 'wav', 'mp4', 'mp3', 'm4a', 'aac', 'flac', 'alac', 'ogg'] and f.is_file() )]:
-        #     print(f)
-        #     if os.path.join(audio_dir,f) in files_to_include:
-        #         audio_files.append(f)
-        # print(f'Including {len(audio_files)} hypotheses matching files_to_include...')
-    else:
-        audio_files = [str(f) for f in Path(audio_dir).rglob("*") if (str(f).rsplit('.',maxsplit=1)[-1] in ['MOV', 'mov', 'WAV', 'wav', 'mp4', 'mp3', 'm4a', 'aac', 'flac', 'alac', 'ogg'] and f.is_file() )]
     # audio_identifier = os.path.basename(audio_dir)
-    asrDir = os.path.join(out_dir,f'ASR_{model_name}') # Dir where full session asr result will be stored
-    jsonDir = os.path.join(out_dir,f'JSON_{model_name}')
-    os.makedirs(asrDir, exist_ok=True)
-    os.makedirs(jsonDir, exist_ok=True)
-    message = "This may take a while on CPU. Go make a cuppa" if asr_model.device.type=="cpu" else "Running on GPU"
     print(f'Running ASR for {len(audio_files)} files. {message} ...')
     compute_time=0
     total_audio_dur=0
     # get the start time
     st = time.time()
     for audiofile in tqdm(audio_files):
         sessname=Path(audiofile).stem
         sesspath=os.path.relpath(os.path.dirname(Path(audiofile).resolve()),Path(audio_dir).resolve())
         asrFullFile = os.path.join(asrDir,sesspath,f"{sessname}.asr.txt") # full session ASR results file
-        jsonFile = os.path.join(jsonDir,sesspath, f"{sessname}.json")
         os.makedirs(os.path.join(asrDir,sesspath),exist_ok=True)
-        os.makedirs(os.path.join(jsonDir,sesspath),exist_ok=True)
         with torch.no_grad():
             with autocast():
@@ -188,13 +90,6 @@ def ASRdirWhisat(
                     print(f'{e}: {audiofile}')
                     continue
-        # save full result JSON
-        with open(jsonFile, "w") as jf:
-            json.dump(result, jf, indent=4)
-        # save full result transcript
-        # if asr_model.return_timestamps:
-        #     asrtext = '\n'.join([r['text'].strip() for r in result['chunks']])
-        # else:
         asrtext = result['text']
         with open(asrFullFile,'w') as outfile:
@@ -204,67 +99,3 @@ def ASRdirWhisat(
     compute_time = (et-st)
     print(f'...transcription complete in {compute_time:.1f} sec')
-def ASRmanifestWhisat(
-                manifest_csv,
-                out_csv,
-                corpora_root,
-                model_type='large-v2',
-                model_dir="../models/whisat-1.2",
-                use_stock_model=False,
-                max_new_tokens=112,
-                num_beams=1,
-                do_sample=False,
-                repetition_penalty=1,
-                ):
-    ## ASR using fine-tuned Transformers Whisper
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # Simply trancsribe each file in the specified folder separately
-    # Whisper takes 30-second input. Anything shorter than this will be 0 padded. Longer will be concatenated.
-    # Save output in same directory structure as input in specified top-level folder
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    df = pd.read_csv(manifest_csv,keep_default_na=False)
-    fieldnames = list(df.columns) + ['asr']
-    asr_model=prepare_pipeline(
-        model_type=model_type,
-        model_dir=model_dir,
-        use_stock_model=use_stock_model,
-        generate_opts={'max_new_tokens':max_new_tokens,
-                'num_beams':num_beams,
-                'repetition_penalty':repetition_penalty,
-                'do_sample':do_sample
-                            }
-                )
-    message = "This may take a while on CPU. Go make a cuppa " if asr_model.device.type=="cpu" else "Running on GPU"
-    print(f'Running ASR for {len(df)} files. {message} ...')
-    compute_time=0
-    total_audio_dur=0
-    # get the start time
-    st = time.time()
-    with open(out_csv, 'w', newline='') as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=fieldnames,delimiter=',')
-        writer.writeheader()
-        for i,row in tqdm(df.iterrows(), total=df.shape[0]):
-            audiofile=row['wav'].replace('$DATAROOT',corpora_root)
-            with torch.no_grad():
-                with autocast():
-                    try:
-                        result = asr_model(audiofile)
-                        asrtext = result['text']
-                    except ValueError as e:
-                        print(f'{e}: {audiofile}')
-                        asrtext=''
-            row['asr']=asrtext
-            writer.writerow( row.to_dict())
-    et = time.time()
-    compute_time = (et-st)
-    print(f'...transcription complete in {compute_time:.1f} sec')

 import pandas as pd
 import csv
+def prepare_pipeline(model_path, generate_kwargs):
+    """Prepare a pipeline for ASR inference
+    Args:
+        model_path (str): path to model directory / huggingface model name
+        generate_kwargs (dict): options to pass to pipeline
+    Returns:
+        pipeline: ASR pipeline
+    """
+    processor = WhisperProcessor.from_pretrained(model_path)
+    asr_pipeline = pipeline(
+        "automatic-speech-recognition",
+        model=model_path,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        generate_kwargs=generate_kwargs,
+        model_kwargs={"load_in_8bit": False},
+        device_map='auto')
+    return asr_pipeline
 def ASRdirWhisat(
                 audio_dir,
                 out_dir = '../whisat_results/',
+                model_dir=".",
                 max_new_tokens=112,
                 num_beams=1,
                 do_sample=False,
     # Save output in same directory structure as input in specified top-level folder
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     asr_model=prepare_pipeline(
         model_type=model_type,
         model_dir=model_dir,
         use_stock_model=use_stock_model,
+        generate_kwargs={'max_new_tokens':max_new_tokens,
                 'num_beams':num_beams,
                 'repetition_penalty':repetition_penalty,
                 'do_sample':do_sample
                             }
                 )
+    audio_files = [str(f) for f in Path(audio_dir).rglob("*") if (str(f).rsplit('.',maxsplit=1)[-1] in ['MOV', 'mov', 'WAV', 'wav', 'mp4', 'mp3', 'm4a', 'aac', 'flac', 'alac', 'ogg'] and f.is_file() )]
     # audio_identifier = os.path.basename(audio_dir)
+    os.makedirs(out_dir, exist_ok=True)
+    message = "This may take a while on CPU." if asr_model.device.type=="cpu" else "Running on GPU"
     print(f'Running ASR for {len(audio_files)} files. {message} ...')
     compute_time=0
     total_audio_dur=0
     # get the start time
     st = time.time()
+    asrDir = out_dir
     for audiofile in tqdm(audio_files):
         sessname=Path(audiofile).stem
         sesspath=os.path.relpath(os.path.dirname(Path(audiofile).resolve()),Path(audio_dir).resolve())
         asrFullFile = os.path.join(asrDir,sesspath,f"{sessname}.asr.txt") # full session ASR results file
         os.makedirs(os.path.join(asrDir,sesspath),exist_ok=True)
         with torch.no_grad():
             with autocast():
                     print(f'{e}: {audiofile}')
                     continue
         asrtext = result['text']
         with open(asrFullFile,'w') as outfile:
     compute_time = (et-st)
     print(f'...transcription complete in {compute_time:.1f} sec')