Update README.md
#3
by
						
AntonV
	
							HF Staff
						- opened
							
					
    	
        README.md
    CHANGED
    
    | @@ -84,6 +84,64 @@ sf.write("simple.mp3", output, 44100) | |
| 84 |  | 
| 85 | 
             
            A pypi package and a working CLI tool will be available soon.
         | 
| 86 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 87 | 
             
            ## 💻 Hardware and Inference Speed
         | 
| 88 |  | 
| 89 | 
             
            Dia has been tested on only GPUs (pytorch 2.0+, CUDA 12.6). CPU support is to be added soon.
         | 
|  | |
| 84 |  | 
| 85 | 
             
            A pypi package and a working CLI tool will be available soon.
         | 
| 86 |  | 
| 87 | 
            +
            ### As part of transformers
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            Install `transformers`:
         | 
| 90 | 
            +
            ```bash
         | 
| 91 | 
            +
            # pip
         | 
| 92 | 
            +
            pip install "transformers[torch]"
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            # uv
         | 
| 95 | 
            +
            uv pip install "transformers[torch]"
         | 
| 96 | 
            +
            ```
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            #### Generation with Text
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            ```python
         | 
| 101 | 
            +
            from transformers import AutoProcessor, DiaForConditionalGeneration
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            torch_device = "cuda"
         | 
| 104 | 
            +
            model_checkpoint = "nari-labs/Dia-1.6B-0626"
         | 
| 105 | 
            +
             | 
| 106 | 
            +
            text = ["[S1] Dia is an open weights text to dialogue model."]
         | 
| 107 | 
            +
            processor = AutoProcessor.from_pretrained(model_checkpoint)
         | 
| 108 | 
            +
            inputs = processor(text=text, padding=True, return_tensors="pt").to(torch_device)
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(torch_device)
         | 
| 111 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=256)  # corresponds to around ~2s
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            # save audio to a file
         | 
| 114 | 
            +
            outputs = processor.batch_decode(outputs)
         | 
| 115 | 
            +
            processor.save_audio(outputs, "example.wav")
         | 
| 116 | 
            +
            ```
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            #### Generation with Text and Audio (Voice Cloning)
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            ```python
         | 
| 121 | 
            +
            from datasets import load_dataset, Audio
         | 
| 122 | 
            +
            from transformers import AutoProcessor, DiaForConditionalGeneration
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            torch_device = "cuda"
         | 
| 125 | 
            +
            model_checkpoint = "nari-labs/Dia-1.6B-0626"
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
         | 
| 128 | 
            +
            ds = ds.cast_column("audio", Audio(sampling_rate=44100))
         | 
| 129 | 
            +
            audio = ds[-1]["audio"]["array"]
         | 
| 130 | 
            +
            # text is a transcript of the audio + additional text you want as new audio
         | 
| 131 | 
            +
            text = ["[S1] I know. It's going to save me a lot of money, I hope. [S2] I sure hope so for you."]
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            processor = AutoProcessor.from_pretrained(model_checkpoint)
         | 
| 134 | 
            +
            inputs = processor(text=text, audio=audio, padding=True, return_tensors="pt").to(torch_device)
         | 
| 135 | 
            +
            prompt_len = processor.get_audio_prompt_len(inputs["decoder_attention_mask"])
         | 
| 136 | 
            +
             | 
| 137 | 
            +
            model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(torch_device)
         | 
| 138 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=256)  # corresponds to around ~2s
         | 
| 139 | 
            +
             | 
| 140 | 
            +
            # retrieve actually generated audio and save to a file
         | 
| 141 | 
            +
            outputs = processor.batch_decode(outputs, audio_prompt_len=prompt_len)
         | 
| 142 | 
            +
            processor.save_audio(outputs, "example_with_audio.wav")
         | 
| 143 | 
            +
            ```
         | 
| 144 | 
            +
             | 
| 145 | 
             
            ## 💻 Hardware and Inference Speed
         | 
| 146 |  | 
| 147 | 
             
            Dia has been tested on only GPUs (pytorch 2.0+, CUDA 12.6). CPU support is to be added soon.
         | 
