Paul Bird
commited on
Upload RunWhisper.cs
Browse files- RunWhisper.cs +18 -12
RunWhisper.cs
CHANGED
|
@@ -36,6 +36,7 @@ public class RunWhisper : MonoBehaviour
|
|
| 36 |
// Link your audioclip here. Format must be 16Hz mono non-compressed.
|
| 37 |
public AudioClip audioClip;
|
| 38 |
|
|
|
|
| 39 |
const int maxTokens = 100;
|
| 40 |
|
| 41 |
//Special tokens
|
|
@@ -56,19 +57,22 @@ public class RunWhisper : MonoBehaviour
|
|
| 56 |
int[] outputTokens = new int[maxTokens];
|
| 57 |
|
| 58 |
// Used for special character decoding
|
| 59 |
-
int[]
|
| 60 |
|
| 61 |
TensorFloat encodedAudio;
|
| 62 |
|
| 63 |
bool transcribe = false;
|
| 64 |
string outputString = "";
|
| 65 |
|
|
|
|
|
|
|
|
|
|
| 66 |
void Start()
|
| 67 |
{
|
| 68 |
allocator = new TensorCachingAllocator();
|
| 69 |
ops = WorkerFactory.CreateOps(backend, allocator);
|
| 70 |
|
| 71 |
-
|
| 72 |
|
| 73 |
GetTokens();
|
| 74 |
|
|
@@ -117,9 +121,7 @@ public class RunWhisper : MonoBehaviour
|
|
| 117 |
|
| 118 |
void EncodeAudio()
|
| 119 |
{
|
| 120 |
-
var input = new TensorFloat(new TensorShape(1, numSamples), data);
|
| 121 |
-
|
| 122 |
-
int maxSamples = 30 * 16000;
|
| 123 |
if (numSamples > maxSamples)
|
| 124 |
{
|
| 125 |
Debug.Log("The AudioClip is too long.");
|
|
@@ -127,7 +129,7 @@ public class RunWhisper : MonoBehaviour
|
|
| 127 |
}
|
| 128 |
|
| 129 |
// Pad out to 30 seconds at 16khz if necessary
|
| 130 |
-
var input30seconds = ops.Pad(input, new int[] { 0, 0, 0,
|
| 131 |
|
| 132 |
spectroEngine.Execute(input30seconds);
|
| 133 |
var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
|
|
@@ -142,7 +144,7 @@ public class RunWhisper : MonoBehaviour
|
|
| 142 |
{
|
| 143 |
if (transcribe && currentToken < outputTokens.Length - 1)
|
| 144 |
{
|
| 145 |
-
var tokensSoFar = new TensorInt(new TensorShape(1, outputTokens.Length), outputTokens);
|
| 146 |
|
| 147 |
var inputs = new Dictionary<string, Tensor>
|
| 148 |
{
|
|
@@ -153,7 +155,7 @@ public class RunWhisper : MonoBehaviour
|
|
| 153 |
decoderEngine.Execute(inputs);
|
| 154 |
var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
|
| 155 |
|
| 156 |
-
var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
|
| 157 |
tokensPredictions.MakeReadable();
|
| 158 |
|
| 159 |
int ID = tokensPredictions[currentToken];
|
|
@@ -165,7 +167,10 @@ public class RunWhisper : MonoBehaviour
|
|
| 165 |
{
|
| 166 |
transcribe = false;
|
| 167 |
}
|
| 168 |
-
else if (ID >= tokens.Length)
|
|
|
|
|
|
|
|
|
|
| 169 |
else outputString += GetUnicodeText(tokens[ID]);
|
| 170 |
|
| 171 |
Debug.Log(outputString);
|
|
@@ -185,16 +190,16 @@ public class RunWhisper : MonoBehaviour
|
|
| 185 |
foreach (char letter in text)
|
| 186 |
{
|
| 187 |
outText += ((int)letter <= 256) ? letter :
|
| 188 |
-
(char)
|
| 189 |
}
|
| 190 |
return outText;
|
| 191 |
}
|
| 192 |
|
| 193 |
-
void
|
| 194 |
{
|
| 195 |
for (int i = 0, n = 0; i < 256; i++)
|
| 196 |
{
|
| 197 |
-
if (IsWhiteSpace((char)i))
|
| 198 |
}
|
| 199 |
}
|
| 200 |
|
|
@@ -209,5 +214,6 @@ public class RunWhisper : MonoBehaviour
|
|
| 209 |
encoderEngine?.Dispose();
|
| 210 |
spectroEngine?.Dispose();
|
| 211 |
ops?.Dispose();
|
|
|
|
| 212 |
}
|
| 213 |
}
|
|
|
|
| 36 |
// Link your audioclip here. Format must be 16Hz mono non-compressed.
|
| 37 |
public AudioClip audioClip;
|
| 38 |
|
| 39 |
+
// This is how many tokens you want. It can be adjusted.
|
| 40 |
const int maxTokens = 100;
|
| 41 |
|
| 42 |
//Special tokens
|
|
|
|
| 57 |
int[] outputTokens = new int[maxTokens];
|
| 58 |
|
| 59 |
// Used for special character decoding
|
| 60 |
+
int[] whiteSpaceCharacters = new int[256];
|
| 61 |
|
| 62 |
TensorFloat encodedAudio;
|
| 63 |
|
| 64 |
bool transcribe = false;
|
| 65 |
string outputString = "";
|
| 66 |
|
| 67 |
+
// Maximum size of audioClip (30s at 16kHz)
|
| 68 |
+
const int maxSamples = 30 * 16000;
|
| 69 |
+
|
| 70 |
void Start()
|
| 71 |
{
|
| 72 |
allocator = new TensorCachingAllocator();
|
| 73 |
ops = WorkerFactory.CreateOps(backend, allocator);
|
| 74 |
|
| 75 |
+
SetupWhiteSpaceShifts();
|
| 76 |
|
| 77 |
GetTokens();
|
| 78 |
|
|
|
|
| 121 |
|
| 122 |
void EncodeAudio()
|
| 123 |
{
|
| 124 |
+
using var input = new TensorFloat(new TensorShape(1, numSamples), data);
|
|
|
|
|
|
|
| 125 |
if (numSamples > maxSamples)
|
| 126 |
{
|
| 127 |
Debug.Log("The AudioClip is too long.");
|
|
|
|
| 129 |
}
|
| 130 |
|
| 131 |
// Pad out to 30 seconds at 16khz if necessary
|
| 132 |
+
using var input30seconds = ops.Pad(input, new int[] { 0, 0, 0, maxSamples - numSamples });
|
| 133 |
|
| 134 |
spectroEngine.Execute(input30seconds);
|
| 135 |
var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
|
|
|
|
| 144 |
{
|
| 145 |
if (transcribe && currentToken < outputTokens.Length - 1)
|
| 146 |
{
|
| 147 |
+
using var tokensSoFar = new TensorInt(new TensorShape(1, outputTokens.Length), outputTokens);
|
| 148 |
|
| 149 |
var inputs = new Dictionary<string, Tensor>
|
| 150 |
{
|
|
|
|
| 155 |
decoderEngine.Execute(inputs);
|
| 156 |
var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
|
| 157 |
|
| 158 |
+
using var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
|
| 159 |
tokensPredictions.MakeReadable();
|
| 160 |
|
| 161 |
int ID = tokensPredictions[currentToken];
|
|
|
|
| 167 |
{
|
| 168 |
transcribe = false;
|
| 169 |
}
|
| 170 |
+
else if (ID >= tokens.Length)
|
| 171 |
+
{
|
| 172 |
+
outputString += $"(time={(ID - START_TIME) * 0.02f})";
|
| 173 |
+
}
|
| 174 |
else outputString += GetUnicodeText(tokens[ID]);
|
| 175 |
|
| 176 |
Debug.Log(outputString);
|
|
|
|
| 190 |
foreach (char letter in text)
|
| 191 |
{
|
| 192 |
outText += ((int)letter <= 256) ? letter :
|
| 193 |
+
(char)whiteSpaceCharacters[(int)(letter - 256)];
|
| 194 |
}
|
| 195 |
return outText;
|
| 196 |
}
|
| 197 |
|
| 198 |
+
void SetupWhiteSpaceShifts()
|
| 199 |
{
|
| 200 |
for (int i = 0, n = 0; i < 256; i++)
|
| 201 |
{
|
| 202 |
+
if (IsWhiteSpace((char)i)) whiteSpaceCharacters[n++] = i;
|
| 203 |
}
|
| 204 |
}
|
| 205 |
|
|
|
|
| 214 |
encoderEngine?.Dispose();
|
| 215 |
spectroEngine?.Dispose();
|
| 216 |
ops?.Dispose();
|
| 217 |
+
allocator?.Dispose();
|
| 218 |
}
|
| 219 |
}
|