luckyt commited on
Commit
d7b639a
·
verified ·
1 Parent(s): a3ef0e8

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model-q4k.gguf filter=lfs diff=lfs merge=lfs -text
37
+ model-q80.gguf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,14 @@
1
- ---
2
- license: cc-by-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - en
5
+ - fr
6
+ library_name: moshi
7
+ tags:
8
+ - audio
9
+ - automatic-speech-recognition
10
+ ---
11
+
12
+ # Moshi Streaming Speech-to-Text (Quantized)
13
+
14
+ This is a quantized version of Kyutai’s [stt-1b-en_fr](https://huggingface.co/kyutai/stt-1b-en_fr) model. The original model is a 1B parameter streaming speech-to-text model for English and French. This fork contains the same model, quantized to Q8_0 and Q4_K GGUF formats for reduced memory usage and faster inference.
config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "card": 2048,
3
+ "n_q": 32,
4
+ "dep_q": 0,
5
+ "delays": [
6
+ 0,
7
+ 0,
8
+ 0,
9
+ 0,
10
+ 0,
11
+ 0,
12
+ 0,
13
+ 0,
14
+ 0,
15
+ 0,
16
+ 0,
17
+ 0,
18
+ 0,
19
+ 0,
20
+ 0,
21
+ 0,
22
+ 0,
23
+ 0,
24
+ 0,
25
+ 0,
26
+ 0,
27
+ 0,
28
+ 0,
29
+ 0,
30
+ 0,
31
+ 0,
32
+ 0,
33
+ 0,
34
+ 0,
35
+ 0,
36
+ 0,
37
+ 0,
38
+ 0
39
+ ],
40
+ "dim": 2048,
41
+ "text_card": 8000,
42
+ "existing_text_padding_id": 3,
43
+ "num_heads": 16,
44
+ "num_layers": 16,
45
+ "hidden_scale": 4.125,
46
+ "causal": true,
47
+ "layer_scale": null,
48
+ "context": 750,
49
+ "max_period": 100000.0,
50
+ "gating": "silu",
51
+ "norm": "rms_norm_f32",
52
+ "positional_embedding": "rope",
53
+ "depformer_dim": 1024,
54
+ "depformer_num_heads": 16,
55
+ "depformer_num_layers": 6,
56
+ "depformer_dim_feedforward": null,
57
+ "depformer_multi_linear": true,
58
+ "depformer_pos_emb": "none",
59
+ "depformer_weights_per_step": true,
60
+ "conditioners": {},
61
+ "cross_attention": false,
62
+ "model_id": {
63
+ "sig": "70f8f0ea",
64
+ "epoch": 500
65
+ },
66
+ "lm_gen_config": {
67
+ "temp": 0.0,
68
+ "temp_text": 0.0,
69
+ "top_k": 250,
70
+ "top_k_text": 50
71
+ },
72
+ "stt_config": {
73
+ "audio_delay_seconds": 0.5,
74
+ "audio_silence_prefix_seconds": 0.0
75
+ },
76
+ "model_type": "stt",
77
+ "mimi_name": "[email protected]",
78
+ "tokenizer_name": "tokenizer_en_fr_audio_8000.model"
79
+ }
[email protected] ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b782f0629851a271227fb9d36db65c041790365f11bbe5d3d59369cf863f50
3
+ size 384644900
model-q4k.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d061f95eec3031e2e06a8590330d371bd04725ddecba497be83644aae15b49b
3
+ size 556669792
model-q80.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc1f1827defedf061fd08796867bb70cf02230ca920370c7e962621e5746cd6c
3
+ size 1051238240
tokenizer_en_fr_audio_8000.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd87dd5d17169151782ac700280ec057e5d658a9afbe238a048ea5ff318cce69
3
+ size 120378