AnthonyPa57
/

HF-torch-demo2

@@ -74,6 +74,15 @@
       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {
@@ -110,61 +119,71 @@
       "<|system|>": 5,
       "<|assistant|>": 6,
       "<|user|>": 7,
-      "\n": 8,
-      "?": 9,
-      "F": 10,
-      "W": 11,
-      "a": 12,
-      "c": 13,
-      "e": 14,
-      "f": 15,
-      "h": 16,
-      "i": 17,
-      "l": 18,
-      "m": 19,
-      "n": 20,
-      "o": 21,
-      "p": 22,
-      "r": 23,
-      "s": 24,
-      "t": 25,
-      "x": 26,
-      "▁": 27,
-      "he": 28,
-      "the": 29,
-      "▁o": 30,
-      "an": 31,
-      "ex": 32,
-      "me": 33,
       "ome": 34,
       "some": 35,
       "tex": 36,
       "▁tex": 37,
-      "ther": 38,
       "▁text": 39,
-      "Fr": 40,
-      "Wh": 41,
-      "al": 42,
-      "ap": 43,
-      "at": 44,
-      "ce": 45,
-      "cap": 46,
-      "is": 47,
-      "it": 48,
-      "▁the": 49,
-      "▁some": 50,
-      "▁Fr": 51,
-      "▁Wh": 52,
-      "▁cap": 53,
-      "▁is": 54,
-      "▁of": 55,
-      "▁other": 56,
-      "ance": 57,
-      "ital": 58,
-      "▁France": 59,
-      "▁What": 60,
-      "▁capital": 61,
-      "▁France?": 62
     },
     "merges": [
       [
@@ -175,14 +194,6 @@
         "t",
         "he"
       ],
-      [
-        "▁",
-        "o"
-      ],
-      [
-        "a",
-        "n"
-      ],
       [
         "e",
         "x"
@@ -191,6 +202,10 @@
         "m",
         "e"
       ],
       [
         "o",
         "me"
@@ -208,13 +223,29 @@
         "tex"
       ],
       [
-        "the",
         "r"
       ],
       [
         "▁tex",
         "t"
       ],
       [
         "F",
         "r"
@@ -252,20 +283,20 @@
         "t"
       ],
       [
-        "▁",
-        "the"
       ],
       [
         "▁",
-        "some"
       ],
       [
         "▁",
-        "Fr"
       ],
       [
         "▁",
-        "Wh"
       ],
       [
         "▁",
@@ -276,28 +307,28 @@
         "is"
       ],
       [
-        "▁o",
-        "f"
-      ],
-      [
-        "▁o",
-        "ther"
       ],
       [
         "an",
         "ce"
       ],
       [
         "it",
         "al"
       ],
       [
-        "▁Fr",
-        "ance"
       ],
       [
-        "▁Wh",
-        "at"
       ],
       [
         "▁cap",
@@ -306,6 +337,30 @@
       [
         "▁France",
         "?"
       ]
     ]
   }

       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 8,
+      "content": "<cls>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": {
       "<|system|>": 5,
       "<|assistant|>": 6,
       "<|user|>": 7,
+      "<cls>": 8,
+      "\n": 9,
+      "?": 10,
+      "F": 11,
+      "W": 12,
+      "a": 13,
+      "c": 14,
+      "e": 15,
+      "f": 16,
+      "h": 17,
+      "i": 18,
+      "l": 19,
+      "m": 20,
+      "n": 21,
+      "o": 22,
+      "p": 23,
+      "r": 24,
+      "s": 25,
+      "t": 26,
+      "x": 27,
+      "▁": 28,
+      "he": 29,
+      "the": 30,
+      "ex": 31,
+      "me": 32,
+      "othe": 33,
       "ome": 34,
       "some": 35,
       "tex": 36,
       "▁tex": 37,
+      "other": 38,
       "▁text": 39,
+      "an": 40,
+      "\nsome": 41,
+      "▁text\n": 42,
+      "▁other": 43,
+      "Fr": 44,
+      "Wh": 45,
+      "al": 46,
+      "ap": 47,
+      "at": 48,
+      "ce": 49,
+      "cap": 50,
+      "is": 51,
+      "it": 52,
+      "of": 53,
+      "▁the": 54,
+      "▁an": 55,
+      "▁Fr": 56,
+      "▁cap": 57,
+      "▁is": 58,
+      "▁of": 59,
+      "ance": 60,
+      "What": 61,
+      "ital": 62,
+      "▁another": 63,
+      "▁France": 64,
+      "▁capital": 65,
+      "▁France?": 66,
+      "▁some": 67,
+      "▁France?\nsome": 68,
+      "▁What": 69,
+      "▁text\n\nsome": 70,
+      "▁text\n\n": 71,
+      "▁text\nWhat": 72
     },
     "merges": [
       [
         "t",
         "he"
       ],
       [
         "e",
         "x"
         "m",
         "e"
       ],
+      [
+        "o",
+        "the"
+      ],
       [
         "o",
         "me"
         "tex"
       ],
       [
+        "othe",
         "r"
       ],
       [
         "▁tex",
         "t"
       ],
+      [
+        "a",
+        "n"
+      ],
+      [
+        "\n",
+        "some"
+      ],
+      [
+        "▁text",
+        "\n"
+      ],
+      [
+        "▁",
+        "other"
+      ],
       [
         "F",
         "r"
         "t"
       ],
       [
+        "o",
+        "f"
       ],
       [
         "▁",
+        "the"
       ],
       [
         "▁",
+        "an"
       ],
       [
         "▁",
+        "Fr"
       ],
       [
         "▁",
         "is"
       ],
       [
+        "▁",
+        "of"
       ],
       [
         "an",
         "ce"
       ],
+      [
+        "Wh",
+        "at"
+      ],
       [
         "it",
         "al"
       ],
       [
+        "▁an",
+        "other"
       ],
       [
+        "▁Fr",
+        "ance"
       ],
       [
         "▁cap",
       [
         "▁France",
         "?"
+      ],
+      [
+        "▁",
+        "some"
+      ],
+      [
+        "▁France?",
+        "\nsome"
+      ],
+      [
+        "▁",
+        "What"
+      ],
+      [
+        "▁text\n",
+        "\nsome"
+      ],
+      [
+        "▁text\n",
+        "\n"
+      ],
+      [
+        "▁text\n",
+        "What"
       ]
     ]
   }

tokenizer_config.json CHANGED Viewed

@@ -63,16 +63,25 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "assistant_token": "<|assistant|>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<eos>",
   "extra_special_tokens": {},
   "mask_token": "<mask>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
-  "sos_token": "<sos>",
   "system_token": "<|system|>",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "<unk>",

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "8": {
+      "content": "<cls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "assistant_token": "<|assistant|>",
+  "bos_token": "<sos>",
+  "class_token": "<cls>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<eos>",
   "extra_special_tokens": {},
   "mask_token": "<mask>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
   "system_token": "<|system|>",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "<unk>",