Add files using upload-large-folder tool

Browse files

Files changed (13) hide show

README.md +31 -0
added_tokens.json +1011 -0
chat_template.jinja +24 -0
config.json +37 -0
configuration_ernie4_5.py +127 -0
generation_config.json +11 -0
model.safetensors +3 -0
model.safetensors.index.json +426 -0
modeling_ernie4_5.py +1068 -0
special_tokens_map.json +1020 -0
tokenization_ernie4_5.py +214 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+---
+license: apache-2.0
+language:
+- en
+- zh
+pipeline_tag: text-generation
+tags:
+- ERNIE4.5
+- mlx
+library_name: transformers
+base_model: baidu/ERNIE-4.5-0.3B-PT
+---
+## 💫 Community Model> ERNIE-4.5-0.3B-PT by baidu
+*👾 [LM Studio](https://lmstudio.ai) Community models highlights program. Highlighting new & noteworthy models by the community. Join the conversation on [Discord](https://discord.gg/aPQfnNkxGC)*.
+**Model creator:** [baidu](https://huggingface.co/baidu)<br>
+**Original model**: [ERNIE-4.5-0.3B-PT](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT)<br>
+**MLX quantization:** provided by [LM Studio team](https://x.com/lmstudio) using [mlx_lm](https://github.com/ml-explore/mlx-lm)<br>
+## Technical Details
+8-bit quantized version of ERNIE-4.5-0.3B-PT using MLX, optimized for Apple Silicon.
+## Special thanks
+🙏 Special thanks to the [Apple Machine Learning Research](https://github.com/ml-explore) team for creating [MLX](https://github.com/ml-explore/mlx).
+## Disclaimers
+LM Studio is not the creator, originator, or owner of any Model featured in the Community Model Program. Each Community Model is created and provided by third parties. LM Studio does not endorse, support, represent or guarantee the completeness, truthfulness, accuracy, or reliability of any Community Model.  You understand that Community Models can produce content that might be offensive, harmful, inaccurate or otherwise inappropriate, or deceptive. Each Community Model is the sole responsibility of the person or entity who originated such Model. LM Studio may not monitor or control the Community Models and cannot, and does not, take responsibility for any such Model. LM Studio disclaims all warranties or guarantees about the accuracy, reliability or benefits of the Community Models.  LM Studio further disclaims any warranty that the Community Model will meet your requirements, be secure, uninterrupted or available at any time or location, or error-free, viruses-free, or that any errors will be corrected, or otherwise. You will be solely responsible for any damage resulting from your use of or access to the Community Models, your downloading of any Community Model, or use of any other Community Model provided by or through LM Studio.

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1011 @@

+{
+  "<|AUDIO_PLACEHOLDER|>": 100296,
+  "<|CROP_COL_SEP|>": 101301,
+  "<|CROP_ROW_SEP|>": 101302,
+  "<|IMAGE_PLACEHOLDER|>": 100295,
+  "<|IMAGE_SEP|>": 101303,
+  "<|LOC_0|>": 100297,
+  "<|LOC_1000|>": 101297,
+  "<|LOC_100|>": 100397,
+  "<|LOC_101|>": 100398,
+  "<|LOC_102|>": 100399,
+  "<|LOC_103|>": 100400,
+  "<|LOC_104|>": 100401,
+  "<|LOC_105|>": 100402,
+  "<|LOC_106|>": 100403,
+  "<|LOC_107|>": 100404,
+  "<|LOC_108|>": 100405,
+  "<|LOC_109|>": 100406,
+  "<|LOC_10|>": 100307,
+  "<|LOC_110|>": 100407,
+  "<|LOC_111|>": 100408,
+  "<|LOC_112|>": 100409,
+  "<|LOC_113|>": 100410,
+  "<|LOC_114|>": 100411,
+  "<|LOC_115|>": 100412,
+  "<|LOC_116|>": 100413,
+  "<|LOC_117|>": 100414,
+  "<|LOC_118|>": 100415,
+  "<|LOC_119|>": 100416,
+  "<|LOC_11|>": 100308,
+  "<|LOC_120|>": 100417,
+  "<|LOC_121|>": 100418,
+  "<|LOC_122|>": 100419,
+  "<|LOC_123|>": 100420,
+  "<|LOC_124|>": 100421,
+  "<|LOC_125|>": 100422,
+  "<|LOC_126|>": 100423,
+  "<|LOC_127|>": 100424,
+  "<|LOC_128|>": 100425,
+  "<|LOC_129|>": 100426,
+  "<|LOC_12|>": 100309,
+  "<|LOC_130|>": 100427,
+  "<|LOC_131|>": 100428,
+  "<|LOC_132|>": 100429,
+  "<|LOC_133|>": 100430,
+  "<|LOC_134|>": 100431,
+  "<|LOC_135|>": 100432,
+  "<|LOC_136|>": 100433,
+  "<|LOC_137|>": 100434,
+  "<|LOC_138|>": 100435,
+  "<|LOC_139|>": 100436,
+  "<|LOC_13|>": 100310,
+  "<|LOC_140|>": 100437,
+  "<|LOC_141|>": 100438,
+  "<|LOC_142|>": 100439,
+  "<|LOC_143|>": 100440,
+  "<|LOC_144|>": 100441,
+  "<|LOC_145|>": 100442,
+  "<|LOC_146|>": 100443,
+  "<|LOC_147|>": 100444,
+  "<|LOC_148|>": 100445,
+  "<|LOC_149|>": 100446,
+  "<|LOC_14|>": 100311,
+  "<|LOC_150|>": 100447,
+  "<|LOC_151|>": 100448,
+  "<|LOC_152|>": 100449,
+  "<|LOC_153|>": 100450,
+  "<|LOC_154|>": 100451,
+  "<|LOC_155|>": 100452,
+  "<|LOC_156|>": 100453,
+  "<|LOC_157|>": 100454,
+  "<|LOC_158|>": 100455,
+  "<|LOC_159|>": 100456,
+  "<|LOC_15|>": 100312,
+  "<|LOC_160|>": 100457,
+  "<|LOC_161|>": 100458,
+  "<|LOC_162|>": 100459,
+  "<|LOC_163|>": 100460,
+  "<|LOC_164|>": 100461,
+  "<|LOC_165|>": 100462,
+  "<|LOC_166|>": 100463,
+  "<|LOC_167|>": 100464,
+  "<|LOC_168|>": 100465,
+  "<|LOC_169|>": 100466,
+  "<|LOC_16|>": 100313,
+  "<|LOC_170|>": 100467,
+  "<|LOC_171|>": 100468,
+  "<|LOC_172|>": 100469,
+  "<|LOC_173|>": 100470,
+  "<|LOC_174|>": 100471,
+  "<|LOC_175|>": 100472,
+  "<|LOC_176|>": 100473,
+  "<|LOC_177|>": 100474,
+  "<|LOC_178|>": 100475,
+  "<|LOC_179|>": 100476,
+  "<|LOC_17|>": 100314,
+  "<|LOC_180|>": 100477,
+  "<|LOC_181|>": 100478,
+  "<|LOC_182|>": 100479,
+  "<|LOC_183|>": 100480,
+  "<|LOC_184|>": 100481,
+  "<|LOC_185|>": 100482,
+  "<|LOC_186|>": 100483,
+  "<|LOC_187|>": 100484,
+  "<|LOC_188|>": 100485,
+  "<|LOC_189|>": 100486,
+  "<|LOC_18|>": 100315,
+  "<|LOC_190|>": 100487,
+  "<|LOC_191|>": 100488,
+  "<|LOC_192|>": 100489,
+  "<|LOC_193|>": 100490,
+  "<|LOC_194|>": 100491,
+  "<|LOC_195|>": 100492,
+  "<|LOC_196|>": 100493,
+  "<|LOC_197|>": 100494,
+  "<|LOC_198|>": 100495,
+  "<|LOC_199|>": 100496,
+  "<|LOC_19|>": 100316,
+  "<|LOC_1|>": 100298,
+  "<|LOC_200|>": 100497,
+  "<|LOC_201|>": 100498,
+  "<|LOC_202|>": 100499,
+  "<|LOC_203|>": 100500,
+  "<|LOC_204|>": 100501,
+  "<|LOC_205|>": 100502,
+  "<|LOC_206|>": 100503,
+  "<|LOC_207|>": 100504,
+  "<|LOC_208|>": 100505,
+  "<|LOC_209|>": 100506,
+  "<|LOC_20|>": 100317,
+  "<|LOC_210|>": 100507,
+  "<|LOC_211|>": 100508,
+  "<|LOC_212|>": 100509,
+  "<|LOC_213|>": 100510,
+  "<|LOC_214|>": 100511,
+  "<|LOC_215|>": 100512,
+  "<|LOC_216|>": 100513,
+  "<|LOC_217|>": 100514,
+  "<|LOC_218|>": 100515,
+  "<|LOC_219|>": 100516,
+  "<|LOC_21|>": 100318,
+  "<|LOC_220|>": 100517,
+  "<|LOC_221|>": 100518,
+  "<|LOC_222|>": 100519,
+  "<|LOC_223|>": 100520,
+  "<|LOC_224|>": 100521,
+  "<|LOC_225|>": 100522,
+  "<|LOC_226|>": 100523,
+  "<|LOC_227|>": 100524,
+  "<|LOC_228|>": 100525,
+  "<|LOC_229|>": 100526,
+  "<|LOC_22|>": 100319,
+  "<|LOC_230|>": 100527,
+  "<|LOC_231|>": 100528,
+  "<|LOC_232|>": 100529,
+  "<|LOC_233|>": 100530,
+  "<|LOC_234|>": 100531,
+  "<|LOC_235|>": 100532,
+  "<|LOC_236|>": 100533,
+  "<|LOC_237|>": 100534,
+  "<|LOC_238|>": 100535,
+  "<|LOC_239|>": 100536,
+  "<|LOC_23|>": 100320,
+  "<|LOC_240|>": 100537,
+  "<|LOC_241|>": 100538,
+  "<|LOC_242|>": 100539,
+  "<|LOC_243|>": 100540,
+  "<|LOC_244|>": 100541,
+  "<|LOC_245|>": 100542,
+  "<|LOC_246|>": 100543,
+  "<|LOC_247|>": 100544,
+  "<|LOC_248|>": 100545,
+  "<|LOC_249|>": 100546,
+  "<|LOC_24|>": 100321,
+  "<|LOC_250|>": 100547,
+  "<|LOC_251|>": 100548,
+  "<|LOC_252|>": 100549,
+  "<|LOC_253|>": 100550,
+  "<|LOC_254|>": 100551,
+  "<|LOC_255|>": 100552,
+  "<|LOC_256|>": 100553,
+  "<|LOC_257|>": 100554,
+  "<|LOC_258|>": 100555,
+  "<|LOC_259|>": 100556,
+  "<|LOC_25|>": 100322,
+  "<|LOC_260|>": 100557,
+  "<|LOC_261|>": 100558,
+  "<|LOC_262|>": 100559,
+  "<|LOC_263|>": 100560,
+  "<|LOC_264|>": 100561,
+  "<|LOC_265|>": 100562,
+  "<|LOC_266|>": 100563,
+  "<|LOC_267|>": 100564,
+  "<|LOC_268|>": 100565,
+  "<|LOC_269|>": 100566,
+  "<|LOC_26|>": 100323,
+  "<|LOC_270|>": 100567,
+  "<|LOC_271|>": 100568,
+  "<|LOC_272|>": 100569,
+  "<|LOC_273|>": 100570,
+  "<|LOC_274|>": 100571,
+  "<|LOC_275|>": 100572,
+  "<|LOC_276|>": 100573,
+  "<|LOC_277|>": 100574,
+  "<|LOC_278|>": 100575,
+  "<|LOC_279|>": 100576,
+  "<|LOC_27|>": 100324,
+  "<|LOC_280|>": 100577,
+  "<|LOC_281|>": 100578,
+  "<|LOC_282|>": 100579,
+  "<|LOC_283|>": 100580,
+  "<|LOC_284|>": 100581,
+  "<|LOC_285|>": 100582,
+  "<|LOC_286|>": 100583,
+  "<|LOC_287|>": 100584,
+  "<|LOC_288|>": 100585,
+  "<|LOC_289|>": 100586,
+  "<|LOC_28|>": 100325,
+  "<|LOC_290|>": 100587,
+  "<|LOC_291|>": 100588,
+  "<|LOC_292|>": 100589,
+  "<|LOC_293|>": 100590,
+  "<|LOC_294|>": 100591,
+  "<|LOC_295|>": 100592,
+  "<|LOC_296|>": 100593,
+  "<|LOC_297|>": 100594,
+  "<|LOC_298|>": 100595,
+  "<|LOC_299|>": 100596,
+  "<|LOC_29|>": 100326,
+  "<|LOC_2|>": 100299,
+  "<|LOC_300|>": 100597,
+  "<|LOC_301|>": 100598,
+  "<|LOC_302|>": 100599,
+  "<|LOC_303|>": 100600,
+  "<|LOC_304|>": 100601,
+  "<|LOC_305|>": 100602,
+  "<|LOC_306|>": 100603,
+  "<|LOC_307|>": 100604,
+  "<|LOC_308|>": 100605,
+  "<|LOC_309|>": 100606,
+  "<|LOC_30|>": 100327,
+  "<|LOC_310|>": 100607,
+  "<|LOC_311|>": 100608,
+  "<|LOC_312|>": 100609,
+  "<|LOC_313|>": 100610,
+  "<|LOC_314|>": 100611,
+  "<|LOC_315|>": 100612,
+  "<|LOC_316|>": 100613,
+  "<|LOC_317|>": 100614,
+  "<|LOC_318|>": 100615,
+  "<|LOC_319|>": 100616,
+  "<|LOC_31|>": 100328,
+  "<|LOC_320|>": 100617,
+  "<|LOC_321|>": 100618,
+  "<|LOC_322|>": 100619,
+  "<|LOC_323|>": 100620,
+  "<|LOC_324|>": 100621,
+  "<|LOC_325|>": 100622,
+  "<|LOC_326|>": 100623,
+  "<|LOC_327|>": 100624,
+  "<|LOC_328|>": 100625,
+  "<|LOC_329|>": 100626,
+  "<|LOC_32|>": 100329,
+  "<|LOC_330|>": 100627,
+  "<|LOC_331|>": 100628,
+  "<|LOC_332|>": 100629,
+  "<|LOC_333|>": 100630,
+  "<|LOC_334|>": 100631,
+  "<|LOC_335|>": 100632,
+  "<|LOC_336|>": 100633,
+  "<|LOC_337|>": 100634,
+  "<|LOC_338|>": 100635,
+  "<|LOC_339|>": 100636,
+  "<|LOC_33|>": 100330,
+  "<|LOC_340|>": 100637,
+  "<|LOC_341|>": 100638,
+  "<|LOC_342|>": 100639,
+  "<|LOC_343|>": 100640,
+  "<|LOC_344|>": 100641,
+  "<|LOC_345|>": 100642,
+  "<|LOC_346|>": 100643,
+  "<|LOC_347|>": 100644,
+  "<|LOC_348|>": 100645,
+  "<|LOC_349|>": 100646,
+  "<|LOC_34|>": 100331,
+  "<|LOC_350|>": 100647,
+  "<|LOC_351|>": 100648,
+  "<|LOC_352|>": 100649,
+  "<|LOC_353|>": 100650,
+  "<|LOC_354|>": 100651,
+  "<|LOC_355|>": 100652,
+  "<|LOC_356|>": 100653,
+  "<|LOC_357|>": 100654,
+  "<|LOC_358|>": 100655,
+  "<|LOC_359|>": 100656,
+  "<|LOC_35|>": 100332,
+  "<|LOC_360|>": 100657,
+  "<|LOC_361|>": 100658,
+  "<|LOC_362|>": 100659,
+  "<|LOC_363|>": 100660,
+  "<|LOC_364|>": 100661,
+  "<|LOC_365|>": 100662,
+  "<|LOC_366|>": 100663,
+  "<|LOC_367|>": 100664,
+  "<|LOC_368|>": 100665,
+  "<|LOC_369|>": 100666,
+  "<|LOC_36|>": 100333,
+  "<|LOC_370|>": 100667,
+  "<|LOC_371|>": 100668,
+  "<|LOC_372|>": 100669,
+  "<|LOC_373|>": 100670,
+  "<|LOC_374|>": 100671,
+  "<|LOC_375|>": 100672,
+  "<|LOC_376|>": 100673,
+  "<|LOC_377|>": 100674,
+  "<|LOC_378|>": 100675,
+  "<|LOC_379|>": 100676,
+  "<|LOC_37|>": 100334,
+  "<|LOC_380|>": 100677,
+  "<|LOC_381|>": 100678,
+  "<|LOC_382|>": 100679,
+  "<|LOC_383|>": 100680,
+  "<|LOC_384|>": 100681,
+  "<|LOC_385|>": 100682,
+  "<|LOC_386|>": 100683,
+  "<|LOC_387|>": 100684,
+  "<|LOC_388|>": 100685,
+  "<|LOC_389|>": 100686,
+  "<|LOC_38|>": 100335,
+  "<|LOC_390|>": 100687,
+  "<|LOC_391|>": 100688,
+  "<|LOC_392|>": 100689,
+  "<|LOC_393|>": 100690,
+  "<|LOC_394|>": 100691,
+  "<|LOC_395|>": 100692,
+  "<|LOC_396|>": 100693,
+  "<|LOC_397|>": 100694,
+  "<|LOC_398|>": 100695,
+  "<|LOC_399|>": 100696,
+  "<|LOC_39|>": 100336,
+  "<|LOC_3|>": 100300,
+  "<|LOC_400|>": 100697,
+  "<|LOC_401|>": 100698,
+  "<|LOC_402|>": 100699,
+  "<|LOC_403|>": 100700,
+  "<|LOC_404|>": 100701,
+  "<|LOC_405|>": 100702,
+  "<|LOC_406|>": 100703,
+  "<|LOC_407|>": 100704,
+  "<|LOC_408|>": 100705,
+  "<|LOC_409|>": 100706,
+  "<|LOC_40|>": 100337,
+  "<|LOC_410|>": 100707,
+  "<|LOC_411|>": 100708,
+  "<|LOC_412|>": 100709,
+  "<|LOC_413|>": 100710,
+  "<|LOC_414|>": 100711,
+  "<|LOC_415|>": 100712,
+  "<|LOC_416|>": 100713,
+  "<|LOC_417|>": 100714,
+  "<|LOC_418|>": 100715,
+  "<|LOC_419|>": 100716,
+  "<|LOC_41|>": 100338,
+  "<|LOC_420|>": 100717,
+  "<|LOC_421|>": 100718,
+  "<|LOC_422|>": 100719,
+  "<|LOC_423|>": 100720,
+  "<|LOC_424|>": 100721,
+  "<|LOC_425|>": 100722,
+  "<|LOC_426|>": 100723,
+  "<|LOC_427|>": 100724,
+  "<|LOC_428|>": 100725,
+  "<|LOC_429|>": 100726,
+  "<|LOC_42|>": 100339,
+  "<|LOC_430|>": 100727,
+  "<|LOC_431|>": 100728,
+  "<|LOC_432|>": 100729,
+  "<|LOC_433|>": 100730,
+  "<|LOC_434|>": 100731,
+  "<|LOC_435|>": 100732,
+  "<|LOC_436|>": 100733,
+  "<|LOC_437|>": 100734,
+  "<|LOC_438|>": 100735,
+  "<|LOC_439|>": 100736,
+  "<|LOC_43|>": 100340,
+  "<|LOC_440|>": 100737,
+  "<|LOC_441|>": 100738,
+  "<|LOC_442|>": 100739,
+  "<|LOC_443|>": 100740,
+  "<|LOC_444|>": 100741,
+  "<|LOC_445|>": 100742,
+  "<|LOC_446|>": 100743,
+  "<|LOC_447|>": 100744,
+  "<|LOC_448|>": 100745,
+  "<|LOC_449|>": 100746,
+  "<|LOC_44|>": 100341,
+  "<|LOC_450|>": 100747,
+  "<|LOC_451|>": 100748,
+  "<|LOC_452|>": 100749,
+  "<|LOC_453|>": 100750,
+  "<|LOC_454|>": 100751,
+  "<|LOC_455|>": 100752,
+  "<|LOC_456|>": 100753,
+  "<|LOC_457|>": 100754,
+  "<|LOC_458|>": 100755,
+  "<|LOC_459|>": 100756,
+  "<|LOC_45|>": 100342,
+  "<|LOC_460|>": 100757,
+  "<|LOC_461|>": 100758,
+  "<|LOC_462|>": 100759,
+  "<|LOC_463|>": 100760,
+  "<|LOC_464|>": 100761,
+  "<|LOC_465|>": 100762,
+  "<|LOC_466|>": 100763,
+  "<|LOC_467|>": 100764,
+  "<|LOC_468|>": 100765,
+  "<|LOC_469|>": 100766,
+  "<|LOC_46|>": 100343,
+  "<|LOC_470|>": 100767,
+  "<|LOC_471|>": 100768,
+  "<|LOC_472|>": 100769,
+  "<|LOC_473|>": 100770,
+  "<|LOC_474|>": 100771,
+  "<|LOC_475|>": 100772,
+  "<|LOC_476|>": 100773,
+  "<|LOC_477|>": 100774,
+  "<|LOC_478|>": 100775,
+  "<|LOC_479|>": 100776,
+  "<|LOC_47|>": 100344,
+  "<|LOC_480|>": 100777,
+  "<|LOC_481|>": 100778,
+  "<|LOC_482|>": 100779,
+  "<|LOC_483|>": 100780,
+  "<|LOC_484|>": 100781,
+  "<|LOC_485|>": 100782,
+  "<|LOC_486|>": 100783,
+  "<|LOC_487|>": 100784,
+  "<|LOC_488|>": 100785,
+  "<|LOC_489|>": 100786,
+  "<|LOC_48|>": 100345,
+  "<|LOC_490|>": 100787,
+  "<|LOC_491|>": 100788,
+  "<|LOC_492|>": 100789,
+  "<|LOC_493|>": 100790,
+  "<|LOC_494|>": 100791,
+  "<|LOC_495|>": 100792,
+  "<|LOC_496|>": 100793,
+  "<|LOC_497|>": 100794,
+  "<|LOC_498|>": 100795,
+  "<|LOC_499|>": 100796,
+  "<|LOC_49|>": 100346,
+  "<|LOC_4|>": 100301,
+  "<|LOC_500|>": 100797,
+  "<|LOC_501|>": 100798,
+  "<|LOC_502|>": 100799,
+  "<|LOC_503|>": 100800,
+  "<|LOC_504|>": 100801,
+  "<|LOC_505|>": 100802,
+  "<|LOC_506|>": 100803,
+  "<|LOC_507|>": 100804,
+  "<|LOC_508|>": 100805,
+  "<|LOC_509|>": 100806,
+  "<|LOC_50|>": 100347,
+  "<|LOC_510|>": 100807,
+  "<|LOC_511|>": 100808,
+  "<|LOC_512|>": 100809,
+  "<|LOC_513|>": 100810,
+  "<|LOC_514|>": 100811,
+  "<|LOC_515|>": 100812,
+  "<|LOC_516|>": 100813,
+  "<|LOC_517|>": 100814,
+  "<|LOC_518|>": 100815,
+  "<|LOC_519|>": 100816,
+  "<|LOC_51|>": 100348,
+  "<|LOC_520|>": 100817,
+  "<|LOC_521|>": 100818,
+  "<|LOC_522|>": 100819,
+  "<|LOC_523|>": 100820,
+  "<|LOC_524|>": 100821,
+  "<|LOC_525|>": 100822,
+  "<|LOC_526|>": 100823,
+  "<|LOC_527|>": 100824,
+  "<|LOC_528|>": 100825,
+  "<|LOC_529|>": 100826,
+  "<|LOC_52|>": 100349,
+  "<|LOC_530|>": 100827,
+  "<|LOC_531|>": 100828,
+  "<|LOC_532|>": 100829,
+  "<|LOC_533|>": 100830,
+  "<|LOC_534|>": 100831,
+  "<|LOC_535|>": 100832,
+  "<|LOC_536|>": 100833,
+  "<|LOC_537|>": 100834,
+  "<|LOC_538|>": 100835,
+  "<|LOC_539|>": 100836,
+  "<|LOC_53|>": 100350,
+  "<|LOC_540|>": 100837,
+  "<|LOC_541|>": 100838,
+  "<|LOC_542|>": 100839,
+  "<|LOC_543|>": 100840,
+  "<|LOC_544|>": 100841,
+  "<|LOC_545|>": 100842,
+  "<|LOC_546|>": 100843,
+  "<|LOC_547|>": 100844,
+  "<|LOC_548|>": 100845,
+  "<|LOC_549|>": 100846,
+  "<|LOC_54|>": 100351,
+  "<|LOC_550|>": 100847,
+  "<|LOC_551|>": 100848,
+  "<|LOC_552|>": 100849,
+  "<|LOC_553|>": 100850,
+  "<|LOC_554|>": 100851,
+  "<|LOC_555|>": 100852,
+  "<|LOC_556|>": 100853,
+  "<|LOC_557|>": 100854,
+  "<|LOC_558|>": 100855,
+  "<|LOC_559|>": 100856,
+  "<|LOC_55|>": 100352,
+  "<|LOC_560|>": 100857,
+  "<|LOC_561|>": 100858,
+  "<|LOC_562|>": 100859,
+  "<|LOC_563|>": 100860,
+  "<|LOC_564|>": 100861,
+  "<|LOC_565|>": 100862,
+  "<|LOC_566|>": 100863,
+  "<|LOC_567|>": 100864,
+  "<|LOC_568|>": 100865,
+  "<|LOC_569|>": 100866,
+  "<|LOC_56|>": 100353,
+  "<|LOC_570|>": 100867,
+  "<|LOC_571|>": 100868,
+  "<|LOC_572|>": 100869,
+  "<|LOC_573|>": 100870,
+  "<|LOC_574|>": 100871,
+  "<|LOC_575|>": 100872,
+  "<|LOC_576|>": 100873,
+  "<|LOC_577|>": 100874,
+  "<|LOC_578|>": 100875,
+  "<|LOC_579|>": 100876,
+  "<|LOC_57|>": 100354,
+  "<|LOC_580|>": 100877,
+  "<|LOC_581|>": 100878,
+  "<|LOC_582|>": 100879,
+  "<|LOC_583|>": 100880,
+  "<|LOC_584|>": 100881,
+  "<|LOC_585|>": 100882,
+  "<|LOC_586|>": 100883,
+  "<|LOC_587|>": 100884,
+  "<|LOC_588|>": 100885,
+  "<|LOC_589|>": 100886,
+  "<|LOC_58|>": 100355,
+  "<|LOC_590|>": 100887,
+  "<|LOC_591|>": 100888,
+  "<|LOC_592|>": 100889,
+  "<|LOC_593|>": 100890,
+  "<|LOC_594|>": 100891,
+  "<|LOC_595|>": 100892,
+  "<|LOC_596|>": 100893,
+  "<|LOC_597|>": 100894,
+  "<|LOC_598|>": 100895,
+  "<|LOC_599|>": 100896,
+  "<|LOC_59|>": 100356,
+  "<|LOC_5|>": 100302,
+  "<|LOC_600|>": 100897,
+  "<|LOC_601|>": 100898,
+  "<|LOC_602|>": 100899,
+  "<|LOC_603|>": 100900,
+  "<|LOC_604|>": 100901,
+  "<|LOC_605|>": 100902,
+  "<|LOC_606|>": 100903,
+  "<|LOC_607|>": 100904,
+  "<|LOC_608|>": 100905,
+  "<|LOC_609|>": 100906,
+  "<|LOC_60|>": 100357,
+  "<|LOC_610|>": 100907,
+  "<|LOC_611|>": 100908,
+  "<|LOC_612|>": 100909,
+  "<|LOC_613|>": 100910,
+  "<|LOC_614|>": 100911,
+  "<|LOC_615|>": 100912,
+  "<|LOC_616|>": 100913,
+  "<|LOC_617|>": 100914,
+  "<|LOC_618|>": 100915,
+  "<|LOC_619|>": 100916,
+  "<|LOC_61|>": 100358,
+  "<|LOC_620|>": 100917,
+  "<|LOC_621|>": 100918,
+  "<|LOC_622|>": 100919,
+  "<|LOC_623|>": 100920,
+  "<|LOC_624|>": 100921,
+  "<|LOC_625|>": 100922,
+  "<|LOC_626|>": 100923,
+  "<|LOC_627|>": 100924,
+  "<|LOC_628|>": 100925,
+  "<|LOC_629|>": 100926,
+  "<|LOC_62|>": 100359,
+  "<|LOC_630|>": 100927,
+  "<|LOC_631|>": 100928,
+  "<|LOC_632|>": 100929,
+  "<|LOC_633|>": 100930,
+  "<|LOC_634|>": 100931,
+  "<|LOC_635|>": 100932,
+  "<|LOC_636|>": 100933,
+  "<|LOC_637|>": 100934,
+  "<|LOC_638|>": 100935,
+  "<|LOC_639|>": 100936,
+  "<|LOC_63|>": 100360,
+  "<|LOC_640|>": 100937,
+  "<|LOC_641|>": 100938,
+  "<|LOC_642|>": 100939,
+  "<|LOC_643|>": 100940,
+  "<|LOC_644|>": 100941,
+  "<|LOC_645|>": 100942,
+  "<|LOC_646|>": 100943,
+  "<|LOC_647|>": 100944,
+  "<|LOC_648|>": 100945,
+  "<|LOC_649|>": 100946,
+  "<|LOC_64|>": 100361,
+  "<|LOC_650|>": 100947,
+  "<|LOC_651|>": 100948,
+  "<|LOC_652|>": 100949,
+  "<|LOC_653|>": 100950,
+  "<|LOC_654|>": 100951,
+  "<|LOC_655|>": 100952,
+  "<|LOC_656|>": 100953,
+  "<|LOC_657|>": 100954,
+  "<|LOC_658|>": 100955,
+  "<|LOC_659|>": 100956,
+  "<|LOC_65|>": 100362,
+  "<|LOC_660|>": 100957,
+  "<|LOC_661|>": 100958,
+  "<|LOC_662|>": 100959,
+  "<|LOC_663|>": 100960,
+  "<|LOC_664|>": 100961,
+  "<|LOC_665|>": 100962,
+  "<|LOC_666|>": 100963,
+  "<|LOC_667|>": 100964,
+  "<|LOC_668|>": 100965,
+  "<|LOC_669|>": 100966,
+  "<|LOC_66|>": 100363,
+  "<|LOC_670|>": 100967,
+  "<|LOC_671|>": 100968,
+  "<|LOC_672|>": 100969,
+  "<|LOC_673|>": 100970,
+  "<|LOC_674|>": 100971,
+  "<|LOC_675|>": 100972,
+  "<|LOC_676|>": 100973,
+  "<|LOC_677|>": 100974,
+  "<|LOC_678|>": 100975,
+  "<|LOC_679|>": 100976,
+  "<|LOC_67|>": 100364,
+  "<|LOC_680|>": 100977,
+  "<|LOC_681|>": 100978,
+  "<|LOC_682|>": 100979,
+  "<|LOC_683|>": 100980,
+  "<|LOC_684|>": 100981,
+  "<|LOC_685|>": 100982,
+  "<|LOC_686|>": 100983,
+  "<|LOC_687|>": 100984,
+  "<|LOC_688|>": 100985,
+  "<|LOC_689|>": 100986,
+  "<|LOC_68|>": 100365,
+  "<|LOC_690|>": 100987,
+  "<|LOC_691|>": 100988,
+  "<|LOC_692|>": 100989,
+  "<|LOC_693|>": 100990,
+  "<|LOC_694|>": 100991,
+  "<|LOC_695|>": 100992,
+  "<|LOC_696|>": 100993,
+  "<|LOC_697|>": 100994,
+  "<|LOC_698|>": 100995,
+  "<|LOC_699|>": 100996,
+  "<|LOC_69|>": 100366,
+  "<|LOC_6|>": 100303,
+  "<|LOC_700|>": 100997,
+  "<|LOC_701|>": 100998,
+  "<|LOC_702|>": 100999,
+  "<|LOC_703|>": 101000,
+  "<|LOC_704|>": 101001,
+  "<|LOC_705|>": 101002,
+  "<|LOC_706|>": 101003,
+  "<|LOC_707|>": 101004,
+  "<|LOC_708|>": 101005,
+  "<|LOC_709|>": 101006,
+  "<|LOC_70|>": 100367,
+  "<|LOC_710|>": 101007,
+  "<|LOC_711|>": 101008,
+  "<|LOC_712|>": 101009,
+  "<|LOC_713|>": 101010,
+  "<|LOC_714|>": 101011,
+  "<|LOC_715|>": 101012,
+  "<|LOC_716|>": 101013,
+  "<|LOC_717|>": 101014,
+  "<|LOC_718|>": 101015,
+  "<|LOC_719|>": 101016,
+  "<|LOC_71|>": 100368,
+  "<|LOC_720|>": 101017,
+  "<|LOC_721|>": 101018,
+  "<|LOC_722|>": 101019,
+  "<|LOC_723|>": 101020,
+  "<|LOC_724|>": 101021,
+  "<|LOC_725|>": 101022,
+  "<|LOC_726|>": 101023,
+  "<|LOC_727|>": 101024,
+  "<|LOC_728|>": 101025,
+  "<|LOC_729|>": 101026,
+  "<|LOC_72|>": 100369,
+  "<|LOC_730|>": 101027,
+  "<|LOC_731|>": 101028,
+  "<|LOC_732|>": 101029,
+  "<|LOC_733|>": 101030,
+  "<|LOC_734|>": 101031,
+  "<|LOC_735|>": 101032,
+  "<|LOC_736|>": 101033,
+  "<|LOC_737|>": 101034,
+  "<|LOC_738|>": 101035,
+  "<|LOC_739|>": 101036,
+  "<|LOC_73|>": 100370,
+  "<|LOC_740|>": 101037,
+  "<|LOC_741|>": 101038,
+  "<|LOC_742|>": 101039,
+  "<|LOC_743|>": 101040,
+  "<|LOC_744|>": 101041,
+  "<|LOC_745|>": 101042,
+  "<|LOC_746|>": 101043,
+  "<|LOC_747|>": 101044,
+  "<|LOC_748|>": 101045,
+  "<|LOC_749|>": 101046,
+  "<|LOC_74|>": 100371,
+  "<|LOC_750|>": 101047,
+  "<|LOC_751|>": 101048,
+  "<|LOC_752|>": 101049,
+  "<|LOC_753|>": 101050,
+  "<|LOC_754|>": 101051,
+  "<|LOC_755|>": 101052,
+  "<|LOC_756|>": 101053,
+  "<|LOC_757|>": 101054,
+  "<|LOC_758|>": 101055,
+  "<|LOC_759|>": 101056,
+  "<|LOC_75|>": 100372,
+  "<|LOC_760|>": 101057,
+  "<|LOC_761|>": 101058,
+  "<|LOC_762|>": 101059,
+  "<|LOC_763|>": 101060,
+  "<|LOC_764|>": 101061,
+  "<|LOC_765|>": 101062,
+  "<|LOC_766|>": 101063,
+  "<|LOC_767|>": 101064,
+  "<|LOC_768|>": 101065,
+  "<|LOC_769|>": 101066,
+  "<|LOC_76|>": 100373,
+  "<|LOC_770|>": 101067,
+  "<|LOC_771|>": 101068,
+  "<|LOC_772|>": 101069,
+  "<|LOC_773|>": 101070,
+  "<|LOC_774|>": 101071,
+  "<|LOC_775|>": 101072,
+  "<|LOC_776|>": 101073,
+  "<|LOC_777|>": 101074,
+  "<|LOC_778|>": 101075,
+  "<|LOC_779|>": 101076,
+  "<|LOC_77|>": 100374,
+  "<|LOC_780|>": 101077,
+  "<|LOC_781|>": 101078,
+  "<|LOC_782|>": 101079,
+  "<|LOC_783|>": 101080,
+  "<|LOC_784|>": 101081,
+  "<|LOC_785|>": 101082,
+  "<|LOC_786|>": 101083,
+  "<|LOC_787|>": 101084,
+  "<|LOC_788|>": 101085,
+  "<|LOC_789|>": 101086,
+  "<|LOC_78|>": 100375,
+  "<|LOC_790|>": 101087,
+  "<|LOC_791|>": 101088,
+  "<|LOC_792|>": 101089,
+  "<|LOC_793|>": 101090,
+  "<|LOC_794|>": 101091,
+  "<|LOC_795|>": 101092,
+  "<|LOC_796|>": 101093,
+  "<|LOC_797|>": 101094,
+  "<|LOC_798|>": 101095,
+  "<|LOC_799|>": 101096,
+  "<|LOC_79|>": 100376,
+  "<|LOC_7|>": 100304,
+  "<|LOC_800|>": 101097,
+  "<|LOC_801|>": 101098,
+  "<|LOC_802|>": 101099,
+  "<|LOC_803|>": 101100,
+  "<|LOC_804|>": 101101,
+  "<|LOC_805|>": 101102,
+  "<|LOC_806|>": 101103,
+  "<|LOC_807|>": 101104,
+  "<|LOC_808|>": 101105,
+  "<|LOC_809|>": 101106,
+  "<|LOC_80|>": 100377,
+  "<|LOC_810|>": 101107,
+  "<|LOC_811|>": 101108,
+  "<|LOC_812|>": 101109,
+  "<|LOC_813|>": 101110,
+  "<|LOC_814|>": 101111,
+  "<|LOC_815|>": 101112,
+  "<|LOC_816|>": 101113,
+  "<|LOC_817|>": 101114,
+  "<|LOC_818|>": 101115,
+  "<|LOC_819|>": 101116,
+  "<|LOC_81|>": 100378,
+  "<|LOC_820|>": 101117,
+  "<|LOC_821|>": 101118,
+  "<|LOC_822|>": 101119,
+  "<|LOC_823|>": 101120,
+  "<|LOC_824|>": 101121,
+  "<|LOC_825|>": 101122,
+  "<|LOC_826|>": 101123,
+  "<|LOC_827|>": 101124,
+  "<|LOC_828|>": 101125,
+  "<|LOC_829|>": 101126,
+  "<|LOC_82|>": 100379,
+  "<|LOC_830|>": 101127,
+  "<|LOC_831|>": 101128,
+  "<|LOC_832|>": 101129,
+  "<|LOC_833|>": 101130,
+  "<|LOC_834|>": 101131,
+  "<|LOC_835|>": 101132,
+  "<|LOC_836|>": 101133,
+  "<|LOC_837|>": 101134,
+  "<|LOC_838|>": 101135,
+  "<|LOC_839|>": 101136,
+  "<|LOC_83|>": 100380,
+  "<|LOC_840|>": 101137,
+  "<|LOC_841|>": 101138,
+  "<|LOC_842|>": 101139,
+  "<|LOC_843|>": 101140,
+  "<|LOC_844|>": 101141,
+  "<|LOC_845|>": 101142,
+  "<|LOC_846|>": 101143,
+  "<|LOC_847|>": 101144,
+  "<|LOC_848|>": 101145,
+  "<|LOC_849|>": 101146,
+  "<|LOC_84|>": 100381,
+  "<|LOC_850|>": 101147,
+  "<|LOC_851|>": 101148,
+  "<|LOC_852|>": 101149,
+  "<|LOC_853|>": 101150,
+  "<|LOC_854|>": 101151,
+  "<|LOC_855|>": 101152,
+  "<|LOC_856|>": 101153,
+  "<|LOC_857|>": 101154,
+  "<|LOC_858|>": 101155,
+  "<|LOC_859|>": 101156,
+  "<|LOC_85|>": 100382,
+  "<|LOC_860|>": 101157,
+  "<|LOC_861|>": 101158,
+  "<|LOC_862|>": 101159,
+  "<|LOC_863|>": 101160,
+  "<|LOC_864|>": 101161,
+  "<|LOC_865|>": 101162,
+  "<|LOC_866|>": 101163,
+  "<|LOC_867|>": 101164,
+  "<|LOC_868|>": 101165,
+  "<|LOC_869|>": 101166,
+  "<|LOC_86|>": 100383,
+  "<|LOC_870|>": 101167,
+  "<|LOC_871|>": 101168,
+  "<|LOC_872|>": 101169,
+  "<|LOC_873|>": 101170,
+  "<|LOC_874|>": 101171,
+  "<|LOC_875|>": 101172,
+  "<|LOC_876|>": 101173,
+  "<|LOC_877|>": 101174,
+  "<|LOC_878|>": 101175,
+  "<|LOC_879|>": 101176,
+  "<|LOC_87|>": 100384,
+  "<|LOC_880|>": 101177,
+  "<|LOC_881|>": 101178,
+  "<|LOC_882|>": 101179,
+  "<|LOC_883|>": 101180,
+  "<|LOC_884|>": 101181,
+  "<|LOC_885|>": 101182,
+  "<|LOC_886|>": 101183,
+  "<|LOC_887|>": 101184,
+  "<|LOC_888|>": 101185,
+  "<|LOC_889|>": 101186,
+  "<|LOC_88|>": 100385,
+  "<|LOC_890|>": 101187,
+  "<|LOC_891|>": 101188,
+  "<|LOC_892|>": 101189,
+  "<|LOC_893|>": 101190,
+  "<|LOC_894|>": 101191,
+  "<|LOC_895|>": 101192,
+  "<|LOC_896|>": 101193,
+  "<|LOC_897|>": 101194,
+  "<|LOC_898|>": 101195,
+  "<|LOC_899|>": 101196,
+  "<|LOC_89|>": 100386,
+  "<|LOC_8|>": 100305,
+  "<|LOC_900|>": 101197,
+  "<|LOC_901|>": 101198,
+  "<|LOC_902|>": 101199,
+  "<|LOC_903|>": 101200,
+  "<|LOC_904|>": 101201,
+  "<|LOC_905|>": 101202,
+  "<|LOC_906|>": 101203,
+  "<|LOC_907|>": 101204,
+  "<|LOC_908|>": 101205,
+  "<|LOC_909|>": 101206,
+  "<|LOC_90|>": 100387,
+  "<|LOC_910|>": 101207,
+  "<|LOC_911|>": 101208,
+  "<|LOC_912|>": 101209,
+  "<|LOC_913|>": 101210,
+  "<|LOC_914|>": 101211,
+  "<|LOC_915|>": 101212,
+  "<|LOC_916|>": 101213,
+  "<|LOC_917|>": 101214,
+  "<|LOC_918|>": 101215,
+  "<|LOC_919|>": 101216,
+  "<|LOC_91|>": 100388,
+  "<|LOC_920|>": 101217,
+  "<|LOC_921|>": 101218,
+  "<|LOC_922|>": 101219,
+  "<|LOC_923|>": 101220,
+  "<|LOC_924|>": 101221,
+  "<|LOC_925|>": 101222,
+  "<|LOC_926|>": 101223,
+  "<|LOC_927|>": 101224,
+  "<|LOC_928|>": 101225,
+  "<|LOC_929|>": 101226,
+  "<|LOC_92|>": 100389,
+  "<|LOC_930|>": 101227,
+  "<|LOC_931|>": 101228,
+  "<|LOC_932|>": 101229,
+  "<|LOC_933|>": 101230,
+  "<|LOC_934|>": 101231,
+  "<|LOC_935|>": 101232,
+  "<|LOC_936|>": 101233,
+  "<|LOC_937|>": 101234,
+  "<|LOC_938|>": 101235,
+  "<|LOC_939|>": 101236,
+  "<|LOC_93|>": 100390,
+  "<|LOC_940|>": 101237,
+  "<|LOC_941|>": 101238,
+  "<|LOC_942|>": 101239,
+  "<|LOC_943|>": 101240,
+  "<|LOC_944|>": 101241,
+  "<|LOC_945|>": 101242,
+  "<|LOC_946|>": 101243,
+  "<|LOC_947|>": 101244,
+  "<|LOC_948|>": 101245,
+  "<|LOC_949|>": 101246,
+  "<|LOC_94|>": 100391,
+  "<|LOC_950|>": 101247,
+  "<|LOC_951|>": 101248,
+  "<|LOC_952|>": 101249,
+  "<|LOC_953|>": 101250,
+  "<|LOC_954|>": 101251,
+  "<|LOC_955|>": 101252,
+  "<|LOC_956|>": 101253,
+  "<|LOC_957|>": 101254,
+  "<|LOC_958|>": 101255,
+  "<|LOC_959|>": 101256,
+  "<|LOC_95|>": 100392,
+  "<|LOC_960|>": 101257,
+  "<|LOC_961|>": 101258,
+  "<|LOC_962|>": 101259,
+  "<|LOC_963|>": 101260,
+  "<|LOC_964|>": 101261,
+  "<|LOC_965|>": 101262,
+  "<|LOC_966|>": 101263,
+  "<|LOC_967|>": 101264,
+  "<|LOC_968|>": 101265,
+  "<|LOC_969|>": 101266,
+  "<|LOC_96|>": 100393,
+  "<|LOC_970|>": 101267,
+  "<|LOC_971|>": 101268,
+  "<|LOC_972|>": 101269,
+  "<|LOC_973|>": 101270,
+  "<|LOC_974|>": 101271,
+  "<|LOC_975|>": 101272,
+  "<|LOC_976|>": 101273,
+  "<|LOC_977|>": 101274,
+  "<|LOC_978|>": 101275,
+  "<|LOC_979|>": 101276,
+  "<|LOC_97|>": 100394,
+  "<|LOC_980|>": 101277,
+  "<|LOC_981|>": 101278,
+  "<|LOC_982|>": 101279,
+  "<|LOC_983|>": 101280,
+  "<|LOC_984|>": 101281,
+  "<|LOC_985|>": 101282,
+  "<|LOC_986|>": 101283,
+  "<|LOC_987|>": 101284,
+  "<|LOC_988|>": 101285,
+  "<|LOC_989|>": 101286,
+  "<|LOC_98|>": 100395,
+  "<|LOC_990|>": 101287,
+  "<|LOC_991|>": 101288,
+  "<|LOC_992|>": 101289,
+  "<|LOC_993|>": 101290,
+  "<|LOC_994|>": 101291,
+  "<|LOC_995|>": 101292,
+  "<|LOC_996|>": 101293,
+  "<|LOC_997|>": 101294,
+  "<|LOC_998|>": 101295,
+  "<|LOC_999|>": 101296,
+  "<|LOC_99|>": 100396,
+  "<|LOC_9|>": 100306,
+  "<|LOC_BEGIN|>": 101298,
+  "<|LOC_END|>": 101299,
+  "<|LOC_SEP|>": 101300
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,24 @@

+{%- if not add_generation_prompt is defined -%}
+    {%- set add_generation_prompt = true -%}
+{%- endif -%}
+{%- if not cls_token is defined -%}
+    {%- set cls_token = "<|begin_of_sentence|>" -%}
+{%- endif -%}
+{%- if not sep_token is defined -%}
+    {%- set sep_token = "<|end_of_sentence|>" -%}
+{%- endif -%}
+{{- cls_token -}}
+{%- for message in messages -%}
+    {%- if message["role"] == "user" -%}
+        {{- "User: " + message["content"] + "
+" -}}
+    {%- elif message["role"] == "assistant" -%}
+        {{- "Assistant: " + message["content"] + sep_token -}}
+    {%- elif message["role"] == "system" -%}
+        {{- message["content"] + "
+" -}}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{- "Assistant: " -}}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "architectures": [
+        "Ernie4_5_ForCausalLM"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_ernie4_5.Ernie4_5_Config",
+        "AutoModel": "modeling_ernie4_5.Ernie4_5_Model",
+        "AutoModelForCausalLM": "modeling_ernie4_5.Ernie4_5_ForCausalLM"
+    },
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "intermediate_size": 3072,
+    "max_position_embeddings": 131072,
+    "model_type": "ernie4_5",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 18,
+    "num_key_value_heads": 2,
+    "pad_token_id": 0,
+    "quantization": {
+        "group_size": 64,
+        "bits": 8
+    },
+    "quantization_config": {
+        "group_size": 64,
+        "bits": 8
+    },
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 500000,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_bias": false,
+    "use_cache": false,
+    "vocab_size": 103424
+}

configuration_ernie4_5.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers import PretrainedConfig
+class Ernie4_5_Config(PretrainedConfig):
+    """
+    Configuration class.
+    This class stores the configuration of an Ernie model, defining the model architecture.
+    It inherits from PretrainedConfig and can be used to control model outputs.
+    """
+    model_type = "ernie4_5"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen3`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=11008,
+        max_position_embeddings=32768,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        rms_norm_eps=1e-6,
+        use_cache=False,
+        use_flash_attention=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        use_bias=False,
+        rope_theta=10000,
+        weight_share_add_bias=True,
+        ignored_index=-100,
+        attention_probs_dropout_prob=0.0,
+        hidden_dropout_prob=0.0,
+        compression_ratio: float = 1.0,
+        num_key_value_heads=None,
+        max_sequence_length=None,
+        **kwargs,
+    ):
+        """
+        Initialize configuration with default or specified parameters.
+        Args:
+            vocab_size (int): Size of the vocabulary (number of unique tokens)
+            hidden_size (int): Dimensionality of the encoder layers and the pooler layer
+            intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
+            max_position_embeddings (int): Maximum sequence length the model can handle
+            num_hidden_layers (int): Number of hidden layers in the Transformer encoder
+            num_attention_heads (int): Number of attention heads for each attention layer
+            rms_norm_eps (float): The epsilon used by the RMS normalization layers
+            use_cache (bool): Whether to use caching for faster generation (decoding)
+            use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
+            pad_token_id (int): Token ID used for padding sequences
+            bos_token_id (int): Token ID used for beginning-of-sequence
+            eos_token_id (int): Token ID used for end-of-sequence
+            use_bias (bool): Whether to use bias terms in linear layers
+            rope_theta (float): The base period of the RoPE embeddings
+            weight_share_add_bias (bool): Whether to share bias weights in certain layers
+            ignored_index (int): Target value that is ignored during loss computation
+            attention_probs_dropout_prob (float): Dropout probability for attention weights
+            hidden_dropout_prob (float): Dropout probability for hidden layers
+            compression_ratio (float): Ratio for KV cache compression (1.0 = no compression)
+            num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention)
+            max_sequence_length (int): Maximum sequence length for positional embeddings
+            **kwargs: Additional keyword arguments passed to parent class
+        """
+        # Set default for tied embeddings if not specified.
+        if "tie_word_embeddings" not in kwargs:
+            kwargs["tie_word_embeddings"] = False
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_flash_attention = use_flash_attention
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.use_bias = use_bias
+        self.weight_share_add_bias = weight_share_add_bias
+        self.rope_theta = rope_theta
+        self.ignored_index = ignored_index
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.compression_ratio = compression_ratio
+        self.num_key_value_heads = num_key_value_heads
+        self.max_sequence_length = max_sequence_length

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "do_sample": true,
+    "top_p": 0.8,
+    "temperature": 0.8,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "pad_token_id": 0,
+    "repetition_penalty": 1.0,
+    "frequency_penalty": 0.0,
+    "presence_penalty": 0.0
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3daf8216c9dc3d79102e541fbd8c4a61f3c82f815b7b499e64af9bab2cd83e3a
+size 383376660

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,426 @@

+{
+    "metadata": {
+        "total_size": 383330304,
+        "total_parameters": 360748032
+    },
+    "weight_map": {
+        "model.embed_tokens.biases": "model.safetensors",
+        "model.embed_tokens.scales": "model.safetensors",
+        "model.embed_tokens.weight": "model.safetensors",
+        "model.layers.0.input_layernorm.weight": "model.safetensors",
+        "model.layers.0.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.0.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.0.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.0.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.0.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.0.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.0.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.0.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.0.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.0.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.0.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.0.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.0.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.0.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.0.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.0.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.0.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.0.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.0.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.0.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.0.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.0.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.1.input_layernorm.weight": "model.safetensors",
+        "model.layers.1.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.1.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.1.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.1.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.1.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.1.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.1.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.1.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.1.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.1.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.1.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.1.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.1.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.1.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.1.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.1.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.1.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.1.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.1.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.1.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.1.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.1.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.10.input_layernorm.weight": "model.safetensors",
+        "model.layers.10.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.10.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.10.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.10.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.10.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.10.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.10.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.10.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.10.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.10.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.10.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.10.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.10.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.10.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.10.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.10.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.10.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.10.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.10.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.10.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.10.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.10.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.11.input_layernorm.weight": "model.safetensors",
+        "model.layers.11.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.11.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.11.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.11.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.11.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.11.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.11.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.11.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.11.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.11.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.11.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.11.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.11.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.11.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.11.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.11.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.11.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.11.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.11.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.11.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.11.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.11.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.12.input_layernorm.weight": "model.safetensors",
+        "model.layers.12.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.12.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.12.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.12.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.12.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.12.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.12.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.12.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.12.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.12.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.12.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.12.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.12.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.12.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.12.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.12.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.12.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.12.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.12.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.12.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.12.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.12.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.13.input_layernorm.weight": "model.safetensors",
+        "model.layers.13.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.13.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.13.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.13.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.13.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.13.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.13.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.13.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.13.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.13.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.13.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.13.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.13.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.13.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.13.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.13.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.13.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.13.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.13.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.13.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.13.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.13.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.14.input_layernorm.weight": "model.safetensors",
+        "model.layers.14.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.14.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.14.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.14.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.14.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.14.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.14.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.14.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.14.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.14.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.14.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.14.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.14.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.14.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.14.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.14.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.14.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.14.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.14.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.14.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.14.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.14.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.15.input_layernorm.weight": "model.safetensors",
+        "model.layers.15.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.15.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.15.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.15.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.15.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.15.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.15.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.15.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.15.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.15.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.15.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.15.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.15.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.15.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.15.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.15.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.15.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.15.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.15.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.15.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.15.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.15.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.16.input_layernorm.weight": "model.safetensors",
+        "model.layers.16.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.16.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.16.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.16.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.16.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.16.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.16.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.16.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.16.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.16.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.16.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.16.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.16.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.16.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.16.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.16.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.16.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.16.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.16.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.16.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.16.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.16.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.17.input_layernorm.weight": "model.safetensors",
+        "model.layers.17.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.17.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.17.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.17.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.17.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.17.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.17.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.17.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.17.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.17.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.17.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.17.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.17.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.17.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.17.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.17.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.17.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.17.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.17.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.17.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.17.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.17.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.2.input_layernorm.weight": "model.safetensors",
+        "model.layers.2.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.2.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.2.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.2.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.2.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.2.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.2.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.2.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.2.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.2.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.2.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.2.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.2.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.2.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.2.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.2.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.2.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.2.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.2.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.2.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.2.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.2.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.3.input_layernorm.weight": "model.safetensors",
+        "model.layers.3.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.3.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.3.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.3.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.3.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.3.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.3.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.3.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.3.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.3.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.3.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.3.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.3.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.3.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.3.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.3.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.3.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.3.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.3.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.3.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.3.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.3.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.4.input_layernorm.weight": "model.safetensors",
+        "model.layers.4.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.4.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.4.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.4.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.4.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.4.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.4.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.4.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.4.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.4.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.4.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.4.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.4.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.4.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.4.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.4.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.4.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.4.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.4.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.4.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.4.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.4.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.5.input_layernorm.weight": "model.safetensors",
+        "model.layers.5.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.5.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.5.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.5.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.5.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.5.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.5.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.5.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.5.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.5.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.5.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.5.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.5.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.5.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.5.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.5.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.5.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.5.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.5.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.5.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.5.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.5.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.6.input_layernorm.weight": "model.safetensors",
+        "model.layers.6.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.6.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.6.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.6.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.6.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.6.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.6.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.6.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.6.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.6.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.6.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.6.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.6.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.6.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.6.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.6.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.6.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.6.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.6.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.6.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.6.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.6.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.7.input_layernorm.weight": "model.safetensors",
+        "model.layers.7.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.7.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.7.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.7.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.7.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.7.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.7.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.7.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.7.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.7.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.7.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.7.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.7.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.7.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.7.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.7.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.7.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.7.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.7.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.7.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.7.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.7.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.8.input_layernorm.weight": "model.safetensors",
+        "model.layers.8.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.8.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.8.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.8.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.8.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.8.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.8.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.8.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.8.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.8.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.8.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.8.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.8.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.8.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.8.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.8.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.8.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.8.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.8.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.8.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.8.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.8.self_attn.v_proj.weight": "model.safetensors",
+        "model.layers.9.input_layernorm.weight": "model.safetensors",
+        "model.layers.9.mlp.down_proj.biases": "model.safetensors",
+        "model.layers.9.mlp.down_proj.scales": "model.safetensors",
+        "model.layers.9.mlp.down_proj.weight": "model.safetensors",
+        "model.layers.9.mlp.gate_proj.biases": "model.safetensors",
+        "model.layers.9.mlp.gate_proj.scales": "model.safetensors",
+        "model.layers.9.mlp.gate_proj.weight": "model.safetensors",
+        "model.layers.9.mlp.up_proj.biases": "model.safetensors",
+        "model.layers.9.mlp.up_proj.scales": "model.safetensors",
+        "model.layers.9.mlp.up_proj.weight": "model.safetensors",
+        "model.layers.9.post_attention_layernorm.weight": "model.safetensors",
+        "model.layers.9.self_attn.k_proj.biases": "model.safetensors",
+        "model.layers.9.self_attn.k_proj.scales": "model.safetensors",
+        "model.layers.9.self_attn.k_proj.weight": "model.safetensors",
+        "model.layers.9.self_attn.o_proj.biases": "model.safetensors",
+        "model.layers.9.self_attn.o_proj.scales": "model.safetensors",
+        "model.layers.9.self_attn.o_proj.weight": "model.safetensors",
+        "model.layers.9.self_attn.q_proj.biases": "model.safetensors",
+        "model.layers.9.self_attn.q_proj.scales": "model.safetensors",
+        "model.layers.9.self_attn.q_proj.weight": "model.safetensors",
+        "model.layers.9.self_attn.v_proj.biases": "model.safetensors",
+        "model.layers.9.self_attn.v_proj.scales": "model.safetensors",
+        "model.layers.9.self_attn.v_proj.weight": "model.safetensors",
+        "model.norm.weight": "model.safetensors"
+    }
+}

modeling_ernie4_5.py ADDED Viewed

	@@ -0,0 +1,1068 @@

+# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.utils import logging
+from .configuration_ernie4_5 import Ernie4_5_Config
+logger = logging.get_logger(__name__)
+class Ernie4_5_RMSNorm(nn.Module):
+    """
+    Root Mean Square Layer Normalization (Ernie4_5_RMSNorm) implementation.
+    Ernie4_5_RMSNorm is a simplified version of LayerNorm that focuses on the root mean square of inputs,
+    omitting the mean-centering operation. This provides computational efficiency while maintaining
+    good performance.
+    """
+    def __init__(self, config):
+        """
+        Initialize Ernie4_5_RMSNorm layer.
+        Args:
+            config: Model configuration.
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.ones(self.hidden_size, dtype=torch.get_default_dtype())
+        )
+        self.variance_epsilon = config.rms_norm_eps
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to input hidden states.
+        Args:
+            hidden_states (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+        Returns:
+            Tensor: Normalized output tensor of same shape as input
+        Note:
+            - computes Ernie4_5_RMSNorm manually:
+                1. Compute variance of features
+                2. Apply reciprocal square root normalization
+                3. Scale by learned weight parameter
+            - Maintains original dtype for numerical stability during computation
+        """
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = torch.rsqrt(variance + self.variance_epsilon) * hidden_states
+        return hidden_states.to(self.weight.dtype) * self.weight
+class Ernie4_5_RopeEmbedding(nn.Module):
+    """
+    Rotary Position Embedding (RoPE) implementation for transformer models.
+    RoPE encodes absolute positional information with rotation matrices and
+    naturally incorporates relative position information in self-attention.
+    Args:
+        head_dim (int): Dimension size of each attention head
+        compression_ratio (float, optional): Sequence length compression ratio. Defaults to 1.0.
+        base (int, optional): Base value for frequency calculation. Defaults to 10000.
+    Attributes:
+        head_dim (int): Dimension size of each attention head
+        compression_ratio (float): Sequence length compression factor
+        base (int): Base value for frequency calculation
+    """
+    def __init__(self, head_dim, compression_ratio=1.0, base=10000):
+        """
+        Initialize RoPE embedding layer.
+        Args:
+            head_dim: Dimension of each attention head
+            compression_ratio: Scaling factor for position indices
+            base: Base value for frequency calculation
+        """
+        super().__init__()
+        self.head_dim = head_dim
+        self.compression_ratio = compression_ratio
+        self.base = base
+    def forward(self, seq_length, position_ids=None):
+        """
+        Compute rotary position embeddings for given sequence length.
+        Args:
+            seq_length (int): Maximum sequence length
+            position_ids (Tensor, optional): Custom position indices. Defaults to None.
+        Returns:
+            Tensor: Rotary position embeddings of shape [1, 1, seq_length, head_dim]
+        """
+        indices = torch.arange(0, self.head_dim, 2, dtype=torch.float32)
+        indices = 1 / self.base ** (indices / self.head_dim)
+        if position_ids is None:
+            position_ids = torch.arange(
+                0, seq_length, 1, dtype=torch.float32
+            ).unsqueeze(1)
+            position_ids = position_ids / self.compression_ratio
+            sinusoid_inp = position_ids * indices.unsqueeze(0)
+        else:
+            position_ids = position_ids / self.compression_ratio
+            seq_length = position_ids.shape[-1]
+            sinusoid_inp = position_ids.unsqueeze(-1).to(
+                torch.float32
+            ) * indices.unsqueeze(0)
+        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
+        pos_emb = pos_emb.view(-1, 1, seq_length, self.head_dim)
+        pos_emb = pos_emb.detach()
+        return pos_emb
+    def apply_rotary(self, rp, q, k):
+        """
+        Apply rotary position embeddings to queries and keys.
+        Args:
+            rp (Tensor): Rotary position embeddings
+            q (Tensor): Query tensor [batch, heads, seq_len, dim]
+            k (Tensor): Key tensor [batch, heads, seq_len, dim]
+        Returns:
+            Tuple[Tensor, Tensor]: Rotated queries and keys
+        """
+        sin, cos = torch.chunk(rp.to(q.device), 2, dim=-1)
+        # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        sin_pos = torch.stack([sin, sin], dim=-1).reshape(rp.shape)
+        # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        cos_pos = torch.stack([cos, cos], dim=-1).reshape(rp.shape)
+        # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2]
+        rotate_half_q = torch.stack(
+            [-q[:, :, :, 1::2], q[:, :, :, 0::2]], dim=-1
+        ).reshape(q.shape)
+        query = (q.to(torch.float32) * cos_pos) + (
+            rotate_half_q.to(torch.float32) * sin_pos
+        )
+        # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
+        rotate_half_k = torch.stack(
+            [-k[:, :, :, 1::2], k[:, :, :, 0::2]], dim=-1
+        ).reshape(k.shape)
+        key = (k.to(torch.float32) * cos_pos) + (
+            rotate_half_k.to(torch.float32) * sin_pos
+        )
+        return query, key
+class Ernie4_5_FusedDropoutImpl(nn.Module):
+    """
+    Fused dropout implementation with residual connection support.
+    This layer combines dropout and residual addition in a single operation for better performance,
+    particularly on GPU devices. The dropout is conditionally applied based on the probability.
+    Args:
+        prob (float): Dropout probability (between 0 and 1)
+    Attributes:
+        prob (float): Stores the dropout probability
+        dropout (nn.Dropout): The actual dropout layer instance
+    """
+    def __init__(self, prob):
+        """
+        Initialize the fused dropout layer.
+        Args:
+            prob (float): Dropout probability (0 means no dropout)
+        """
+        super().__init__()
+        self.prob = prob
+        self.dropout = nn.Dropout(p=prob)
+    def forward(self, x, y):
+        """
+        Forward pass of the fused dropout layer.
+        Args:
+            x (Tensor): Input tensor to potentially apply dropout
+            y (Tensor): Residual tensor to add to the (possibly dropped out) x
+        Returns:
+            Tensor: Result of x (with optional dropout) + y
+        """
+        if self.prob > 0:
+            x = self.dropout(x)
+        output = x + y
+        return output
+class Ernie4_5_MLP(nn.Module):
+    """
+    Ernie4_5_MLP - Gated Multi-Layer Perceptron module used in Ernie model.
+    """
+    def __init__(self, config, layer_idx=0):
+        """
+        Initialize the MLP module with configuration options.
+        Args:
+            config: Model configurations.
+            layer_idx (int): Index of current layer (default: 0)
+        """
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=config.use_bias
+        )
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=config.use_bias
+        )
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=config.use_bias
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): shape [batch_size, seq_len, hidden_size]
+        Returns:
+            Tensor: shape [batch_size, seq_len, hidden_size]
+        """
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class Ernie4_5_Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config, layer_idx=0):
+        """Initialize the attention layer.
+        Args:
+            config: Model configuration.
+            layer_idx (int, optional): Index in transformer stack. Defaults to 0.
+        """
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        if config.head_dim is None:
+            self.head_dim = self.hidden_size // self.num_heads
+        else:
+            self.head_dim = config.head_dim
+        self.is_gqa = (
+            self.num_key_value_heads is not None
+            and self.num_key_value_heads != self.num_heads
+        )
+        if self.is_gqa:
+            logger.info(
+                f"use GQA - num_heads: {self.num_heads}- num_key_value_heads: {self.num_key_value_heads}"
+            )
+            assert (
+                self.num_heads % self.num_key_value_heads == 0
+            ), f"num_heads: {self.num_heads}, num_key_value_heads: {self.num_key_value_heads}"
+            kv_hidden_size = self.head_dim * self.num_key_value_heads
+            q_hidden_size = self.head_dim * self.num_heads
+        else:
+            q_hidden_size = kv_hidden_size = self.head_dim * self.num_heads
+        self.q_proj = nn.Linear(self.hidden_size, q_hidden_size, bias=config.use_bias)
+        self.k_proj = nn.Linear(self.hidden_size, kv_hidden_size, bias=config.use_bias)
+        self.v_proj = nn.Linear(self.hidden_size, kv_hidden_size, bias=config.use_bias)
+        self.o_proj = nn.Linear(q_hidden_size, self.hidden_size, bias=config.use_bias)
+        self.rotary_emb = Ernie4_5_RopeEmbedding(
+            self.head_dim,
+            compression_ratio=config.compression_ratio,
+            base=config.rope_theta,
+        )
+        self.config = config
+        self.set_attn_func()
+    def set_attn_func(self):
+        """Configure attention function based on settings.
+        Selects between flash/core attention.
+        """
+        config = self.config
+        if config.use_flash_attention:
+            self.attn_func = self._flash_attention_wrapper
+        else:
+            self.attn_func = self.core_attn
+    def forward(
+        self,
+        hidden_states,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attn_mask_start_row_indices: Optional[torch.Tensor] = None,
+        position_ids: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        token_type_ids: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Compute attention outputs.
+        Args:
+            hidden_states (torch.Tensor): Input tensor [bsz, seq_len, hidden_size]
+            past_key_value (Optional[Tuple[torch.Tensor, torch.Tensor]]): Cached key/value states
+            attention_mask (Optional[torch.Tensor]): Attention mask tensor
+            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length attention indices
+            position_ids (Optional[torch.Tensor]): Position indices for RoPE
+            output_attentions (bool): Return attention weights if True
+            use_cache (bool): Cache key/value states if True
+        Returns:
+            Tuple containing:
+                - attention_output: [bsz, seq_len, hidden_size]
+                - attention_weights: Optional attention probabilities
+                - updated_key_value_cache: Optional updated cache
+        """
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, :-1]
+        bsz, q_len, _ = hidden_states.shape
+        query_states = self.q_proj(hidden_states).reshape(
+            [bsz, q_len, -1, self.head_dim]
+        )
+        key_states = self.k_proj(hidden_states).reshape([bsz, q_len, -1, self.head_dim])
+        value_states = self.v_proj(hidden_states).reshape(
+            [bsz, q_len, -1, self.head_dim]
+        )
+        attn_output, attn_weights, past_key_value = self.rope_attn(
+            query_states=query_states,
+            key_states=key_states,
+            value_states=value_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            attn_mask_start_row_indices=attn_mask_start_row_indices,
+        )
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def repeat_kv(self, hidden_states, n_rep):
+        """
+        This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+        num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+        """
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep == 1:
+            return hidden_states
+        hidden_states = hidden_states[:, :, None, :, :].expand(
+            batch, num_key_value_heads, n_rep, slen, head_dim
+        )
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+    def _flash_attention_wrapper(
+        self,
+        q,
+        k,
+        v,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        seq_length=None,
+    ):
+        """Wrapper for flash attention implementation.
+        Args:
+            q (torch.Tensor): Query tensor
+            k (torch.Tensor): Key tensor
+            v (torch.Tensor): Value tensor
+            attention_mask (Optional[torch.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
+            seq_length (Optional[int]): Sequence length
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Attention output and weights
+        """
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            out = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=None,
+                dropout_p=self.config.attention_probs_dropout_prob,
+                is_causal=q.shape[2] != 1,
+                scale=1
+                / (getattr(self.config, "scale_qk_coeff", 1.0) * self.head_dim**0.5),
+                enable_gqa=self.is_gqa,
+            )
+        out = out.transpose(1, 2)
+        out = out.contiguous().view(out.size(0), out.size(1), -1)
+        return out, None
+    def core_attn(
+        self,
+        q,
+        k,
+        v,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        seq_length=None,
+    ):
+        """Standard self-attention implementation.
+        Args:
+            q (torch.Tensor): Query tensor
+            k (torch.Tensor): Key tensor
+            v (torch.Tensor): Value tensor
+            attention_mask (Optional[torch.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
+            seq_length (Optional[int]): Sequence length
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Attention output and weights
+        """
+        origin_dtype = q.dtype
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+        scale_qk_coeff = (
+            getattr(self.config, "scale_qk_coeff", 1.0) * self.head_dim**0.5
+        )
+        q = q / scale_qk_coeff
+        # Handle GQA case - repeat k and v heads to match q heads
+        if self.is_gqa:
+            # [batch, num_key_value_heads, seq_len, head_dim] -> [batch, num_heads, seq_len, head_dim]
+            repeat_factor = self.num_heads // self.num_key_value_heads
+            k = self.repeat_kv(k, repeat_factor)
+            v = self.repeat_kv(v, repeat_factor)
+        attn_scores = torch.matmul(q, k.transpose(-2, -1))
+        if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
+            attn_scores = attn_scores * getattr(self.config, "scale_qk_coeff", 1.0)
+        # Causal mask
+        seq_len = attn_scores.size(-1)
+        mask = torch.triu(
+            torch.ones((seq_len, seq_len), dtype=torch.bool, device=attn_scores.device),
+            diagonal=1,
+        )
+        attn_scores = attn_scores.masked_fill(mask, float("-inf"))
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        attn_weights = attn_weights.to(origin_dtype)
+        # attention_probs_dropout_prob default 0.0
+        if getattr(self.config, "attention_probs_dropout_prob", 0.0) > 0:
+            attn_weights = F.dropout(
+                attn_weights,
+                p=self.config.attention_probs_dropout_prob,
+                training=self.training,
+            )
+        # [batch, num_heads, q_len, k_len] @ [batch, num_heads, k_len, head_dim] -> [batch, num_heads, q_len, head_dim]
+        out = torch.matmul(attn_weights, v)
+        # [batch, num_heads, seq_len, head_dim] -> [batch, seq_len, num_heads, head_dim]
+        out = out.permute(0, 2, 1, 3)
+        # [batch, seq_len, hidden_size]
+        out = out.contiguous().view(out.size(0), out.size(1), -1)
+        return out, attn_weights
+    def rope_attn(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        position_ids,
+        output_attentions=False,
+        past_key_value=None,
+        use_cache=False,
+        attn_mask_start_row_indices=None,
+    ):
+        """Attention computation with rotary embeddings.
+        Args:
+            query_states (torch.Tensor): Query states
+            key_states (torch.Tensor): Key states
+            value_states (torch.Tensor): Value states
+            attention_mask (Optional[torch.Tensor]): Attention mask
+            position_ids (Optional[torch.Tensor]): Position indices
+            output_attentions (bool): Return attention weights
+            past_key_value (Optional[Tuple[torch.Tensor, torch.Tensor]]): Cached states
+            use_cache (bool): Cache new states
+            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
+        Returns:
+            Tuple containing:
+                - attention_output: Result tensor
+                - attention_weights: Optional weights
+                - updated_key_value_cache: Optional cache
+        """
+        query_states_dtype = query_states.dtype
+        kv_seq_len = key_states.shape[-3]
+        offset = 0
+        if past_key_value is not None:
+            offset = past_key_value[0].shape[-3]
+            kv_seq_len += offset
+        cos_sin = self.rotary_emb(kv_seq_len).permute(
+            [0, 2, 1, 3]
+        )  # [b,h,s,d]->[b,s,h,d]
+        if offset > 0:
+            cos_sin = cos_sin[:, offset:]
+        query_states, key_states = self.rotary_emb.apply_rotary(
+            cos_sin, query_states, key_states
+        )
+        query_states = query_states.to(query_states_dtype)
+        key_states = key_states.to(query_states_dtype)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=1)
+            value_states = torch.cat([past_key_value[1], value_states], dim=1)
+        # shape: [2, b, s, kvh, d]
+        past_key_value = [key_states, value_states] if use_cache else None
+        seq_length = query_states.shape[1]
+        attn_output, attn_weights = self.attn_func(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            attn_mask_start_row_indices,
+            seq_length,
+        )
+        return attn_output, attn_weights, past_key_value
+class Ernie4_5_DecoderLayer(nn.Module):
+    """
+    A single transformer decoder layer in ERNIE model.
+    """
+    def __init__(self, config, layer_idx):
+        """Initialize the decoder layer.
+        Args:
+            config: Model configuration.
+            layer_idx (int): Index of this layer in the transformer stack
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.config = config
+        self.self_attn = Ernie4_5_Attention(config, layer_idx)
+        self.mlp = Ernie4_5_MLP(config)
+        self.input_layernorm = Ernie4_5_RMSNorm(config)
+        self.post_attention_layernorm = Ernie4_5_RMSNorm(config)
+        self.residual_add1 = Ernie4_5_FusedDropoutImpl(config.hidden_dropout_prob)
+        self.residual_add2 = Ernie4_5_FusedDropoutImpl(config.hidden_dropout_prob)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        attn_mask_start_row_indices: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass through the decoder layer.
+        Args:
+            hidden_states (torch.Tensor): Input tensor [batch_size, seq_len, hidden_size]
+            attention_mask (Optional[torch.Tensor]): Attention mask tensor
+            attn_mask_start_row_indices (Optional[torch.Tensor]): Indices for variable length attention
+            position_ids (Optional[torch.Tensor]): Position indices for rotary embeddings
+            output_attentions (Optional[bool]): Whether to return attention weights
+            past_key_value (Optional[Tuple[torch.Tensor]]): Cached key/value states
+            use_cache (Optional[bool]): Whether to cache key/value states
+        Returns:
+            Union: Various output combinations depending on arguments:
+                - Base case: Hidden states tensor
+                - With attention: Tuple of (hidden_states, attention_weights)
+                - With cache: Tuple of (hidden_states, cached_key_value)
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        (hidden_states, self_attn_weights, present_key_value) = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=past_key_value,
+            attention_mask=attention_mask,
+            attn_mask_start_row_indices=attn_mask_start_row_indices,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            token_type_ids=token_type_ids,
+        )
+        hidden_states = self.residual_add1(hidden_states, residual)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.residual_add2(hidden_states, residual)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+        return outputs
+class Ernie4_5_PretrainedModel(PreTrainedModel):
+    """Base class for ERNIE pretrained models."""
+    config_class = Ernie4_5_Config
+    base_model_prefix = "ernie"
+class Ernie4_5_Model(Ernie4_5_PretrainedModel):
+    def __init__(self, config):
+        """Initialize the ERNIE model architecture.
+        Args:
+            config: Model configuration.
+        """
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size,
+            self.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [Ernie4_5_DecoderLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+        self.norm = Ernie4_5_RMSNorm(config)
+        self.gradient_checkpointing = False
+    def get_input_embeddings(self):
+        """Get the input embedding layer.
+        Returns:
+            nn.Embedding: The embedding layer for input tokens
+        """
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        """Set new input embeddings.
+        Args:
+            value (nn.Embedding): New embedding layer to use
+        """
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        inputs_embeds=None,
+        use_cache=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+    ):
+        """Forward pass through the ERNIE model.
+        Args:
+            input_ids (Optional[torch.Tensor]): Input token IDs
+            position_ids (Optional[torch.Tensor]): Position indices
+            attention_mask (Optional[torch.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length attention indices
+            inputs_embeds (Optional[torch.Tensor]): Precomputed embeddings
+            use_cache (Optional[bool]): Whether to cache key/value states
+            past_key_values (Optional[Tuple[Tuple[torch.Tensor]]]): Cached key/value states
+            output_attentions (Optional[bool]): Whether to output attention weights
+            output_hidden_states (Optional[bool]): Whether to output all hidden states
+            return_dict (Optional[bool]): Whether to return dict or tuple
+        Returns:
+            Union[Tuple, BaseModelOutputWithPast]:
+                Various outputs depending on configuration, including:
+                - last_hidden_state: Final layer hidden states
+                - past_key_values: Cached key/value states if use_cache=True
+                - hidden_states: All hidden states if output_hidden_states=True
+                - attentions: Attention weights if output_attentions=True
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            _, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            _, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = inputs_embeds.to(self.embed_tokens.weight.dtype)
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask,
+                attn_mask_start_row_indices,
+                position_ids,
+                token_type_ids,
+                output_attentions,
+                past_key_value,
+                use_cache,
+            )
+            if isinstance(layer_outputs, (tuple, list)):
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+            # apply kv cache
+            if past_key_value is not None:
+                hidden_states = hidden_states[:, -1:, :]
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Ernie4_5_LMHead(nn.Module):
+    """Language model head for ERNIE"""
+    def __init__(self, config):
+        """Initialize the language model head.
+        Args:
+            config: Model configuration containing:
+                - vocab_size: Size of vocabulary
+                - hidden_size: Dimension of hidden states
+                - tie_word_embeddings: Whether to tie input/output embeddings
+                - weight_share_add_bias: Whether to add bias when weight sharing
+                - use_bias: Whether to use bias term
+        """
+        super(Ernie4_5_LMHead, self).__init__()
+        self.config = config
+        vocab_size = config.vocab_size
+        if config.tie_word_embeddings:
+            # Weight of shape [vocab_size, hidden_size]
+            self.weight = nn.Parameter(
+                torch.empty(
+                    vocab_size, config.hidden_size, dtype=torch.get_default_dtype()
+                )
+            )
+        else:
+            # Weight of shape [hidden_size, vocab_size]
+            self.weight = nn.Parameter(
+                torch.empty(
+                    config.hidden_size, vocab_size, dtype=torch.get_default_dtype()
+                )
+            )
+        nn.init.xavier_uniform_(self.weight)
+        logger.info(
+            f"output-weight: {self.weight.shape}, tie_word_embeddings: {config.tie_word_embeddings}"
+        )
+        if config.weight_share_add_bias and config.use_bias:
+            self.bias = nn.Parameter(
+                torch.zeros(vocab_size, dtype=torch.get_default_dtype())
+            )
+        else:
+            self.bias = None
+    def forward(self, hidden_states):
+        """Project hidden states to vocabulary logits.
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+        Returns:
+            Logits tensor of shape [batch_size, seq_len, vocab_size]
+        """
+        return self.calc_lm_head_logits(
+            self.config, hidden_states, self.weight, self.bias
+        )
+    def calc_lm_head_logits(self, config, hidden_states, weight, bias):
+        """
+        Calculate language model head logits.
+        This is the core function that computes the final output logits for a language model.
+        Args:
+            config: Model configuration.
+            hidden_states (Tensor): Hidden states from the transformer layers
+            weight (Tensor): Weight matrix for the language model head
+            bias (Tensor): Bias vector for the language model head
+        Returns:
+            Tensor: The computed logits for language modeling.
+        """
+        if config.tie_word_embeddings:
+            logits = torch.matmul(hidden_states, weight.T)
+        else:
+            logits = torch.matmul(hidden_states, weight)
+        if bias is not None:
+            logits = logits + bias
+        return logits
+class Ernie4_5_ForCausalLM(Ernie4_5_PretrainedModel, GenerationMixin):
+    """ERNIE model for causal language modeling."""
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        """
+        Initializes the ERNIE model for causal language modeling.
+        Args:
+            config: Model configuration.
+        """
+        super().__init__(config)
+        self.config = config
+        self.model = Ernie4_5_Model(config)
+        self.lm_head = Ernie4_5_LMHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @torch.no_grad()
+    def set_state_dict(self, state_dict, *args, **kwargs):
+        """
+        Loads the model state dictionary.
+        """
+        ret = super().set_state_dict(state_dict)
+        return ret
+    def get_input_embeddings(self):
+        """Returns the input embeddings layer."""
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        """Sets the input embeddings layer."""
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        """Returns the output embeddings (LM head)."""
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Sets the output embeddings layer."""
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        """Sets the ERNIE decoder model."""
+        self.model = decoder
+    def get_decoder(self):
+        """Gets the ERNIE decoder model."""
+        return self.model
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        **kwargs,
+    ):
+        """
+        Forward pass for causal language modeling.
+        Args:
+            input_ids (torch.Tensor): Input token IDs.
+            position_ids (torch.Tensor): Position IDs.
+            attention_mask (torch.Tensor): Attention mask.
+            attn_mask_start_row_indices (torch.Tensor): Attention mask start indices.
+            inputs_embeds (torch.Tensor): Optional embedded inputs.
+            labels (torch.Tensor): Target labels.
+            use_cache (bool): Whether to use cached hidden states.
+            past_key_values (dict): Pre-computed hidden states.
+            output_attentions (bool): Whether to output attentions.
+            output_hidden_states (bool): Whether to output hidden states.
+        Returns:
+            CausalLMOutputWithPast: Model outputs.
+        """
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        outputs = self.model(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            attn_mask_start_row_indices=attn_mask_start_row_indices,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,1020 @@

+{
+  "additional_special_tokens": [
+    "<|IMAGE_PLACEHOLDER|>",
+    "<|AUDIO_PLACEHOLDER|>",
+    "<|LOC_0|>",
+    "<|LOC_1|>",
+    "<|LOC_2|>",
+    "<|LOC_3|>",
+    "<|LOC_4|>",
+    "<|LOC_5|>",
+    "<|LOC_6|>",
+    "<|LOC_7|>",
+    "<|LOC_8|>",
+    "<|LOC_9|>",
+    "<|LOC_10|>",
+    "<|LOC_11|>",
+    "<|LOC_12|>",
+    "<|LOC_13|>",
+    "<|LOC_14|>",
+    "<|LOC_15|>",
+    "<|LOC_16|>",
+    "<|LOC_17|>",
+    "<|LOC_18|>",
+    "<|LOC_19|>",
+    "<|LOC_20|>",
+    "<|LOC_21|>",
+    "<|LOC_22|>",
+    "<|LOC_23|>",
+    "<|LOC_24|>",
+    "<|LOC_25|>",
+    "<|LOC_26|>",
+    "<|LOC_27|>",
+    "<|LOC_28|>",
+    "<|LOC_29|>",
+    "<|LOC_30|>",
+    "<|LOC_31|>",
+    "<|LOC_32|>",
+    "<|LOC_33|>",
+    "<|LOC_34|>",
+    "<|LOC_35|>",
+    "<|LOC_36|>",
+    "<|LOC_37|>",
+    "<|LOC_38|>",
+    "<|LOC_39|>",
+    "<|LOC_40|>",
+    "<|LOC_41|>",
+    "<|LOC_42|>",
+    "<|LOC_43|>",
+    "<|LOC_44|>",
+    "<|LOC_45|>",
+    "<|LOC_46|>",
+    "<|LOC_47|>",
+    "<|LOC_48|>",
+    "<|LOC_49|>",
+    "<|LOC_50|>",
+    "<|LOC_51|>",
+    "<|LOC_52|>",
+    "<|LOC_53|>",
+    "<|LOC_54|>",
+    "<|LOC_55|>",
+    "<|LOC_56|>",
+    "<|LOC_57|>",
+    "<|LOC_58|>",
+    "<|LOC_59|>",
+    "<|LOC_60|>",
+    "<|LOC_61|>",
+    "<|LOC_62|>",
+    "<|LOC_63|>",
+    "<|LOC_64|>",
+    "<|LOC_65|>",
+    "<|LOC_66|>",
+    "<|LOC_67|>",
+    "<|LOC_68|>",
+    "<|LOC_69|>",
+    "<|LOC_70|>",
+    "<|LOC_71|>",
+    "<|LOC_72|>",
+    "<|LOC_73|>",
+    "<|LOC_74|>",
+    "<|LOC_75|>",
+    "<|LOC_76|>",
+    "<|LOC_77|>",
+    "<|LOC_78|>",
+    "<|LOC_79|>",
+    "<|LOC_80|>",
+    "<|LOC_81|>",
+    "<|LOC_82|>",
+    "<|LOC_83|>",
+    "<|LOC_84|>",
+    "<|LOC_85|>",
+    "<|LOC_86|>",
+    "<|LOC_87|>",
+    "<|LOC_88|>",
+    "<|LOC_89|>",
+    "<|LOC_90|>",
+    "<|LOC_91|>",
+    "<|LOC_92|>",
+    "<|LOC_93|>",
+    "<|LOC_94|>",
+    "<|LOC_95|>",
+    "<|LOC_96|>",
+    "<|LOC_97|>",
+    "<|LOC_98|>",
+    "<|LOC_99|>",
+    "<|LOC_100|>",
+    "<|LOC_101|>",
+    "<|LOC_102|>",
+    "<|LOC_103|>",
+    "<|LOC_104|>",
+    "<|LOC_105|>",
+    "<|LOC_106|>",
+    "<|LOC_107|>",
+    "<|LOC_108|>",
+    "<|LOC_109|>",
+    "<|LOC_110|>",
+    "<|LOC_111|>",
+    "<|LOC_112|>",
+    "<|LOC_113|>",
+    "<|LOC_114|>",
+    "<|LOC_115|>",
+    "<|LOC_116|>",
+    "<|LOC_117|>",
+    "<|LOC_118|>",
+    "<|LOC_119|>",
+    "<|LOC_120|>",
+    "<|LOC_121|>",
+    "<|LOC_122|>",
+    "<|LOC_123|>",
+    "<|LOC_124|>",
+    "<|LOC_125|>",
+    "<|LOC_126|>",
+    "<|LOC_127|>",
+    "<|LOC_128|>",
+    "<|LOC_129|>",
+    "<|LOC_130|>",
+    "<|LOC_131|>",
+    "<|LOC_132|>",
+    "<|LOC_133|>",
+    "<|LOC_134|>",
+    "<|LOC_135|>",
+    "<|LOC_136|>",
+    "<|LOC_137|>",
+    "<|LOC_138|>",
+    "<|LOC_139|>",
+    "<|LOC_140|>",
+    "<|LOC_141|>",
+    "<|LOC_142|>",
+    "<|LOC_143|>",
+    "<|LOC_144|>",
+    "<|LOC_145|>",
+    "<|LOC_146|>",
+    "<|LOC_147|>",
+    "<|LOC_148|>",
+    "<|LOC_149|>",
+    "<|LOC_150|>",
+    "<|LOC_151|>",
+    "<|LOC_152|>",
+    "<|LOC_153|>",
+    "<|LOC_154|>",
+    "<|LOC_155|>",
+    "<|LOC_156|>",
+    "<|LOC_157|>",
+    "<|LOC_158|>",
+    "<|LOC_159|>",
+    "<|LOC_160|>",
+    "<|LOC_161|>",
+    "<|LOC_162|>",
+    "<|LOC_163|>",
+    "<|LOC_164|>",
+    "<|LOC_165|>",
+    "<|LOC_166|>",
+    "<|LOC_167|>",
+    "<|LOC_168|>",
+    "<|LOC_169|>",
+    "<|LOC_170|>",
+    "<|LOC_171|>",
+    "<|LOC_172|>",
+    "<|LOC_173|>",
+    "<|LOC_174|>",
+    "<|LOC_175|>",
+    "<|LOC_176|>",
+    "<|LOC_177|>",
+    "<|LOC_178|>",
+    "<|LOC_179|>",
+    "<|LOC_180|>",
+    "<|LOC_181|>",
+    "<|LOC_182|>",
+    "<|LOC_183|>",
+    "<|LOC_184|>",
+    "<|LOC_185|>",
+    "<|LOC_186|>",
+    "<|LOC_187|>",
+    "<|LOC_188|>",
+    "<|LOC_189|>",
+    "<|LOC_190|>",
+    "<|LOC_191|>",
+    "<|LOC_192|>",
+    "<|LOC_193|>",
+    "<|LOC_194|>",
+    "<|LOC_195|>",
+    "<|LOC_196|>",
+    "<|LOC_197|>",
+    "<|LOC_198|>",
+    "<|LOC_199|>",
+    "<|LOC_200|>",
+    "<|LOC_201|>",
+    "<|LOC_202|>",
+    "<|LOC_203|>",
+    "<|LOC_204|>",
+    "<|LOC_205|>",
+    "<|LOC_206|>",
+    "<|LOC_207|>",
+    "<|LOC_208|>",
+    "<|LOC_209|>",
+    "<|LOC_210|>",
+    "<|LOC_211|>",
+    "<|LOC_212|>",
+    "<|LOC_213|>",
+    "<|LOC_214|>",
+    "<|LOC_215|>",
+    "<|LOC_216|>",
+    "<|LOC_217|>",
+    "<|LOC_218|>",
+    "<|LOC_219|>",
+    "<|LOC_220|>",
+    "<|LOC_221|>",
+    "<|LOC_222|>",
+    "<|LOC_223|>",
+    "<|LOC_224|>",
+    "<|LOC_225|>",
+    "<|LOC_226|>",
+    "<|LOC_227|>",
+    "<|LOC_228|>",
+    "<|LOC_229|>",
+    "<|LOC_230|>",
+    "<|LOC_231|>",
+    "<|LOC_232|>",
+    "<|LOC_233|>",
+    "<|LOC_234|>",
+    "<|LOC_235|>",
+    "<|LOC_236|>",
+    "<|LOC_237|>",
+    "<|LOC_238|>",
+    "<|LOC_239|>",
+    "<|LOC_240|>",
+    "<|LOC_241|>",
+    "<|LOC_242|>",
+    "<|LOC_243|>",
+    "<|LOC_244|>",
+    "<|LOC_245|>",
+    "<|LOC_246|>",
+    "<|LOC_247|>",
+    "<|LOC_248|>",
+    "<|LOC_249|>",
+    "<|LOC_250|>",
+    "<|LOC_251|>",
+    "<|LOC_252|>",
+    "<|LOC_253|>",
+    "<|LOC_254|>",
+    "<|LOC_255|>",
+    "<|LOC_256|>",
+    "<|LOC_257|>",
+    "<|LOC_258|>",
+    "<|LOC_259|>",
+    "<|LOC_260|>",
+    "<|LOC_261|>",
+    "<|LOC_262|>",
+    "<|LOC_263|>",
+    "<|LOC_264|>",
+    "<|LOC_265|>",
+    "<|LOC_266|>",
+    "<|LOC_267|>",
+    "<|LOC_268|>",
+    "<|LOC_269|>",
+    "<|LOC_270|>",
+    "<|LOC_271|>",
+    "<|LOC_272|>",
+    "<|LOC_273|>",
+    "<|LOC_274|>",
+    "<|LOC_275|>",
+    "<|LOC_276|>",
+    "<|LOC_277|>",
+    "<|LOC_278|>",
+    "<|LOC_279|>",
+    "<|LOC_280|>",
+    "<|LOC_281|>",
+    "<|LOC_282|>",
+    "<|LOC_283|>",
+    "<|LOC_284|>",
+    "<|LOC_285|>",
+    "<|LOC_286|>",
+    "<|LOC_287|>",
+    "<|LOC_288|>",
+    "<|LOC_289|>",
+    "<|LOC_290|>",
+    "<|LOC_291|>",
+    "<|LOC_292|>",
+    "<|LOC_293|>",
+    "<|LOC_294|>",
+    "<|LOC_295|>",
+    "<|LOC_296|>",
+    "<|LOC_297|>",
+    "<|LOC_298|>",
+    "<|LOC_299|>",
+    "<|LOC_300|>",
+    "<|LOC_301|>",
+    "<|LOC_302|>",
+    "<|LOC_303|>",
+    "<|LOC_304|>",
+    "<|LOC_305|>",
+    "<|LOC_306|>",
+    "<|LOC_307|>",
+    "<|LOC_308|>",
+    "<|LOC_309|>",
+    "<|LOC_310|>",
+    "<|LOC_311|>",
+    "<|LOC_312|>",
+    "<|LOC_313|>",
+    "<|LOC_314|>",
+    "<|LOC_315|>",
+    "<|LOC_316|>",
+    "<|LOC_317|>",
+    "<|LOC_318|>",
+    "<|LOC_319|>",
+    "<|LOC_320|>",
+    "<|LOC_321|>",
+    "<|LOC_322|>",
+    "<|LOC_323|>",
+    "<|LOC_324|>",
+    "<|LOC_325|>",
+    "<|LOC_326|>",
+    "<|LOC_327|>",
+    "<|LOC_328|>",
+    "<|LOC_329|>",
+    "<|LOC_330|>",
+    "<|LOC_331|>",
+    "<|LOC_332|>",
+    "<|LOC_333|>",
+    "<|LOC_334|>",
+    "<|LOC_335|>",
+    "<|LOC_336|>",
+    "<|LOC_337|>",
+    "<|LOC_338|>",
+    "<|LOC_339|>",
+    "<|LOC_340|>",
+    "<|LOC_341|>",
+    "<|LOC_342|>",
+    "<|LOC_343|>",
+    "<|LOC_344|>",
+    "<|LOC_345|>",
+    "<|LOC_346|>",
+    "<|LOC_347|>",
+    "<|LOC_348|>",
+    "<|LOC_349|>",
+    "<|LOC_350|>",
+    "<|LOC_351|>",
+    "<|LOC_352|>",
+    "<|LOC_353|>",
+    "<|LOC_354|>",
+    "<|LOC_355|>",
+    "<|LOC_356|>",
+    "<|LOC_357|>",
+    "<|LOC_358|>",
+    "<|LOC_359|>",
+    "<|LOC_360|>",
+    "<|LOC_361|>",
+    "<|LOC_362|>",
+    "<|LOC_363|>",
+    "<|LOC_364|>",
+    "<|LOC_365|>",
+    "<|LOC_366|>",
+    "<|LOC_367|>",
+    "<|LOC_368|>",
+    "<|LOC_369|>",
+    "<|LOC_370|>",
+    "<|LOC_371|>",
+    "<|LOC_372|>",
+    "<|LOC_373|>",
+    "<|LOC_374|>",
+    "<|LOC_375|>",
+    "<|LOC_376|>",
+    "<|LOC_377|>",
+    "<|LOC_378|>",
+    "<|LOC_379|>",
+    "<|LOC_380|>",
+    "<|LOC_381|>",
+    "<|LOC_382|>",
+    "<|LOC_383|>",
+    "<|LOC_384|>",
+    "<|LOC_385|>",
+    "<|LOC_386|>",
+    "<|LOC_387|>",
+    "<|LOC_388|>",
+    "<|LOC_389|>",
+    "<|LOC_390|>",
+    "<|LOC_391|>",
+    "<|LOC_392|>",
+    "<|LOC_393|>",
+    "<|LOC_394|>",
+    "<|LOC_395|>",
+    "<|LOC_396|>",
+    "<|LOC_397|>",
+    "<|LOC_398|>",
+    "<|LOC_399|>",
+    "<|LOC_400|>",
+    "<|LOC_401|>",
+    "<|LOC_402|>",
+    "<|LOC_403|>",
+    "<|LOC_404|>",
+    "<|LOC_405|>",
+    "<|LOC_406|>",
+    "<|LOC_407|>",
+    "<|LOC_408|>",
+    "<|LOC_409|>",
+    "<|LOC_410|>",
+    "<|LOC_411|>",
+    "<|LOC_412|>",
+    "<|LOC_413|>",
+    "<|LOC_414|>",
+    "<|LOC_415|>",
+    "<|LOC_416|>",
+    "<|LOC_417|>",
+    "<|LOC_418|>",
+    "<|LOC_419|>",
+    "<|LOC_420|>",
+    "<|LOC_421|>",
+    "<|LOC_422|>",
+    "<|LOC_423|>",
+    "<|LOC_424|>",
+    "<|LOC_425|>",
+    "<|LOC_426|>",
+    "<|LOC_427|>",
+    "<|LOC_428|>",
+    "<|LOC_429|>",
+    "<|LOC_430|>",
+    "<|LOC_431|>",
+    "<|LOC_432|>",
+    "<|LOC_433|>",
+    "<|LOC_434|>",
+    "<|LOC_435|>",
+    "<|LOC_436|>",
+    "<|LOC_437|>",
+    "<|LOC_438|>",
+    "<|LOC_439|>",
+    "<|LOC_440|>",
+    "<|LOC_441|>",
+    "<|LOC_442|>",
+    "<|LOC_443|>",
+    "<|LOC_444|>",
+    "<|LOC_445|>",
+    "<|LOC_446|>",
+    "<|LOC_447|>",
+    "<|LOC_448|>",
+    "<|LOC_449|>",
+    "<|LOC_450|>",
+    "<|LOC_451|>",
+    "<|LOC_452|>",
+    "<|LOC_453|>",
+    "<|LOC_454|>",
+    "<|LOC_455|>",
+    "<|LOC_456|>",
+    "<|LOC_457|>",
+    "<|LOC_458|>",
+    "<|LOC_459|>",
+    "<|LOC_460|>",
+    "<|LOC_461|>",
+    "<|LOC_462|>",
+    "<|LOC_463|>",
+    "<|LOC_464|>",
+    "<|LOC_465|>",
+    "<|LOC_466|>",
+    "<|LOC_467|>",
+    "<|LOC_468|>",
+    "<|LOC_469|>",
+    "<|LOC_470|>",
+    "<|LOC_471|>",
+    "<|LOC_472|>",
+    "<|LOC_473|>",
+    "<|LOC_474|>",
+    "<|LOC_475|>",
+    "<|LOC_476|>",
+    "<|LOC_477|>",
+    "<|LOC_478|>",
+    "<|LOC_479|>",
+    "<|LOC_480|>",
+    "<|LOC_481|>",
+    "<|LOC_482|>",
+    "<|LOC_483|>",
+    "<|LOC_484|>",
+    "<|LOC_485|>",
+    "<|LOC_486|>",
+    "<|LOC_487|>",
+    "<|LOC_488|>",
+    "<|LOC_489|>",
+    "<|LOC_490|>",
+    "<|LOC_491|>",
+    "<|LOC_492|>",
+    "<|LOC_493|>",
+    "<|LOC_494|>",
+    "<|LOC_495|>",
+    "<|LOC_496|>",
+    "<|LOC_497|>",
+    "<|LOC_498|>",
+    "<|LOC_499|>",
+    "<|LOC_500|>",
+    "<|LOC_501|>",
+    "<|LOC_502|>",
+    "<|LOC_503|>",
+    "<|LOC_504|>",
+    "<|LOC_505|>",
+    "<|LOC_506|>",
+    "<|LOC_507|>",
+    "<|LOC_508|>",
+    "<|LOC_509|>",
+    "<|LOC_510|>",
+    "<|LOC_511|>",
+    "<|LOC_512|>",
+    "<|LOC_513|>",
+    "<|LOC_514|>",
+    "<|LOC_515|>",
+    "<|LOC_516|>",
+    "<|LOC_517|>",
+    "<|LOC_518|>",
+    "<|LOC_519|>",
+    "<|LOC_520|>",
+    "<|LOC_521|>",
+    "<|LOC_522|>",
+    "<|LOC_523|>",
+    "<|LOC_524|>",
+    "<|LOC_525|>",
+    "<|LOC_526|>",
+    "<|LOC_527|>",
+    "<|LOC_528|>",
+    "<|LOC_529|>",
+    "<|LOC_530|>",
+    "<|LOC_531|>",
+    "<|LOC_532|>",
+    "<|LOC_533|>",
+    "<|LOC_534|>",
+    "<|LOC_535|>",
+    "<|LOC_536|>",
+    "<|LOC_537|>",
+    "<|LOC_538|>",
+    "<|LOC_539|>",
+    "<|LOC_540|>",
+    "<|LOC_541|>",
+    "<|LOC_542|>",
+    "<|LOC_543|>",
+    "<|LOC_544|>",
+    "<|LOC_545|>",
+    "<|LOC_546|>",
+    "<|LOC_547|>",
+    "<|LOC_548|>",
+    "<|LOC_549|>",
+    "<|LOC_550|>",
+    "<|LOC_551|>",
+    "<|LOC_552|>",
+    "<|LOC_553|>",
+    "<|LOC_554|>",
+    "<|LOC_555|>",
+    "<|LOC_556|>",
+    "<|LOC_557|>",
+    "<|LOC_558|>",
+    "<|LOC_559|>",
+    "<|LOC_560|>",
+    "<|LOC_561|>",
+    "<|LOC_562|>",
+    "<|LOC_563|>",
+    "<|LOC_564|>",
+    "<|LOC_565|>",
+    "<|LOC_566|>",
+    "<|LOC_567|>",
+    "<|LOC_568|>",
+    "<|LOC_569|>",
+    "<|LOC_570|>",
+    "<|LOC_571|>",
+    "<|LOC_572|>",
+    "<|LOC_573|>",
+    "<|LOC_574|>",
+    "<|LOC_575|>",
+    "<|LOC_576|>",
+    "<|LOC_577|>",
+    "<|LOC_578|>",
+    "<|LOC_579|>",
+    "<|LOC_580|>",
+    "<|LOC_581|>",
+    "<|LOC_582|>",
+    "<|LOC_583|>",
+    "<|LOC_584|>",
+    "<|LOC_585|>",
+    "<|LOC_586|>",
+    "<|LOC_587|>",
+    "<|LOC_588|>",
+    "<|LOC_589|>",
+    "<|LOC_590|>",
+    "<|LOC_591|>",
+    "<|LOC_592|>",
+    "<|LOC_593|>",
+    "<|LOC_594|>",
+    "<|LOC_595|>",
+    "<|LOC_596|>",
+    "<|LOC_597|>",
+    "<|LOC_598|>",
+    "<|LOC_599|>",
+    "<|LOC_600|>",
+    "<|LOC_601|>",
+    "<|LOC_602|>",
+    "<|LOC_603|>",
+    "<|LOC_604|>",
+    "<|LOC_605|>",
+    "<|LOC_606|>",
+    "<|LOC_607|>",
+    "<|LOC_608|>",
+    "<|LOC_609|>",
+    "<|LOC_610|>",
+    "<|LOC_611|>",
+    "<|LOC_612|>",
+    "<|LOC_613|>",
+    "<|LOC_614|>",
+    "<|LOC_615|>",
+    "<|LOC_616|>",
+    "<|LOC_617|>",
+    "<|LOC_618|>",
+    "<|LOC_619|>",
+    "<|LOC_620|>",
+    "<|LOC_621|>",
+    "<|LOC_622|>",
+    "<|LOC_623|>",
+    "<|LOC_624|>",
+    "<|LOC_625|>",
+    "<|LOC_626|>",
+    "<|LOC_627|>",
+    "<|LOC_628|>",
+    "<|LOC_629|>",
+    "<|LOC_630|>",
+    "<|LOC_631|>",
+    "<|LOC_632|>",
+    "<|LOC_633|>",
+    "<|LOC_634|>",
+    "<|LOC_635|>",
+    "<|LOC_636|>",
+    "<|LOC_637|>",
+    "<|LOC_638|>",
+    "<|LOC_639|>",
+    "<|LOC_640|>",
+    "<|LOC_641|>",
+    "<|LOC_642|>",
+    "<|LOC_643|>",
+    "<|LOC_644|>",
+    "<|LOC_645|>",
+    "<|LOC_646|>",
+    "<|LOC_647|>",
+    "<|LOC_648|>",
+    "<|LOC_649|>",
+    "<|LOC_650|>",
+    "<|LOC_651|>",
+    "<|LOC_652|>",
+    "<|LOC_653|>",
+    "<|LOC_654|>",
+    "<|LOC_655|>",
+    "<|LOC_656|>",
+    "<|LOC_657|>",
+    "<|LOC_658|>",
+    "<|LOC_659|>",
+    "<|LOC_660|>",
+    "<|LOC_661|>",
+    "<|LOC_662|>",
+    "<|LOC_663|>",
+    "<|LOC_664|>",
+    "<|LOC_665|>",
+    "<|LOC_666|>",
+    "<|LOC_667|>",
+    "<|LOC_668|>",
+    "<|LOC_669|>",
+    "<|LOC_670|>",
+    "<|LOC_671|>",
+    "<|LOC_672|>",
+    "<|LOC_673|>",
+    "<|LOC_674|>",
+    "<|LOC_675|>",
+    "<|LOC_676|>",
+    "<|LOC_677|>",
+    "<|LOC_678|>",
+    "<|LOC_679|>",
+    "<|LOC_680|>",
+    "<|LOC_681|>",
+    "<|LOC_682|>",
+    "<|LOC_683|>",
+    "<|LOC_684|>",
+    "<|LOC_685|>",
+    "<|LOC_686|>",
+    "<|LOC_687|>",
+    "<|LOC_688|>",
+    "<|LOC_689|>",
+    "<|LOC_690|>",
+    "<|LOC_691|>",
+    "<|LOC_692|>",
+    "<|LOC_693|>",
+    "<|LOC_694|>",
+    "<|LOC_695|>",
+    "<|LOC_696|>",
+    "<|LOC_697|>",
+    "<|LOC_698|>",
+    "<|LOC_699|>",
+    "<|LOC_700|>",
+    "<|LOC_701|>",
+    "<|LOC_702|>",
+    "<|LOC_703|>",
+    "<|LOC_704|>",
+    "<|LOC_705|>",
+    "<|LOC_706|>",
+    "<|LOC_707|>",
+    "<|LOC_708|>",
+    "<|LOC_709|>",
+    "<|LOC_710|>",
+    "<|LOC_711|>",
+    "<|LOC_712|>",
+    "<|LOC_713|>",
+    "<|LOC_714|>",
+    "<|LOC_715|>",
+    "<|LOC_716|>",
+    "<|LOC_717|>",
+    "<|LOC_718|>",
+    "<|LOC_719|>",
+    "<|LOC_720|>",
+    "<|LOC_721|>",
+    "<|LOC_722|>",
+    "<|LOC_723|>",
+    "<|LOC_724|>",
+    "<|LOC_725|>",
+    "<|LOC_726|>",
+    "<|LOC_727|>",
+    "<|LOC_728|>",
+    "<|LOC_729|>",
+    "<|LOC_730|>",
+    "<|LOC_731|>",
+    "<|LOC_732|>",
+    "<|LOC_733|>",
+    "<|LOC_734|>",
+    "<|LOC_735|>",
+    "<|LOC_736|>",
+    "<|LOC_737|>",
+    "<|LOC_738|>",
+    "<|LOC_739|>",
+    "<|LOC_740|>",
+    "<|LOC_741|>",
+    "<|LOC_742|>",
+    "<|LOC_743|>",
+    "<|LOC_744|>",
+    "<|LOC_745|>",
+    "<|LOC_746|>",
+    "<|LOC_747|>",
+    "<|LOC_748|>",
+    "<|LOC_749|>",
+    "<|LOC_750|>",
+    "<|LOC_751|>",
+    "<|LOC_752|>",
+    "<|LOC_753|>",
+    "<|LOC_754|>",
+    "<|LOC_755|>",
+    "<|LOC_756|>",
+    "<|LOC_757|>",
+    "<|LOC_758|>",
+    "<|LOC_759|>",
+    "<|LOC_760|>",
+    "<|LOC_761|>",
+    "<|LOC_762|>",
+    "<|LOC_763|>",
+    "<|LOC_764|>",
+    "<|LOC_765|>",
+    "<|LOC_766|>",
+    "<|LOC_767|>",
+    "<|LOC_768|>",
+    "<|LOC_769|>",
+    "<|LOC_770|>",
+    "<|LOC_771|>",
+    "<|LOC_772|>",
+    "<|LOC_773|>",
+    "<|LOC_774|>",
+    "<|LOC_775|>",
+    "<|LOC_776|>",
+    "<|LOC_777|>",
+    "<|LOC_778|>",
+    "<|LOC_779|>",
+    "<|LOC_780|>",
+    "<|LOC_781|>",
+    "<|LOC_782|>",
+    "<|LOC_783|>",
+    "<|LOC_784|>",
+    "<|LOC_785|>",
+    "<|LOC_786|>",
+    "<|LOC_787|>",
+    "<|LOC_788|>",
+    "<|LOC_789|>",
+    "<|LOC_790|>",
+    "<|LOC_791|>",
+    "<|LOC_792|>",
+    "<|LOC_793|>",
+    "<|LOC_794|>",
+    "<|LOC_795|>",
+    "<|LOC_796|>",
+    "<|LOC_797|>",
+    "<|LOC_798|>",
+    "<|LOC_799|>",
+    "<|LOC_800|>",
+    "<|LOC_801|>",
+    "<|LOC_802|>",
+    "<|LOC_803|>",
+    "<|LOC_804|>",
+    "<|LOC_805|>",
+    "<|LOC_806|>",
+    "<|LOC_807|>",
+    "<|LOC_808|>",
+    "<|LOC_809|>",
+    "<|LOC_810|>",
+    "<|LOC_811|>",
+    "<|LOC_812|>",
+    "<|LOC_813|>",
+    "<|LOC_814|>",
+    "<|LOC_815|>",
+    "<|LOC_816|>",
+    "<|LOC_817|>",
+    "<|LOC_818|>",
+    "<|LOC_819|>",
+    "<|LOC_820|>",
+    "<|LOC_821|>",
+    "<|LOC_822|>",
+    "<|LOC_823|>",
+    "<|LOC_824|>",
+    "<|LOC_825|>",
+    "<|LOC_826|>",
+    "<|LOC_827|>",
+    "<|LOC_828|>",
+    "<|LOC_829|>",
+    "<|LOC_830|>",
+    "<|LOC_831|>",
+    "<|LOC_832|>",
+    "<|LOC_833|>",
+    "<|LOC_834|>",
+    "<|LOC_835|>",
+    "<|LOC_836|>",
+    "<|LOC_837|>",
+    "<|LOC_838|>",
+    "<|LOC_839|>",
+    "<|LOC_840|>",
+    "<|LOC_841|>",
+    "<|LOC_842|>",
+    "<|LOC_843|>",
+    "<|LOC_844|>",
+    "<|LOC_845|>",
+    "<|LOC_846|>",
+    "<|LOC_847|>",
+    "<|LOC_848|>",
+    "<|LOC_849|>",
+    "<|LOC_850|>",
+    "<|LOC_851|>",
+    "<|LOC_852|>",
+    "<|LOC_853|>",
+    "<|LOC_854|>",
+    "<|LOC_855|>",
+    "<|LOC_856|>",
+    "<|LOC_857|>",
+    "<|LOC_858|>",
+    "<|LOC_859|>",
+    "<|LOC_860|>",
+    "<|LOC_861|>",
+    "<|LOC_862|>",
+    "<|LOC_863|>",
+    "<|LOC_864|>",
+    "<|LOC_865|>",
+    "<|LOC_866|>",
+    "<|LOC_867|>",
+    "<|LOC_868|>",
+    "<|LOC_869|>",
+    "<|LOC_870|>",
+    "<|LOC_871|>",
+    "<|LOC_872|>",
+    "<|LOC_873|>",
+    "<|LOC_874|>",
+    "<|LOC_875|>",
+    "<|LOC_876|>",
+    "<|LOC_877|>",
+    "<|LOC_878|>",
+    "<|LOC_879|>",
+    "<|LOC_880|>",
+    "<|LOC_881|>",
+    "<|LOC_882|>",
+    "<|LOC_883|>",
+    "<|LOC_884|>",
+    "<|LOC_885|>",
+    "<|LOC_886|>",
+    "<|LOC_887|>",
+    "<|LOC_888|>",
+    "<|LOC_889|>",
+    "<|LOC_890|>",
+    "<|LOC_891|>",
+    "<|LOC_892|>",
+    "<|LOC_893|>",
+    "<|LOC_894|>",
+    "<|LOC_895|>",
+    "<|LOC_896|>",
+    "<|LOC_897|>",
+    "<|LOC_898|>",
+    "<|LOC_899|>",
+    "<|LOC_900|>",
+    "<|LOC_901|>",
+    "<|LOC_902|>",
+    "<|LOC_903|>",
+    "<|LOC_904|>",
+    "<|LOC_905|>",
+    "<|LOC_906|>",
+    "<|LOC_907|>",
+    "<|LOC_908|>",
+    "<|LOC_909|>",
+    "<|LOC_910|>",
+    "<|LOC_911|>",
+    "<|LOC_912|>",
+    "<|LOC_913|>",
+    "<|LOC_914|>",
+    "<|LOC_915|>",
+    "<|LOC_916|>",
+    "<|LOC_917|>",
+    "<|LOC_918|>",
+    "<|LOC_919|>",
+    "<|LOC_920|>",
+    "<|LOC_921|>",
+    "<|LOC_922|>",
+    "<|LOC_923|>",
+    "<|LOC_924|>",
+    "<|LOC_925|>",
+    "<|LOC_926|>",
+    "<|LOC_927|>",
+    "<|LOC_928|>",
+    "<|LOC_929|>",
+    "<|LOC_930|>",
+    "<|LOC_931|>",
+    "<|LOC_932|>",
+    "<|LOC_933|>",
+    "<|LOC_934|>",
+    "<|LOC_935|>",
+    "<|LOC_936|>",
+    "<|LOC_937|>",
+    "<|LOC_938|>",
+    "<|LOC_939|>",
+    "<|LOC_940|>",
+    "<|LOC_941|>",
+    "<|LOC_942|>",
+    "<|LOC_943|>",
+    "<|LOC_944|>",
+    "<|LOC_945|>",
+    "<|LOC_946|>",
+    "<|LOC_947|>",
+    "<|LOC_948|>",
+    "<|LOC_949|>",
+    "<|LOC_950|>",
+    "<|LOC_951|>",
+    "<|LOC_952|>",
+    "<|LOC_953|>",
+    "<|LOC_954|>",
+    "<|LOC_955|>",
+    "<|LOC_956|>",
+    "<|LOC_957|>",
+    "<|LOC_958|>",
+    "<|LOC_959|>",
+    "<|LOC_960|>",
+    "<|LOC_961|>",
+    "<|LOC_962|>",
+    "<|LOC_963|>",
+    "<|LOC_964|>",
+    "<|LOC_965|>",
+    "<|LOC_966|>",
+    "<|LOC_967|>",
+    "<|LOC_968|>",
+    "<|LOC_969|>",
+    "<|LOC_970|>",
+    "<|LOC_971|>",
+    "<|LOC_972|>",
+    "<|LOC_973|>",
+    "<|LOC_974|>",
+    "<|LOC_975|>",
+    "<|LOC_976|>",
+    "<|LOC_977|>",
+    "<|LOC_978|>",
+    "<|LOC_979|>",
+    "<|LOC_980|>",
+    "<|LOC_981|>",
+    "<|LOC_982|>",
+    "<|LOC_983|>",
+    "<|LOC_984|>",
+    "<|LOC_985|>",
+    "<|LOC_986|>",
+    "<|LOC_987|>",
+    "<|LOC_988|>",
+    "<|LOC_989|>",
+    "<|LOC_990|>",
+    "<|LOC_991|>",
+    "<|LOC_992|>",
+    "<|LOC_993|>",
+    "<|LOC_994|>",
+    "<|LOC_995|>",
+    "<|LOC_996|>",
+    "<|LOC_997|>",
+    "<|LOC_998|>",
+    "<|LOC_999|>",
+    "<|LOC_1000|>",
+    "<|LOC_BEGIN|>",
+    "<|LOC_END|>",
+    "<|LOC_SEP|>",
+    "<|CROP_COL_SEP|>",
+    "<|CROP_ROW_SEP|>",
+    "<|IMAGE_SEP|>"
+  ],
+  "bos_token": "<s>",
+  "cls_token": "<|begin_of_sentence|>",
+  "eos_token": "</s>",
+  "mask_token": "<mask:1>",
+  "pad_token": "<unk>",
+  "sep_token": "<|end_of_sentence|>",
+  "unk_token": "<unk>"
+}

tokenization_ernie4_5.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Ernie4_5_Tokenizer(PreTrainedTokenizer):
+    vocab_files_names = {
+        "vocab_file": "tokenizer.model",
+    }
+    # Model input names expected by the tokenizer
+    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
+    # Padding side (where to add padding tokens)
+    padding_side = "right"
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        cls_token="<cls>",
+        eos_token="</s>",
+        mask_token="<mask:0>",
+        pad_token="<pad>",
+        sep_token="<sep>",
+        unk_token="<unk>",
+        additional_special_tokens=None,
+        verbose=False,
+        **kwargs,
+    ):
+        """
+        Initialize the ERNIE tokenizer.
+        Args:
+            vocab_file (str): Path to the SentencePiece model file.
+            bos_token (str, optional): Beginning of sentence token. Defaults to "<s>".
+            cls_token (str, optional): Classification token. Defaults to "<cls>".
+            eos_token (str, optional): End of sentence token. Defaults to "</s>".
+            mask_token (str, optional): Mask token. Defaults to "<mask:0>".
+            pad_token (str, optional): Padding token. Defaults to "<pad>".
+            sep_token (str, optional): Separator token. Defaults to "<sep>".
+            unk_token (str, optional): Unknown token. Defaults to "<unk>".
+            additional_special_tokens (List[str], optional): Additional special tokens.
+                Defaults to ["<mask:1>", "<mask:7>"].
+            verbose (bool, optional): Whether to print detailed logs or progress information during execution.
+            **kwargs: Additional keyword arguments passed to the parent class.
+        """
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+        if additional_special_tokens is None:
+            additional_special_tokens = ["<mask:1>", "<mask:7>"]
+        super().__init__(
+            bos_token=bos_token,
+            cls_token=cls_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            additional_special_tokens=additional_special_tokens,
+            verbose=verbose,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self):
+        """Returns the size of the vocabulary.
+        Returns:
+            int: The number of tokens in the vocabulary.
+        """
+        return self.sp_model.vocab_size()
+    def get_vocab(self):
+        """Get the vocabulary as a dictionary mapping tokens to their IDs.
+        Returns:
+            dict: A dictionary mapping tokens to their corresponding IDs.
+        """
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Tokenize text using SentencePiece.
+        Args:
+            text (str): The text to tokenize.
+        Returns:
+            list: A list of tokens.
+        """
+        return self.sp_model.encode_as_pieces(text)
+    def _convert_token_to_id(self, token):
+        """Convert a token (str) to an ID using the vocabulary.
+        Args:
+            token (str): The token to convert.
+        Returns:
+            int: The corresponding token ID.
+        """
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, id):
+        """Convert an ID to a token (str) using the vocabulary.
+        Args:
+            id (int): The token ID to convert.
+        Returns:
+            str: The corresponding token.
+        """
+        if id >= self.vocab_size:
+            return self.unk_token
+        else:
+            return self.sp_model.id_to_piece(id)
+    def convert_tokens_to_string(self, tokens):
+        """Convert a sequence of tokens back to a single string.
+        Args:
+            tokens (List[str]): A list of tokens to convert.
+        Returns:
+            str: The reconstructed string.
+        """
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def prepare_for_model(self, *args, **kwargs):
+        if "add_special_tokens" in kwargs:
+            kwargs.pop("add_special_tokens")
+        return super().prepare_for_model(*args, **kwargs)
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (str): The directory in which to save the vocabulary.
+            filename_prefix (Optional[str]): Optional prefix for the saved filename.
+        Returns:
+            Tuple[str]: Paths to the files saved.
+        Raises:
+            ValueError: If the save_directory is not a valid directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + self.vocab_files_names["vocab_file"],
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def _decode(self, *args, **kwargs):
+        kwargs.pop("clean_up_tokenization_spaces", None)
+        kwargs.pop("spaces_between_special_tokens", None)
+        return super()._decode(
+            *args,
+            **kwargs,
+            clean_up_tokenization_spaces=False,
+            spaces_between_special_tokens=False,
+        )

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34ef7db83df785924fb83d7b887b6e822a031c56e15cff40aaf9b982988180df
+size 1614363

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff