alexmarques commited on
Commit
441be35
·
verified ·
1 Parent(s): b8783f0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +86 -8
README.md CHANGED
@@ -355,16 +355,20 @@ evalplus.evaluate \
355
  #### Open LLM Leaderboard evaluation scores
356
  <table>
357
  <tr>
358
- <td><strong>Benchmark</strong>
359
- </td>
360
- <td><strong>Mistral-Small-3.1-24B-Instruct-2503</strong>
361
- </td>
362
- <td><strong>Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16<br>(this model)</strong>
363
- </td>
364
- <td><strong>Recovery</strong>
365
- </td>
 
 
366
  </tr>
367
  <tr>
 
 
368
  <td>MMLU (5-shot)
369
  </td>
370
  <td>80.67
@@ -434,5 +438,79 @@ evalplus.evaluate \
434
  <td><strong>99.5%</strong>
435
  </td>
436
  </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  </table>
438
 
 
355
  #### Open LLM Leaderboard evaluation scores
356
  <table>
357
  <tr>
358
+ <th>Category
359
+ </th>
360
+ <th>Benchmark
361
+ </th>
362
+ <th>Mistral-Small-3.1-24B-Instruct-2503
363
+ </th>
364
+ <th>Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16<br>(this model)
365
+ </th>
366
+ <th>Recovery
367
+ </th>
368
  </tr>
369
  <tr>
370
+ <td rowspan="7" ><strong>OpenLLM v1</strong>
371
+ </td>
372
  <td>MMLU (5-shot)
373
  </td>
374
  <td>80.67
 
438
  <td><strong>99.5%</strong>
439
  </td>
440
  </tr>
441
+ <tr>
442
+ <td rowspan="3" ><strong></strong>
443
+ </td>
444
+ <td>MMLU-Pro (5-shot)
445
+ </td>
446
+ <td>67.25
447
+ </td>
448
+ <td>66.56
449
+ </td>
450
+ <td>99.0%
451
+ </td>
452
+ </tr>
453
+ <tr>
454
+ <td>GPQA CoT main (5-shot)
455
+ </td>
456
+ <td>42.63
457
+ </td>
458
+ <td>47.10
459
+ </td>
460
+ <td>110.5%
461
+ </td>
462
+ </tr>
463
+ <tr>
464
+ <td>GPQA CoT diamond (5-shot)
465
+ </td>
466
+ <td>45.96
467
+ </td>
468
+ <td>44.95
469
+ </td>
470
+ <td>97.80%
471
+ </td>
472
+ </tr>
473
+ <tr>
474
+ <td rowspan="4" ><strong>Coding</strong>
475
+ </td>
476
+ <td>HumanEval pass@1
477
+ </td>
478
+ <td>84.70
479
+ </td>
480
+ <td>84.60
481
+ </td>
482
+ <td>99.9%
483
+ </td>
484
+ </tr>
485
+ <tr>
486
+ <td>HumanEval+ pass@1
487
+ </td>
488
+ <td>79.50
489
+ </td>
490
+ <td>79.90
491
+ </td>
492
+ <td>100.5%
493
+ </td>
494
+ </tr>
495
+ <tr>
496
+ <td>MBPP pass@1
497
+ </td>
498
+ <td>71.10
499
+ </td>
500
+ <td>70.10
501
+ </td>
502
+ <td>98.6%
503
+ </td>
504
+ </tr>
505
+ <tr>
506
+ <td>MBPP+ pass@1
507
+ </td>
508
+ <td>60.60
509
+ </td>
510
+ <td>60.70
511
+ </td>
512
+ <td>100.2%
513
+ </td>
514
+ </tr>
515
  </table>
516