nm-research commited on
Commit
20798f6
·
verified ·
1 Parent(s): 72696de

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +43 -43
README.md CHANGED
@@ -172,88 +172,88 @@ lm_eval \
172
  <tr>
173
  <td rowspan="7"><b>OpenLLM V1</b></td>
174
  <td>arc_challenge</td>
175
- <td>60.24</td>
176
- <td>59.04</td>
177
- <td>98.01%</td>
178
  </tr>
179
  <tr>
180
  <td>gsm8k</td>
181
- <td>60.12</td>
182
- <td>70.81</td>
183
- <td>117.79%</td>
184
  </tr>
185
  <tr>
186
  <td>hellaswag</td>
187
- <td>74.94</td>
188
- <td>73.28</td>
189
- <td>97.79%</td>
190
  </tr>
191
  <tr>
192
  <td>mmlu</td>
193
- <td>64.14</td>
194
- <td>64.82</td>
195
- <td>101.06%</td>
196
  </tr>
197
  <tr>
198
  <td>truthfulqa_mc2</td>
199
- <td>54.87</td>
200
- <td>54.61</td>
201
- <td>99.53%</td>
202
  </tr>
203
  <tr>
204
  <td>winogrande</td>
205
- <td>68.35</td>
206
- <td>67.72</td>
207
- <td>99.08%</td>
208
  </tr>
209
  <tr>
210
  <td><b>Average</b></td>
211
- <td>63.78</td>
212
- <td>65.05</td>
213
- <td><b>101.99%</b></td>
214
  </tr>
215
  <tr>
216
  <td rowspan="7"><b>Leaderboard</b></td>
217
  <td>bbh</td>
218
- <td>55.46</td>
219
- <td>55.20</td>
220
- <td>99.53%</td>
221
  </tr>
222
  <tr>
223
  <td>mmlu_pro</td>
224
- <td>34.38</td>
225
- <td>34.28</td>
226
- <td>99.71%</td>
227
  </tr>
228
  <tr>
229
  <td>musr</td>
230
- <td>33.20</td>
231
- <td>34.26</td>
232
- <td>103.19%</td>
233
  </tr>
234
  <tr>
235
  <td>ifeval</td>
236
- <td>84.41</td>
237
- <td>83.93</td>
238
- <td>99.43%</td>
239
  </tr>
240
  <tr>
241
  <td>gpqa</td>
242
- <td>30.87</td>
243
- <td>31.38</td>
244
- <td>101.65%</td>
245
  </tr>
246
  <tr>
247
  <td>math_hard</td>
248
- <td>45.54</td>
249
- <td>46.60</td>
250
- <td>102.33%</td>
251
  </tr>
252
  <tr>
253
  <td><b>Average</b></td>
254
- <td>47.31</td>
255
- <td>47.61</td>
256
- <td><b>100.63%</b></td>
257
  </tr>
258
  </tbody>
259
- </table>
 
172
  <tr>
173
  <td rowspan="7"><b>OpenLLM V1</b></td>
174
  <td>arc_challenge</td>
175
+ <td>50.60</td>
176
+ <td>50.09</td>
177
+ <td>99.00%</td>
178
  </tr>
179
  <tr>
180
  <td>gsm8k</td>
181
+ <td>48.07</td>
182
+ <td>54.51</td>
183
+ <td>113.40%</td>
184
  </tr>
185
  <tr>
186
  <td>hellaswag</td>
187
+ <td>67.78</td>
188
+ <td>65.67</td>
189
+ <td>96.89%</td>
190
  </tr>
191
  <tr>
192
  <td>mmlu</td>
193
+ <td>59.92</td>
194
+ <td>60.16</td>
195
+ <td>100.40%</td>
196
  </tr>
197
  <tr>
198
  <td>truthfulqa_mc2</td>
199
+ <td>49.98</td>
200
+ <td>49.48</td>
201
+ <td>99.00%</td>
202
  </tr>
203
  <tr>
204
  <td>winogrande</td>
205
+ <td>65.11</td>
206
+ <td>63.85</td>
207
+ <td>98.06%</td>
208
  </tr>
209
  <tr>
210
  <td><b>Average</b></td>
211
+ <td>56.91</td>
212
+ <td>57.29</td>
213
+ <td><b>100.67%</b></td>
214
  </tr>
215
  <tr>
216
  <td rowspan="7"><b>Leaderboard</b></td>
217
  <td>bbh</td>
218
+ <td>53.32</td>
219
+ <td>52.99</td>
220
+ <td>99.38%</td>
221
  </tr>
222
  <tr>
223
  <td>mmlu_pro</td>
224
+ <td>29.76</td>
225
+ <td>29.36</td>
226
+ <td>98.66%</td>
227
  </tr>
228
  <tr>
229
  <td>musr</td>
230
+ <td>34.52</td>
231
+ <td>35.85</td>
232
+ <td>103.85%</td>
233
  </tr>
234
  <tr>
235
  <td>ifeval</td>
236
+ <td>80.22</td>
237
+ <td>80.58</td>
238
+ <td>100.45%</td>
239
  </tr>
240
  <tr>
241
  <td>gpqa</td>
242
+ <td>30.54</td>
243
+ <td>29.36</td>
244
+ <td>96.14%</td>
245
  </tr>
246
  <tr>
247
  <td>math_hard</td>
248
+ <td>34.52</td>
249
+ <td>34.97</td>
250
+ <td>101.30%</td>
251
  </tr>
252
  <tr>
253
  <td><b>Average</b></td>
254
+ <td>43.81</td>
255
+ <td>43.85</td>
256
+ <td><b>100.09%</b></td>
257
  </tr>
258
  </tbody>
259
+ </table>