nm-research commited on
Commit
9229c55
·
verified ·
1 Parent(s): 55c49e8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +34 -15
README.md CHANGED
@@ -101,7 +101,7 @@ MAX_SEQUENCE_LENGTH = 2048
101
  ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
102
  ds = ds.shuffle(seed=42)
103
 
104
- dampening_frac=0.01
105
 
106
  def data_collator(batch):
107
  assert len(batch) == 1, "Only batch size of 1 is supported for calibration"
@@ -193,44 +193,63 @@ lm_eval \
193
  <td rowspan="7"><b>OpenLLM V1</b></td>
194
  <td>ARC Challenge</td>
195
  <td>72.53%</td>
196
- <td>%</td>
197
- <td>%</td>
198
  </tr>
199
  <tr>
200
  <td>GSM8K</td>
201
  <td>92.12%</td>
202
- <td>%</td>
203
- <td>%</td>
204
  </tr>
205
  <tr>
206
  <td>Hellaswag</td>
207
  <td>85.78%</td>
208
- <td>%</td>
209
- <td>%</td>
210
  </tr>
211
  <tr>
212
  <td>MMLU</td>
213
  <td>77.53%</td>
214
- <td>%</td>
215
- <td>%</td>
216
  </tr>
217
  <tr>
218
  <td>Truthfulqa (mc2)</td>
219
  <td>62.20%</td>
220
- <td>%</td>
221
- <td>%</td>
222
  </tr>
223
  <tr>
224
  <td>Winogrande</td>
225
  <td>79.40%</td>
226
- <td>%</td>
227
- <td>%</td>
228
  </tr>
229
  <tr>
230
  <td><b>Average Score</b></td>
231
  <td><b>78.26%</b></td>
232
- <td><b>%</b></td>
233
- <td><b>%</b></td>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  </tr>
235
  </tbody>
236
  </table>
 
101
  ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
102
  ds = ds.shuffle(seed=42)
103
 
104
+ dampening_frac=0.05
105
 
106
  def data_collator(batch):
107
  assert len(batch) == 1, "Only batch size of 1 is supported for calibration"
 
193
  <td rowspan="7"><b>OpenLLM V1</b></td>
194
  <td>ARC Challenge</td>
195
  <td>72.53%</td>
196
+ <td>70.82%</td>
197
+ <td>97.65%</td>
198
  </tr>
199
  <tr>
200
  <td>GSM8K</td>
201
  <td>92.12%</td>
202
+ <td>85.75%</td>
203
+ <td>93.09%</td>
204
  </tr>
205
  <tr>
206
  <td>Hellaswag</td>
207
  <td>85.78%</td>
208
+ <td>85.05%</td>
209
+ <td>99.15%</td>
210
  </tr>
211
  <tr>
212
  <td>MMLU</td>
213
  <td>77.53%</td>
214
+ <td>76.37%</td>
215
+ <td>98.50%</td>
216
  </tr>
217
  <tr>
218
  <td>Truthfulqa (mc2)</td>
219
  <td>62.20%</td>
220
+ <td>61.73%</td>
221
+ <td>99.24%</td>
222
  </tr>
223
  <tr>
224
  <td>Winogrande</td>
225
  <td>79.40%</td>
226
+ <td>79.72%</td>
227
+ <td>100.40%</td>
228
  </tr>
229
  <tr>
230
  <td><b>Average Score</b></td>
231
  <td><b>78.26%</b></td>
232
+ <td><b>76.57%</b></td>
233
+ <td><b>97.84%</b></td>
234
+ </tr>
235
+ <tr>
236
+ <td rowspan="3"><b>Vision Evals</b></td>
237
+ <td>MMMU (val)</td>
238
+ <td>50.89%</td>
239
+ <td>51.78%</td>
240
+ <td>101.75%</td>
241
+ </tr>
242
+ <tr>
243
+ <td>ChartQA</td>
244
+ <td>72.16%</td>
245
+ <td>72.20%</td>
246
+ <td>100.06%</td>
247
+ </tr>
248
+ <tr>
249
+ <td><b>Average Score</b></td>
250
+ <td><b>61.53%</b></td>
251
+ <td><b>61.99%</b></td>
252
+ <td><b>100.90%</b></td>
253
  </tr>
254
  </tbody>
255
  </table>