| {"created_at": "2025-08-28T10:33:55.059371", "global_step": 30000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883523}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41943835889265085, "acc_stderr,none": 0.004924586362301651, "acc_norm,none": 0.5455088627763394, "acc_norm_stderr,none": 0.004969070188763761}, "mmlu": {"acc,none": 0.22959692351516878, "acc_stderr,none": 0.003544581077655137, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24272051009564294, "acc_stderr,none": 0.006248239474862417, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501943}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598035}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733393}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24902216427640156, "acc_stderr,none": 0.01104489226404077}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.23978113936272932, "acc_stderr,none": 0.007646972330614246, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.030898610882477515}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646035}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150193}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888135}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.21579460513487164, "acc_stderr,none": 0.007413422208060848, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538787}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1981651376146789, "acc_stderr,none": 0.017090573804217885}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884603}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17142857142857143, "acc_stderr,none": 0.024127463462650146}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.02992941540834839}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.2134475103076435, "acc_stderr,none": 0.007288945306107154, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.031546980450822305}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.020742740560122652}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15763546798029557, "acc_stderr,none": 0.025639014131172404}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.02488211685765511}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.025130453652268455}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.874, "acc_stderr,none": 0.010499249222408037, "acc_norm,none": 0.81, "acc_norm_stderr,none": 0.012411851354816318}} | |
| {"created_at": "2025-08-28T18:06:09.067572", "global_step": 60000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19574119574119575, "acc_stderr,none": 0.011359497363584395}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.433379804819757, "acc_stderr,none": 0.004945291270072418, "acc_norm,none": 0.5802628958374826, "acc_norm_stderr,none": 0.004925072159723817}, "mmlu": {"acc,none": 0.2318758011679248, "acc_stderr,none": 0.00355531490586192, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24909670563230607, "acc_stderr,none": 0.006302659942242834, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.022598703804321617}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26033519553072626, "acc_stderr,none": 0.014676252009319468}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543332}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24902216427640156, "acc_stderr,none": 0.01104489226404077}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.23720630833601544, "acc_stderr,none": 0.007616406791854159, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.18497109826589594, "acc_stderr,none": 0.029605623981771214}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.03089861088247751}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.02974504857267406}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24521072796934865, "acc_stderr,none": 0.015384352284543953}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.023929155517351294}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2198581560283688, "acc_stderr,none": 0.024706141070705477}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.02388688192244033}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.03484331592680588}, "mmlu_social_sciences": {"acc,none": 0.22066948326291844, "acc_stderr,none": 0.00747326932821189, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538787}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371383}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1963302752293578, "acc_stderr,none": 0.017030719339154364}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2118617189977799, "acc_stderr,none": 0.00726091419550542, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.16447368421052633, "acc_stderr,none": 0.03016753346863271}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179963}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20105820105820105, "acc_stderr,none": 0.020641810782370158}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.02203721734026784}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.14285714285714285, "acc_stderr,none": 0.024620785269489683}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.02488211685765511}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613423}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.861, "acc_stderr,none": 0.010945263761042958, "acc_norm,none": 0.78, "acc_norm_stderr,none": 0.01310617304066178}} | |
| {"created_at": "2025-08-29T01:08:19.315168", "global_step": 90000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19082719082719082, "acc_stderr,none": 0.011250215810979045}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43815972913762197, "acc_stderr,none": 0.004951470301995878, "acc_norm,none": 0.5717984465245967, "acc_norm_stderr,none": 0.004938068627349492}, "mmlu": {"acc,none": 0.23052271756160092, "acc_stderr,none": 0.0035487270388323043, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24420828905419767, "acc_stderr,none": 0.006260942728979899, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.03501438706296781}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2470664928292047, "acc_stderr,none": 0.011015752255279333}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.24010299324106857, "acc_stderr,none": 0.007646527819706578, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150193}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21741956451088723, "acc_stderr,none": 0.007433477439782672, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2134475103076435, "acc_stderr,none": 0.00728564266372319, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.021037331505262893}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1625615763546798, "acc_stderr,none": 0.025960300064605576}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.14814814814814814, "acc_stderr,none": 0.024227629273728356}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.798, "acc_stderr,none": 0.012702651587655116, "acc_norm,none": 0.75, "acc_norm_stderr,none": 0.013699915608779773}} | |
| {"created_at": "2025-08-29T08:22:00.664423", "global_step": 120000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19574119574119575, "acc_stderr,none": 0.011359497363584391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.44682334196375223, "acc_stderr,none": 0.004961481380023775, "acc_norm,none": 0.5821549492133041, "acc_norm_stderr,none": 0.004921964133874026}, "mmlu": {"acc,none": 0.23479561316051845, "acc_stderr,none": 0.0035744204670951586, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23953241232731137, "acc_stderr,none": 0.006225688157304655, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.038522733649243156}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.0328766675860349}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.0284588209914603}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.040191074725573483}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.01433352205921789}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.02226819625878321}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005723}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2405475880052151, "acc_stderr,none": 0.010916406735478947}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.2423559703894432, "acc_stderr,none": 0.007663656993324164, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.20754716981132076, "acc_stderr,none": 0.02495991802891127}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.1907514450867052, "acc_stderr,none": 0.029957851329869337}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.039166677628225836}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.030236389942173092}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2515964240102171, "acc_stderr,none": 0.015517322365529627}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290392}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16544117647058823, "acc_stderr,none": 0.022571771025494753}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.0355092018568963}, "mmlu_social_sciences": {"acc,none": 0.2235944101397465, "acc_stderr,none": 0.007511054931718876, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.029857515673386417}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.18717948717948718, "acc_stderr,none": 0.019776601086550032}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715487}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21100917431192662, "acc_stderr,none": 0.017493922404112648}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596917}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.017776947157528044}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.0252069631542254}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772432}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_stem": {"acc,none": 0.23120837297811608, "acc_stderr,none": 0.007505969352918169, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174023}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364396}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.034559302019248124}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02201908001221789}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2161290322580645, "acc_stderr,none": 0.02341529343356852}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2019704433497537, "acc_stderr,none": 0.028247350122180277}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959343}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.043270409325787296}, "sciq": {"alias": "sciq", "acc,none": 0.834, "acc_stderr,none": 0.011772110370812194, "acc_norm,none": 0.789, "acc_norm_stderr,none": 0.01290913032104209}} | |
| {"created_at": "2025-08-29T15:42:00.893364", "global_step": 150000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3995220075682135, "acc_stderr,none": 0.004887991225950289, "acc_norm,none": 0.5097590121489743, "acc_norm_stderr,none": 0.00498883088413164}, "mmlu": {"acc,none": 0.2282438399088449, "acc_stderr,none": 0.0035370333367014967, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.240807651434644, "acc_stderr,none": 0.0062319599148075696, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.02289408248992599}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265816}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24511082138200782, "acc_stderr,none": 0.010986307870045517}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.23624074670099773, "acc_stderr,none": 0.0076076198539781386, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.030952890217749884}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.03063659134869982}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23371647509578544, "acc_stderr,none": 0.015133383278988822}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.20915032679738563, "acc_stderr,none": 0.023287685312334813}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266733}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.21579460513487164, "acc_stderr,none": 0.0074135491365507995, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.026552207828215293}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817244}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246794}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936087}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884603}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1836734693877551, "acc_stderr,none": 0.024789071332007657}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_stem": {"acc,none": 0.21376466856961623, "acc_stderr,none": 0.007287152806027947, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.03502553170678317}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.0315469804508223}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410296}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.02103733150526289}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18064516129032257, "acc_stderr,none": 0.021886178567172544}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1625615763546798, "acc_stderr,none": 0.025960300064605576}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "sciq": {"alias": "sciq", "acc,none": 0.773, "acc_stderr,none": 0.013253174964763935, "acc_norm,none": 0.725, "acc_norm_stderr,none": 0.014127086556490528}} | |
| {"created_at": "2025-08-30T08:01:34.747363", "global_step": 180000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883525}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4159529974108743, "acc_stderr,none": 0.0049187816623739306, "acc_norm,none": 0.5428201553475404, "acc_norm_stderr,none": 0.004971449552787179}, "mmlu": {"acc,none": 0.26634382566585957, "acc_stderr,none": 0.003722357010863062, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24718384697130713, "acc_stderr,none": 0.006289788186432777, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04040610178208841}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.03114557065948678}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24472573839662448, "acc_stderr,none": 0.027985699387036423}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2066115702479339, "acc_stderr,none": 0.03695980128098823}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.022894082489925992}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2659217877094972, "acc_stderr,none": 0.014776765066438892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21543408360128619, "acc_stderr,none": 0.023350225475471414}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.023788583551658537}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23598435462842243, "acc_stderr,none": 0.010844802669662689}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.2729320888316704, "acc_stderr,none": 0.007966569466151438, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.28679245283018867, "acc_stderr,none": 0.02783491252754407}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.034564257450869995}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2062780269058296, "acc_stderr,none": 0.027157150479563824}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260597}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.028286324075564404}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30718954248366015, "acc_stderr,none": 0.026415601914388995}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.027464708442022142}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.029289413409403192}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3132530120481928, "acc_stderr,none": 0.03610805018031024}, "mmlu_social_sciences": {"acc,none": 0.2892427689307767, "acc_stderr,none": 0.00814414946137002, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.03135305009533085}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.29533678756476683, "acc_stderr,none": 0.032922966391551414}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.32564102564102565, "acc_stderr,none": 0.02375966576741229}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3697478991596639, "acc_stderr,none": 0.031357095996135904}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25504587155963304, "acc_stderr,none": 0.01868850085653584}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596918}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.01759348689536683}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3836734693877551, "acc_stderr,none": 0.031130880396235926}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.03115715086935557}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_stem": {"acc,none": 0.2660957817951158, "acc_stderr,none": 0.007869837107593291, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.03712537833614866}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171453}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.30638297872340425, "acc_stderr,none": 0.030135906478517563}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.02185150982203171}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3161290322580645, "acc_stderr,none": 0.02645087448904277}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.0307127300709826}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184406}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046937}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841043}, "sciq": {"alias": "sciq", "acc,none": 0.772, "acc_stderr,none": 0.013273740700804478, "acc_norm,none": 0.737, "acc_norm_stderr,none": 0.013929286594259726}} | |
| {"created_at": "2025-08-30T09:46:44.984837", "global_step": 210000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19901719901719903, "acc_stderr,none": 0.011430809442838391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3926508663612826, "acc_stderr,none": 0.004873421833291583, "acc_norm,none": 0.49681338378809003, "acc_norm_stderr,none": 0.004989680072717478}, "mmlu": {"acc,none": 0.2296681384418174, "acc_stderr,none": 0.003544644134978251, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24208289054197663, "acc_stderr,none": 0.00624341271351515, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350194}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262196}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927235}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.23945928548439008, "acc_stderr,none": 0.007637810706207164, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150193}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.02388688192244033}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21611959701007474, "acc_stderr,none": 0.007418036077134387, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24673202614379086, "acc_stderr,none": 0.017440820367402507}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.02520696315422542}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2147161433555344, "acc_stderr,none": 0.007303644775035364, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.0315469804508223}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.02103733150526289}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.19032258064516128, "acc_stderr,none": 0.02233170761182307}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.17733990147783252, "acc_stderr,none": 0.026874337276808352}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291518}, "sciq": {"alias": "sciq", "acc,none": 0.715, "acc_stderr,none": 0.014282120955200497, "acc_norm,none": 0.683, "acc_norm_stderr,none": 0.01472167543888022}} | |
| {"created_at": "2025-08-30T14:02:28.793346", "global_step": 240000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19164619164619165, "acc_stderr,none": 0.01126862497880165}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.36387173869747064, "acc_stderr,none": 0.00480129095438708, "acc_norm,none": 0.4462258514240191, "acc_norm_stderr,none": 0.00496083998609953}, "mmlu": {"acc,none": 0.2581541091012676, "acc_stderr,none": 0.00368583735078824, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24909670563230607, "acc_stderr,none": 0.006298968842058454, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604674}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.03242661719827218}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.12396694214876033, "acc_stderr,none": 0.03008309871603522}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650741}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.21098265895953758, "acc_stderr,none": 0.021966309947043114}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2536312849162011, "acc_stderr,none": 0.014551553659369923}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24115755627009647, "acc_stderr,none": 0.024296594034763426}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.02368359183700855}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25749674054758803, "acc_stderr,none": 0.01116770601490415}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19883040935672514, "acc_stderr,none": 0.030611116557432528}, "mmlu_other": {"acc,none": 0.24750563244287094, "acc_stderr,none": 0.00770585718144021, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3320754716981132, "acc_stderr,none": 0.0289854556523344}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.03583901754736413}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.13901345291479822, "acc_stderr,none": 0.023219352834474478}, "mmlu_management": {"alias": " - management", "acc,none": 0.32038834951456313, "acc_stderr,none": 0.0462028408228004}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.21367521367521367, "acc_stderr,none": 0.02685345037700917}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036625}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24521072796934865, "acc_stderr,none": 0.015384352284543936}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826507}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.03175554786629919}, "mmlu_social_sciences": {"acc,none": 0.2674683132921677, "acc_stderr,none": 0.007968642426549297, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813365}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03358618145732523}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276586}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31025641025641026, "acc_stderr,none": 0.02345467488940429}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.0295973297309781}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25137614678899084, "acc_stderr,none": 0.018599206360287415}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.03547771004159463}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.238562091503268, "acc_stderr,none": 0.0172423858287796}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.028920583220675585}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.2730732635585157, "acc_stderr,none": 0.00793558554598318, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.03853254836552003}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.0378272898086547}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542129}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993179}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19148936170212766, "acc_stderr,none": 0.02572214999263779}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309994}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491842}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2838709677419355, "acc_stderr,none": 0.02564938106302926}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617715}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712163}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526732}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.30092592592592593, "acc_stderr,none": 0.03128039084329881}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952685}, "sciq": {"alias": "sciq", "acc,none": 0.642, "acc_stderr,none": 0.015167928865407559, "acc_norm,none": 0.609, "acc_norm_stderr,none": 0.015438826294681783}} | |
| {"created_at": "2025-08-30T20:59:08.788619", "global_step": 270000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19983619983619982, "acc_stderr,none": 0.011448447996728393}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.33957379008165706, "acc_stderr,none": 0.004725967684806405, "acc_norm,none": 0.4117705636327425, "acc_norm_stderr,none": 0.004911481830909232}, "mmlu": {"acc,none": 0.2543797179888905, "acc_stderr,none": 0.0036720049187128115, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26928799149840593, "acc_stderr,none": 0.006463141556007274, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15873015873015872, "acc_stderr,none": 0.03268454013011745}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511783}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.03077855467869326}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.04266416363352167}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.0395783547198098}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.025407197798890162}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27053455019556716, "acc_stderr,none": 0.011345996743539258}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.03546976959393163}, "mmlu_other": {"acc,none": 0.2375281622143547, "acc_stderr,none": 0.007620313967861259, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2037735849056604, "acc_stderr,none": 0.024790784501775406}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483099}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.02693611191280227}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.01588988836256049}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.024739981355113592}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.026469036818590627}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16911764705882354, "acc_stderr,none": 0.022770868010113025}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.031417842916639245}, "mmlu_social_sciences": {"acc,none": 0.24211894702632433, "acc_stderr,none": 0.0077263523748471305, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.03003114797764154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715494}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.0181256691808615}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.036412970813137296}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.0180540274588152}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724137}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.027212835884073142}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.031157150869355568}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_stem": {"acc,none": 0.26070409134157946, "acc_stderr,none": 0.007821439248688587, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640766}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2127659574468085, "acc_stderr,none": 0.02675439134803976}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525214}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885203}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.02813968944485967}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "sciq": {"alias": "sciq", "acc,none": 0.547, "acc_stderr,none": 0.015749255189977582, "acc_norm,none": 0.547, "acc_norm_stderr,none": 0.015749255189977582}} | |
| {"created_at": "2025-08-31T04:18:54.980256", "global_step": 300000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091197}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3434574785899223, "acc_stderr,none": 0.004738920624724474, "acc_norm,none": 0.4148575980880303, "acc_norm_stderr,none": 0.0049169050958108446}, "mmlu": {"acc,none": 0.25395242842899873, "acc_stderr,none": 0.003668487491257294, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26992561105207225, "acc_stderr,none": 0.006465292148147697, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15079365079365079, "acc_stderr,none": 0.03200686497287394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624335}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.03077855467869326}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.043913262867240704}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.023948512905468358}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2990353697749196, "acc_stderr,none": 0.02600330111788513}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2839506172839506, "acc_stderr,none": 0.02508947852376513}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27053455019556716, "acc_stderr,none": 0.011345996743539258}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.035650796707083106}, "mmlu_other": {"acc,none": 0.23881557772771161, "acc_stderr,none": 0.007633216007047615, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19730941704035873, "acc_stderr,none": 0.02670985334496796}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.028605953702004264}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500514}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16544117647058823, "acc_stderr,none": 0.022571771025494767}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1927710843373494, "acc_stderr,none": 0.030709824050565267}, "mmlu_social_sciences": {"acc,none": 0.2382190445238869, "acc_stderr,none": 0.0076848230906898215, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646836}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.029778663037752964}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463185}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715494}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22752293577981653, "acc_stderr,none": 0.017974463578776502}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.036412970813137296}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.018054027458815205}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.027212835884073142}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401464}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2603869330796067, "acc_stderr,none": 0.007811469665462428, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34814814814814815, "acc_stderr,none": 0.041153246103369526}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.02635515841334943}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462833}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.031785297106427496}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.027920963147993666}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "sciq": {"alias": "sciq", "acc,none": 0.534, "acc_stderr,none": 0.015782683329937625, "acc_norm,none": 0.531, "acc_norm_stderr,none": 0.015788865959539006}} | |