# Copyright 2025 The HuggingFace Team. All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.
"""Custom evaluation tasks for LightEval."""
import random
from lighteval.metrics.dynamic_metrics import ( ExprExtractionConfig, IndicesExtractionConfig, LatexExtractionConfig, multilingual_extractive_match_metric,)from lighteval.tasks.lighteval_task import LightevalTaskConfigfrom lighteval.tasks.requests import Docfrom lighteval.utils.language import Language
latex_gold_metric = multilingual_extractive_match_metric( language=Language.ENGLISH, fallback_mode="first_match", precision=5, gold_extraction_target=(LatexExtractionConfig(),), # Match boxed first before trying other regexes pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), aggregation_function=max,)
expr_gold_metric = multilingual_extractive_match_metric( language=Language.ENGLISH, fallback_mode="first_match", precision=5, gold_extraction_target=(ExprExtractionConfig(),), # Match boxed first before trying other regexes pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)), aggregation_function=max,)
gpqa_metric = multilingual_extractive_match_metric( language=Language.ENGLISH, gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], precision=5,)
def prompt_fn(line, task_name: str = None): """Assumes the model is either prompted to emit \\boxed{answer} or does so automatically""" return Doc( task_name=task_name, query=line["problem"], choices=[line["solution"]], gold_index=0, )
def aime_prompt_fn(line, task_name: str = None): return Doc( task_name=task_name, query=line["problem"], choices=[line["answer"]], gold_index=0, )
def gpqa_prompt_fn(line, task_name: str = None): """Prompt template adapted from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14""" gold_index = random.randint(0, 3) choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]] choices.insert(gold_index, line["Correct Answer"]) query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}" query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
return Doc( task_name=task_name, query=query, choices=["A", "B", "C", "D"], gold_index=gold_index, instruction=query, )
# Define tasksaime24 = LightevalTaskConfig( name="aime24", suite=["custom"], prompt_function=aime_prompt_fn, hf_repo="HuggingFaceH4/aime_2024", hf_subset="default", hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split=None, few_shots_select=None, generation_size=32768, metric=[expr_gold_metric], version=1,)aime25 = LightevalTaskConfig( name="aime25", suite=["custom"], prompt_function=aime_prompt_fn, hf_repo="yentinglin/aime_2025", hf_subset="default", hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split=None, few_shots_select=None, generation_size=32768, metric=[expr_gold_metric], version=1,)math_500 = LightevalTaskConfig( name="math_500", suite=["custom"], prompt_function=prompt_fn, hf_repo="HuggingFaceH4/MATH-500", hf_subset="default", hf_avail_splits=["test"], evaluation_splits=["test"], few_shots_split=None, few_shots_select=None, generation_size=32768, metric=[latex_gold_metric], version=1,)gpqa_diamond = LightevalTaskConfig( name="gpqa:diamond", suite=["custom"], prompt_function=gpqa_prompt_fn, hf_repo="Idavidrein/gpqa", hf_subset="gpqa_diamond", hf_avail_splits=["train"], evaluation_splits=["train"], few_shots_split=None, few_shots_select=None, generation_size=32768, # needed for reasoning models like R1 metric=[gpqa_metric], stop_sequence=[], # no stop sequence, will use eos token trust_dataset=True, version=1,)
# Add tasks to the tableTASKS_TABLE = []TASKS_TABLE.append(aime24)TASKS_TABLE.append(aime25)TASKS_TABLE.append(math_500)TASKS_TABLE.append(gpqa_diamond)
# MODULE LOGICif __name__ == "__main__": print([t["name"] for t in TASKS_TABLE]) print(len(TASKS_TABLE))
评论