AnyTool/toolbench/tooleval/evaluators/registered_cls/base.py
2024-02-23 15:13:06 +08:00

124 lines
No EOL
4.4 KiB
Python

import random
from typing import List, Union, Dict, Any, Callable
import os
import yaml
from .utils import register_evaluator
def process_answer(answer: Dict):
answer['final_answer'] = answer['final_answer'][:1000]
answer['answer_details'] = answer['answer_details'][:3000]
answer.pop('method', None)
return answer
def process_tools(tools: List[Dict]):
for tool in tools:
tool.pop('description', None)
tool.pop('parameters', None)
return tools
@register_evaluator
class BaseEvaluator:
"""Base class for evaluators.
Attributes:
----------
fn_completions : Callable[[Dict,List[Dict]],int]
The completion function of the evaluator, used to get annotated results.
This function should take two arguments: `task_description`:Dict and `answers`:List[Dict], return a int stand for the index of best answer.
Functions:
---------
annotate_preference : Callable
Annotate and return the index of the preferred answer.
"""
def __init__(self,
fn_completions: Callable[[Dict,List[Dict]],int] = None,
*args,
**kwargs):
self.fn_completions = fn_completions
def annotate_preference(self,
query: str,
available_tools: List[Dict[Any, Any]],
answers:List[Dict],
multisample=False,
sample_n=4,
task_status=None,
answer_statuss=[None, None]) -> Union[List[int], int]:
"""Annotate and return the index of the preferred answer.
For given query, available tools, and two answers, return the index of the preferred answer by calling function `fn_completions` of the evaluator.
Parameters:
----------
query : str
The query of the task.
available_tools : List[Dict[Any, Any]]
The list of available tools for the task. The specific format of the tool is defined in `tooleval/evaluation/dataclass.py`
answers : List[Dict]
The list of answers for comparison.
multisample : bool, optional
Whether to use multisample to get the preference. If True, the function will return a list of preferences, otherwise return a single preference.
sample_n : int, optional
The number of samples to get the preference.
Returns:
-------
preference : Union[List[int], int]
The index of the preferred answer. If `multisample` is True, return a list of preferences, otherwise return a single preference.
Raise:
-----
"""
answers_processed = [process_answer(ans) for ans in answers]
available_tools = process_tools(available_tools)
def shuffle_run() -> int:
indexs = list(range(len(answers_processed)))
random.shuffle(indexs)
answers_projected = [answers[idx] for idx in indexs]
preferred_index = self.fn_completions(
{
'query':query,
'available_tools':available_tools,
},
answers_projected,
task_status,
answer_statuss
)
if preferred_index in indexs:
return indexs.index(preferred_index)
raise ValueError(f'Preferred index {preferred_index} is invalid!')
if not multisample:
return shuffle_run()
else:
prefers = [shuffle_run() for _ in range(sample_n)]
return prefers
@register_evaluator
class ToolEvalEvaluator(BaseEvaluator):
"""ToolEval common evaluator class.
Attributes:
----------
cfg_path : str
A path store the configuration of the evaluator.
"""
def __init__(self,
cfg_path: str = None,
):
eval_config = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)
template = open(os.path.join(cfg_path,eval_config['prompt_template'])).read()
super().__init__(
fn_completions=getattr(self,eval_config['fn_completions'])
)
self.eval_config = eval_config
self.template = template