137 lines
5.6 KiB
Python
137 lines
5.6 KiB
Python
import os
|
|
import json
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor,as_completed
|
|
from tqdm import tqdm
|
|
import numpy as np
|
|
import argparse
|
|
import random
|
|
from evaluation import UserEvaluation,BaseToolMethod
|
|
from evaluators import load_registered_automatic_evaluator
|
|
from typing import List,Dict,Callable
|
|
import pandas as pd
|
|
|
|
abs_dir = os.path.split(__file__)[0]
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--output',default=os.path.join(abs_dir,'dataset','test.json'),help='where to store the method output.')
|
|
parser.add_argument('--method',default='unknown',help='what the name of the method.')
|
|
parser.add_argument('--ref_method',default='gpt-3.5-turbo_CoT',help='what the reference method is')
|
|
parser.add_argument('--ref_output',default=os.path.join(abs_dir,'dataset','ref_sample.json'),help='where the reference answer stored')
|
|
parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored')
|
|
parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use')
|
|
parser.add_argument('--max_eval_threads',default=1,type=int,help='how many threads to use for evaluation')
|
|
parser.add_argument('--evalset',default='default_evalset',help='which the evaluation dataset to use')
|
|
parser.add_argument('--eval_server_address',default='http://localhost:8000',help='the address of the evaluation server')
|
|
parser.add_argument('--use_existed_output',default=False,action='store_true',help='whether to use the existed output')
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
## !!define your method here !!
|
|
class SampleMethod(BaseToolMethod):
|
|
def __init__(self):
|
|
super().__init__()
|
|
def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
|
|
return {}
|
|
def convert_result_to_dict(self,result):
|
|
return {
|
|
'method': 'sample',
|
|
'total_steps': 0,
|
|
'final_answer': '',
|
|
'answer_details': []
|
|
}
|
|
def process_answer(answer: Dict):
|
|
# answer['final_answer'] = answer['final_answer'][:1000]
|
|
# answer['answer_details'] = answer['answer_details'][:3000]
|
|
# answer.pop('method', None)
|
|
return answer
|
|
if __name__=='__main__':
|
|
args = parse_args()
|
|
|
|
# exec_generating_method_outputs = True
|
|
# if os.path.exists(args.output):
|
|
# print('Output file {} already exists!'.format(args.output))
|
|
# if args.use_existed_output:
|
|
# exec_generating_method_outputs = False
|
|
# else:
|
|
# print('Overwrite? (y/n)')
|
|
# exec_generating_method_outputs = input()=='y'
|
|
|
|
# if exec_generating_method_outputs:
|
|
# ## change the SampleMethod to your method
|
|
# usereval = UserEvaluation(SampleMethod(),args.eval_server_address,args.evalset)
|
|
# print('Generating method outputs...')
|
|
# results = usereval.run()
|
|
# print('Saving method outputs...')
|
|
# with open(args.output,'w') as f:
|
|
# json.dump(results,f)
|
|
# else:
|
|
# print('Use existed output.')
|
|
results = json.load(open(args.output))
|
|
|
|
print('Loading reference answer for evaluation...')
|
|
try:
|
|
output = json.load(open(args.output))
|
|
except:
|
|
raise Exception('Cannot load reference answer from {}\n Please Download before evaluation!'.format(args.ref_output))
|
|
|
|
print('Loading automatic evaluators...')
|
|
evaluators = [load_registered_automatic_evaluator(vars(args)) for _ in range(args.max_eval_threads)]
|
|
|
|
def check_query_solved(qid,query, ans):
|
|
global evaluators
|
|
evaluator = random.choice(evaluators)
|
|
is_solved = evaluator.check_solve_query(
|
|
query,
|
|
ans)
|
|
return qid, is_solved
|
|
|
|
|
|
print('Evaluating...')
|
|
prefer_dict = {}
|
|
with ThreadPoolExecutor(args.max_eval_threads) as pool:
|
|
future = []
|
|
solved = []
|
|
data_list = []
|
|
for qid in output.keys():
|
|
try:
|
|
results[qid]['answer'] = process_answer(results[qid]['answer'])
|
|
future.append(pool.submit(
|
|
check_query_solved,
|
|
qid,
|
|
output[qid]['query'],
|
|
results[qid]['answer']['final_answer']
|
|
))
|
|
except KeyError as e:
|
|
print('Warning : Missing answer for query {} in answer file! '.format(e))
|
|
|
|
for thd in tqdm(as_completed(future),total=len(future),ncols=100):
|
|
qid, is_solved = thd.result()
|
|
solved.append(is_solved)
|
|
# print(ref_output[qid]['query'], file=open('output/check_solved.txt','a'))
|
|
# print(results[qid]['answer'], file=open('output/check_solved.txt','a'))
|
|
data = {'query':output[qid]['query'],
|
|
'answer':results[qid]['answer']['final_answer'],
|
|
'solved':is_solved,
|
|
'qid':qid}
|
|
data_list.append(data)
|
|
print(np.mean(solved))
|
|
file_name = args.output.split('/')[-1]
|
|
prefix = os.path.dirname(args.output)
|
|
json.dump(data_list, open(os.path.join(prefix, f'{file_name}_check_solved.json'),'w'), indent=4)
|
|
|
|
|
|
|
|
# df = pd.DataFrame.from_dict([{
|
|
# 'Method':args.method,
|
|
# 'Win Rate':prefer.mean(),
|
|
# 'Std Error':np.std(prefer)/np.sqrt(len(prefer))
|
|
# }])
|
|
# print('###### Leaderboard vs {} ######'.format(args.ref_method))
|
|
# print(df)
|
|
# save_file = os.path.join(abs_dir,'results',args.evalset,args.method)
|
|
# os.makedirs(save_file,exist_ok=True)
|
|
# df.to_csv(os.path.join(save_file,'win.csv'))
|