186 lines
8.4 KiB
Python
186 lines
8.4 KiB
Python
from toolbench.tooleval.evaluators import load_registered_automatic_evaluator
|
|
import os
|
|
import json
|
|
import csv
|
|
from toolbench.tooleval.evaluators.registered_cls.rtl import AnswerStatus, TaskStatus, AnswerPass
|
|
import random
|
|
from concurrent.futures import ThreadPoolExecutor,as_completed
|
|
import argparse
|
|
from tqdm import tqdm
|
|
from toolbench.tooleval.utils import test_sets, get_steps
|
|
|
|
abs_dir = os.path.split(__file__)[0]
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--converted_answer_path', type=str, default="", required=True, help='converted answer path')
|
|
parser.add_argument('--save_path', type=str, default="", required=False, help='result save path')
|
|
parser.add_argument('--reference_model', type=str, default="", required=False, help='model predictions path')
|
|
parser.add_argument('--test_ids', type=str, default="", required=True, help='model predictions path')
|
|
parser.add_argument('--evaluator', type=str, default="tooleval_gpt-3.5-turbo_default", required=False, help='which evaluator to use.')
|
|
parser.add_argument('--max_eval_threads', type=int, default=30, required=False, help='max threads nums')
|
|
parser.add_argument('--evaluate_times', type=int, default=4, required=False, help='how many times to predict with the evaluator for each solution path.')
|
|
return parser.parse_args()
|
|
|
|
def write_results(filename: str, reference_model: str, label_cnt: dict) -> None:
|
|
with open(filename, 'w', newline='') as file:
|
|
writer = csv.writer(file, delimiter="\t")
|
|
writer.writerow(["query", "solvable", "available_tools", "model_intermediate_steps", "model_final_step", "model", "query_id", "is_solved", "pass_rate_label", "reason", "not_hallucinate"])
|
|
for query_id in label_cnt:
|
|
if label_cnt[query_id]["passed"] > label_cnt[query_id]["failed"]:
|
|
final_label = "passed"
|
|
elif label_cnt[query_id]["passed"] < label_cnt[query_id]["failed"]:
|
|
final_label = "failed"
|
|
else:
|
|
if random.random() < 0.5: # if tie, random choose
|
|
final_label = "passed"
|
|
else:
|
|
final_label = "failed"
|
|
query = label_cnt[query_id]["query"]
|
|
task_solvable = label_cnt[query_id]["task_solvable"]
|
|
tool_names = label_cnt[query_id]["tool_names"]
|
|
answer_steps = label_cnt[query_id]["answer_steps"]
|
|
final_step = label_cnt[query_id]["final_step"]
|
|
is_solved = label_cnt[query_id]["is_solved"]
|
|
reason = label_cnt[query_id]["reason"]
|
|
not_hallucinate = label_cnt[query_id]["not_hallucinate"]
|
|
writer.writerow([query, task_solvable, tool_names, answer_steps, final_step, reference_model, query_id, is_solved, final_label, reason, not_hallucinate])
|
|
|
|
|
|
def compute_pass_rate(query_id, example):
|
|
global evaluators
|
|
evaluator = random.choice(evaluators)
|
|
try:
|
|
not_hallucinate = evaluator.check_has_hallucination(
|
|
example['available_tools'],
|
|
example['answer']
|
|
)
|
|
except:
|
|
not_hallucinate = True
|
|
answer_steps, final_step = get_steps(example)
|
|
|
|
if "'name': 'Finish'" not in final_step:
|
|
return query_id, TaskStatus.Solvable, AnswerStatus.Unsolved, "failed", "No answer", not_hallucinate
|
|
|
|
is_solved, is_solved_reason = evaluator.check_is_solved(
|
|
{
|
|
'query':example['query'],
|
|
'available_tools':example['available_tools'],
|
|
},
|
|
example['answer'],
|
|
return_reason=True
|
|
)
|
|
if is_solved == AnswerStatus.Solved:
|
|
is_solved_flag = True
|
|
elif is_solved == AnswerStatus.Unsolved:
|
|
is_solved_flag = False
|
|
else:
|
|
is_solved_flag = False
|
|
|
|
task_solvable, task_solvable_reason = evaluator.check_task_solvable(
|
|
{
|
|
'query':example['query'],
|
|
'available_tools':example['available_tools'],
|
|
},
|
|
has_been_solved=is_solved_flag,
|
|
return_reason=True
|
|
)
|
|
|
|
is_passed = evaluator.is_passed(
|
|
{
|
|
'query':example['query'],
|
|
'available_tools':example['available_tools'],
|
|
},
|
|
example['answer'],
|
|
answer_status=is_solved,
|
|
task_status=task_solvable
|
|
)
|
|
|
|
reason = f"Is solved: {is_solved_reason}\nTask solvable: {task_solvable_reason}"
|
|
if is_passed == AnswerPass.Passed:
|
|
label = "passed"
|
|
elif is_passed == AnswerPass.Failed:
|
|
label = "failed"
|
|
else:
|
|
if random.random() < 0.5: # if unsure, random choose
|
|
label = "passed"
|
|
else:
|
|
label = "failed"
|
|
return query_id, task_solvable, is_solved, label, reason, not_hallucinate
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator, evaluators_cfg_path=os.path.join(abs_dir,'evaluators')) for _ in range(args.max_eval_threads)]
|
|
os.makedirs(args.save_path, exist_ok=True)
|
|
|
|
reference_model = args.reference_model
|
|
output_list = []
|
|
for test_set in test_sets:
|
|
reference_path = f"{args.converted_answer_path}/{reference_model}/{test_set}.json"
|
|
if not os.path.exists(reference_path):
|
|
print(f"Warning: {reference_path} not exists.")
|
|
continue
|
|
test_ids = list(json.load(open(os.path.join(args.test_ids, test_set+".json"), "r")).keys())
|
|
reference_examples = json.load(open(reference_path, "r"))
|
|
if os.path.exists(f"{args.save_path}/{test_set}_{reference_model}.json"):
|
|
existed_ids = list(json.load(open(f"{args.save_path}/{test_set}_{reference_model}.json", "r")).keys())
|
|
label_cnt = json.load(open(f"{args.save_path}/{test_set}_{reference_model}.json", "r"))
|
|
else:
|
|
existed_ids = []
|
|
label_cnt = {}
|
|
|
|
with ThreadPoolExecutor(args.max_eval_threads) as pool:
|
|
future = []
|
|
cnt = 0
|
|
for query_id in reference_examples:
|
|
cnt += 1
|
|
if str(query_id) not in test_ids:
|
|
# print('continue')
|
|
continue
|
|
if query_id in existed_ids:
|
|
continue
|
|
for i in range(args.evaluate_times):
|
|
example = reference_examples[query_id]
|
|
future.append(pool.submit(
|
|
compute_pass_rate,
|
|
query_id,
|
|
example
|
|
))
|
|
print(cnt)
|
|
for thd in tqdm(as_completed(future),total=len(future),ncols=100):
|
|
query_id, task_solvable, is_solved, machine_label, reason, not_hallucinate = thd.result()
|
|
example = reference_examples[query_id]
|
|
query = example["query"]
|
|
tool_names = []
|
|
for tool_dict in example["available_tools"]:
|
|
tool_name = tool_dict["name"]
|
|
tool_names.append(tool_name)
|
|
answer_steps, final_step = get_steps(example)
|
|
if query_id not in label_cnt:
|
|
label_cnt[query_id] = {"passed":0, "failed":0}
|
|
if machine_label == "passed":
|
|
label_cnt[query_id]["passed"] += 1
|
|
else:
|
|
label_cnt[query_id]["failed"] += 1
|
|
label_cnt[query_id]["query"] = query
|
|
label_cnt[query_id]["task_solvable"] = str(task_solvable)
|
|
label_cnt[query_id]["tool_names"] = tool_names
|
|
label_cnt[query_id]["answer_steps"] = answer_steps
|
|
label_cnt[query_id]["final_step"] = final_step
|
|
label_cnt[query_id]["is_solved"] = str(is_solved)
|
|
label_cnt[query_id]["reason"] = reason
|
|
label_cnt[query_id]["not_hallucinate"] = not_hallucinate
|
|
json.dump(label_cnt, open(f"{args.save_path}/{test_set}_{reference_model}.json", "w"), ensure_ascii=False, indent=4)
|
|
json.dump(label_cnt, open(f"{args.save_path}/{test_set}_{reference_model}.json", "w"), ensure_ascii=False, indent=4)
|
|
|
|
filename = f"{args.save_path}/{test_set}_{reference_model}.csv"
|
|
write_results(filename, reference_model, label_cnt)
|
|
pass_rate = 0
|
|
for query_id in label_cnt:
|
|
if label_cnt[query_id]["failed"] <= label_cnt[query_id]["passed"]:
|
|
pass_rate += 1
|
|
pass_rate /= len(label_cnt)
|
|
print(f"Test set: {test_set}. Model: {reference_model}. Pass rate: {str(pass_rate)}")
|
|
|
|
|
|
|