first commit

2024-02-23 15:13:06 +08:00 · 2024-02-23 15:13:06 +08:00 · 9460d4dd94
commit 9460d4dd94
106 changed files with 316239 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,2 @@
+api_test_results_with_docs2.json filter=lfs diff=lfs merge=lfs -text
+category_tool_details_add_nonfree.json filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,45 @@
+data/
+data
+data_0830/
+data*.zip
+*.zip
+*io.txt
+error.txt
+miss*.txt
+output.txt
+token_count_in.txt
+token_count_out.txt
+*local.sh
+*.DS_store
+openchat*/
+toolllama*/
+ws/
+.history/
+reproduction_data*/
+output/
+*result/
+result*/
+__MACOSX/
+api_test_results_with_docs.json
+customized_api_test_results_with_docs.json
+model_list.txt
+run.bash
+tool_data*
+api_test_results*
+api_details.json
+api_details*
+category_tool_details*
+config.py
+*dy.py
+*dy.json
+OAI_CONFIG_LIST
+repos/
+rapidapi_key_list.json
+openai_utils_dy.py
+.chroma/
+
+*.pyc
+**/__pycache__
+.vscode/
+.cache/42/
+retrieval_model/
--- a/201
+++ b/201
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@ -0,0 +1,57 @@
+# AnyTool
+This is the implementation of the paper [AnyTool: Self-Reflective, Hierarchical Agents for Large-Scale API Calls](https://arxiv.org/abs/2402.04253)
+![Figure](https://media.discordapp.net/attachments/1202909094470492163/1202909161755648010/image.png?ex=65d865f5&is=65c5f0f5&hm=a399dda2c4b1c6caf17d3a0d29bc7dc9c504012ba7a4cc856283ce9dc9a3ebd5&=&format=webp&quality=lossless&width=781&height=601)
+
+# Installation
+## Dependencies
+Require Python 3.9+
+
+Quick install 
+```bash
+pip install requirements.txt
+```
+
+# Data
+**ToolBench**
+
+Refer to [ToolBench](https://github.com/OpenBMB/ToolBench).
+
+**AnyToolBench**
+
+# AnyToolBench Generation
+```
+python data_generation_by_gpt4.py
+```
+
+We provide sample data in anytoolbench.json file.
+
+
+
+# Run AnyTool
+Fill your OpenAI config and toolbench key into the config.py.
+
+Run ToolBench
+```
+python anytool.py --output_dir result/test_instruction/G1_instruction --query_path data/test_instruction/G1_instruction.json --max_api_number 64
+```
+Run AnyToolBench
+```
+python anytool.py --output_dir result/anytoolbench --query_path anytoolbench.json -max_api_number 64
+```
+# AnyToolBench Generation
+```
+python data_generation_by_gpt4.py
+```
+# Acknowledgement
+This repo is built on [ToolBench](https://github.com/OpenBMB/ToolBench).
+
+# Citation
+If you find this project is helpful for your research, consider citing our paper
+```
+@article{du2024anytool,
+  title={AnyTool: Self-Reflective, Hierarchical Agents for Large-Scale API Calls},
+  author={Du, Yu and Wei, Fangyun and Zhang, Hongyang},
+  journal={arXiv preprint arXiv:2402.04253},
+  year={2024}
+}
+```
--- a/anytool.py
+++ b/anytool.py
--- a/anytoolbench.json
+++ b/anytoolbench.json
--- a/api_database_function.py
+++ b/api_database_function.py
@ -0,0 +1,626 @@
+import json
+from copy import deepcopy
+from autogen.retrieve_utils import TEXT_FORMATS
+# from openai_function_calling import FunctionInferer
+import autogen
+from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
+import chromadb
+import openai
+import random
+import re
+import os
+from tqdm import tqdm
+from openai_utils import call_gpt
+from arguments import parse_args
+from config import *
+if api_type == "azure":
+    from openai import AzureOpenAI as Client
+else:
+    from openai import OpenAI as Client
+client = Client(
+    api_version=api_version,
+    api_key = api_key,
+    azure_endpoint = api_base
+)
+def get_embedding(text, model="text-embedding-ada-002"):
+    if isinstance(text, list):
+        print(len(text))
+        result = []
+        for single_text in tqdm(text):
+            result.append(client.embeddings.create(input = single_text.replace("\n", " "), model=model).data[0].embedding)
+        return result
+    text = text.replace("\n", " ")
+    return client.embeddings.create(input = [text], model=model).data[0].embedding
+args = parse_args()
+output_dir = args.output_dir
+
+# Load the extracted and restructured data from file
+with open('tool_data.json', 'r', encoding='utf-8') as file:
+    database = json.load(file)
+api_details_dict = json.load(open('api_details.json', 'r', encoding='utf-8'))
+category_tool_details_dict = json.load(open('category_tool_details.json', 'r', encoding='utf-8'))
+
+sample_api_number = args.all_api_number
+if sample_api_number == 1000:
+    sampled_api_list = json.load(open('sampled_api_list1000.json', 'r', encoding='utf-8'))
+elif sample_api_number == 5000:
+    sampled_api_list = json.load(open('sampled_api_list5000.json', 'r', encoding='utf-8'))
+elif sample_api_number == 10000:
+    sampled_api_list = json.load(open('sampled_api_list10000.json', 'r', encoding='utf-8'))
+else:
+    sampled_api_list = []
+print('database size ', sum([sum([len(database[category][tool_name]['api_list_names']) for tool_name in database[category].keys()]) for category in database.keys()]))
+# """
+if len(sampled_api_list) > 0:
+    cnt = 0
+    total_cnt = 0
+    total_cnt1 = 0
+    database_copy = deepcopy(database)
+    for category in database_copy.keys():
+        for tool_name, tool_data in database_copy[category].items():
+            total_cnt += len(tool_data["api_list_names"])
+            assert isinstance(tool_data['api_list_names'], list)
+            for api in tool_data["api_list_names"]:
+                total_cnt1 += 1
+                try:
+                    if category+tool_name+api not in sampled_api_list:
+                        database[category][tool_name]["api_list_names"].remove(api)
+                        for api_dict in api_details_dict[category][tool_name]["api_list"]:
+                            if api_dict["name"] == api:
+                                api_details_dict[category][tool_name]["api_list"].remove(api_dict)
+                    else:
+                        cnt += 1
+                except:
+                    pass
+    print('total api number ', total_cnt, total_cnt1)
+    print('total api number after filtering ', cnt)
+# """
+# """
+# Define the query functions
+def query_all_categories() -> list:
+    """query all categories in the database"""
+
+    return random.sample(list(database.keys()), len(database.keys()))
+
+def get_tools_in_category(category_name: str=None) -> list:
+    """query all tools in a specific category"""
+    if category_name is None:
+        return {'Error': 'Category name is required', 'response':''}
+    if category_name not in database:
+        return 'Illegal category name'
+    return list(database[category_name].keys()) if category_name in database else None
+
+def query_all_tools_in_all_categories() -> list:
+    """query all tools in all categories"""
+    return {category: list(tools.keys()) for category, tools in database.items()}
+
+def get_apis_in_tool(category_name: str=None, tool_name: str=None) -> list:
+    """query all apis in a specific tool"""
+    if category_name is None:
+        return {'Error': 'Category name is required', 'response':''}
+    if category_name not in database:
+        return 'Illegal category name'
+    if tool_name not in database[category_name]:
+        return 'Illegal tool name'
+    return database[category_name][tool_name]['api_list_names']
+# def query_api_details(api_name):
+#     if api_name in api_details_dict:
+#         return api_details_dict[api_name]
+#     return None
+def get_api_details(category_name: str=None, tool_name: str=None, api_name: str=None) -> dict:
+    """query the details of a specific api"""
+    if category_name is None:
+        return {'Error': 'Category name is required', 'response':''}
+    if tool_name is None:
+        return {'Error': 'Tool name is required', 'response':''}
+    if api_name is None:
+        return {'Error': 'API name is required', 'response':''}
+    for category, tools in api_details_dict.items():
+        if category != category_name:
+            continue
+        for tool, tool_data in tools.items():
+            if tool != tool_name:
+                continue
+            for api in tool_data["api_list"]:
+                if api["name"] == api_name:
+                    return api
+    return {}
+
+def locate_api(api_name: str=None) -> dict:
+    """query the details of a specific api"""
+    for category, tools in api_details_dict.items():
+        for tool, tool_data in tools.items():
+            for api in tool_data["api_list"]:
+                if api["name"] == api_name:
+                    return {"category_name": category, "tool_name": tool, "api_name": api_name}
+    return 'api not found'
+
+def sample_apis(gt_apis, num=200):
+    categories_origin = database.keys()
+    sampled_categories = random.sample(categories_origin, 5)
+    categories = []
+    tools = []
+    apis = []
+    for api in gt_apis:
+        if api['category_name'] not in categories:
+            categories.append(api['category_name'])
+        if api['tool_name'] not in tools:
+            tools.append(api['tool_name'])
+        if api['api_name'] not in apis:
+            apis.append(api['api_name'])
+    for cate in sampled_categories:
+        if cate in categories:
+            continue
+        categories.append(cate)
+        tools_origin = get_tools_in_category(cate)
+        sampled_tools = random.sample(tools_origin, min(25, len(tools_origin)))
+        tools.extend(sampled_tools)
+        for tool in sampled_tools:
+            apis_origin = get_apis_in_tool(cate, tool)
+            if apis_origin is None:
+                continue
+            sampled_apis = random.sample(apis_origin, min(10, len(apis_origin)))
+            apis.extend(sampled_apis)
+    
+    return categories, tools, apis
+get_api_details_function = {
+    'name': 'get_api_details',
+    'description': 'get the details of a specific api',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'category_name': {'type': 'string'}, 
+            'tool_name': {'type': 'string'}, 
+            'api_name': {'type': 'string'}
+        },
+        'required': ['category_name', 'tool_name', 'api_name']
+    }
+}
+
+locate_api_function = {
+    'name': 'locate_api',
+    'description': 'locate a specific api in the database',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'api_name': {"type": "string"}
+        },
+        'required': ['api_name']
+    }
+}
+
+get_apis_in_tool_function = {
+    'name': 'get_apis_in_tool',
+    'description': 'query all apis in a specific tool',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'category_name': {'type': 'string'}, 
+            'tool_name': {'type': 'string'}
+        },
+        'required': ['category_name', 'tool_name']
+    }
+}
+
+get_tools_in_category_function = {
+    'name': 'get_tools_in_category',
+    'description': 'get all tools in a specific category',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'category_name': {'type': 'string'}
+        },
+        'required': ['category_name']
+    }
+}
+
+
+get_tools_descriptions_function = {
+    'name': 'get_tools_descriptions',
+    'description': 'get the descriptions of some tools in a specific category. Require input to be list of tool names. You should query no more than 10 tools at a time.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'category_name':{'type':'string'}, 
+            'tool_list': {
+                'type': 'array', 
+                'items': {'type': 'string'}
+            }
+        },
+        'required': ['category_name', 'tool_list']
+    }
+}
+
+retrieve_context_function = {
+    'name': 'retrieve_context',
+    'description': 'retrieve the context relevant to a specific query, the context must contain the search_string',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'search_string': {'type': 'string'}
+        },
+        'required': ['search_string']
+    }
+}
+
+check_if_request_solvable_function = {
+    'name': 'check_if_request_solvable',
+    'description': 'check if the current apis are sufficient to solve the query',
+    'parameters': {
+        'type': 'object',
+        'properties': {}
+    }
+}
+
+add_apis_into_api_pool_function = {
+    'name': 'add_apis_into_api_pool',
+    'description': 'add apis to the final api list. required input to be list of dictionaries describing with the keys category_name, tool_name, api_name',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'api_list': {'type': 'null'}
+        },
+        'required': ['api_list']
+    }
+}
+
+remove_apis_function = {'name': 'remove_apis', 'description': 'remove apis from the final api list. require input to be list of dictionaries describing with the keys category_name, tool_name, api_name', 'parameters': {'type': 'object', 'properties': {'api_list': {'type': 'null'}},'required': ['api_list']}}
+
+def query_all_tool_info(category:str, tools: list) -> list:
+    """query all tool info of a list of tools"""
+    if tools is None:
+        return {'Error': 'Tool list is required', 'response':''}
+    if not isinstance(tools, list):
+        return {'Error': 'Tools must be a list', 'response':''}
+    res = {}
+    all_tools = api_details_dict[category]
+    
+    for tool in tools:
+        if tool not in all_tools:
+            return {'Error': f'Tool name {tool} not found', 'response':''}
+        res[tool] = all_tools[tool]
+        res[tool]['description'] = category_tool_details_dict[category][tool]['tool_description']
+    return res
+
+def query_all_tool_info_in_category(cate):
+    """query all category tool"""
+    return category_tool_details_dict[cate]
+
+def get_tool_description(category_name: str, tool_name: str) -> dict:
+    """get the description of a specific tool"""
+    if category_name not in category_tool_details_dict:
+        return 'category name not found'
+    if tool_name not in category_tool_details_dict[category_name]:
+        return 'tool name not found'
+    return category_tool_details_dict[category_name][tool_name]['tool_description']
+
+def get_tools_descriptions(category_name: str, tool_list: str) -> dict:
+    """query the details of a tool list"""
+    if category_name not in category_tool_details_dict:
+        return {'Error': 'category name not found', 'response':''}
+    if not isinstance(tool_list, list):
+        return {'Error': 'tool_list must be a list', 'response':''}
+    if isinstance(tool_list, str):
+        tool_list = eval(tool_list)
+    for tool_name in tool_list:
+        if tool_name not in category_tool_details_dict[category_name]:
+            return f'tool name {tool_name} not found'
+    return {tool_name: category_tool_details_dict[category_name][tool_name]['tool_description'] for tool_name in tool_list}
+
+def get_response_example(api_name: str) -> str:
+    """get the response example of a specific api"""
+    api_details = get_api_details(api_name)
+    if api_details is None:
+        return 'api name not found'
+    # return api_details['response_example']
+split_function = lambda x: x.split("}")
+# # 1. create an RetrieveAssistantAgent instance named "assistant"
+# assistant = RetrieveAssistantAgent(
+#     name="assistant", 
+#     system_message="You are a helpful assistant. You should help the user find the relevant apis for their tasks. Return the category_name, tool_name and api_name exactly as in your context. Do not make up them",
+#     llm_config={
+#         # "request_timeout": 600,
+#         "seed": 42,
+#         "config_list": config_list,
+#     },
+# )
+# config_list = autogen.config_list_from_json(
+#     env_or_file="OAI_CONFIG_LIST",
+#     file_location=".",
+#     filter_dict={
+#         "model": {
+#             "gpt-4",
+#             "gpt4",
+#             "gpt-4-32k",
+#             "gpt-4-turbo",
+#             "gpt-4-32k-0314",
+#             "gpt-35-turbo",
+#             "gpt-3.5-turbo",
+#         }
+#     },
+# )
+
+# assert len(config_list) > 0
+# print("models to use: ", [config_list[i]["model"] for i in range(len(config_list))])
+
+
+#  Accepted file formats for that can be stored in 
+# a vector database instance
+from autogen.retrieve_utils import TEXT_FORMATS
+
+print("Accepted file formats for `docs_path`:")
+print(TEXT_FORMATS)
+
+
+split_function = lambda x: x.split("}")
+from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
+embedding_function = get_embedding
+ragproxyagent = RetrieveUserProxyAgent(
+    name="ragproxyagent",
+    human_input_mode="NEVER",
+    max_consecutive_auto_reply=10,
+    retrieve_config={
+        "task": "qa",
+        "docs_path": "data_for_retrieval.json",
+        "chunk_token_size": 1000,
+        "model": model_name,
+        # "client": chromadb.PersistentClient(path="/tmp/chromadb"),
+        # "embedding_model": "text-embedding-ada-002",
+        # "embedding_model": "all-mpnet-base-v2",
+        "embedding_function": embedding_function,
+        "get_or_create": True,  # set to True if you want to recreate the collection
+        # "custom_split_function": split_function,
+        "collection_name": "toolbench",
+        "must_break_at_empty_line": False
+    },
+)
+def summarize_context(query, context):
+    messages = [{
+        "role": "system",
+        "content": """You are a helpful assistant. Given a task description, you should help the user find  the relevant APIs in the context. Each API consists of category_name, tool_name and api_name. Do not make up them. 
+        You should call Finish function with the api list. Each element of the list is a dictionary with keys 'category_name', 'tool_name', 'api_name'. Remember, you must call Finish function at one step.""",
+    },
+        {"role": "user", 
+        "content": f"Task description: {query}. Can you help me find the relevant category_names, tool_names, and api_names in following context: {context}"}
+        ]
+    functions = [finish_function]
+    for i in range(5):
+        response = call_gpt(
+                        messages=messages,
+                        functions=functions
+                    )
+        tool_calls = response.choices[0].message.tool_calls
+        print('Thought:', response.choices[0].message.content)
+        if tool_calls:
+            for tool_call in tool_calls:
+                function_name = tool_call.function.name
+                function_args = tool_call.function.arguments
+                if function_name.lower() == 'finish':
+                    try:
+                        api_list = json.loads(function_args)['api_list']
+                        if api_list is None:
+                            continue
+                    except:
+                        continue
+                        
+                else:
+                    continue
+                return api_list
+        else:
+            print('Thought:', response.choices[0].message.content)
+            continue
+    return []
+# assistant.reset()
+finish_function = {
+    'name': 'Finish',
+    'description': 'Finish with the api list. required input to be list of dictionaries describing with the keys category_name, tool_name, api_name',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'api_list': {'type': 'null'}
+        },
+        'required': ['api_list']
+    }
+}
+def retrieve_context(query, search_string=None):
+    """retrieve the context relevant to a specific query, the context must contain the search_string"""
+    print('search_string:', search_string)
+    # try:
+    context = ragproxyagent.generate_init_message(problem=query,n_results=64, search_string=search_string)
+    # except:
+    #     return 'No context found'
+    # if 'Context is' not in context:
+    #     return 'No context found'
+    return summarize_context(query, context.split('Context is')[1])
+
+def is_iterator(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__next__')
+
+def standardize(string):
+    res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]")
+    string = res.sub("_", string)
+    string = re.sub(r"(_)\1+","_", string).lower()
+    while True:
+        if len(string) == 0:
+            return string
+        if string[0] == "_":
+            string = string[1:]
+        else:
+            break
+    while True:
+        if len(string) == 0:
+            return string
+        if string[-1] == "_":
+            string = string[:-1]
+        else:
+            break
+    if string[0].isdigit():
+        string = "get_" + string
+    return string
+
+# For pipeline environment preparation
+def get_white_list(tool_root_dir):
+    # print(tool_root_dir)
+    white_list_dir = os.path.join(tool_root_dir)
+    white_list = {}
+    for cate in tqdm(os.listdir(white_list_dir)):
+        if not os.path.isdir(os.path.join(white_list_dir,cate)):
+            continue
+        for file in os.listdir(os.path.join(white_list_dir,cate)):
+            if not file.endswith(".json"):
+                continue
+            standard_tool_name = file.split(".")[0]
+            # print(standard_tool_name)
+            # print('-'*100)
+            # print(white_list_dir, cate, file)
+            with open(os.path.join(white_list_dir,cate,file)) as reader:
+                js_data = json.load(reader)
+            # print(js_data)
+            # print('#'*100)
+            try:
+                origin_tool_name = js_data["tool_name"]
+            except:
+                print('#'*100)
+                print('error:', 'js_data', js_data[0])
+
+            white_list[standardize(origin_tool_name)] = {"description": js_data["tool_description"], "standard_tool_name": standard_tool_name}
+    return white_list
+
+tool_root_dir = "data/toolenv/tools"
+white_list = get_white_list(tool_root_dir)
+
+def change_name(name):
+    change_list = ["from", "class", "return", "false", "true", "id", "and"]
+    if name in change_list:
+        name = "is_" + name
+    return name
+
+def contain(candidate_list, white_list):
+    output = []
+    for cand in candidate_list:
+        if cand not in white_list.keys():
+            return False
+        output.append(white_list[cand])
+    return output
+
+
+def fetch_api_json(api_list):
+    api_list_new =[]
+    index_list = []
+    for k, item in enumerate(api_list):
+        cate_name = item["category_name"]
+        tool_name = standardize(item["tool_name"])
+        api_name = change_name(standardize(item["api_name"]))
+        tool_json = json.load(open(os.path.join(tool_root_dir, cate_name, tool_name + ".json"), "r"))
+        append_flag = False
+        api_dict_names = []
+        for api_dict in tool_json["api_list"]:
+            api_dict_names.append(api_dict["name"])
+            pure_api_name = change_name(standardize(api_dict["name"]))
+            if pure_api_name != api_name:
+                continue
+            api_json = {}
+            api_json["category_name"] = cate_name
+            api_json["api_name"] = api_dict["name"]
+            api_json["api_description"] = api_dict["description"]
+            api_json["required_parameters"] = api_dict["required_parameters"]
+            api_json["optional_parameters"] = api_dict["optional_parameters"]
+            api_json["tool_name"] = tool_json["tool_name"]
+            api_list_new.append(api_json)
+            index_list.append(k)
+            append_flag = True
+            break
+        if not append_flag:
+            print(api_name, api_dict_names)
+    return api_list_new, index_list
+    
+def api_json_to_openai_json(api_json,standard_tool_name):
+    description_max_length=256
+    templete =     {
+        "name": "",
+        "description": "",
+        "parameters": {
+            "type": "object",
+            "properties": {
+            },
+            "required": [],
+            "optional": [],
+        }
+    }
+    
+    map_type = {
+        "NUMBER": "integer",
+        "STRING": "string",
+        "BOOLEAN": "boolean"
+    }
+
+    pure_api_name = change_name(standardize(api_json["api_name"]))
+    templete["name"] = pure_api_name+ f"_for_{standard_tool_name}"
+    templete["name"] = templete["name"][-64:]
+
+    templete["description"] = f"This is the subfunction for tool \"{standard_tool_name}\", you can use this tool."
+    
+    if api_json["api_description"].strip() != "":
+        tuncated_description = api_json['api_description'].strip().replace(api_json['api_name'],templete['name'])[:description_max_length]
+        templete["description"] = templete["description"] + f"The description of this function is: \"{tuncated_description}\""
+    if "required_parameters" in api_json.keys() and len(api_json["required_parameters"]) > 0:
+        for para in api_json["required_parameters"]:
+            name = standardize(para["name"])
+            name = change_name(name)
+            if para["type"] in map_type:
+                param_type = map_type[para["type"]]
+            else:
+                param_type = "string"
+            prompt = {
+                "type":param_type,
+                "description":para["description"][:description_max_length],
+            }
+
+            default_value = para['default']
+            if len(str(default_value)) != 0:    
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length],
+                    "example_value": default_value
+                }
+            else:
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length]
+                }
+
+            templete["parameters"]["properties"][name] = prompt
+            templete["parameters"]["required"].append(name)
+        for para in api_json["optional_parameters"]:
+            name = standardize(para["name"])
+            name = change_name(name)
+            if para["type"] in map_type:
+                param_type = map_type[para["type"]]
+            else:
+                param_type = "string"
+
+            default_value = para['default']
+            if len(str(default_value)) != 0:    
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length],
+                    "example_value": default_value
+                }
+            else:
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length]
+                }
+
+            templete["parameters"]["properties"][name] = prompt
+            templete["parameters"]["optional"].append(name)
+
+    return templete, api_json["category_name"],  pure_api_name
+
+exclusion_words = ["sorry", "apologize", "apology", "unfortunately", "couldn't", "could not", "can't", "cannot", 'unable', 'regret', 'not successfully']
+
+if __name__ == "__main__":
+    qa_problem = "I'm interested in buying NFTs and would like to know the order-related information. Could you provide me with the fee rate, base token, fee token, and lower limit using the GetOrderInfo API? Additionally, I would like to know the balance of a specific stark key and asset ID using the Balanceofstark_keyandasset_id API."
+    print(retrieve_context(qa_problem))
+    # print(retrieve_context(qa_problem, search_string="GetOrderInfo"))
+    # print(get_apis_in_tool('Search', 'Amazon Search'))
--- a/arguments.py
+++ b/arguments.py
@ -0,0 +1,30 @@
+import argparse
+def parse_args():
+    # 创建 ArgumentParser 对象
+    parser = argparse.ArgumentParser(description="Process paths and numbers.")
+
+    # 添加字符串参数
+    parser.add_argument("--query_path", type=str, default='', help="Path to the query data")
+    parser.add_argument("--output_dir", type=str, default='', help="Path for the output file")
+    parser.add_argument("--model", type=str, default='32k', help="openai model name")
+    parser.add_argument("--solver", type=str, default='dfs', help="solver")
+
+    # 添加整数参数
+    parser.add_argument("--max_api_number", type=int, default=64, help="Maximum number of API calls")
+    parser.add_argument("--check_solvable", action='store_true', default=False, help="check solvable")
+    parser.add_argument("--recheck_solved", action='store_true', default=False, help="check solvable")
+    parser.add_argument("--include_unsolvable", action='store_true', default=False, help="whether skip unsolvable")
+    parser.add_argument("--use_original_prompt", action='store_true', default=False, help="whether use original prompt")
+    parser.add_argument("--leaf_tool_number", type=int, default=5, help="Maximum number of leaf tools")
+    parser.add_argument("--all_api_number", type=int, default=17000, help="Total number of API calls")
+
+    # 解析命令行参数
+    args = parser.parse_args()
+
+    # 使用参数
+    print(f"Query Path: {args.query_path}")
+    print(f"Output Path: {args.output_dir}")
+    print(f"OpenAI Model: {args.model}")
+    print(f"Maximum API Number: {args.max_api_number}")
+    print(f"All API Number: {args.all_api_number}")
+    return args
--- a/check_solved.py
+++ b/check_solved.py
@ -0,0 +1,280 @@
+from toolbench.tooleval.eval_pass_rate import compute_pass_rate, write_results, get_steps, load_registered_automatic_evaluator
+import json
+from concurrent.futures import ThreadPoolExecutor,as_completed
+import argparse
+import os
+from tqdm import tqdm
+import random
+from toolbench.tooleval.evaluators.registered_cls.rtl import AnswerStatus, TaskStatus, AnswerPass
+from toolbench.tooleval.convert_to_answer_format import process_invalid_data, process_valid_data
+import numpy as np
+abs_dir = os.path.split(__file__)[0]
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--save_path', type=str, default="", required=False, help='result save path')
+    parser.add_argument('--reference_model', type=str, default="", required=False, help='model predictions path')
+    parser.add_argument('--evaluator', type=str, default="tooleval_gpt-3.5-turbo_default", required=False, help='which evaluator to use.')
+    parser.add_argument('--max_eval_threads', type=int, default=20, required=False, help='max threads nums')
+    parser.add_argument('--evaluate_times', type=int, default=7, required=False, help='how many times to predict with the evaluator for each solution path.')
+    parser.add_argument("--query_path", type=str, default='', help="Path to the query directory")
+    parser.add_argument("--output_dir", type=str, default='', help="Path for the output file")
+    parser.add_argument("--check_solvable", action='store_true', default=False, help="check solvable")
+    parser.add_argument("--recheck_solved", action='store_true', default=False, help="check solvable")
+    parser.add_argument("--include_unsolvable", action='store_true', default=False, help="whether skip unsolvable")
+    parser.add_argument("--use_original_prompt", action='store_true', default=False, help="whether use original prompt")
+    parser.add_argument("--model", type=str, default='32k', help="openai model name")
+    parser.add_argument("--solver", type=str, default='dfs', help="solver")
+    parser.add_argument("--leaf_tool_number", type=int, default=5, help="Maximum number of leaf tools")
+
+    # 添加整数参数
+    parser.add_argument("--max_api_number", type=int, default=64, help="Maximum number of API calls")
+    parser.add_argument("--all_api_number", type=int, default=17000, help="Total number of API calls")
+    return parser.parse_args()
+args = parse_args()
+evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator, evaluators_cfg_path=os.path.join('toolbench/tooleval','evaluators')) for _ in range(args.max_eval_threads)]
+def compute_pass_rate(query_id, example, task_solvable=None, task_solvable_reason=None):
+    global evaluators
+    evaluator = random.choice(evaluators)
+    try:
+        not_hallucinate = evaluator.check_has_hallucination(
+        example['available_tools'],
+        example['answer']
+        )
+    except:
+        not_hallucinate = True
+    final_step = ''
+    answer_steps, final_step = get_steps(example)
+    
+    if "'name': 'Finish'" not in final_step:
+        return query_id, TaskStatus.Solvable, AnswerStatus.Unsolved, "failed", "No answer", not_hallucinate, 0
+    
+    is_solved, is_solved_reason, tokens = evaluator.check_is_solved(
+        {
+            'query':example['query'],
+            'available_tools':example['available_tools'],
+        },
+        example['answer'],
+        return_reason=True
+    )
+    if is_solved == AnswerStatus.Solved:
+        is_solved_flag = True
+    elif is_solved == AnswerStatus.Unsolved:
+        is_solved_flag = False
+    else:
+        is_solved_flag = False
+        
+    if task_solvable is None:
+        task_solvable, task_solvable_reason, _ = evaluator.check_task_solvable(
+        {
+            'query':example['query'],
+            'available_tools':example['available_tools'],
+        },
+        has_been_solved=is_solved_flag,
+        return_reason=True
+    )
+
+    is_passed, _ = evaluator.is_passed(
+        {
+            'query':example['query'],
+            'available_tools':example['available_tools'],
+        },
+        example['answer'],
+        answer_status=is_solved,
+        task_status=task_solvable
+    )
+
+    reason = f"Is solved: {is_solved_reason}\nTask solvable: {task_solvable_reason}"
+    if is_passed == AnswerPass.Passed:
+        label = "passed"
+    elif is_passed == AnswerPass.Failed:
+        label = "failed"
+    else:
+        # label = 'unsure'
+        if random.random() < 0.5: # if unsure, random choose
+            label = "passed"
+        else:
+            label = "failed"
+    return query_id, task_solvable, is_solved, label, reason, not_hallucinate, tokens
+# output_dir = f'result1/generated_solve_given_api_solvable_multicat_complex_r1/stack_reassign_solve_results_turbo_r16'
+if __name__ == '__main__':
+    # reassign = False
+    test_sets = ["G1_instruction", "G1_tool", "G1_category", "G2_instruction", "G2_category", "G3_instruction"]
+    # test_sets = ["G1_tool", "G1_category", "G2_instruction", "G2_category", "G3_instruction"]
+    # test_sets = ['custom_data']
+    # test_sets = ['G1_instruction']
+    # test_sets = [ "G1_category","G1_instruction", "G1_tool", "G2_instruction"]
+    # test_sets = ['G2_instruction', 'G3_instruction']
+    unsolvable_list = json.load(open("unsolvable.json", "r", encoding="utf-8"))
+    # unsolvable_list = []
+    pass_rate_list = []
+    average_tokens_list = []
+    for test_set in test_sets:
+        total_tokens = 0
+        # query_dir = f'data/test_instruction/{test_set}'
+        # output_dir = f'result2/test_instruction/{test_set}'
+        # output_dir = f'result0111/turbo/test_instruction/{test_set}_r1'
+        # output_dir = f'data/reproduction_data/model_predictions/chatgpt_dfs/{test_set}'
+        # output_dir = f'data/reproduction_data/model_predictions/toolllama_dfs/{test_set}'
+        output_dir = f'data/reproduction_data/model_predictions/toolllama_dfs_retriever/{test_set}'
+        # output_dir = f'data/reproduction_data/model_predictions/gpt-4-0613_dfs/{test_set}'
+        # output_dir = f'data/reproduction_data/model_predictions/chatgpt_cot/{test_set}'
+        # output_dir = f'data/reproduction_data/model_predictions/gpt-4-0613_cot/{test_set}'
+        # 33.5&33.5&41.0&23.5&29.5&3.0 27.3
+        # output_dir = f'result0111/32k/test_instruction/{test_set}_r1'
+        # output_dir = f'result0111/32k/max32/test_instruction/{test_set}_r1'
+        # output_dir = f'result_final/toolbench/{test_set}'
+        # output_dir = f'result0126/toolbench/{test_set}'
+        # output_dir = f'repos/toolbench_ori/{test_set}_filtered/gpt4_retriever_dfs'
+        # output_dir = f'repos/toolbench_ori/{test_set}_filtered/toolllama_retriever_ada_dfs'
+        # output_dir = f'data/reproduction_data/model_predictions/gpt-35-turbo_dfs/{test_set}'
+        # output_dir = 'result_final/custom_data/gpt_dfs_retriever'
+        # output_dir = 'result_final/custom_data/toolllama_dfs_retriever'
+        # output_dir = 'result_final/custom_data/gpt4_gt_dfs'
+        # output_dir = 'result0111/32k_aus/custom_data'
+        if 'reproduction' in output_dir or 'ori' in output_dir:
+            reassign = False
+        else:
+            reassign = True
+        # reassign = False
+        if reassign:
+            test_ids = list(range(200))
+        else:
+            test_ids = json.load(open(f'data/test_query_ids/{test_set}.json', 'r', encoding='utf-8'))
+        if 'cot' in output_dir:
+            method = 'CoT@1'
+        else:
+            method = 'DFS_woFilter_w2'
+        if not os.path.exists(output_dir):
+            continue
+        # evaluation_output_dir = f'result2/test_instruction/{test_set}/pass_rate_result_reeval_32k'
+        # os.system(f'mv {evaluation_output_dir} {output_dir}')
+        # evaluation_output_dir = f'result2/test_instruction/{test_set}/pass_rate_result_35'
+        # evaluation_output_dir = f'{output_dir}/pass_rate_result_reeval_32k_3times_nounsure_aus_r1'
+        # evaluation_output_dir = f'{output_dir}/pass_rate_result_reeval_32k_r1'
+        # final
+        evaluation_output_dir = f'{output_dir}/pass_rate_result_reeval_32k_3times'
+        # evaluation_output_dir = f'{output_dir}/pass_rate_result_35'
+        # evaluation_output_dir = f'{output_dir}/pass_rate_result_reeval_32k'
+        # continue
+        os.makedirs(evaluation_output_dir, exist_ok=True)
+        # label_cnt = {}
+        # answer_dict = {}
+        if os.path.exists(f"{evaluation_output_dir}/label_cnt.json"):
+            label_cnt = json.load(open(f"{evaluation_output_dir}/label_cnt.json", "r", encoding="utf-8"))
+        else:
+            label_cnt = {}
+        future = []
+        if os.path.exists(f"{evaluation_output_dir}/answer_dict.json"):
+            answer_dict = json.load(open(f"{evaluation_output_dir}/answer_dict.json", "r", encoding="utf-8"))
+        else:
+            answer_dict = {}
+        # result_data = json.load(open(f'data/reproduction_data/model_predictions/gpt-4-0613_dfs/{test_set}.json', 'r', encoding='utf-8'))
+        referenced_examples = {}
+
+        with ThreadPoolExecutor(args.max_eval_threads) as pool:
+            for i in test_ids:
+                # print(i)
+                if reassign:
+                    try:
+                        # print(f'{output_dir}/{i}.json')
+                        data = json.load(open(f'{output_dir}/{i}.json', 'r', encoding='utf-8'))
+                    except:
+                        continue
+                    query_id = data['query_id']
+                    if int(query_id) in unsolvable_list:
+                        continue
+                else:
+                    query_id = i
+                    if int(query_id) in unsolvable_list:
+                        continue
+                    if 'chatgpt' in output_dir and 'cot' not in output_dir:
+                        data = json.load(open(f'{output_dir}/{i}_ChatGPT_{method}.json', 'r', encoding='utf-8'))
+                    elif 'chatgpt' in output_dir:
+                        data = json.load(open(f'{output_dir}/{i}_{method}.json', 'r', encoding='utf-8'))
+                    else:
+                        data = json.load(open(f'{output_dir}/{i}_{method}.json', 'r', encoding='utf-8'))
+                        
+                if not reassign:
+                    total_tokens += data['answer_generation']['total_tokens'] 
+                else:
+                    if 'total_tokens' in data:
+                        total_tokens += data['total_tokens']
+                if str(query_id) in label_cnt:
+                    continue
+                if reassign:
+                    # print(i)
+                    if 'last_solve_time' not in data:
+                        try:
+                            data_dict = json.load(open(f'{output_dir}/{i}_DFS_woFilter_w2.json', 'r', encoding='utf-8'))
+                        except:
+                            continue
+                    else:
+                        last_solve_time = data['last_solve_time']
+                        data_dict = json.load(open(f'{output_dir}/{i}_{last_solve_time}_DFS_woFilter_w2.json', 'r', encoding='utf-8'))
+                else:
+                    data_dict = data
+                if not data_dict['answer_generation']['valid_data']:
+                    answer_dict[i] = process_invalid_data(method,data_dict)
+                else:
+                    answer_dict[i] = process_valid_data(method,data_dict['answer_generation'])
+                example = answer_dict[i]
+                # query_id = i
+                # example['available_tools'] = query_data[str(query_id)]['available_tools']
+                referenced_examples[query_id] = example
+                for _ in range(args.evaluate_times):
+                    future.append(pool.submit(
+                        compute_pass_rate,
+                        query_id,
+                        example,
+                        'Solvable',
+                        'Task solvable human label'
+                    ))
+            for thd in tqdm(as_completed(future),total=len(future),ncols=100):
+                query_id, task_solvable, is_solved, machine_label, reason, not_hallucinate, tokens = thd.result()
+                example = referenced_examples[query_id]
+                query = example["query"]
+                tool_names = []
+                for tool_dict in example["available_tools"]:
+                    tool_name = tool_dict["name"]
+                    tool_names.append(tool_name)
+                answer_steps, final_step = get_steps(example)
+                if query_id not in label_cnt:
+                    label_cnt[query_id] = {"passed":0, "failed":0, "unsure":0}
+                if machine_label == "passed":
+                    label_cnt[query_id]["passed"] += 1
+                elif machine_label == "failed":
+                    label_cnt[query_id]["failed"] += 1
+                else:
+                    label_cnt[query_id]["unsure"] += 1
+                label_cnt[query_id]["query"] = query
+                label_cnt[query_id]["task_solvable"] = str(task_solvable)
+                label_cnt[query_id]["tool_names"] = tool_names
+                label_cnt[query_id]["answer_steps"] = answer_steps
+                label_cnt[query_id]["final_step"] = final_step
+                label_cnt[query_id]["is_solved"] = str(is_solved)
+                label_cnt[query_id]["reason"] = reason
+                label_cnt[query_id]["not_hallucinate"] = not_hallucinate
+                json.dump(label_cnt, open(f"{evaluation_output_dir}/label_cnt.json", "w"), ensure_ascii=False, indent=4)
+            filename = f"{evaluation_output_dir}/label_cnt.csv"
+            write_results(filename, 'result', label_cnt)
+            pass_rate = 0
+            total_num = 0
+            print('#'*100)
+            for query_id in label_cnt:
+                if int(query_id) in unsolvable_list:
+                    continue
+                if label_cnt[query_id]["failed"] <= label_cnt[query_id]["passed"]:
+                    pass_rate += 1
+                # if label_cnt[query_id]["unsure"] > 0:
+                #     print('unsure')
+                total_num += 1
+            pass_rate /= total_num
+            pass_rate_list.append(pass_rate)
+            average_tokens_list.append(total_tokens/total_num)
+            print(f"Pass rate: {str(pass_rate)} total num {total_num} average tokens {total_tokens/total_num} {test_set}")
+            json.dump(answer_dict, open(f"{evaluation_output_dir}/answer_dict.json", "w"), ensure_ascii=False, indent=4)
+        print('&'.join([str(round(x*100,1)) for x in pass_rate_list]),round(np.mean(pass_rate_list)*100,1))
+        print('&'.join([str(round(x,1)) for x in average_tokens_list]),round(np.mean(average_tokens_list),1))
+    
+    
+
--- a/config_example.py
+++ b/config_example.py
@ -0,0 +1,7 @@
+api_version = ""
+model_name = ""
+api_key = ""
+api_base = ""
+api_type = "azure"
+toolbench_key = ""
+    
--- a/custom_query_data0129.json
+++ b/custom_query_data0129.json
--- a/data_for_retrieval.json
+++ b/data_for_retrieval.json
--- a/data_generation_by_gpt4.py
+++ b/data_generation_by_gpt4.py
@ -0,0 +1,763 @@
+#encoding:utf-8
+
+import openai
+import os
+from typing import List, Dict, Any
+import re
+from tqdm import tqdm
+import time
+import requests
+from termcolor import colored
+import random
+from api_database_function import *
+from server import get_rapidapi_response
+import tiktoken
+from copy import deepcopy
+from verifier import check_task_complete, check_task_solved
+from prompt_template import FORMAT_INSTRUCTIONS_DATA_GENERATION
+from openai_utils import call_gpt
+enc = tiktoken.get_encoding("cl100k_base")
+assert enc.decode(enc.encode("hello world")) == "hello world"
+# To get the tokeniser corresponding to a specific model in the OpenAI API:
+enc = tiktoken.encoding_for_model("gpt-4")
+# enc = tiktoken.get_encoding("cl100k_base")
+assert enc.decode(enc.encode("hello world")) == "hello world"
+token_cnt = 0
+error_list = ['Too many requests error...', 'Rate limit...', 'Unsubscribed', 'Unauthorized', 'not working error...', 'Quota','quota', 'Blocked', 'Rate limit', 'Unauthorized error']
+
+# def retrieve_context(search_string=None):
+#     """retrieve the context containing the search_string"""
+#     context = ragproxyagent.generate_init_message(problem=query,n_results=5, search_string=search_string)
+#     return summarize_context(query, context.split('Context is')[1][:8000])
+# To help you explore the api database, you can leverage the retrieve_context meta function, which retrieves the relevant context in the database based on your query. And you can specify 
+# the search_string that the context must contain. The retrieved context may contain the potential category_names, tool_names and api_names you are interested in. 
+
+
+
+FORMAT_INSTRUCTIONS_CONTINUAL_DATA_GENERATION = """
+You have access to a database of tools and functions (apis). Function is same to api in our context. 
+You need to help me extend a user query which can be answered by the apis in the database.
+The database has the categorites of {categories},
+You can use the meta functions to retrieve the relevant functions. For example, you can use the meta
+function query_tools_in_category to retrieve the available tools of a specific category. Then, you can use the meta
+function query_apis_in_tool to retrieve the api list of a specific tool. 
+If you are unsure about the functioinality of some tools, you can use the meta api query_tool_details to retrieve the details of a specific tool. 
+If you are unsure about the functioinality of some apis, you can use the meta api get_api_details to retrieve the details of a specific api.
+After you get some functions, use the add_apis function to add the functions you find to the available function list which you can call them later.
+Please note that the original function names will be transformed to a standard form, 
+so you should not use the original function names when calling.
+You must synthsize some parameters to test each of these functions!
+You can try multiple times with different parameters.
+Then, you should use the function responses for formulating the query.
+If you find that some functions are not valid now or cannot be used to form a query, use remove_apis to remove the apis from the available api list.
+Please make sure that the extended query contains all the information or parameters needed to call the apis. 
+Do not use ambiguous words like 'a specific', 'my friend'. 
+You should mention the detailed information. The query should be related to the test results of the functions 
+but it should not  mention the tool names or api names. Make sure that extended part can be answered by the current functions.
+If you finished extending the query, 
+call the Finish function with the final extened query and the corresponding extended answer. 
+You must include the original query in the extended query.
+The answer should directly answer the query instead of giving a plan.
+You should call the initial meta functions no more than 20 times.
+The extended part should consist of a minimum of thirty words.
+"""
+#  "\nPlease produce three queries in line with the given requirements and inputs. These three queries should display a diverse range of sentence structures: some queries should be in the form of imperative sentences, others declarative, and yet others, interrogative. Equally, they should encompass a variety of tones, with some being polite, others straightforward. Ensure they vary in length and contain a wide range of subjects: myself, my friends, family, and company. Aim to include a number of engaging queries as long as they relate to API calls. Try to avoid explicitly specifying which API to employ in the query. Each query should consist of a minimum of thirty words
+# At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.
+# All the thought is short, at most in 5 sentence. 
+
+
+# These ten queries should display a diverse range of sentence structures: some queries should be in the form of imperative sentences,
+# others declarative, and yet others, interrogative. Equally, they should encompass a variety of tones, with some being polite,
+# others straightforward.
+FORMAT_INSTRUCTIONS_DATA_GENERATION_OPTIMIZED="""
+You are an advanced AutoGPT interface designed for dynamic interaction with a comprehensive database of tools and APIs. Your primary function is to assist in generating user queries that can be resolved using the appropriate APIs within the database. To navigate this task efficiently, you have access to five initial meta APIs: query_all_categories, query_tools_in_category, query_apis_in_tool, query_tool_details, and get_api_details. Additionally, you have the capability to test APIs with the add_apis function and can finalize a process with the Finish function.
+
+You must articulate your analytical process at each step, providing a rationale for your choices and outlining the next action, all in a succinct manner not exceeding five sentences. Use the initial meta APIs to identify relevant functions, remembering that your total calls to these APIs should not exceed 10. Ensure that each formulated query is comprehensive, containing all necessary information to be addressed by the selected APIs.
+
+Here is your task:
+
+Call query_all_categories to start identifying potential API categories relevant to common user queries.
+Select a category and use query_tools_in_category to find available tools within that category.
+Choose a tool and employ query_apis_in_tool to obtain a list of APIs associated with it.
+If clarification on tool functionality is needed, use query_tool_details; similarly, use get_api_details for specifics on an API.
+Test potential APIs with add_apis if you deem it necessary.
+Once the suitable APIs are determined, compose a user query that these API can answer and execute the Finish function with this query.
+Begin your task with an initial analysis based on common user query needs and proceed strategically, justifying your decisions at each step and ensuring efficient use of your limited meta API calls."""
+
+
+import os
+import json
+from flask import Flask, jsonify, request
+
+tool_root_dir = "data/toolenv/tools"
+
+def standardize(string):
+    res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]")
+    string = res.sub("_", string)
+    string = re.sub(r"(_)\1+","_", string).lower()
+    while True:
+        if len(string) == 0:
+            return string
+        if string[0] == "_":
+            string = string[1:]
+        else:
+            break
+    while True:
+        if len(string) == 0:
+            return string
+        if string[-1] == "_":
+            string = string[:-1]
+        else:
+            break
+    if string[0].isdigit():
+        string = "get_" + string
+    return string
+
+def fetch_api_json(api_list):
+    api_list_new =[]
+    index_list = []
+    for k, item in enumerate(api_list):
+        cate_name = item["category_name"]
+        tool_name = standardize(item["tool_name"])
+        api_name = change_name(standardize(item["api_name"]))
+        tool_json = json.load(open(os.path.join(tool_root_dir, cate_name, tool_name + ".json"), "r"))
+        append_flag = False
+        api_dict_names = []
+        for api_dict in tool_json["api_list"]:
+            api_dict_names.append(api_dict["name"])
+            pure_api_name = change_name(standardize(api_dict["name"]))
+            if pure_api_name != api_name:
+                continue
+            api_json = {}
+            api_json["category_name"] = cate_name
+            api_json["api_name"] = api_dict["name"]
+            api_json["api_description"] = api_dict["description"]
+            api_json["required_parameters"] = api_dict["required_parameters"]
+            api_json["optional_parameters"] = api_dict["optional_parameters"]
+            api_json["tool_name"] = tool_json["tool_name"]
+            api_list_new.append(api_json)
+            index_list.append(k)
+            append_flag = True
+            break
+        if not append_flag:
+            print(api_name, api_dict_names)
+    return api_list_new, index_list
+    
+def api_json_to_openai_json(api_json,standard_tool_name):
+    description_max_length=256
+    templete =     {
+        "name": "",
+        "description": "",
+        "parameters": {
+            "type": "object",
+            "properties": {
+            },
+            "required": [],
+            "optional": [],
+        }
+    }
+    
+    map_type = {
+        "NUMBER": "integer",
+        "STRING": "string",
+        "BOOLEAN": "boolean"
+    }
+
+    pure_api_name = change_name(standardize(api_json["api_name"]))
+    templete["name"] = pure_api_name+ f"_for_{standard_tool_name}"
+    templete["name"] = templete["name"][-64:]
+
+    templete["description"] = f"This is the subfunction for tool \"{standard_tool_name}\", you can use this tool."
+    
+    if api_json["api_description"].strip() != "":
+        tuncated_description = api_json['api_description'].strip().replace(api_json['api_name'],templete['name'])[:description_max_length]
+        templete["description"] = templete["description"] + f"The description of this function is: \"{tuncated_description}\""
+    if "required_parameters" in api_json.keys() and len(api_json["required_parameters"]) > 0:
+        for para in api_json["required_parameters"]:
+            name = standardize(para["name"])
+            name = change_name(name)
+            if para["type"] in map_type:
+                param_type = map_type[para["type"]]
+            else:
+                param_type = "string"
+            prompt = {
+                "type":param_type,
+                "description":para["description"][:description_max_length],
+            }
+
+            default_value = para['default']
+            if len(str(default_value)) != 0:    
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length],
+                    "example_value": default_value
+                }
+            else:
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length]
+                }
+
+            templete["parameters"]["properties"][name] = prompt
+            templete["parameters"]["required"].append(name)
+        for para in api_json["optional_parameters"]:
+            name = standardize(para["name"])
+            name = change_name(name)
+            if para["type"] in map_type:
+                param_type = map_type[para["type"]]
+            else:
+                param_type = "string"
+
+            default_value = para['default']
+            if len(str(default_value)) != 0:    
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length],
+                    "example_value": default_value
+                }
+            else:
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length]
+                }
+
+            templete["parameters"]["properties"][name] = prompt
+            templete["parameters"]["optional"].append(name)
+
+    return templete, api_json["category_name"],  pure_api_name
+
+# For pipeline environment preparation
+def get_white_list(tool_root_dir):
+    # print(tool_root_dir)
+    white_list_dir = os.path.join(tool_root_dir)
+    white_list = {}
+    for cate in tqdm(os.listdir(white_list_dir)):
+        if not os.path.isdir(os.path.join(white_list_dir,cate)):
+            continue
+        for file in os.listdir(os.path.join(white_list_dir,cate)):
+            if not file.endswith(".json"):
+                continue
+            standard_tool_name = file.split(".")[0]
+            # print(standard_tool_name)
+            with open(os.path.join(white_list_dir,cate,file)) as reader:
+                js_data = json.load(reader)
+            # print(js_data)
+            try:
+                origin_tool_name = js_data["tool_name"]
+            except:
+                print('#'*100)
+                print('error:', 'js_data', js_data[0])
+
+            white_list[standardize(origin_tool_name)] = {"description": js_data["tool_description"], "standard_tool_name": standard_tool_name}
+    return white_list
+
+def contain(candidate_list, white_list):
+    output = []
+    for cand in candidate_list:
+        if cand not in white_list.keys():
+            return False
+        output.append(white_list[cand])
+    return output
+
+
+def Finish(answer: str):
+    """finish the conversation, required answer to be list of dictionaries describing with the keys category_name, tool_name, api_name"""
+    return answer
+
+functions = []
+api_name_reflect = {}
+api2origin = {}
+tool_names = []
+cate_names = []
+white_list = get_white_list(tool_root_dir)
+def add_apis(api_list):
+    """add apis to the current available api list. required input to be list of dictionaries describing with the keys category_name, tool_name, api_name"""
+    if isinstance(api_list, str):
+        api_list = eval(api_list)
+    if not isinstance(api_list, list) or any('category_name' not in ele or 'tool_name' not in ele or 'api_name' not in ele for ele in api_list):
+        return 'illegal input, input should be list, each element in the list should have category_name, tool_name, api_name'
+    if not all([isinstance(ele['category_name'],str) and isinstance(ele['tool_name'],str) and isinstance(ele['api_name'],str) for ele in api_list]):
+        return 'illegal input, category_name, tool_name, api_name should be string'
+    global raw_api_list
+    origin_api_list = deepcopy(api_list)
+    for api in api_list:
+        api.update(get_api_details(api['category_name'], api['tool_name'], api['api_name']))
+    api_list, indexs = fetch_api_json(api_list)
+    origin_api_list = [origin_api_list[k] for k in indexs]
+    raw_api_list.extend(origin_api_list)
+    origin_tool_names = [standardize(cont["tool_name"]) for cont in api_list]
+    tool_des = contain(origin_tool_names,white_list)
+    tool_des = [[cont["standard_tool_name"], cont["description"]] for cont in tool_des]
+    global functions, api_name_reflect, tool_names, cate_names, api2origin, call_cnt
+    for k,api_json in enumerate(api_list):
+        standard_tool_name = tool_des[k][0]
+        openai_function_json, cate_name, pure_api_name = api_json_to_openai_json(api_json,standard_tool_name)
+        functions.append(openai_function_json)
+
+        api_name_reflect[openai_function_json["name"]] = pure_api_name
+        api2origin[openai_function_json["name"]] = {'category_name': origin_api_list[k]['category_name'], 'tool_name': origin_api_list[k]['tool_name'], 'api_name': origin_api_list[k]['api_name']} 
+
+        tool_names.append(standard_tool_name)
+        cate_names.append(cate_name)
+        call_cnt[openai_function_json["name"]] = 0
+    return 'apis added successfully. The mapping from the standard api names to the original category_names, tool_names and  api_names is: ' + str(api2origin)
+
+def remove_apis(api_list):
+    """remove apis from the current available api list. required input to be list of dictionaries describing with the keys category_name, tool_name, api_name"""
+    if isinstance(api_list, str):
+        api_list = eval(api_list)
+    global raw_api_list
+    if not isinstance(api_list, list) or any('category_name' not in ele or 'tool_name' not in ele or 'api_name' not in ele for ele in api_list):
+        return 'illegal input, input should be list, each element in the list should have category_name, tool_name, api_name'
+    if not all([isinstance(ele['category_name'],str) and isinstance(ele['tool_name'],str) and isinstance(ele['api_name'],str) for ele in api_list]):
+        return 'illegal input, category_name, tool_name, api_name should be string'
+    origin_api_list = deepcopy(api_list)
+    for api in api_list:
+        api.update(get_api_details(api['category_name'], api['tool_name'], api['api_name']))
+    api_list, indexs = fetch_api_json(api_list)
+    origin_api_list = [origin_api_list[k] for k in indexs]
+    origin_tool_names = [standardize(cont["tool_name"]) for cont in api_list]
+    tool_des = contain(origin_tool_names,white_list)
+    tool_des = [[cont["standard_tool_name"], cont["description"]] for cont in tool_des]
+    global functions, api_name_reflect, tool_names, cate_names, api2origin, call_cnt
+    for k,api_json in enumerate(api_list):
+        standard_tool_name = tool_des[k][0]
+        openai_function_json, cate_name, pure_api_name = api_json_to_openai_json(api_json,standard_tool_name)
+        # print(openai_function_json)
+        functions.remove(openai_function_json)
+
+        api_name_reflect.pop(openai_function_json["name"])
+        api2origin.pop(openai_function_json["name"])
+
+        tool_names.remove(standard_tool_name)
+        cate_names.remove(cate_name)
+        call_cnt.pop(openai_function_json["name"])
+
+    
+    for api in origin_api_list:
+        for ele in raw_api_list:
+            if ele['category_name'] == api['category_name'] and ele['tool_name'] == api['tool_name'] and ele['api_name'] == api['api_name']:
+                raw_api_list.remove(ele)
+                break
+    return 'apis removed successfully. The mapping from the standard api names to the original category_names, tool_names and  api_names is: ' + str(api2origin)
+
+# Define the API endpoints
+def get_categories():
+    return jsonify(query_all_categories(database))
+
+def get_current_weather(location: str, unit: str = "fahrenheit") -> str:
+    """Get the current weather and return a summary."""
+    return f"It is currently sunny in {location} and 75 degrees {unit}."
+
+
+def get_tomorrows_weather(location: str, unit: str = "fahrenheit") -> str:
+    """Get the weather for tomorrow and return a summary."""
+    return f"Tomorrow it will be rainy in {location} and 60 degrees {unit}."
+
+# Infer the function definitions.
+
+api_mapping = {
+    "query_all_categories": query_all_categories,
+    "get_tools_in_category": get_tools_in_category,
+    "get_apis_in_tool": get_apis_in_tool,
+    # "Finish": Finish,
+    "get_api_details": get_api_details,
+    "get_tools_descriptions": get_tools_descriptions,
+    "add_apis_into_api_pool": add_apis,
+    "remove_apis": remove_apis,
+}
+
+call_cnt = {}
+
+def standardize_category(category):
+    save_category = category.replace(" ", "_").replace(",", "_").replace("/", "_")
+    while " " in save_category or "," in save_category:
+        save_category = save_category.replace(" ", "_").replace(",", "_")
+    save_category = save_category.replace("__", "_")
+    return save_category
+
+
+def change_name(name):
+    change_list = ["from", "class", "return", "false", "true", "id", "and"]
+    if name in change_list:
+        name = "is_" + name
+    return name
+
+def contain(candidate_list, white_list):
+    output = []
+    for cand in candidate_list:
+        if cand not in white_list.keys():
+            return False
+        output.append(white_list[cand])
+    return output
+
+class CoT_Runner(object):
+    def __init__(self):
+        self.toolbench_key = 'VvZd8bIZV2Lu6wz63hAp1oVwIFRgpniyJrHG6bVU3zzOIAC3wC'
+        self.service_url = "http://8.218.239.54:8080/rapidapi"
+        self.max_observation_length = 1024
+        self.observ_compress_method = 'truncate'
+        self.CALL_MAX_TIME = 3
+        self.task_description = f'''You should use functions to help handle the real time user querys. Remember:
+1.ALWAYS call \"Finish\" function at the end of the task. And the final answer should contain enough information to show to the user,If you can't handle the task, or you find that function calls always fail(the function is not valid now), use function Finish->give_up_and_restart.
+2.Do not use origin tool names, use only subfunctions' names.
+\n'''
+        try:
+            self.rapidapi_key_list = json.load('rapidapi_key_list.json')
+        except:
+            self.rapidapi_key_list = []
+        self.use_rapidapi_key = True
+        self.api_customization = True
+#         unduplicated_reflection = {}
+#         for standardize_tool_name, tool_des in tool_des:
+#             unduplicated_reflection[standardize_tool_name] = tool_des
+
+#         for k,(standardize_tool_name, tool_des) in enumerate(unduplicated_reflection.items()):
+#             striped = tool_des[:512].replace('\n','').strip()
+#             if striped == "":
+#                 striped = "None"
+#             self.task_description += f"{k+1}.{standardize_tool_name}: {striped}\n"
+
+
+
+    def call_api(self, action_name="", action_input=""):
+        """Need to return an observation string and status code:
+            0 means normal response
+            1 means there is no corresponding api name
+            2 means there is an error in the input
+            3 represents the end of the generation and the final answer appears
+            4 means that the model decides to pruning by itself
+            5 represents api call timeout
+            6 for 404
+            7 means not subscribed
+            8 represents unauthorized
+            9 represents too many requests
+            10 stands for rate limit
+            11 message contains "error" field
+            12 error sending request
+        """
+        global call_cnt
+        if action_name in api_mapping:
+            try:
+            # if True:
+                result = api_mapping[action_name](**json.loads(action_input))
+            except Exception as e:
+                print(e, file=open('output/generation_error.txt','a'))
+                result = 'input format error'
+                # result = 'function args should be dict'
+            return result, 2
+        if action_name == "Finish":
+            if len(call_cnt) > 0 and min(call_cnt.values()) == 0:
+                function_never_called = []
+                for function_name in call_cnt:
+                    if call_cnt[function_name] == 0:
+                        function_never_called.append(function_name)
+                return json.dumps({"error": f"{function_never_called} have not been called. You should call them at least once before formulating the final query", "response": ""}), 15
+                # return json.dumps({"error": "You should call the each new added function at least once before formulating the final query", "response": ""}), 15
+            if len(functions) == 7:
+                return json.dumps({"error": "There must be apis successfully added using the add_apis function, and you should formulate your query based on the found apis", "response": ""}), 15
+                
+            try:
+                json_data = json.loads(action_input,strict=False)
+            except:
+                json_data = {}
+            if 'query' not in json_data:
+                return json.dumps({"error": "You should formulate a query", "response": ""}), 15
+            if 'answer' not in json_data:
+                return json.dumps({"error": "You should formulate an answer", "response": ""}), 15
+            if 'plan' not in json_data:
+                return json.dumps({"error": "You should formulate a plan", "response": ""}), 15
+            # solvable, reason = check_task_complete(json_data['query'], functions[5:])
+            solved, reason = check_task_solved(json_data['query'], json_data['answer'])
+            if solved != 'Solved':
+                return json.dumps({"error": f"The query is not solved by the answer. The reason is: {reason}", "response": ""}), 15
+            # if solvable == 'Incomplete':
+                # return json.dumps({"error": f"The query has incomplete inforamtion. The reason is: {reason}", "response": ""}), 15
+            return json_data, 3
+                
+        else:
+            for k, function in enumerate(functions):
+                if function["name"].endswith(action_name):
+                    assert function["name"] in call_cnt, (function["name"], call_cnt)
+                    call_cnt[function["name"]] += 1
+                    pure_api_name = api_name_reflect[function["name"]]
+                    payload = {
+                        "category": cate_names[k],
+                        "tool_name": tool_names[k],
+                        "api_name": pure_api_name,
+                        "tool_input": action_input,
+                        "strip": self.observ_compress_method,
+                        "toolbench_key": self.toolbench_key
+                    }
+                    # if self.process_id == 0:
+                    if True:
+                        print(colored(f"query to {cate_names[k]}-->{tool_names[k]}-->{action_name}",color="yellow"))
+                    if True:
+                        time.sleep(2) # rate limit: 30 per minute
+                        headers = {"toolbench_key": self.toolbench_key}
+                        try:
+                            response = requests.post(self.service_url, json=payload, headers=headers, timeout=15)
+                        except:
+                            # return json.dumps({"error": action_name, "response": ""}), 13
+                            os.makedirs('output', exist_ok=True)
+                            print(payload, file=open('output/timeout.txt','a'))
+                            return json.dumps({"error": "connection timeout", "response": ""}), 13
+                        if response.status_code != 200:
+                            return json.dumps({"error": f"request invalid, data error. status_code={response.status_code}", "response": ""}), 12
+                        try:
+                            response = response.json()
+                        except:
+                            print(response)
+                            return json.dumps({"error": f"request invalid, data error", "response": ""}), 15
+                    cnt = 0
+                    while any([word in response["error"] for word in error_list]):
+                        if cnt < len(self.rapidapi_key_list):
+                            # if self.use_rapidapi_key or self.api_customization:
+                            print(f'use rapidapi key {cnt}', file=open('output/rapidapi_key_usage.txt','a'))
+                            print(colored(f'use rapidapi key {cnt}', 'red'))
+                            payload["rapidapi_key"] = self.rapidapi_key_list[cnt]
+                            response = get_rapidapi_response(payload, api_customization=self.api_customization)
+                            print(response['error'], file=open('output/rapidapi_key_usage.txt','a'))
+                            cnt += 1    
+                        else:
+                            break
+                   
+                   
+
+
+
+
+
+
+
+
+
+# 12 error sending request
+                    if response["error"] == "API not working error...":
+                        status_code = 6
+                    elif response["error"] == "Unauthorized error...":
+                        status_code = 7
+                    elif response["error"] == "Unsubscribed error...":
+                        status_code = 8
+                    elif response["error"] == "Too many requests error...":
+                        status_code = 9
+                    elif response["error"] == "Rate limit per minute error...":
+                        print("Reach api calling limit per minute, sleeping...")
+                        time.sleep(10)
+                        status_code = 10
+                    elif response["error"] == "Message error...":
+                        status_code = 11
+                    else:
+                        status_code = 0
+                    return json.dumps(response), status_code
+                    # except Exception as e:
+                    #     return json.dumps({"error": f"Timeout error...{e}", "response": ""}), 5
+            return json.dumps({"error": f"No such function name: {action_name}", "response": ""}), 1
+
+    def run(self, query, answer):
+        messages = [
+                    {'role':'system',
+                     'content': 'You are QueryGPT, a helpful assistant who can strictly follow my instructions to generate diverse real queries'},
+                    #  'The query should be related to the category {random.sample(query_all_categories(), random.randint(2, 3))}
+                    ]
+        if len(query) > 0:
+            messages.append(
+                {'role':'user',
+                'content': FORMAT_INSTRUCTIONS_CONTINUAL_DATA_GENERATION.replace('{categories}', str(random.sample(query_all_categories(),5))) + 'Here is the query generated at the previous step: ' + query + ' And the answer is: ' + answer + 'You should extend this query to involve more api calls and also extend the answer. The extended part should be related to the current query.'})
+        else:            
+            messages.append({'role':'user', 
+                     'content': FORMAT_INSTRUCTIONS_DATA_GENERATION.replace('{generated_queries}', str(generated_query_list[-5:])).replace('{categories}', str(random.sample(query_all_categories(), 49)))})
+        i = 0
+        while i < 20:
+            print('#'*100)
+            print(len(functions), len(raw_api_list))
+            # assert len(functions) == len(raw_api_list) + 7, (len(functions), len(raw_api_list))
+            print(len(enc.encode(str(messages))), file=open('token_count_in.txt','a'))
+            response = call_gpt(
+                messages,
+                functions
+            )
+            if response == 'bad request':
+                pass
+                # messages = messages_old
+            elif isinstance(response, str):
+                continue
+            print(messages)
+            # messages_old = deepcopy(messages)
+            i = i + 1
+            tool_calls = response.choices[0].message.tool_calls
+            print('Thought:', response.choices[0].message.content)
+            print(len(enc.encode(str(response.choices[0].message.content))), file=open('token_count_out.txt','a'))
+            print(response.choices[0].finish_reason)
+            if tool_calls:
+                messages.append(
+                {
+                    "role": "assistant",
+                    "tool_calls": tool_calls,
+                    "content": response.choices[0].message.content if response.choices[0].message.content is not None else '',
+                }
+                )
+                for tool_call in tool_calls:
+                    function_name = tool_call.function.name
+                    function_args = tool_call.function.arguments
+            
+                    function_call_result, status_code = self.call_api(function_name, function_args)
+                    if function_name == 'get_api_details':
+                        function_call_result = str(function_call_result)
+                    print('Thought:', response.choices[0].message.content)
+                    print('function call:', function_name, function_args)
+                    print('function response:', function_call_result)
+                    messages.append(
+                            {
+                                "tool_call_id": tool_call.id,
+                                "role": "tool",
+                                "name": function_name,
+                                "content": str(function_call_result),
+                            })
+                    if function_name == 'Finish' and status_code != 15:
+                        return function_call_result, messages
+            else:
+                messages.append({'role': "assistant",
+                    'content': response.choices[0].message.content})
+                print('Thought:', response.choices[0].message.content)
+        return 'Exceed_max_iterations', messages    
+            
+generated_query_list = ['What is the current weather in Seattle, and what is the weather forecast for the next five days?']       
+
+def generate_main():
+    data = {}
+    global functions, tool_names, cate_names, generated_query_list
+    while True:
+        functions = [
+            # {'name': 'query_all_categories', 'description': 'query all categories in the database', 'parameters': {'type': 'object', 'properties': {}}}, {'name': 'query_tools_in_category', 'description': 'query all tools in a specific category', 'parameters': {'type': 'object', 'properties': {'category': {'type': 'string'}}}}, 
+                    get_tools_in_category_function.to_json_schema(),
+
+                     {'name': 'query_apis_in_tool', 'description': 'query all apis in a specific tool', 'parameters': {'type': 'object', 'properties': {'category': {'type': 'string'}, 'tool_name': {'type': 'string'}},'required': ['category','tool_name']}}, 
+                     {'name': 'query_tool_details', 'description': 'query the details of a specific tool', 'parameters': {'type': 'object', 'properties': {'tool_name': {'type': 'string'}},'required': ['tool_name']}},
+                     {'name': 'add_apis', 'description': 'add apis to the current available api list. required input to be list of dictionaries describing with the keys category_name, tool_name, api_name', 'parameters': {'type': 'object', 'properties': {'api_list': {'type': 'null'}},'required': ['api_list']}},
+                    #  retrieve_context_function.to_json_schema()
+                     ]
+
+        finish_func = {
+            "name": "Finish",
+            "description": "If you believe that you have obtained a query that can answered by the api database, please call this function to provide the final answer.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "answer":{"type":"string"}
+                },
+                "required": ["answer"]
+            }
+            }
+    
+        functions.append(finish_func)
+        cate_names = ['' for func in functions]
+        tool_names = ['' for func in functions]
+
+        runner = CoT_Runner()
+        result, messages = runner.run()
+        data['result'] = result
+        generated_query_list.append(result['answer'])
+        return result['answer'], messages
+    
+exclusion_words = ["sorry", "apologize", "apology", "unfortunately", "couldn't"]
+def generate_return_api_main(query, answer):
+    data = {}
+    global functions, tool_names, cate_names, generated_query_list, raw_api_list, call_cnt
+    while True:
+        # try:
+        if True:
+            raw_api_list = []
+            call_cnt = {}
+            functions = [
+                        get_tools_in_category_function,
+                        get_apis_in_tool_function,
+                        get_api_details_function,
+                        get_tools_descriptions_function,
+                        add_apis_into_api_pool_function,
+                         remove_apis_function,
+                         ]
+
+            finish_func = {
+                "name": "Finish",
+                "description": "If you believe that you have obtained a query that can answered by the api database, please call this function to provide the query, the corresponding answer and the plan of using the functions to answer the query.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "query":{"type":"string"},
+                        "answer":{"type":"string"},
+                        "plan":{"type":"string"},
+                    },
+                    "required": ["query", "answer", 'plan']
+                }
+                }
+    
+            functions.append(finish_func)
+            cate_names = ['' for func in functions]
+            tool_names = ['' for func in functions]
+
+            runner = CoT_Runner()
+            result, messages = runner.run('', '')
+            if isinstance(result, str):
+                continue
+            data['result'] = result
+            if 'openai' in result: 
+                return result, messages, raw_api_list
+            generated_query_list.append(result['query'])
+            query = result['query']
+            answer = result['answer']
+            if not any([word in result['answer'].lower() for word in exclusion_words]):
+                return result, messages, raw_api_list
+                # return result['query'], result['answer'], messages, [{'api_name': functions[k]['name'], 'tool_name': tool_names[k], 'category_name': cate_names[k] }for k in range(6, len(functions))]
+        # except:
+        #     pass
+
+import time
+if __name__ == '__main__':
+    exclusion_words = ["sorry", "apologize", "apology", "unfortunately", "couldn't"]
+    # output_dir = 'result1/custom_data'
+    output_dir = 'result1/custom_data_0129'
+    os.makedirs(output_dir, exist_ok=True)
+    generated_query_list = []
+    query = ''
+    answer = ''
+    for i in range(1000):
+        t_s = time.time()
+        print('#' * 100)
+        print(i)
+
+        data = {}
+        output_path = f'{output_dir}/{i}.json'
+        if os.path.exists(output_path):
+            continue
+        query = ''
+        answer = ''
+        plan = ''
+        try:
+            while True:
+            # try:
+                result, generate_messages, api_list = generate_return_api_main(query, answer)
+                break
+        except:
+            continue
+            # except:
+                # pass
+
+      
+        generated_query_list.append(query)
+        if isinstance(result, dict):
+            query = result['query']
+            answer = result['answer']
+            plan = result['plan']
+        data['query'] =  query
+        data['plan'] = plan
+        data['gt_api_list'] = api_list
+        data['final_answer'] = answer
+        # for message in generate_messages:
+        #     if message['role'] == 'assistant':
+        #         if 'tool_calls' in message:
+        #             message['tool_calls'] = [tool_call.json() for tool_call in message['tool_calls']]
+        data['generate_messages'] = generate_messages
+        print(query,  file=open(os.path.join(output_dir, f'generated_query_given_api_list.txt'),'a'))  
+        json.dump(data, open(output_path, 'w'), indent=4)
+        # print(time.time() - t_s, file=open(os.path.join(output_dir, f'time.txt'),'a'))
--- a/dfs_gt.py
+++ b/dfs_gt.py
@ -0,0 +1,231 @@
+#encoding:utf-8
+
+import openai
+import os
+from typing import List, Dict, Any
+import re
+from tqdm import tqdm
+import time
+import requests
+from termcolor import colored
+from copy import deepcopy
+from api_database_function import *
+from verifier import check_solved_toolbench
+import os
+from rapidapi import pipeline_runner
+
+from typing import Any, Callable
+from openai_function_calling import FunctionInferer
+import openai
+import json
+# query_data = json.load(open('G1_instruction_query_failed.json', 'r', encoding='utf-8'))
+# Define example functions.
+from flask import Flask, jsonify, request
+
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+# For pipeline environment preparation
+def get_white_list(tool_root_dir):
+    # print(tool_root_dir)
+    white_list_dir = os.path.join(tool_root_dir)
+    white_list = {}
+    for cate in tqdm(os.listdir(white_list_dir)):
+        if not os.path.isdir(os.path.join(white_list_dir,cate)):
+            continue
+        for file in os.listdir(os.path.join(white_list_dir,cate)):
+            if not file.endswith(".json"):
+                continue
+            standard_tool_name = file.split(".")[0]
+            # print(standard_tool_name)
+            with open(os.path.join(white_list_dir,cate,file)) as reader:
+                js_data = json.load(reader)
+            # print(js_data)
+            try:
+                origin_tool_name = js_data["tool_name"]
+            except:
+                print('#'*100)
+                print('error:', 'js_data', js_data[0])
+
+            white_list[standardize(origin_tool_name)] = {"description": js_data["tool_description"], "standard_tool_name": standard_tool_name}
+    return white_list
+
+def standardize(string):
+    # print(string)
+    if not isinstance(string, str):
+        print('*'*100)
+        print(string)
+    res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9^_]")
+    string = res.sub("_", string)
+    string = re.sub(r"(_)\1+","_", string).lower()
+    while True:
+        if len(string) == 0:
+            return string
+        if string[0] == "_":
+            string = string[1:]
+        else:
+            break
+    while True:
+        if len(string) == 0:
+            return string
+        if string[-1] == "_":
+            string = string[:-1]
+        else:
+            break
+    if string[0].isdigit():
+        string = "get_" + string
+    return string
+
+tool_root_dir = "data/toolenv/tools"
+white_list = get_white_list(tool_root_dir)
+def contain(candidate_list, white_list):
+    output = []
+    for cand in candidate_list:
+        if cand not in white_list.keys():
+            return False
+        # print(white_list[cand])
+        output.append(white_list[cand])
+    return output
+def change_name(name):
+    change_list = ["from", "class", "return", "false", "true", "id", "and"]
+    if name in change_list:
+        name = "is_" + name
+    return name
+
+def solve_given_api_main(query, api_list, i, messages=None):
+    answer_dir = dfs_args.output_answer_file
+    if not os.path.exists(answer_dir):
+        os.mkdir(answer_dir)
+    if os.path.exists(os.path.join(answer_dir, f'{i}_DFS_woFilter_w2.json')):
+        os.remove(os.path.join(answer_dir, f'{i}_DFS_woFilter_w2.json'))
+    method = dfs_args.method
+    backbone_model = dfs_runner.backbone_model
+    data_dict = {}
+    result_data = {}
+    data_dict['query'] = query
+    data_dict['api_list'] = api_list
+    origin_tool_names = [standardize(cont["tool_name"]) for cont in api_list]
+    tool_des = contain(origin_tool_names,white_list)
+    if tool_des == False:
+        result_data = {'result': 'no tool description'}
+        return False, result_data
+    tool_des = [[cont["standard_tool_name"], cont["description"]] for cont in tool_des]
+    task = (method, backbone_model, i, data_dict, dfs_args, answer_dir, tool_des)
+    for _ in range(3):
+        dfs_runner.run(task, messages)
+        result = json.load(open(os.path.join(answer_dir, f'{i}_DFS_woFilter_w2.json'), 'r', encoding='utf-8'))
+        try:
+            result_data['result'] = json.loads(result['answer_generation']['final_answer'])
+        except:
+            print(result['answer_generation']['final_answer'])
+            final_answer_str = result['answer_generation']['final_answer']
+            return_type = final_answer_str[final_answer_str.find('"return_type": "')+len('"return_type": "'):final_answer_str.find('",')]
+            result_data['result'] = {
+                "return_type": return_type,
+            }
+            if '"final_answer": "' in final_answer_str:
+                final_answer = final_answer_str[final_answer_str.find('"final_answer": "')+len('"final_answer": "'):]
+                result_data['result']['final_answer'] = final_answer
+            elif return_type == 'give_answer':
+                continue
+            if '"reason": "' in final_answer_str:
+                reason = final_answer_str[final_answer_str.find('"reason": "')+len('"reason": "'):]
+                result_data['result']['reason'] = reason
+            result['answer_generation']['final_answer'] = json.dumps(result_data['result'])
+        if result['answer_generation']['finish_type'] == 'give_answer' and 'final_answer' in result_data['result'] and  result_data['result']['final_answer'] != '':
+            # and not any(word in str(result['answer_generation']['final_answer']).lower() for word in exclusion_words)
+            solved = True
+        else:
+            solved = False
+        return solved, result_data
+    result_data['result']['final_answer'] = ''
+    return False, result_data
+      
+from arguments import parse_args
+args = parse_args()
+output_path = args.output_dir
+
+# output_path = f'{query_dir}/reassign_toolllama_dfs_r1'
+os.makedirs(output_path, exist_ok=True)
+dfs_args = dotdict(dict(backbone_model='chatgpt_function', openai_key='', model_path='your_model_path/', tool_root_dir='data/toolenv/tools/', lora=False, lora_path='your_lora_path if lora', max_observation_length=1024, max_source_sequence_length=4096, max_sequence_length=8192, observ_compress_method='truncate', method='DFS_woFilter_w2', input_query_file='data/test_instruction/G1_tool.json', output_answer_file=output_path, toolbench_key=toolbench_key, rapidapi_key='', use_rapidapi_key=False, api_customization=False))
+dfs_runner = pipeline_runner(dfs_args)
+
+if __name__ == '__main__':
+    retrieved_api_nums = 10
+    query_list = []
+    cnt = 0
+    success = 0
+    no_return_type_cnt = 0
+    failed = []
+    task_solvable = 'Solvable'
+    solvable_reason = 'Solvable checked by human'
+    # for root, dirs, files in os.walk('result/generated_solve_given_api_solvable2'):
+    solved_dict = json.load(open('solved_dict.json', 'r', encoding='utf-8'))
+    for i in range(262):
+        t_s = time.time()
+        comparison_data = {}
+        # for file in files:
+        #     if file.endswith('.json'):
+        #         print(file)
+        data_load = json.load(open(f'{args.query_dir}/{i}.json', 'r', encoding='utf-8'))
+        if str(data_load['query_id']) in solved_dict and solved_dict[str(data_load['query_id'])]['solved'] != 'Solved':
+            continue
+        
+        query = data_load['query']
+        # continue
+        cnt += 1
+        # if cnt > 50:
+        #     break
+        if os.path.exists(os.path.join(output_path, f'{i}_DFS_woFilter_w2.json')):
+            data = json.load(open(os.path.join(output_path, f'{i}_DFS_woFilter_w2.json'), 'r', encoding='utf-8'))
+            final_data = json.load(open(os.path.join(output_path, f'{i}.json'), 'r', encoding='utf-8'))
+            if data['answer_generation']['finish_type'] != 'give_answer':
+                print(i)
+            if 'final_answer' in data['answer_generation'] and not any(word in data['answer_generation']['final_answer'].lower() for word in exclusion_words):
+                if 'check_solved' in final_data:
+                    check_solved = final_data['check_solved']
+                    reason = final_data['reason']
+                else:
+                    check_solved, reason = check_solved_toolbench(f'{output_dir}/{i}_DFS_woFilter_w2.json', i, data_load['query_id'], task_solvable, solvable_reason)
+                if check_solved == 'Solved':
+                    success += 1
+                else:
+                    check_solved = 'Unsolved'
+            else:
+                check_solved = 'Unsolved'
+                print(output_path, i, file=open(os.path.join(output_path, 'failed.txt'), 'a'))
+            print(success, cnt, i+1, file=open(os.path.join(output_path, 'success_cnt.txt'), 'a'))
+            continue
+        find_api_messages_to_save = []
+        messages_to_save = []
+        try:
+            gt_api_list = [{'category_name': api.get('category_name', ''), 'tool_name':api.get('tool_name', ''),'api_name':api.get('api_name', '') }for api in data_load['api_list'][-1]]
+            # gt_api_list = [{'category_name': api.get('category_name', ''), 'tool_name':api.get('tool_name', ''),'api_name':api.get('api_name', '') }for api in data_load['gt_api_list']]
+            comparison_data['gt_api_list'] = gt_api_list
+        except:
+            pass
+        print('#'*100, file=open(os.path.join(output_path, 'time.txt'), 'a'))
+        solved, result_data = solve_given_api_main(query, data_load['gt_api_list'], i)
+        if solved:
+            check_solved, reason, _ = check_solved_toolbench(f'{output_dir}/{i}_DFS_woFilter_w2.json', i, data_load['query_id'], task_solvable, solvable_reason)
+            if check_solved == 'Solved':
+                success += 1
+        else:
+            check_solved = 'Unsolved'
+            reason = ''
+            print(output_path, i, file=open(os.path.join(output_path, 'failed.txt'),'a'))
+        print(success, cnt, i+1, file=open(os.path.join(output_path, 'success_cnt.txt'), 'a'))
+        final_data = {}
+        final_data['query_id'] = data_load['query_id']
+        final_data['query'] = data_load['query']
+        final_data['gt_api_list'] = data_load['gt_api_list']
+        final_data['gt_answer'] = data_load['final_answer']
+        final_data['result'] = result_data
+        final_data['check_solved'] = check_solved
+        final_data['reason'] = reason
+        json.dump(final_data, open(os.path.join(output_path, f'{i}.json'), 'w', encoding='utf-8'), ensure_ascii=False, indent=4)
+
+        print(f'time: {time.time()-t_s}', file=open(os.path.join(output_path, 'time.txt'),'a'))
+      
--- a/extract_api_details.py
+++ b/extract_api_details.py
@ -0,0 +1,110 @@
+import zipfile
+import os
+import json
+from copy import deepcopy
+# Extract the new zip file
+# with zipfile.ZipFile(zip_file_path_small, 'r') as zip_ref:
+#     zip_ref.extractall(extracted_folder_path_small)
+extracted_folder_path_small = 'data/toolenv/tools'
+
+
+
+# api_test_results = json.load(open('api_test_results_with_docs2.json', 'r', encoding='utf-8'))
+
+
+# Walk through the extracted files and read the JSON data
+detailed_data_small = {}  # Initialize an empty dictionary to store the extracted data
+cnt = 0
+api_name_list = []
+data_for_retrieval = []
+for root, dirs, files in os.walk(extracted_folder_path_small):
+    for file in files:
+        # Ensure we are only processing .json files
+        if file.endswith(".json"):
+            file_path = os.path.join(root, file)
+            # Extract the category name from the file path
+            print(file_path)
+            category = file_path.split('/')[-2]
+            with open(file_path, 'r', encoding='utf-8') as json_file:
+                # try:
+                json_data = json.load(json_file)
+                if 'name' in json_data:
+                    tool_name = json_data['name']
+                else:
+                    tool_name = json_data['tool_name']
+                api_list = json_data.get('api_list', [])
+                # Extract necessary data for each API and organize it in the dictionary
+                if category not in detailed_data_small:
+                    detailed_data_small[category] = {}
+                if tool_name not in detailed_data_small[category]:
+                    detailed_data_small[category][tool_name] = {"api_list": []}
+                else:
+                    tool_name += '_new'
+                    raise ValueError('duplicate tool name')
+                    detailed_data_small[category][tool_name] = {"api_list": []}
+                for api in api_list:
+                    cnt += 1
+                    api_name = api.get('name', 'Unknown API')
+                    # try:
+                    #     if api_test_results[category][tool_name][api_name]["result"]['return_type'] == "inalive":
+                    #         print('remove')
+                    #         continue
+                    # except:
+                    #     print(category, tool_name, api_name)
+                    #     pass
+                    # if api_name in api_name_list:
+                    #     raise Exception('duplicate api name')
+                    api_name_list.append(api_name)
+                    description = api.get('description', 'No description available.')
+                    required_parameters = [param.get('name', 'Unknown Parameter') for param in api.get('required_parameters', [])]
+                    optional_parameters = [param.get('name', 'Unknown Parameter') for param in api.get('optional_parameters', [])]
+                    test_endpoint = api.get('test_endpoint', '') 
+                    tool_description = json_data.get('tool_description', 'No description available.'),
+                    # Organizing the data
+                    # print(len(detailed_data_small[category][tool_name]['api_list']))
+                    if tool_description is not None:
+                        tool_description = tool_description[:100]
+                    if description is not None:
+                        description = description[:100]
+                    data_for_retrieval.append({
+                        "category_name": category,
+                        "tool_name": tool_name,
+                        "api_name": api_name,
+                        "tool_description": tool_description,
+                        "api_description": description,
+                        "required_parameters": required_parameters,
+                        "optional_parameters": optional_parameters,
+                    })
+                    detailed_data_small[category][tool_name]["api_list"].append({
+                        "name": api_name,
+                        "description": description,
+                        "required_parameters": required_parameters,
+                        "optional_parameters": optional_parameters,
+                        # "test_endpoint": test_endpoint
+                    })
+                # except Exception as e:
+                    # Store the error message if we fail to process a file
+                    # if category not in detailed_data_small:
+                    #     detailed_data_small[category] = {}
+                    # detailed_data_small[category][file] = {"error": str(e)}
+
+# Verifying the structure of the detailed_data_small by displaying a sample
+# sample_detailed_data_small = {
+#     category: {
+#         tool_name: detailed_data_small[category][tool_name] 
+#         for tool_name in list(detailed_data_small[category].keys())[:1]
+#     }
+#     for category in list(detailed_data_small.keys())[:3]
+# }
+cnt = 0 
+for category in detailed_data_small:
+    for tool_name in detailed_data_small[category]:
+        cnt += len(detailed_data_small[category][tool_name]['api_list'])
+print('total api number:', cnt)
+
+# json.dump(detailed_data_small, open('api_details_compressed.json', 'w', encoding='utf-8'), indent=4, ensure_ascii=False)
+print(len(data_for_retrieval))
+json.dump(data_for_retrieval, open('data_for_retrieval.json', 'w', encoding='utf-8'), indent=4, ensure_ascii=False)
+json.dump(detailed_data_small, open('api_details.json', 'w', encoding='utf-8'), indent=4, ensure_ascii=False)
+print(cnt)
+
--- a/extract_category_tool_details.py
+++ b/extract_category_tool_details.py
@ -0,0 +1,32 @@
+import os
+import json
+
+def extract_tool_data():
+    tool_data = {}
+    cnt = 0
+    for root, dirs, files in os.walk("data/toolenv/tools"):
+        for file in files:
+            if file.endswith(".json"):
+                if root.split('/')[-1] not in tool_data:
+                    tool_data[root.split('/')[-1]] = {}
+                with open(os.path.join(root, file), "r") as f:
+                    # print(root, file)
+                    data = json.load(f)
+                    try:
+                        tool_name = data["tool_name"] if "tool_name" in data else data["name"]
+                    except:
+                        tool_name = os.path.basename(file).split(".")[0]
+                    tool_description = data["tool_description"]
+
+                    api_list = [api['name'] for api in data['api_list']]
+                    # print({file:{ tool_name:{"api_list": [api['name'] for api in api_list]}}})
+                    if tool_name not in tool_data[root.split('/')[-1]]:
+                        tool_data[root.split('/')[-1]][tool_name] = {"tool_description": tool_description}
+                    else:
+                        tool_name += '_new'    
+                        tool_data[root.split('/')[-1]][tool_name] = {"tool_description": tool_description}
+    return tool_data
+tool_data = extract_tool_data()
+print(tool_data.keys())
+json.dump(tool_data, open("category_tool_details.json", "w", encoding='utf-8'), indent=4)
+# json.dump(tool_data, open("category_tool_details_add_nonfree.json", "w", encoding='utf-8'), indent=4)
--- a/extract_tool_database.py
+++ b/extract_tool_database.py
@ -0,0 +1,34 @@
+import os
+import json
+
+def extract_tool_data():
+    tool_data = {}
+    cnt = 0
+    for root, dirs, files in os.walk("data/toolenv/tools"):
+        for file in files:
+            if file.endswith(".json"):
+                if root.split('/')[-1] not in tool_data:
+                    tool_data[root.split('/')[-1]] = {}
+                with open(os.path.join(root, file), "r") as f:
+                    # print(root, file)
+                    data = json.load(f)
+                    try:
+                        tool_name = data["tool_name"] if "tool_name" in data else data["name"]
+                    except:
+                        tool_name = os.path.basename(file).split(".")[0]
+                    api_list = data["api_list"]
+                    if api_list is None: continue
+                    cnt += len(api_list)
+                    # print([api['name'] for api in api_list])
+                    # print({file:{ tool_name:{"api_list": [api['name'] for api in api_list]}}})
+                    if tool_name not in tool_data[root.split('/')[-1]]:
+                        tool_data[root.split('/')[-1]][tool_name] = {"api_list_names": [api['name'] for api in api_list]}
+                    else:
+                        tool_name += '_new'    
+                        tool_data[root.split('/')[-1]][tool_name]['api_list_names'].extend([api['name'] for api in api_list])
+    # print(tool_data)
+    print(cnt)
+    return tool_data
+tool_data = extract_tool_data()
+print(tool_data.keys())
+json.dump(tool_data, open("tool_data_add_nonfree.json", "w", encoding='utf-8'), indent=4)
--- a/openai_utils.py
+++ b/openai_utils.py
@ -0,0 +1,206 @@
+import openai
+from tenacity import retry, wait_random_exponential, stop_after_attempt
+import time
+import os
+from datetime import datetime
+import tiktoken
+from copy import deepcopy
+import json
+from config import *
+from arguments import parse_args
+import importlib
+from termcolor import colored
+enc = tiktoken.encoding_for_model("gpt-4")
+args = parse_args()
+output_dir = args.output_dir
+if api_type == "azure":
+    from openai import AzureOpenAI as Client
+else:
+    from openai import OpenAI as Client
+client = Client(
+api_key=api_key,
+api_version=api_version,
+azure_endpoint = api_base
+)
+# turbo_client = Client(
+# api_key=api_key,
+# api_version=api_version,
+# azure_endpoint = api_base
+# )
+
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+
+
+def call_gpt(messages, functions=None, **kwargs):
+    if 'model' not in kwargs:
+        kwargs['model'] = model_name
+    messages_converted = messages
+    for message in messages_converted:
+        if "tool_calls" in message:
+            message['function_call'] = message['tool_calls'][0]['function']
+            message.pop('tool_calls')
+        if "tool_call_id" in message:
+            message.pop('tool_call_id')
+            message['role'] = 'function'
+    @retry(wait=wait_random_exponential(multiplier=10, max=50), stop=stop_after_attempt(5))
+    def call_gpt_retry(messages, functions):
+        ts = time.time()
+        try:
+            response = client.chat.completions.create(
+                        seed=123,
+                        messages=messages,
+                        functions=functions,
+                        **kwargs
+                    )
+        except openai.BadRequestError as e:
+            # try:
+            #     response = turbo_client.chat.completions.create(
+            #             seed=123,
+            #             model='gpt-4-turbo',
+            #             messages=messages,
+            #             functions=functions
+            #         )
+            # except Exception as e:
+            #     raise e
+            raise e
+           
+        except openai.RateLimitError as e:
+            time.sleep(50)
+            raise e
+        except openai.InternalServerError as e:
+            raise e
+        except Exception as e:
+            raise e
+            
+        t = time.time() - ts
+        return response, t
+    t_s = time.time()
+    try:
+        response, t_real = call_gpt_retry(messages_converted, functions)
+        # json_content = response.choices[0].message.content
+        t = time.time() - t_s
+        print('minus:', t-t_real, file=open(os.path.join(output_dir, "time.txt"), "a"))
+        # print(response.choices[0].message.function_call)
+        if response.choices[0].finish_reason == 'function_call':
+            response_json = json.loads(response.json())
+            tool_call = {'arguments': response_json['choices'][0]['message']['function_call']['arguments'], 'name': response_json['choices'][0]['message']['function_call']['name']}
+            response.choices[0].message.tool_calls = [dotdict({'id':'111', 'function':dotdict(tool_call)})]
+        else:
+            if model_name == 'gpt-4-turbo':
+                response.choices[0].message.tool_calls = []
+            # else:
+                # response.choices[0].message['tool_calls'] = []
+        if response.usage is None:
+            token_cnt = len(enc.encode(str(functions))) + len(enc.encode(str(messages))) + len(enc.encode(str(response.choices[0].message.content)))
+            response.usage = dotdict({'total_tokens': token_cnt})
+        else:
+            print(colored('tokens', 'blue'), colored(response.usage.total_tokens, 'blue'))
+        return response
+        
+    except Exception as e:
+        raise e
+        t = time.time() - t_s
+        print('minus:', t, file=open(os.path.join(output_dir, "time.txt"), "a"))
+        return "openai error"
+
+def call_gpt_no_func(messages):
+    @retry(wait=wait_random_exponential(multiplier=60, max=100), stop=stop_after_attempt(10))
+    def call_gpt_retry(messages):
+        response = client.chat.completions.create(
+                        model=model_name,
+                        messages=messages,
+                    )
+        return response
+    # try:
+    return call_gpt_retry(messages)
+
+#
+
+
+def call_gpt_turbo(messages, functions):
+    functions_new = []
+    for function in functions:
+        functions_new.append({
+            "type": "function",
+            "function": function
+        })
+    # time.sleep(1)
+        
+    @retry(wait=wait_random_exponential(multiplier=5, max=20), stop=stop_after_attempt(10))
+    def call_gpt_retry(messages, functions):
+        t_s = time.time()
+        try:
+            response = client.chat.completions.create(
+                            model="gpt-4-turbo",
+                            messages=messages,
+                            seed=123,
+                            # response_format={"type": "json_object"},
+                            tools=functions,
+                            # tool_choice="tool",  # auto is default, but we'll be explicit
+                            tool_choice="auto",  # auto is default, but we'll be explicit
+
+                        )
+        except openai.BadRequestError as e:
+            # raise e
+            return "bad request", 0
+        except openai.RateLimitError as e:
+            time.sleep(50)
+            raise e
+        except openai.InternalServerError as e:
+            return "internal server error", 0
+        
+        t = time.time() - t_s
+        return response, t
+    t_s = time.time()
+    try:
+    # if True:
+        response, t_real = call_gpt_retry(messages, functions_new)
+        t = time.time() - t_s
+        print(f'{datetime.now()}', file=open(os.path.join(output_dir, "time.txt"), "a"))
+        if not isinstance(response, str):
+            print(response.usage.total_tokens, file=open(os.path.join(output_dir, "time.txt"), "a"))
+        print('minus:', t-t_real, file=open(os.path.join(output_dir, "time.txt"), "a"))
+        print('#'*100, '\n\n', messages, '\n\n', functions, '\n\n', response, file=open(os.path.join('output', "log.txt"), "a"))
+        return response
+    except Exception as e:
+        raise e
+        t = time.time() - t_s
+        print('minus:', t, file=open(os.path.join(output_dir, "time.txt"), "a"))
+        return "openai error"
+# Example dummy function hard coded to return the same weather
+# In production, this could be your backend API or an external API
+def get_current_weather(location, unit="fahrenheit"):
+    """Get the current weather in a given location"""
+    if "tokyo" in location.lower():
+        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": unit})
+    elif "san francisco" in location.lower():
+        return json.dumps({"location": "San Francisco", "temperature": "72", "unit": unit})
+    elif "paris" in location.lower():
+        return json.dumps({"location": "Paris", "temperature": "22", "unit": unit})
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})
+if __name__ == "__main__":
+    messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
+    tools = [
+            {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The city and state, e.g. San Francisco, CA",
+                            },
+                            "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                        },
+                        "required": ["location"],
+                    },
+                },
+        ]
+    response = call_gpt(messages, tools)
+    print(response)
--- a/prompt_template.py
+++ b/prompt_template.py
@ -0,0 +1,313 @@
+from datetime import datetime
+from arguments import parse_args
+args = parse_args()
+leaf_tool_number = args.leaf_tool_number
+current_date_time = datetime.now()
+
+META_AGENT_PROMPT = """
+You are APIGPT, You have access to a database of apis. The database has the following categories: {categories}.
+You should help the user find the relevant categories for a task. You can use the get_tools_in_category function to retrieve the available tools of a specific category. 
+If you are unsure about the functionality of some tools, you can use the get_tools_descriptions function to retrieve the details of these tools. 
+This will help you understand the general functionality of each category.
+You can use the create_agent_category_level function to assign a relevant category to a agent. 
+Each agent should be assigned only one category. 
+You can assign multiple categories to different agents. 
+You should explore as many categories as possible. The query may be solved by tools in unexpected categories.
+Remember, you do not need to answer the query, all you need is to find all possible relevant categories and assign them to agents.
+When you finish the assignment, call the Finish function. 
+ At each step, you need to give your thought to analyze the status now and what to do next, with the function calls to actually excute your step.
+ All the thought is short, at most in 3 sentence. 
+"""
+
+"""
+You are APIGPT, with access to a database of APIs. This database is organized
+into the following categories: {categories}. Your task is to help users
+identify the relevant categories for their needs. To do this, you can
+use the 'get_tools_in_category' function to retrieve the available tools
+within a specific category. If you are unsure about the functionality of
+some tools, the 'get_tools_descriptions' function can be used to obtain
+detailed information about these tools. This information will aid you in
+understanding the general functionality of each category. Additionally, the
+'create_agent_category_level' function allows you to assign a relevant category
+to an agent, with each agent being assigned only one category. However,
+you can assign multiple categories to different agents. It is important
+to explore as many categories as possible, as the solution to a query may
+be found in unexpected categories. Remember, your goal is not to answer
+the query directly but to identify all potentially relevant categories and
+assign them to agents. Once you have completed the assignment, call the
+'Finish' function. 
+At each step,  you should call functions to actually excute your step.
+All the thought is short, at most in 3 sentence.
+"""
+
+CATEGORY_AGENT_PROMPT = """
+You are APIGPT, You have access to a database of apis. The database has many categories. Each category has many tools. Each tool has many apis.
+Now, you should help the user find the relevant tools in '{category}' category for a task.
+If you are unsure about the functioinality of some tools, you can use the get_tools_descriptions function to retrieve the details of these tools.
+Then you can use the create_agent_tool_level function to assign a subset of relevant tools to a agent. You should assign similar tools to the same agent and no more than {leaf_tool_number} tools to each agent.
+You can assign multiple subsets to different agents. 
+Remember, you do not need to answer the query but you need to assign all possible tools. 
+When you finish the assignment or you think the query is irrelevant to tools in this category, call the Finish function.
+At each step,  you should call functions to actually excute your step.
+All the thought is short, at most in 3 sentence.
+""".replace('{leaf_tool_number}', str(leaf_tool_number))
+
+
+"""
+You are APIGPT, with access to a database of APIs categorized into various
+groups. Each category contains numerous tools, and each tool encompasses
+multiple APIs. Your task is to assist users in finding relevant tools within
+the category: {category}. If uncertain about the functionality of some tools, use
+the 'get_tools_descriptions' function to obtain detailed information. Then,
+employ the 'create agent tool level' function to allocate a subset of pertinent
+tools to an agent, ensuring that similar tools are assigned to the same agent
+and limiting the allocation to no more than five tools per agent. You may
+assign different subsets to multiple agents. Remember, your role is not to
+answer queries directly, but to assign all possible tools. Once you complete
+the assignment, or if you determine the query is irrelevant to the tools in
+the specified category, invoke the 'Finish' function.
+At each step,  you should call functions to actually excute your step.
+All the thought is short, at most in 3 sentence.
+"""
+
+TOOL_AGENT_PROMPT = """
+You are APIGPT, You have access to a database of apis. The database has many categories. Each category has many tools. Each tool has many apis.
+Now, you should help the user find the relevant apis in the tools {tools} of category '{category}' for a task. You will be given all the tool description and the contained api list and their details
+When you determine the api names, use the add_apis_into_api_pool function to add them to the final api list. 
+If you think you have explored all the possible apis or you think there are no relevant apis in these tools, call the Finish function.
+In the middle step, you may be provided with feedback on these apis.
+At each step,  you should call functions to actually excute your step.
+All the thought is short, at most in 3 sentence.
+"""
+
+"""
+You are APIGPT with access to a database of APIs, categorized into various
+sections. Each category contains multiple tools, and each tool encompasses
+numerous APIs. Your task is to assist users in finding relevant APIs within
+the tools '{tools}' of the '{category}' category. You will be provided with
+descriptions and details of these tools and their APIs. Upon identifying
+relevant API names, use the 'add_apis_into_api_pool' function to add them to
+the final API list. If you conclude that all possible APIs have been explored,
+or if there are no relevant APIs in these tools, invoke the Finish function.
+During the process, you may receive feedback on these APIs. 
+At each step,  you should call functions to actually excute your step.
+All the thought is short, at most in 3 sentence.
+"""
+
+
+
+
+FORMAT_INSTRUCTIONS_DATA_GENERATION = """
+Your task is to interact with a sophisticated database of tools and functions,
+often referred to as APIs, to construct a user query that will be answered
+using the capabilities of these APIs. This database is organized into various
+categories, indicated by {categories}. To guide your exploration and selection
+of the appropriate APIs, the database offers several meta functions:
+Exploration Functions:
+1. Use get_tools_in_category to explore tools in a specific category.
+2. Employ get_apis_in_tool to discover the list of APIs available within a
+selected tool.
+3. If you need detailed information about some tools, gets_tools_descriptions will
+provide it.
+4. For in-depth understanding of an API's functionality, turn to
+get api details.
+Selection and Testing Functions:
+1. As you identify relevant functions, add them to your working list using
+add_apis_into_pool into api pool.
+2. Test these functions by synthesizing and applying various parameters.
+This step is crucial to understand how these functions can be practically
+applied in formulating your query.
+3. Should you find any function obsolete or not fitting your query context,
+remove them using remove_apis from api pool.
+Query Formulation Guidelines:
+1.Your formulated query should be comprehensive, integrating APIs from 2
+to 5 different categories. This cross-functional approach is essential to
+demonstrate the versatility and broad applicability of the database.
+2.Avoid using ambiguous terms. Instead, provide detailed, specific
+information. For instance, if your query involves personal contact details,
+use provided placeholders like {email} for email, {phone number} for phone
+number, and URLs like {url} for a company website.
+3.The query should be relatable and understandable to users without requiring
+knowledge of the specific tools or API names used in the background. It
+should reflect a real-world user scenario.
+4. Aim for a query length of at least thirty words to ensure depth and
+complexity.
+Final Steps:
+1.Once you've crafted the query, use the Finish function to submit it along
+with the corresponding answer. The answer should be direct and concise,
+addressing the query without delving into the operational plan of the APIs.
+2.Remember, the total number of calls to the initial meta functions should not
+exceed 20.
+3.Consider various use cases while formulating your query, such as data
+analysis in business contexts or educational content in academic settings.
+Your approach should be creative and inclusive, catering to users with
+different skill levels and cultural backgrounds. Ensure that the query is
+globally relevant and straightforward, serving a singular purpose without
+diverging into unrelated areas. The complexity of your query should stem from
+the synthesis of information from multiple APIs.
+""".replace('{email}', "devon58425@trackden.com").replace('{phone number}', "+6285360071764").replace('{url}', "https://deepmind.google/")
+
+
+CHECK_COMPLETE_PROMPT = """
+Please check whether the given task has complete infomation for function calls with following rules:
+1. If the `query` provide invalid or ambiguous information (e.g. invalid email address or phone number), return "Incomplete"
+2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task, the name of my friend or company), return "Incomplete"
+3. If the `query` has complete information , return "Complete"
+Remember, you do not need to answer the query, all you need is to check whether the query has complete information for calling the functions to solve.
+You must call the Finish function at one step
+"""
+
+# Knowledge cutoff: 2023-04
+# Current date: {current_date_time}
+
+CHECK_SOLVED_PROMPT = """
+You are a AI assistant. 
+Giving the query and answer, you need give `answer_status` of the answer by following rules:
+1. If the answer is a sorry message or not a positive/straight response for the given query, return "Unsolved".
+2. If the answer is a positive/straight response for the given query, you have to further check.
+2.1 If the answer is not sufficient to determine whether the solve the query or not, return "Unsure".
+2.2 If you are confident that the answer is sufficient to determine whether the solve the query or not, return "Solved" or "Unsolved".
+"""
+# .replace('{current_date_time}', str(current_date_time))
+
+
+REFIND_API_PROMPT = """
+Current APIs failed to solve the query. The result is: {{failed_reason}}. 
+You need to analyze the result, and find more apis.
+It is possible that the tools do not have the relevant apis. In this case, you should call the Finish function. Do not make up the tool names or api names.
+"""
+# You need to analyze why the apis failed, remove some of the apis you add before and find alternative apis.
+
+REFIND_CATEGORY_PROMPT = """
+Current APIs failed to solve the query and the result is: {{failed_reason}}. 
+Please assign more unexplored categories to the agents.
+"""
+
+REFIND_TOOL_PROMPT = """
+Current APIs failed  to solve the query. The result is: {{failed_reason}}. 
+Please assign more unexplored tools to the agents.
+"""
+
+# Giving the query and answer, you need give `answer_status` of the answer by following rules:
+# 1. If the answer is a sorry message or not a positive/straight response for the given query, return "Unsolved".
+# 2. If the answer is a positive/straight response for the given query, you have to further check.
+# 2.1 If the answer is not sufficient to determine whether the solve the query or not, return "Unsure".
+# 2.2 If you are confident that the answer is sufficient to determine whether the solve the query or not, return "Solved" or "Unsolved".
+FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION = """You are AutoGPT, you can use many tools(functions) to do the following task.
+First I will give you the task description, and your task start.
+At each step, you need to give your thought to analyze the status now and what to do next, with function calls to actually excute your step.
+After the call, you will get the call result, and you are now in a new state.
+Then you will analyze your status now, then decide what to do next...
+After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.
+Remember: 
+1.the state change is irreversible, you can't go back to one of the former state, if you think you cannot finish the task with the current functions, 
+say "I give up and restart" and return give_up_feedback including the function name list you think unuseful 
+and the reason why they are unuseful. 
+If you think the query cannot be answered due to incomplete or ambiguous information, you should also say "say "I give up and restart" and return give_up_feedback with 
+just the reason why this query cannot answered.
+2.All the thought is short, at most in 5 sentence.
+3.You can do more then one trys, so if your plan is to continuously try some conditions, you can do one of the conditions per try.
+Let's Begin!
+Task description: {task_description}"""
+
+FORMAT_INSTRUCTIONS_USER_FUNCTION = """
+{input_description}
+Begin!
+"""
+
+FORMAT_INSTRUCTIONS_FIND_API = """You are an AutoGPT. You have access to a database of tools and functions (apis). 
+                I will give you a task description and you need to find the relevant function (apis) for solving the task.
+                You can use five initial meta apis to retrieve the relevant apis. For example, you can use the 
+                meta api query_all_categories to retrieve all the categories in the api database. Then you can use the second meta
+                api query_tools_in_category to retrieve the available tools of a specific category. Then, you can use the meta
+                api query_apis_in_tool to retrieve the api list of a specific tool. 
+                If you are unsure about the functioinality of some tools, you can use the meta api query_tool_details to retrieve the details of a specific tool. 
+                If you are unsure about the functioinality of some apis, you can use the meta api query_api_details to retrieve the details of a specific api. 
+                Additionally, you can use the meta api retrieve_relevant_apis_using_knn to retrieve the relevant apis according to the query using a knn retriever. 
+                When you get the api names, call the Finish function with the final answer. You should call the initial meta apis no more than 10 times.
+                At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.
+                All the thought is short, at most in 5 sentence."""
+
+FORMAT_INSTRUCTIONS_FIND_API_OPTIMIZED = """As an AutoGPT with access to a suite of meta APIs, your role is to navigate an API database to find the tools necessary to complete a given task. Here's how you'll proceed:
+
+1. When presented with the task description, begin by calling the <query_all_categories> meta API to obtain a list of all categories in the API database.
+
+2. Analyze the task and determine the most relevant category. Use the <query_tools_in_category> meta API to list the tools within this selected category.
+
+3. Choose the most appropriate tool for the task and employ the <query_apis_in_tool> meta API to find the specific APIs available under that tool.
+
+4. If clarification is needed on the functionality of any tools, invoke the <query_tool_details> to gather more detailed information.
+
+5. Similarly, use the <query_api_details> meta API for detailed insights into the functionalities of specific APIs if required.
+
+6. Throughout each step, provide a brief analysis (no more than five sentences) of your current status and your next action, including the actual function call to execute your step.
+
+7. Once you have determined the best APIs for the task, conclude by calling the <Finish> function with the final API names.
+
+Remember, you have a limit of 20 calls to the initial meta APIs. Prioritize efficiency and clarity in each step of your analysis and actions.
+"""
+# 6. To enhance the selection process, leverage the <retrieve_relevant_apis_using_knn> meta API, which utilizes a k-nearest neighbors algorithm to find the most pertinent APIs based on your query.
+FIND_API_NO_HIER_PROMPT = """
+You are APIGPT, You have access to a database of apis. The database has many categories. Each category has many tools. Each tool has many apis.
+Now, you should help the user find the relevant apis in the database. 
+You are provided with some functions to retrieve the relevant apis. The database has the following categories: {categories}.
+You can use the query_tools_in_category function to retrieve the available tools of a specific category. Then, you can use the query_apis_in_tool function to retrieve the api list of a specific tool. 
+If you are unsure about the functioinality of some tools, you can use the function query_tools_details to retrieve the details of these tools. 
+If you are unsure about the functioinality of some apis, you can use the function query_api_details to retrieve the details of a specific api. 
+When you determine the api names, use the add_apis function to add them to the final api list.
+Remember, you should explore as many apis as possible and you should not omit any  possible apis.
+If you think you have explored all the possible apis or you think there are no relevant apis in the database, call the Finish function.
+At each step,  you should call functions to actually excute your step.
+All the thought is short, at most in 3 sentence.
+"""
+REFIND_API_NO_HIER_PROMPT = """
+Current apis failed to solve the query. The result is: {{failed_reason}}. 
+You need to analyze the result, and find more apis.
+It is possible that the database do not have the relevant apis. In this case, you should call the Finish function. Do not make up the tool names or api names.
+"""
+# You are APIGPT, You have access to a database of apis. The database has many categories. Each category has many tools. Each tool has many apis.
+# Now, you should help the user find the relevant apis in the tools {tools} of category '{category}' for a task. You will be given all the tool description and the contained api list and their details
+# When you determine the api names, use the add_apis function to add them to the final api list. 
+# If you think you have explored all the possible apis or you think there are no relevant apis in these tools, call the Finish function.
+# In the middle step, you may be provided with feedback on these apis.
+# At each step,  you should call functions to actually excute your step.
+# All the thought is short, at most in 3 sentence.
+"""
+You are APIGPT, You have access to a database of apis. The database has many categories. Each category has many tools. Each tool has many apis.
+Now, you should help the user find the relevant apis in the database. 
+You are provided with some functions to retrieve the relevant apis. For example, you can use the 
+function query_all_categories to retrieve all the categories in the api database. 
+Then you can use the second function query_tools_in_category to retrieve the available tools of a specific category. Then, you can use the meta
+api query_apis_in_tool to retrieve the api list of a specific tool. 
+If you are unsure about the functioinality of some tools, you can use the function query_tools_details to retrieve the details these tools. 
+If you are unsure about the functioinality of some apis, you can use the function query_api_details to retrieve the details of a specific api. 
+When you get the relevant api names, use the add_apis function to add them to the final api list.
+Remember, you should explore as many apis as possible.
+If you think you have explored all the possible apis or you think there are no relevant apis in the database, call the Finish function.
+In the middle step, you may be provided with feedback on these apis.
+You can use the remove_apis function to remove the apis from the api list.
+At each step,  you should call functions to actually excute your step.
+All the thought is short, at most in 3 sentence.
+"""
+
+CHECK_SOLVABLE_BY_FUNCTION_PROMPT = """
+Please check whether the given task solvable with following rules:
+1. If the `query` provide invalid information (e.g. invalid email address or phone number), return "Unsolvable"
+2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task), return "Unsolvable"
+3. If you are unable to draw a conclusion, return "Unsure"
+4. If the query is illegal or unethical or sensitive, return "Unsure"
+5. If the currently `available_tools` are enough to solve the query, return "Solvable"
+You must call the Finish function at one step.
+"""
+
+CHECK_SOLVABLE_PROMPT = """
+Please check whether the given task solvable with following rules:
+1. If the `query` provide invalid information (e.g. invalid email address or phone number), return "Unsolvable"
+2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task), return "Unsolvable"
+3. If you are unable to draw a conclusion, return "Unsure"
+5. Otherwise, return "Solvable"
+Remember, you should assume you have all the tools to solve the query but you do not need to answer the query at this time.
+
+You must call the Finish function at one step.
+"""
+# 4. If the query is illegal or unethical or sensitive, return "Unsure"
--- a/rapidapi.py
+++ b/rapidapi.py
@ -0,0 +1,671 @@
+import re
+import os
+import json
+import time
+import requests
+from tqdm import tqdm
+from termcolor import colored
+import random
+from copy import deepcopy
+from toolbench.inference.LLM.chatgpt_function_model import ChatGPTFunction, GPT4Function
+from toolbench.inference.LLM.davinci_model import Davinci
+from toolbench.inference.LLM.tool_llama_lora_model import ToolLLaMALoRA
+from toolbench.inference.LLM.tool_llama_model import ToolLLaMA
+from toolbench.inference.LLM.retriever import ToolRetriever
+from toolbench.inference.Algorithms.single_chain import single_chain
+from toolbench.inference.Algorithms.DFS import DFS_tree_search
+from toolbench.inference.server import get_rapidapi_response
+from toolbench.utils import (
+    standardize,
+    change_name,
+    replace_llama_with_condense
+)
+
+from toolbench.inference.Downstream_tasks.base_env import base_env
+from arguments import parse_args
+args = parse_args()
+# from toolbench.inference.Downstream_tasks.find_api_by_gpt4 import find_apis_with_details
+                    # while response["error"] == "Too many requests error..." or response["error"] == "Rate limit per minute error..." or response["error"] == "Unsubscribed error..." or response["error"] == "Unauthorized error..." or response["error"] == "API not working error..." or 'Quota' in response["error"] or 'blocked' in response['error'].lower() or 'Rate limit' in response['error']: 
+
+error_list = ['Too many requests error...', 'Rate limit...', 'Unsubscribed', 'Unauthorized', 'not working error...', 'Quota','quota', 'Blocked', 'Rate limit', 'Unauthorized error']
+
+
+# For pipeline environment preparation
+def get_white_list(tool_root_dir):
+    # print(tool_root_dir)
+    white_list_dir = os.path.join(tool_root_dir)
+    white_list = {}
+    for cate in tqdm(os.listdir(white_list_dir)):
+        if not os.path.isdir(os.path.join(white_list_dir,cate)):
+            continue
+        for file in os.listdir(os.path.join(white_list_dir,cate)):
+            if not file.endswith(".json"):
+                continue
+            standard_tool_name = file.split(".")[0]
+            # print(standard_tool_name)
+            with open(os.path.join(white_list_dir,cate,file)) as reader:
+                js_data = json.load(reader)
+            origin_tool_name = js_data["tool_name"]
+            white_list[standardize(origin_tool_name)] = {"description": js_data["tool_description"], "standard_tool_name": standard_tool_name}
+    return white_list
+
+def contain(candidate_list, white_list):
+    output = []
+    for cand in candidate_list:
+        if cand not in white_list.keys():
+            return False
+        output.append(white_list[cand])
+    return output
+
+
+# rapidapi env wrapper
+class rapidapi_wrapper(base_env):
+    def __init__(self, query_json, tool_descriptions, retriever, args, process_id=0):
+        super(rapidapi_wrapper).__init__()
+
+        self.tool_root_dir = args.tool_root_dir
+        self.toolbench_key = args.toolbench_key
+        self.rapidapi_key = args.rapidapi_key
+        self.use_rapidapi_key = args.use_rapidapi_key
+        self.api_customization = args.api_customization
+        self.service_url = "http://8.218.239.54:8080/rapidapi"
+        self.max_observation_length = args.max_observation_length
+        self.observ_compress_method = args.observ_compress_method
+        self.retriever = retriever
+        self.process_id = process_id
+        try:
+            self.rapidapi_key_list = json.load(open('rapidapi_key_list.json','r'))
+        except:
+            self.rapidapi_key_list = []
+
+        self.tool_names = []
+        self.cate_names = []
+
+        self.input_description = query_json["query"]
+        self.functions = []
+        self.api_name_reflect = {}
+
+        if self.retriever is not None:
+            query_json = self.retrieve_rapidapi_tools(self.input_description, args.retrieved_api_nums, args.tool_root_dir)
+            data_dict = self.fetch_api_json(query_json)
+            tool_descriptions = self.build_tool_description(data_dict)
+        else:
+            data_dict = self.fetch_api_json(query_json)
+        self.api2origin = {}
+        origin_api_list = deepcopy(data_dict["api_list"])
+
+        for k,api_json in enumerate(data_dict["api_list"]):
+            standard_tool_name = tool_descriptions[k][0]
+            openai_function_json,cate_name, pure_api_name = self.api_json_to_openai_json(api_json,standard_tool_name)
+            self.functions.append(openai_function_json)
+
+            self.api_name_reflect[openai_function_json["name"]] = pure_api_name
+            self.tool_names.append(standard_tool_name)
+            self.cate_names.append(cate_name)
+            self.api2origin[openai_function_json["name"]] = {'category_name': origin_api_list[k]['category_name'], 'tool_name': origin_api_list[k]['tool_name'], 'api_name': origin_api_list[k]['api_name']} 
+        # """
+        if args.use_original_prompt:
+            finish_func = {
+            "name": "Finish",
+            "description": "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. Alternatively, if you recognize that you are unable to proceed with the task in the current state, call this function to restart. Remember: you must ALWAYS call this function at the end of your attempt, and the only part that will be shown to the user is the final answer, so it should contain sufficient information.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "return_type": {
+                        "type": "string",
+                        "enum": ["give_answer","give_up_and_restart"],
+                    },
+                    "final_answer": {
+                        "type": "string",
+                        "description": "The final answer you want to give the user. You should have this field if \"return_type\"==\"give_answer\"",
+                    }
+                },
+                "required": ["return_type"],
+            }
+        }
+        else:
+            finish_func = {
+            "name": "Finish",
+            "description": "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. Alternatively, if you recognize that you are unable to proceed with the task in the current state, call this function to restart and give reason. If you think you cannot finish the task with the current tools, you should also call this function and give the reason which should mention the names of the failed function. Remember: you must ALWAYS call this function at the end of your attempt, and the only part that will be shown to the user is the final answer, so it should contain sufficient information.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "return_type": {
+                        "type": "string",
+                        "enum": ["give_answer","give_up_and_restart", "give_up"],
+                    },
+                    "final_answer": {
+                        "type": "string",
+                        "description": "The final answer you want to give the user. It should not contain anly sorry message. You should have this field if \"return_type\"==\"give_answer\"",
+                    },
+                    "reason": {
+                        "type": "string",
+                        "description": "The reason why you give up. You should mention the names of the failed functions. You should have this field if \"return_type\"==\"give_up\" or \"return_type\"==\"give_up_and_restart\"",
+                    }
+                },
+                "required": ["return_type"],
+            }
+        }
+        # """
+        """
+        finish_func = {
+            "name": "Finish",
+            "description": "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. Alternatively, if you recognize that you are unable to proceed with the task in the current state, call this function to restart. If you think you cannot finish the task with the current tools, you should give the reason which should mention the unuseful apis. Remember: you must ALWAYS call this function at the end of your attempt, and the only part that will be shown to the user is the final answer, so it should contain sufficient information.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "return_type": {
+                        "type": "string",
+                        "enum": ["give_answer","give_up_and_restart"],
+                    },
+                    "final_answer": {
+                        "type": "string",
+                        "description": "The final answer you want to give the user. The final answer should not contain any sorry message. You should have this field if \"return_type\"==\"give_answer\"",
+                    },
+                    "reason": {
+                        "type": "string",
+                        "description": "The reason why you give up. You should mention the failed function names. You should have this field if \"return_type\"==\"give_up_and_restart\"",
+                    }
+                },
+                "required": ["return_type"],
+            }
+        }
+        """
+        self.functions.append(finish_func)
+        self.CALL_MAX_TIME = 3
+        self.task_description = f'''You should use functions to help handle the real time user querys. Remember:
+1.ALWAYS call \"Finish\" function at the end of the task. And the final answer should contain enough information to show to the user,If you can't handle the task, or you find that function calls always fail(the function is not valid now), use function Finish->give_up_and_restart.
+2.Do not use origin tool names, use only subfunctions' names.
+You have access of the following tools:\n'''
+        
+        unduplicated_reflection = {}
+        for standardize_tool_name, tool_des in tool_descriptions:
+            unduplicated_reflection[standardize_tool_name] = tool_des
+
+        for k,(standardize_tool_name, tool_des) in enumerate(unduplicated_reflection.items()):
+            striped = tool_des[:512].replace('\n','').strip()
+            if striped == "":
+                striped = "None"
+            self.task_description += f"{k+1}.{standardize_tool_name}: {striped}\n"
+
+        self.success = 0
+
+    def build_tool_description(self, data_dict):
+        white_list = get_white_list(self.tool_root_dir)
+        origin_tool_names = [standardize(cont["tool_name"]) for cont in data_dict["api_list"]]
+        tool_des = contain(origin_tool_names,white_list)
+        tool_descriptions = [[cont["standard_tool_name"], cont["description"]] for cont in tool_des]
+        return tool_descriptions
+    
+    def retrieve_rapidapi_tools(self, query, top_k, jsons_path):
+        retrieved_tools = self.retriever.retrieving(query, top_k=top_k)
+        query_json = {"api_list":[]}
+        for tool_dict in retrieved_tools:
+            if len(query_json["api_list"]) == top_k:
+                break
+            category = tool_dict["category"]
+            tool_name = tool_dict["tool_name"]
+            api_name = tool_dict["api_name"]
+            if os.path.exists(jsons_path):
+                if os.path.exists(os.path.join(jsons_path, category)):
+                    if os.path.exists(os.path.join(jsons_path, category, tool_name+".json")):
+                        query_json["api_list"].append({
+                            "category_name": category,
+                            "tool_name": tool_name,
+                            "api_name": api_name
+                        })
+        return query_json
+    
+    def fetch_api_json(self, query_json):
+        data_dict = {"api_list":[]}
+        for item in query_json["api_list"]:
+            cate_name = item["category_name"]
+            tool_name = standardize(item["tool_name"])
+            api_name = change_name(standardize(item["api_name"]))
+            try:
+                tool_json = json.load(open(os.path.join(self.tool_root_dir, cate_name, tool_name + ".json"), "r"))
+            except:
+                continue
+            append_flag = False
+            api_dict_names = []
+            for api_dict in tool_json["api_list"]:
+                api_dict_names.append(api_dict["name"])
+                pure_api_name = change_name(standardize(api_dict["name"]))
+                if pure_api_name != api_name:
+                    continue
+                api_json = {}
+                api_json["category_name"] = cate_name
+                api_json["api_name"] = api_dict["name"]
+                api_json["api_description"] = api_dict["description"]
+                api_json["required_parameters"] = api_dict["required_parameters"]
+                api_json["optional_parameters"] = api_dict["optional_parameters"]
+                api_json["tool_name"] = tool_json["tool_name"]
+                data_dict["api_list"].append(api_json)
+                append_flag = True
+                break
+            if not append_flag:
+                print(api_name, api_dict_names)
+        return data_dict
+
+    def api_json_to_openai_json(self, api_json,standard_tool_name):
+        description_max_length=256
+        templete =     {
+            "name": "",
+            "description": "",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                },
+                "required": [],
+                "optional": [],
+            }
+        }
+        
+        map_type = {
+            "NUMBER": "integer",
+            "STRING": "string",
+            "BOOLEAN": "boolean"
+        }
+
+        pure_api_name = change_name(standardize(api_json["api_name"]))
+        templete["name"] = pure_api_name+ f"_for_{standard_tool_name}"
+        templete["name"] = templete["name"][-64:]
+
+        templete["description"] = f"This is the subfunction for tool \"{standard_tool_name}\", you can use this tool."
+        
+        if api_json["api_description"].strip() != "":
+            tuncated_description = api_json['api_description'].strip().replace(api_json['api_name'],templete['name'])[:description_max_length]
+            templete["description"] = templete["description"] + f"The description of this function is: \"{tuncated_description}\""
+        if "required_parameters" in api_json.keys() and len(api_json["required_parameters"]) > 0:
+            for para in api_json["required_parameters"]:
+                name = standardize(para["name"])
+                name = change_name(name)
+                if para["type"] in map_type:
+                    param_type = map_type[para["type"]]
+                else:
+                    param_type = "string"
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length],
+                }
+
+                default_value = para['default']
+                if len(str(default_value)) != 0:    
+                    prompt = {
+                        "type":param_type,
+                        "description":para["description"][:description_max_length],
+                        "example_value": default_value
+                    }
+                else:
+                    prompt = {
+                        "type":param_type,
+                        "description":para["description"][:description_max_length]
+                    }
+
+                templete["parameters"]["properties"][name] = prompt
+                templete["parameters"]["required"].append(name)
+            for para in api_json["optional_parameters"]:
+                name = standardize(para["name"])
+                name = change_name(name)
+                if para["type"] in map_type:
+                    param_type = map_type[para["type"]]
+                else:
+                    param_type = "string"
+
+                default_value = para['default']
+                if len(str(default_value)) != 0:    
+                    prompt = {
+                        "type":param_type,
+                        "description":para["description"][:description_max_length],
+                        "example_value": default_value
+                    }
+                else:
+                    prompt = {
+                        "type":param_type,
+                        "description":para["description"][:description_max_length]
+                    }
+
+                templete["parameters"]["properties"][name] = prompt
+                templete["parameters"]["optional"].append(name)
+
+        return templete, api_json["category_name"],  pure_api_name
+
+    def check_success(self):
+        return self.success
+
+    def to_json(self):
+        return {}
+
+    def restart(self):
+        pass
+
+    def get_score(self):
+        return 0.0
+
+    def step(self,**args):
+        obs, code = self._step(**args)
+        if len(obs) > self.max_observation_length:
+            obs = obs[:self.max_observation_length] + "..."
+        return obs, code
+
+    def _step(self, action_name="", action_input=""):
+        """Need to return an observation string and status code:
+            0 means normal response
+            1 means there is no corresponding api name
+            2 means there is an error in the input
+            3 represents the end of the generation and the final answer appears
+            4 means that the model decides to pruning by itself
+            5 represents api call timeout
+            6 for 404
+            7 means not subscribed
+            8 represents unauthorized
+            9 represents too many requests
+            10 stands for rate limit
+            11 message contains "error" field
+            12 error sending request
+        """
+        if action_name == "Finish":
+            print(action_input, file=open('output/finish.txt','a'))
+            try:
+                json_data = json.loads(action_input,strict=False)
+                if 'reason' in json_data.keys():
+                    reason = json_data["reason"]
+                    print(json_data, file=open('output/reason.txt','a'))
+            except Exception as e:
+                # raise e
+                json_data = {}
+                action_input = str(action_input)
+                if '"return_type": "' in action_input:
+                    if '"return_type": "give_answer"' in action_input:
+                        return_type = "give_answer"
+                    elif '"return_type": "give_up_and_restart"' in action_input:
+                        return_type = "give_up_and_restart"
+                    elif '"return_type": "give_up"' in action_input:
+                        return_type = "give_up"
+                    else:
+                        return_type = action_input[action_input.find('"return_type": "')+len('"return_type": "'):action_input.find('",')]
+                    json_data["return_type"] = return_type
+                if '"final_answer": "' in action_input:
+                    final_answer = action_input[action_input.find('"final_answer": "')+len('"final_answer": "'):]
+                    json_data["final_answer"] = final_answer
+                if '"reason": "' in action_input:
+                    reason = action_input[action_input.find('"reason": "')+len('"reason": "'):]
+                    json_data["reason"] = reason
+                    print(reason, file=open('output/reason.txt','a'))
+            if "return_type" not in json_data.keys():
+                print(json_data.keys(), file=open('output/return_type.txt','a'))
+                return "{error:\"must have \"return_type\"\"}", 2
+            if json_data["return_type"] == "give_up_and_restart":
+                return "{\"response\":\"chose to give up and restart\"}", 4
+            elif json_data["return_type"] == "give_up":
+                if "reason" not in json_data.keys():
+                    return "{error:\"must have \"reason\"\"}", 2
+                return "{\"response\":\"chose to give up\"}", 3
+            elif json_data["return_type"] == "give_answer":
+                if "final_answer" not in json_data.keys():
+                    return "{error:\"must have \"final_answer\"\"}", 2
+                
+                self.success = 1 # succesfully return final_answer
+                return "{\"response\":\"successfully giving the final answer.\"}", 3
+            else:
+                return "{error:\"\"return_type\" is not a valid choice\"}", 2
+        else:
+
+            for k, function in enumerate(self.functions):
+                if function["name"].endswith(action_name):
+                    pure_api_name = self.api_name_reflect[function["name"]]
+                    payload = {
+                        "category": self.cate_names[k],
+                        "tool_name": self.tool_names[k],
+                        "api_name": pure_api_name,
+                        "tool_input": action_input,
+                        "strip": self.observ_compress_method,
+                        "toolbench_key": self.toolbench_key
+                    }
+                    if self.process_id == 0:
+                        print(colored(f"query to {self.cate_names[k]}-->{self.tool_names[k]}-->{action_name}",color="yellow"))
+                    if self.use_rapidapi_key or self.api_customization:
+                        payload["rapidapi_key"] = self.rapidapi_key
+                        response = get_rapidapi_response(payload, api_customization=self.api_customization)
+                    else:
+                        time.sleep(2) # rate limit: 30 per minute
+                        headers = {"toolbench_key": self.toolbench_key}
+                        try:
+                            response = requests.post(self.service_url, json=payload, headers=headers, timeout=15)
+                        except:
+                            # return json.dumps({"error": action_name, "response": ""}), 13
+                            os.makedirs('output', exist_ok=True)
+                            print(payload, file=open('output/timeout.txt','a'))
+                            return json.dumps({"error": "connection timeout", "response": ""}), 13
+                        if response.status_code != 200:
+                            return json.dumps({"error": f"request invalid, data error. status_code={response.status_code}", "response": ""}), 12
+                        try:
+                            response = response.json()
+                        except:
+                            print(response)
+                            return json.dumps({"error": f"request invalid, data error", "response": ""}), 12
+                    # 1 Hallucinating function names
+                    # 4 means that the model decides to pruning by itself
+                    # 5 represents api call timeout
+                    # 6 for 404
+                    # 7 means not subscribed
+                    # 8 represents unauthorized
+                    # 9 represents too many requests
+                    # 10 stands for rate limit
+                    # 11 message contains "error" field
+                    # 12 error sending request
+                    cnt = 0
+                    print(111, response['error'], file=open('error.txt','a'))
+                    while any([word in response["error"] for word in error_list]):
+                        if cnt < len(self.rapidapi_key_list):
+                            # if self.use_rapidapi_key or self.api_customization:
+                            print(f'use rapidapi key {cnt}', file=open('output/rapidapi_key_usage.txt','a'))
+                            print(colored(f'use rapidapi key {cnt}', 'red'))
+                            payload["rapidapi_key"] = self.rapidapi_key_list[cnt]
+                            response = get_rapidapi_response(payload, api_customization=self.api_customization)
+                            print(response['error'], file=open('output/rapidapi_key_usage.txt','a'))
+                            cnt += 1    
+                        else:
+                            break
+                    if response["error"] == "API not working error...":
+                        status_code = 6
+                    elif response["error"] == "Unauthorized error...":
+                        status_code = 7
+                    elif response["error"] == "Unsubscribed error...":
+                        status_code = 8
+                    elif response["error"] == "Too many requests error...":
+                        status_code = 9
+                    elif response["error"] == "Rate limit per minute error...":
+                        print("Reach api calling limit per minute, sleeping...")
+                        time.sleep(10)
+                        status_code = 10
+                    elif response["error"] == "Message error...":
+                        status_code = 11
+                    else:
+                        status_code = 0
+                    return json.dumps(response), status_code
+                    # except Exception as e:
+                    #     return json.dumps({"error": f"Timeout error...{e}", "response": ""}), 5
+            return json.dumps({"error": f"No such function name: {action_name}", "response": ""}), 1
+
+
+class pipeline_runner:
+    def __init__(self, args, add_retrieval=False, process_id=0, server=False):
+        self.args = args
+        self.add_retrieval = add_retrieval
+        self.process_id = process_id
+        self.server = server
+        self.backbone_model = self.get_backbone_model()
+        # if not self.server: self.task_list = self.generate_task_list()
+        # else: self.task_list = []
+
+    def get_backbone_model(self):
+        args = self.args
+        if args.backbone_model == "toolllama":
+            # ratio = 4 means the sequence length is expanded by 4, remember to change the model_max_length to 8192 (2048 * ratio) for ratio = 4
+            ratio = int(args.max_sequence_length/args.max_source_sequence_length)
+            replace_llama_with_condense(ratio=ratio)
+            if args.lora:
+                backbone_model = ToolLLaMALoRA(base_name_or_path=args.model_path, model_name_or_path=args.lora_path, max_sequence_length=args.max_sequence_length)
+            else:
+                backbone_model = ToolLLaMA(model_name_or_path=args.model_path, max_sequence_length=args.max_sequence_length)
+        else:
+            backbone_model = args.backbone_model
+        return backbone_model
+
+    def get_retriever(self):
+        return ToolRetriever(corpus_tsv_path=self.args.corpus_tsv_path, model_path=self.args.retrieval_model_path)
+
+    def get_args(self):
+        return self.args
+
+    def generate_task_list(self):
+        args = self.args
+        query_dir = args.input_query_file
+        answer_dir = args.output_answer_file
+        if not os.path.exists(answer_dir):
+            os.mkdir(answer_dir)
+        method = args.method
+        backbone_model = self.get_backbone_model()
+        white_list = get_white_list(args.tool_root_dir)
+        task_list = []
+        querys = json.load(open(query_dir, "r"))
+        for query_id, data_dict in enumerate(querys):
+            if "query_id" in data_dict:
+                query_id = data_dict["query_id"]
+            if "api_list" in data_dict:
+                # data_dict['api_list'] = find_apis_with_details(data_dict['query'])
+                # print('#'*100)
+                # print(data_dict['api_list'])
+                origin_tool_names = [standardize(cont["tool_name"]) for cont in data_dict["api_list"]]
+                tool_des = contain(origin_tool_names,white_list)
+                if tool_des == False:
+                    continue
+                tool_des = [[cont["standard_tool_name"], cont["description"]] for cont in tool_des]
+            else:
+                tool_des = None
+            task_list.append((method, backbone_model, query_id, data_dict, args, answer_dir, tool_des))
+            # json.dump(querys, open(query_dir.split('.')[0]+'_new.json', 'w'))
+        return task_list
+    
+    def method_converter(self, backbone_model, openai_key, method, env, process_id, single_chain_max_step=12, max_query_count=60, callbacks=None, messages=None):
+        if callbacks is None: callbacks = []
+        if backbone_model == "chatgpt_function":
+            # model = "gpt-4-0613"
+            model = "gpt-4-32k"
+            # model = "gpt-4-deployment"
+            # model = "gpt-3.5-turbo-16k-0613"
+            llm_forward = GPT4Function(model=model, openai_key=openai_key)
+        elif backbone_model == "davinci":
+            model = "text-davinci-003"
+            llm_forward = Davinci(model=model, openai_key=openai_key)
+        else:
+            model = backbone_model
+            llm_forward = model
+        
+        if method.startswith("CoT"):
+            passat = int(method.split("@")[-1])
+            chain = single_chain(llm=llm_forward, io_func=env,process_id=process_id)
+            result = chain.start(
+                                pass_at=passat,
+                                single_chain_max_step=single_chain_max_step,
+                                answer=1)
+        elif method.startswith("DFS"):
+            pattern = r".+_w(\d+)"
+            re_result = re.match(pattern,method)
+            assert re_result != None
+            width = int(re_result.group(1))
+            with_filter = True
+            if "woFilter" in method:
+                with_filter = False
+            chain = DFS_tree_search(llm=llm_forward, io_func=env,process_id=process_id, callbacks=callbacks)
+            result = chain.start(
+                                single_chain_max_step=single_chain_max_step,
+                                tree_beam_size = width,
+                                max_query_count = max_query_count,
+                                answer=1,
+                                with_filter=with_filter,
+                                messages=messages)
+        else:
+            print("invalid method")
+            raise NotImplementedError
+        return chain, result
+    
+    def run_single_task(self, method, backbone_model, query_id, data_dict, args, output_dir_path, tool_des, messages, retriever=None, process_id=0, callbacks=None, server= None):
+        if server is None:
+            server = self.server
+        if callbacks is None:
+            if server: print("Warning: no callbacks are defined for server mode")
+            callbacks = []
+        splits = output_dir_path.split("/")
+        # print("/".join(splits[:-1]))
+        # print("/".join(splits))
+        # print('#'*100)
+        # os.makedirs("/".join(splits[:-1]),exist_ok=True)
+        os.makedirs("/".join(splits),exist_ok=True)
+        output_file_path = os.path.join(output_dir_path,f"{query_id}_{method}.json")
+        if (not server) and os.path.exists(output_file_path):
+            return
+        [callback.on_tool_retrieval_start() for callback in callbacks]
+        env = rapidapi_wrapper(data_dict, tool_des, retriever, args, process_id=process_id)
+        [callback.on_tool_retrieval_end(
+            tools=env.functions
+        ) for callback in callbacks]
+        query = data_dict["query"]
+        if process_id == 0:
+            print(colored(f"[process({process_id})]now playing {query}, with {len(env.functions)} APIs", "green"))
+        [callback.on_request_start(
+            user_input=query,
+            method=method,
+        ) for callback in callbacks]
+        chain,result = self.method_converter(
+            backbone_model=backbone_model,
+            openai_key=args.openai_key,
+            method=method,
+            env=env,
+            process_id=process_id,
+            single_chain_max_step=12,
+            # max_query_count=100,
+            max_query_count=200,
+            callbacks=callbacks,
+            messages=messages,
+        )
+        [callback.on_request_end(
+            chain=chain.terminal_node[0].messages,
+            outputs=chain.terminal_node[0].description,
+        ) for callback in callbacks]
+        if output_dir_path is not None:
+            print('#'*100)
+            print(output_file_path)
+            with open(output_file_path,"w") as writer:
+                data = chain.to_json(answer=True,process=True)
+                data["answer_generation"]["query"] = query
+                data["api2origin"] = env.api2origin
+                json.dump(data, writer, indent=2)
+                success = data["answer_generation"]["valid_data"] and "give_answer" in data["answer_generation"]["final_answer"]
+                print(colored(f"[process({process_id})]valid={success}", "green"))
+        return result
+        
+    def run(self, task, messages):
+        # task_list = self.task_list
+        # random.seed(42)
+        # random.shuffle(task_list)
+        # print(f"total tasks: {len(task_list)}")
+        # new_task_list = []
+        # for task in task_list:
+        #     out_dir_path = task[-2]
+        #     query_id = task[2]
+        #     output_file_path = os.path.join(out_dir_path,f"{query_id}_{self.args.method}.json")
+        #     if not os.path.exists(output_file_path):
+        #         new_task_list.append(task)
+        # task_list = new_task_list
+        # print(f"undo tasks: {len(task_list)}")
+        if self.add_retrieval:
+            retriever = self.get_retriever()
+        else:
+            retriever = None
+        # for k, task in enumerate(task_list):
+        #     print(f"process[{self.process_id}] doing task {k}/{len(task_list)}: real_task_id_{task[2]}")
+        result = self.run_single_task(*task, messages, retriever=retriever, process_id=self.process_id)
+        return result
+
--- a/rapidapi_key_list_example.json
+++ b/rapidapi_key_list_example.json
@ -0,0 +1 @@
+[]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,34 @@
+accelerate==0.20.3
+fastapi==0.95.1
+gradio==3.23.0
+httpx==0.24.0
+markdown-it-py==2.2.0
+numpy==1.24.3
+prompt-toolkit==3.0.38
+pydantic==1.10.7
+requests==2.30.0
+rich==13.3.5
+rouge==1.0.1
+sentencepiece==0.1.99
+shortuuid==1.0.11
+tiktoken==0.4.0
+tokenizers==0.13.3
+torch>=1.12.0
+transformers==4.28.1
+uvicorn==0.22.0
+bitsandbytes==0.38.1
+peft==0.3.0
+langchain==0.0.229
+deepspeed==0.9.2
+sentence_transformers==2.2.2
+tensorboard
+openai
+scipy
+termcolor
+flask
+flask_cors
+sentence_transformers
+# openai-function-calling
+pypdf
+chromadb
+IPython
--- a/run_main.sh
+++ b/run_main.sh
@ -0,0 +1,10 @@
+#main 
+python anytool.py --model aus --output_dir result/aus/test_instruction/G1_instruction_customrapidapi --query_path data/test_instruction/G1_instruction.json --max_api_number 64
+
+python anytool.py --model aus --output_dir result/aus/test_instruction/G1_tool_customrapidapi --query_path data/test_instruction/G1_tool.json --max_api_number 64
+
+ python anytool.py --model 32k --output_dir result/32k/test_instruction/G1_instruction_customrapidapi --query_path data/test_instruction/G1_instruction.json --max_api_number 64
+
+proxychains4 python anytool.py --model aus --output_dir result/aus/test_instruction/G1_tool_customrapidapi_oriprompt_r1 --query_path data/test_instruction/G1_tool.json --max_api_number 64
+
+proxychains4 python anytool.py --model aus --output_dir result/aus/test_instruction/G1_instruction_customrapidapi_oriprompt_r1 --query_path data/test_instruction/G1_instruction.json --max_api_number 64
--- a/server.py
+++ b/server.py
@ -0,0 +1,182 @@
+from pydantic import BaseModel
+import json
+import os
+from typing import Union
+from toolbench.utils import standardize, change_name
+import random
+
+
+class Info(BaseModel):
+    category: str
+    tool_name: str
+    api_name: str
+    tool_input: Union[str, dict]
+    strip: str
+
+def prepare_tool_name_and_url(tools_root, info):
+    category = info.category
+    standard_category = category.replace(" ", "_").replace(",", "_").replace("/", "_")
+    while " " in standard_category or "," in standard_category:
+        standard_category = standard_category.replace(" ", "_").replace(",", "_")
+    standard_category = standard_category.replace("__", "_")
+    
+    tool_name = info.tool_name
+    api_name = change_name(standardize(info.api_name))
+    if not tool_name.endswith(f"_for_{standard_category}"):
+        tool_name = standardize(info.tool_name)
+        code_string = f"""from {tools_root}.{standard_category}.{tool_name}.api import {api_name}"""
+        tool_name += f"_for_{standard_category}"
+    else:
+        tmp_tool_name = standardize(tool_name.replace(f"_for_{standard_category}", ""))
+        code_string = f"""from {tools_root}.{standard_category}.{tmp_tool_name}.api import {api_name}"""
+    return tool_name, standard_category, api_name, code_string
+
+def process_error(response):
+    save_cache_flag = False
+    switch_flag = False
+    if "The request to the API has timed out. Please try again later, or if the issue persists" in str(response):
+        return_dict = {"error": "API temporarily not working error...", "response": response}
+
+    if "Your Client (working) ---> Gateway (working) ---> API (not working)" in str(response):
+        return_dict = {"error": "API not working error...", "response": response}
+        
+    elif "Unauthorized" in str(response) or "unauthorized" in str(response):
+        save_cache_flag = True
+        return_dict = {"error": "Unauthorized error...", "response": response}
+    
+    elif "You are not subscribed to this API." in str(response):
+        switch_flag = True
+        return_dict = {"error": "Unsubscribed error...", "response": response}
+    
+    elif "Too many requests" in str(response):
+        switch_flag = True
+        return_dict = {"error": "Too many requests error...", "response": response}
+
+    elif "You have exceeded" in str(response) or "you are being rate limited"  in str(response):
+        switch_flag = True
+        return_dict = {"error": "Rate limit error...", "response": response}
+
+    elif "Access restricted. Check credits balance or enter the correct API key." in str(response):
+        switch_flag = True
+        return_dict = {"error": "Rate limit error...", "response": response}
+    
+    elif "Oops, an error in the gateway has occurred." in str(response):
+        switch_flag = True
+        return_dict = {"error": "Gateway error...", "response": response}
+
+    elif "Blocked User. Please contact your API provider." in str(response):
+        switch_flag = True
+        return_dict = {"error": "Blocked error...", "response": response}
+    
+    elif "error" in str(response):
+        return_dict = {"error": "Message error...", "response": response}
+
+    else:
+        save_cache_flag = True
+        return_dict = {"error": "", "response": response}
+    return return_dict, save_cache_flag, switch_flag
+
+def run(toolbench_code_string, toolbench_api_name, toolbench_input_params_str):
+    # get observation
+    success_flag = False
+    switch_flag = False
+    save_cache = False
+    # print('#'*100)
+    print(toolbench_code_string, file=open("output/log.txt", "a"))
+    # from data.toolenv.tools.Data.refactor_numbers_in_human_readable_form_like_1k_or_1m.api import number
+    # from data.toolenv.tools.Data.get_twitter_mentions.api import getmentions
+    exec(toolbench_code_string)
+    # print('*'*100)
+    try:
+        eval_func_str = f"{toolbench_api_name}({toolbench_input_params_str})"
+        new_func = eval(eval_func_str)
+        response, save_cache, switch_flag = process_error(new_func)
+        success_flag = True
+    except Exception as e:
+        response = {"error": f"Function executing {toolbench_code_string} error...\n{e}", "response": ""}
+        save_cache = False
+    return success_flag, switch_flag, response, save_cache
+
+
+def dict_shorten(origin: dict, schema: dict):
+    for key, value in list(origin.items()):
+        if key not in schema:
+            del origin[key]
+        else:
+            if isinstance(value, dict):
+                dict_shorten(value, schema[key]) # schema[key] should be a dict
+            elif isinstance(value, list):
+                if value:
+                    if isinstance(value[0], dict):
+                        for item in value:
+                            dict_shorten(item, schema[key][0]) # schema[key] should be a list with only one dict element
+    return origin
+
+def observation_shorten(schema_root, response_dict, category, tool_name, api_name, strip_method):
+    print(random.random())
+    if strip_method == "filter" or (strip_method == "random" and random.random() > 0.5):
+        if isinstance(response_dict["response"], dict):
+            if os.path.exists(os.path.join(schema_root, category)):
+                if os.path.exists(os.path.join(schema_root, category, tool_name+".json")):
+                    schema_dicts = json.load(open(os.path.join(schema_root, category, tool_name+".json"), "r"))
+                    api_list = schema_dicts["api_list"]
+                    schema = None
+                    for schema_dict in api_list:
+                        schema_api_name = change_name(standardize(schema_dict["name"]))
+                        if schema_api_name == api_name and len(schema_dict["schema"]) > 0:
+                            schema = schema_dict["schema"]
+                            break
+                    if schema is not None:
+                        response_dict["response"] = dict_shorten(response_dict["response"], schema)
+    return str(response_dict["response"])
+
+
+def get_rapidapi_response(input_dict: dict, api_customization: bool=False, tools_root: str="data.toolenv.tools", schema_root: str="data/toolenv/response_examples"):
+    info = Info
+    info.category = input_dict['category']
+    info.tool_name = input_dict['tool_name']
+    info.api_name = input_dict['api_name']
+    info.tool_input = input_dict['tool_input']
+    info.strip = input_dict['strip']
+    rapidapi_key = input_dict['rapidapi_key']
+
+    tool_name, standard_category, api_name, code_string = prepare_tool_name_and_url(tools_root, info)
+    tool_input = info.tool_input
+    
+    strip_method = info.strip
+    
+    try:
+        tool_input = json.loads(tool_input)
+    except Exception as e:
+        if tool_input == "":
+            tool_input = {}
+        else:
+            print(f"Can not parse tool input into json: {tool_input}")
+            response_dict = {"error": f"Tool input parse error...\n", "response": ""}
+            return response_dict
+    
+    input_params_str = ""
+    if len(tool_input) > 0:
+        for key, value in tool_input.items():
+            if isinstance(value, str):
+                input_params_str += f'{key}="{value}", '
+            else:
+                input_params_str += f'{key}={value}, '
+    if not api_customization:
+        input_params_str += f"toolbench_rapidapi_key='{rapidapi_key}'"
+    success_flag, switch_flag, response_dict, save_cache = run(code_string, api_name, input_params_str)
+    observation = observation_shorten(schema_root, response_dict, standard_category, tool_name.replace(f"_for_{standard_category}", ""), api_name, strip_method)
+    result = str(observation)[:2048]
+    return {"error": response_dict['error'], "response": result}
+
+
+if __name__ == "__main__":
+    result = get_rapidapi_response({
+        "category": "Social",
+        "tool_name": "olato_quotes",
+        "api_name": "love_quote",
+        "tool_input": '{}',
+        "strip": "filter",
+        "rapidapi_key": ""
+    })
+    print(result)
--- a/solved_dict.json
+++ b/solved_dict.json
--- a/tool_details.json
+++ b/tool_details.json
--- a/toolbench/inference/Algorithms/DFS.py
+++ b/toolbench/inference/Algorithms/DFS.py
@ -0,0 +1,406 @@
+import re
+from toolbench.inference.Tree.Tree import my_tree, tree_node
+from toolbench.inference.Prompts.ReAct_prompts import FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION, FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ADAPTED, FORMAT_INSTRUCTIONS_USER_FUNCTION
+from toolbench.inference.Prompts.Tree_search_prompts import DIVERSITY_PROMPT
+from toolbench.inference.Algorithms.base_search import base_search_method
+from copy import deepcopy
+from toolbench.inference.LLM_rank.rank_candidate import sum_based_rankn, rank2_subfix
+import json
+import random
+import os
+from arguments import parse_args
+args = parse_args()
+file_name = 'llama_io.txt'
+
+class DFS_tree_search(base_search_method):
+
+    def __init__(self, llm, io_func, process_id=0, callbacks=None):
+        super(DFS_tree_search, self).__init__(
+            llm, io_func, process_id, callbacks)
+        """Depth-first search. 
+        with_filter=True: Every time a child node is generated, choose the best multiple iterations to go.
+        with_filter=False: Do as Preorder traversal.
+        """
+        self.io_func = io_func
+        self.llm = llm
+        self.process_id = process_id
+        self.restart()
+
+        self.callbacks = callbacks if callbacks is not None else []
+
+    def restart(self):
+        self.status = 0
+        self.terminal_node = []
+        self.give_up_node = []
+        self.now_expand_num = 0
+        self.query_count = 0
+        self.total_tokens = 0
+
+    def send_agent_chain_end(self, depth, agent_block_ids, chain_block_ids):
+        for i in range(len(self.callbacks)):
+            callback = self.callbacks[i]
+            callback.on_chain_end(
+                depth=depth,
+                block_id=chain_block_ids[i]
+            )
+            if i < len(agent_block_ids):
+                callback.on_agent_end(
+                    depth=depth,
+                    block_id=agent_block_ids[i]
+                )
+
+    def to_json(self, answer=False, process=True):
+
+        if process:
+            json_obj = {
+                "win": self.status == 1,
+                "tree": self.tree.to_json_recursive(),
+                "forward_args": self.forward_args,
+                "compare_candidates": [],
+            }
+            for node in self.terminal_node:
+                if node.pruned == False:  # has answer
+                    json_obj["compare_candidates"].append(
+                        node.get_chain_result_from_this_node(use_messages=False))
+        else:
+            json_obj = {}
+
+        if answer:
+            json_obj["answer_generation"] = {
+                "valid_data": False,
+                "query_count": self.query_count,
+                "total_tokens": self.total_tokens,
+                "final_answer": "",
+                "finish_type": "give_answer",
+                "function": self.io_func.functions,
+                "chain": [],
+            }
+            for node in self.terminal_node:
+                if node.pruned == False:
+                    if 'give_up' in node.description.lower():
+                        json_obj["answer_generation"]["finish_type"] = "give_up"
+                    else:
+                        json_obj["answer_generation"]["finish_type"] = "give_answer"
+                    json_obj["answer_generation"]["final_answer"] = node.description
+                    json_obj["answer_generation"]["valid_data"] = True
+                    json_obj["answer_generation"]["train_messages"] = node.get_train_messages_from_this_node(
+                    )
+                    break
+            # do not have final answer, look for give_up
+            if json_obj["answer_generation"]["valid_data"] == False:
+                if len(self.give_up_node) > 0:
+                    random_pos = random.randint(0, len(self.give_up_node) - 1)
+                    choose_give_up_node = self.give_up_node[random_pos]
+                    json_obj["answer_generation"]["valid_data"] = True
+                    json_obj["answer_generation"]["finish_type"] = "give_up"
+                    json_obj["answer_generation"]["final_answer"] = choose_give_up_node.description
+                    json_obj["answer_generation"]["train_messages"] = choose_give_up_node.get_train_messages_from_this_node()
+        return json_obj
+
+    def start(self, single_chain_max_step, tree_beam_size, max_query_count, answer=1, with_filter=True, messages=None):
+        """ single_chain_max_step: The maximum depth of the tree
+            tree_beam_size: How many children nodes for one node are generated per layer
+            answer = n means the Algo exits when find n "give_answer" nodes
+            max_query_count: the Algo exits when OpenAI-query exists this value
+            with_filter: This is the difference between normal DFS(with_filter=True) and DFSDT(with_filter=False). 
+        """
+        self.forward_args = locals()
+        if "self" in self.forward_args.keys():
+            # self.forward_args.pop("self")
+            self.forward_args=None
+        self.tree = my_tree()
+        self.tree.root.node_type = "Action Input"
+        self.tree.root.io_state = deepcopy(self.io_func)
+        if args.use_original_prompt:
+            system = FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION
+        else:
+            system = FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ADAPTED
+        system = system.replace("{task_description}",
+                                self.io_func.task_description)
+        user = FORMAT_INSTRUCTIONS_USER_FUNCTION
+        user = user.replace("{input_description}",
+                        self.io_func.input_description)
+        if messages is None:
+            self.tree.root.messages.append({"role": "system", "content": system})
+            self.tree.root.messages.append({"role": "user", "content": user})
+        else:
+            messages[0] = {"role": "system", "content": system}
+            messages.pop()
+            function_names = []
+            for function in self.io_func.functions:
+                function_names.append(function["name"])
+            for i, message in reversed(list(enumerate(messages))):
+                if message["role"] == "function":
+                    if message["name"] not in function_names:
+                        messages.pop(i)
+                        messages.pop(i-1)
+                if message["role"] == "user":
+                    if 'maximum query count' in message["content"]:
+                        messages.pop(i)
+
+            self.tree.root.messages = messages
+        print('#'*100, file=open(file_name,'a') )
+
+        return self.DFS(self.tree.root, single_chain_max_step, tree_beam_size, max_query_count, answer, with_filter)
+
+    def DFS(self, now_node, single_chain_max_step, tree_beam_size, max_query_count, answer, with_filter=True):
+        """Returns the number of grids to go back. When a child node of a node generates a final answer or give up, it should go back a few more grids
+        In a sense, the larger this value is, the more diverse it is, and it is GreedySearch@n when it is enlarged to infinity.
+        """
+
+        # this two value declares the rate to go back, Algo degrades to CoT when the value=Inf
+        if args.solver == 'dfs':
+            final_answer_back_length = 2
+            prune_back_length = 2
+        else:
+            final_answer_back_length = 10000
+            prune_back_length = 10000
+
+        now_node.expand_num = self.now_expand_num
+        self.now_expand_num += 1
+        if now_node.get_depth() >= single_chain_max_step or now_node.pruned or now_node.is_terminal:
+            if now_node.is_terminal:  # final answer
+                self.status = 1
+                self.terminal_node.append(now_node)
+                return final_answer_back_length
+            else:
+                now_node.pruned = True
+                if now_node.observation_code == 4:
+                    self.give_up_node.append(now_node)
+                    return prune_back_length
+                else:
+                    return 1
+
+        next_tree_split_nodes = []
+        for i in range(tree_beam_size):
+            temp_now_node = now_node
+
+            """If a node have children now, We will prompt the model to generate different nodes than all the existing nodes"""
+            delete_former_diversity_message = False
+            diversity_message = None
+            if len(temp_now_node.children) > 0:
+
+                former_candidates_des = ""
+                js_list = []
+                for k, child in enumerate(temp_now_node.children):
+                    temp_node = child
+                    while not temp_node.is_terminal and temp_node.node_type != "Action Input" and len(temp_node.children) > 0:
+                        temp_node = temp_node.children[0]
+                    if temp_node.node_type == "Action Input":
+                        obj_dict = {
+                            "name": temp_node.father.description,
+                            "arguments": temp_node.description,
+                            "function_output": temp_node.observation,
+                            "mento-carlo-action-value": temp_node.compute_weight(),
+                        }
+                        js_list.append(obj_dict)
+
+                if len(js_list) > 0:
+                    former_candidates_des = former_candidates_des + \
+                        f"{json.dumps(js_list,indent=2)}\n"
+                    if temp_now_node.observation != "":
+                        former_candidates_des = former_candidates_des + \
+                            f"again, your former observation: {temp_now_node.observation}\n"
+                    diverse_prompt = DIVERSITY_PROMPT
+                    diverse_prompt = diverse_prompt.replace(
+                        "{previous_candidate}", former_candidates_des)
+                    diversity_message = {
+                        "role": "user", "content": diverse_prompt}
+                    temp_now_node.messages.append(diversity_message)
+
+                    delete_former_diversity_message = True
+            # on_chain_start
+            now_depth = temp_now_node.get_depth() // 3
+            chain_block_ids = [callback.on_chain_start(
+                depth=now_depth,
+                inputs=temp_now_node.messages
+            ) for callback in self.callbacks]
+            agent_block_ids = []
+            self.llm.change_messages(temp_now_node.messages)
+            # on_llm_start
+            [callback.on_llm_start(
+                depth=now_depth,
+                messages=temp_now_node.messages
+            ) for callback in self.callbacks]
+            new_message, error_code, total_tokens = self.llm.parse(
+                self.io_func.functions, process_id=self.process_id)
+            # print('-'*100, file=open(file_name,'a'))
+            # print('input', file=open(file_name,'a') )
+            a = deepcopy(temp_now_node.messages)
+            for aa in a:
+                # pprint.pprint(get_pretty_print(json.dumps(aa, indent=4)))
+                if 'function_call' in aa:
+                    aa['function_call'] = {}
+                print(json.dumps(aa, indent=4), file=open(file_name,'a'))
+            # print('output', file=open(file_name,'a') )
+            # print(new_message, file=open(file_name,'a') )
+            # on_llm_end
+            [callback.on_llm_end(
+                depth=now_depth,
+                response=new_message
+            ) for callback in self.callbacks]
+            self.query_count += 1
+            self.total_tokens += total_tokens
+            # if self.query_count >= max_query_count:  # a big return value will cause the Algo to exit
+                # return 100000
+
+            # We need to exclude the diversity_message, because it will influence child nodes
+            if delete_former_diversity_message:
+                temp_now_node.messages[-1]["valid"] = False
+
+            # parse nodes from OpenAI-message like CoT method
+            assert new_message["role"] == "assistant"
+            if "content" in new_message.keys() and new_message["content"] != None:
+                temp_node = tree_node()
+                temp_node.node_type = "Thought"
+                temp_node.description = new_message["content"]
+                child_io_state = deepcopy(temp_now_node.io_state)
+                child_io_state.retriever=None
+
+                temp_node.io_state = child_io_state
+                temp_node.is_terminal = child_io_state.check_success() != 0
+                temp_node.messages = deepcopy(temp_now_node.messages)
+                temp_node.father = temp_now_node
+                temp_now_node.children.append(temp_node)
+                temp_node.print(self.process_id)
+                temp_now_node = temp_node
+
+                if error_code != 0:
+                    temp_now_node.observation_code = error_code
+                    temp_now_node.pruned = True
+
+            if "function_call" in new_message.keys():
+                # on_agent_action
+                agent_block_ids = [callback.on_agent_action(
+                    depth=now_depth,
+                    action=new_message["function_call"]["name"],
+                    action_input=new_message["function_call"]["arguments"]
+                ) for callback in self.callbacks]
+                function_name = new_message["function_call"]["name"]
+                temp_node = tree_node()
+                temp_node.node_type = "Action"
+                temp_node.description = function_name
+                child_io_state = deepcopy(temp_now_node.io_state)
+                child_io_state.retriever=None
+
+                temp_node.io_state = child_io_state
+                temp_node.is_terminal = child_io_state.check_success() != 0
+                temp_node.messages = deepcopy(temp_now_node.messages)
+                temp_node.father = temp_now_node
+                temp_now_node.children.append(temp_node)
+
+                temp_node.print(self.process_id)
+                temp_now_node = temp_node
+
+                function_input = new_message["function_call"]["arguments"]
+                temp_node = tree_node()
+                temp_node.node_type = "Action Input"
+                temp_node.description = function_input
+                child_io_state = deepcopy(temp_now_node.io_state)
+                child_io_state.retriever=None
+                
+                # on_tool_start
+                [callback.on_tool_start(
+                    depth=now_depth,
+                    tool_name=temp_now_node.description,
+                    tool_input=function_input
+                ) for callback in self.callbacks]
+                observation, status = child_io_state.step(
+                    action_name=temp_now_node.description, action_input=function_input)
+                if status == 1:
+                    print(observation)
+                temp_node.observation = observation
+                temp_node.observation_code = status
+
+                temp_node.io_state = child_io_state
+                temp_node.is_terminal = child_io_state.check_success() != 0
+                temp_node.messages = deepcopy(temp_now_node.messages)
+                temp_node.father = temp_now_node
+                temp_now_node.children.append(temp_node)
+                temp_node.print(self.process_id)
+                temp_now_node = temp_node
+                # on_tool_end
+                [callback.on_tool_end(
+                    depth=now_depth,
+                    output=observation,
+                    status=status
+                ) for callback in self.callbacks]
+                if status != 0:
+                    # return code defination can be seen in Downstream_tasks/rapid_api
+                    if status == 4:
+                        temp_now_node.pruned = True
+                    elif status == 1:  # hallucination api name
+                        assert "function_call" in new_message.keys()
+                        os.makedirs('output', exist_ok=True)
+                        print(new_message["function_call"]["name"], file=open('output/hallucination.txt','a'))
+                        new_message["function_call"]["name"] = "invalid_hallucination_function_name"
+                    elif status == 3:  # final answer
+                        temp_now_node.is_terminal = True
+                        temp_now_node.make_finish(final_answer_back_length)
+
+            temp_now_node.messages.append(new_message)
+            if temp_now_node.node_type == "Action Input":
+                temp_now_node.messages.append({
+                    "role": "function",
+                    "name": new_message["function_call"]["name"],
+                    "content": temp_now_node.observation,
+                })
+            if self.query_count >= max_query_count:  # a big return value will cause the Algo to exit
+                temp_now_node.messages.append({
+                    "role": "user",
+                    "content": "you have reached the maximum query count, please call the finish function to give the answer or give up without restart.",
+                })
+            return_value = None
+            if not with_filter:  # DFSDT
+                result = self.DFS(temp_now_node, single_chain_max_step,
+                                  tree_beam_size, max_query_count, answer, with_filter)
+                if len(self.terminal_node) >= answer:
+                    return_value = 10000
+                elif result > 1:
+                    return_value = result-1
+
+            else:
+
+                next_tree_split_nodes.append(temp_now_node)
+            self.send_agent_chain_end(
+                now_depth, agent_block_ids, chain_block_ids)
+            if return_value is not None:
+                return return_value
+
+        # Sort the generated next_tree_split_nodes nodes when normal DFS
+        if len(next_tree_split_nodes) > 1:
+            # When using normal DFS, if we have many child nodes, we will refer to LLM to compare and choose the best one to expand first
+            # remember, this operator will cost extra OpenAI calls.
+            LLM_rank_args = {
+                "functions": self.io_func.functions,
+                "process_id": self.process_id,
+                "task_description": self.io_func.task_description,
+                "rank_func": rank2_subfix,
+            }
+            scores, rank_query_count, total_tokens = sum_based_rankn(
+                self.llm, LLM_rank_args=LLM_rank_args, candidates=next_tree_split_nodes)
+            self.query_count += rank_query_count
+            self.total_tokens += total_tokens
+            for score, node in zip(scores, next_tree_split_nodes):
+                node.prior_score = score
+            zip_value = list(
+                zip(next_tree_split_nodes, range(len(next_tree_split_nodes))))
+            zip_value.sort(
+                key=lambda x: x[0].prior_score, reverse=True)  # 先做score高的
+            next_tree_split_nodes, filtered_order = zip(*zip_value)
+            # if self.process_id == 0:
+            #     print(f"score={scores}, filtered order: {filtered_order}")
+
+        '''
+        Choose one to expand
+        '''
+        for i in range(len(next_tree_split_nodes)):
+            result = self.DFS(
+                next_tree_split_nodes[i], single_chain_max_step, tree_beam_size, max_query_count, answer)
+            if len(self.terminal_node) >= answer:
+                return 10000
+            elif result > 1:
+                now_node.make_finish(2)
+                return result - 1
+
+        return 1
--- a/toolbench/inference/Algorithms/init.py
+++ b/toolbench/inference/Algorithms/init.py
--- a/toolbench/inference/Algorithms/base_search.py
+++ b/toolbench/inference/Algorithms/base_search.py
@ -0,0 +1,33 @@
+from toolbench.inference.Downstream_tasks.base_env import base_env
+
+class base_search_method:
+    """For the base tree search method, you need to support the following functions"""
+    
+    def __init__(self,llm,io_func: base_env, process_id=0, callbacks = None):
+        """Args:
+            llm: The interface of the LLM 
+            io_func(base_env): Interface to the environment,
+            process_id (int, optional): In multiprocessing annotation, this describes the process id. Defaults to 0.
+            callbacks (_type_, optional): _description_. Defaults to None.
+        """
+        pass
+
+    def to_json(self,answer=False,process=True):
+        '''
+        return a json object, 
+        If "answer" = True. must have the following field to make answer annotation
+        If "process" = True. You need provide the full information of the tree searching process
+
+        "answer_generation": {
+            "valid_data": bool,
+            "final_answer": string,
+            "finish_type": enum["give_up","give_answer"]
+            "train_messages": [ [openAI-message] ],
+        }
+        '''
+        raise NotImplementedError
+
+    def start(self, **args):
+        """This is the entry point of the searching process"""
+        raise NotImplementedError
+
--- a/toolbench/inference/Algorithms/single_chain.py
+++ b/toolbench/inference/Algorithms/single_chain.py
@ -0,0 +1,189 @@
+import re
+from toolbench.inference.Tree.Tree import my_tree, tree_node
+from toolbench.inference.Prompts.ReAct_prompts import FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION, FORMAT_INSTRUCTIONS_USER_FUNCTION
+from toolbench.inference.Algorithms.base_search import base_search_method
+from copy import deepcopy
+
+class single_chain(base_search_method):
+    """Implement of CoT method
+    """
+    def __init__(self,llm,io_func,extra_prefix="",process_id=0,start_message_list=None):
+        """extra_prefix and start_message_list is used in Reflection Algo"""
+        super(single_chain, self).__init__(llm,io_func, process_id, callbacks=None)
+        self.io_func = io_func
+        self.llm = llm
+        self.extra_prefix = extra_prefix
+        self.start_message_list = start_message_list
+        self.process_id = process_id
+
+        self.restart()
+    def restart(self):
+        self.status = 0
+        self.try_list = []
+        self.terminal_node = []
+
+        self.query_count = 0 # number of interactions with openai
+        self.total_tokens = 0
+        self.success_count = 0
+
+    def to_json(self, answer=False,process=True):
+        if process:
+            json_obj = {
+                "win": self.status == 1,
+                "try_count": len(self.try_list),
+                "trys": self.try_list,
+                "compare_candidates": [],
+                "forward_args":self.forward_args,
+            }
+            for node in self.terminal_node:
+                if node.pruned == False: # has final answer
+                    json_obj["compare_candidates"].append(node.get_chain_result_from_this_node(use_messages=False))
+        else:
+            json_obj = {}
+
+        if answer:
+            json_obj["answer_generation"] = {
+                "valid_data": False,
+                "final_answer": "",
+                "function": self.io_func.functions,
+                "query_count": self.query_count,
+                "total_tokens": self.total_tokens,
+                "train_messages": [],
+                "chain": [],
+            }
+            for node in self.terminal_node:
+                if node.pruned == False:
+                    json_obj["answer_generation"]["valid_data"] = True
+                    json_obj["answer_generation"]["final_answer"] = node.description
+                    json_obj["answer_generation"]["train_messages"] = node.get_train_messages_from_this_node()
+                    break
+        return json_obj
+
+    def to_json_single(self):
+        """parse the last try
+        Though the nodes are formed as a tree, We still know they are actually a chain
+        """
+        json_obj = {}
+        tree_obj = self.terminal_node[-1].get_chain_result_from_this_node()
+        json_obj["chain"] = tree_obj
+        json_obj["win"] = self.status == 1
+        return json_obj
+
+    def start(self,single_chain_max_step,pass_at=1,answer=1):
+        self.forward_args = locals()
+        if "self" in self.forward_args.keys():
+            self.forward_args.pop("self")
+
+        for i in range(pass_at):
+            if self.process_id == 0:
+                print(f"[single_chain]try for the {i+1} time")
+            self.tree = my_tree()
+            self.tree.root.node_type = "Action Input"
+            self.tree.root.io_state = deepcopy(self.io_func)
+            out_node = self.do_chain(self.tree.root, single_chain_max_step)
+            self.terminal_node.append(out_node)
+            self.try_list.append(self.to_json_single())
+            if out_node.io_state.check_success() == 1:
+                self.status = 1
+                self.success_count += 1
+                if self.success_count >= answer:
+                    return 1
+        return 0
+
+
+    def do_chain(self,now_node,single_chain_max_step):
+
+        if self.start_message_list == None:
+            system = FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION
+            system = system.replace("{task_description}",self.io_func.task_description)
+            self.tree.root.messages.append({"role":"system","content":system})
+
+            user = FORMAT_INSTRUCTIONS_USER_FUNCTION
+            user = user.replace("{input_description}",self.io_func.input_description)
+            self.tree.root.messages.append({"role":"user","content":user})
+        else:
+            """In Reflection Algo, we startswith former trials and reflections, so the caller will give the start messages"""
+            self.tree.root.messages = self.start_message_list
+        
+        now_node = self.tree.root
+        while True:
+            # recursively parse message into nodes
+            self.llm.change_messages(now_node.messages)
+            new_message,error_code,total_tokens = self.llm.parse(functions=self.io_func.functions,process_id=self.process_id)
+            self.total_tokens += total_tokens
+            self.query_count += 1
+            assert new_message["role"] == "assistant"
+            if "content" in new_message.keys() and new_message["content"] != None:
+                temp_node = tree_node()
+                temp_node.node_type = "Thought"
+                temp_node.description = new_message["content"]
+                child_io_state = deepcopy(now_node.io_state)
+                
+                temp_node.io_state = child_io_state
+                temp_node.is_terminal = child_io_state.check_success() != 0 
+                temp_node.messages = now_node.messages.copy()
+                temp_node.father = now_node
+                now_node.children.append(temp_node)
+                temp_node.print(self.process_id)
+                now_node = temp_node
+
+                if error_code != 0:
+                    now_node.observation_code = error_code
+                    now_node.pruned = True
+
+            if "function_call" in new_message.keys():
+                function_name = new_message["function_call"]["name"]
+                temp_node = tree_node()
+                temp_node.node_type = "Action"
+                temp_node.description = function_name
+                child_io_state = deepcopy(now_node.io_state)
+                
+                temp_node.io_state = child_io_state
+                temp_node.is_terminal = child_io_state.check_success() != 0 
+                temp_node.messages = now_node.messages.copy()
+                temp_node.father = now_node
+                now_node.children.append(temp_node)
+
+                temp_node.print(self.process_id)
+                now_node = temp_node
+
+                function_input = new_message["function_call"]["arguments"]
+                temp_node = tree_node()
+                temp_node.node_type = "Action Input"
+                temp_node.description = function_input
+                child_io_state = deepcopy(now_node.io_state)
+
+                observation, status = child_io_state.step(action_name=now_node.description, action_input=function_input)
+                temp_node.observation = observation
+                temp_node.observation_code = status
+
+                temp_node.io_state = child_io_state
+                temp_node.is_terminal = child_io_state.check_success() != 0 
+                temp_node.messages = now_node.messages.copy()
+                temp_node.father = now_node
+                now_node.children.append(temp_node)
+                temp_node.print(self.process_id)
+                now_node = temp_node
+
+                if status != 0:
+                    # return code refers to Downstream_tasks/rapidapi
+                    if status == 4:
+                        now_node.pruned = True
+                    elif status == 1: # hallucination api name
+                        assert "function_call" in new_message.keys()
+                        new_message["function_call"]["name"] = "invalid_hallucination_function_name"
+            
+            now_node.messages.append(new_message)
+            if now_node.node_type == "Action Input":
+                now_node.messages.append({
+                    "role":"function",
+                    "name": new_message["function_call"]["name"],
+                    "content": now_node.observation,
+                })
+            if now_node.get_depth() >= single_chain_max_step and not (now_node.is_terminal):
+                now_node.pruned = True
+            
+            if now_node.pruned or now_node.is_terminal:
+                return now_node
+
+    
--- a/toolbench/inference/Downstream_tasks/init.py
+++ b/toolbench/inference/Downstream_tasks/init.py
--- a/toolbench/inference/Downstream_tasks/base_env.py
+++ b/toolbench/inference/Downstream_tasks/base_env.py
@ -0,0 +1,36 @@
+class base_env:
+
+    def __init__(self):
+        self.task_description = ""
+        self.input_description = ""
+        self.tool_names = []
+        self.functions = []
+
+    def restart(self):
+        '''
+        Restrat the environment
+        '''
+        raise NotImplementedError
+    
+    def get_score(self):
+        '''
+        Get the value of the current state
+        A fake function, used to search in oracle mode, which is not actually used (and impossible to obtain)
+        '''
+        raise NotImplementedError
+
+    def step(self, action, input_str):
+        '''
+        Perform an interaction in natural language mode
+        return value (output str, status code)
+        '''
+        raise NotImplementedError
+    
+    def check_success(self):
+        '''
+        Returns 1 if successful, otherwise returns 0
+        '''
+        raise NotImplementedError
+    
+    def to_json(self):
+        raise NotImplementedError
--- a/toolbench/inference/Downstream_tasks/rapidapi.py
+++ b/toolbench/inference/Downstream_tasks/rapidapi.py
@ -0,0 +1,589 @@
+import re
+import os
+import json
+import time
+import requests
+from tqdm import tqdm
+from termcolor import colored
+import random
+from toolbench.inference.LLM.chatgpt_function_model import ChatGPTFunction, GPT4Function
+from toolbench.inference.LLM.davinci_model import Davinci
+from toolbench.inference.LLM.tool_llama_lora_model import ToolLLaMALoRA
+from toolbench.inference.LLM.tool_llama_model import ToolLLaMA
+from toolbench.inference.LLM.retriever import ToolRetriever
+from toolbench.inference.Algorithms.single_chain import single_chain
+from toolbench.inference.Algorithms.DFS import DFS_tree_search
+from toolbench.inference.server import get_rapidapi_response
+from toolbench.utils import (
+    standardize,
+    change_name,
+    replace_llama_with_condense
+)
+
+from toolbench.inference.Downstream_tasks.base_env import base_env
+# from toolbench.inference.Downstream_tasks.find_api_by_gpt4 import find_apis_with_details
+error_list = ['Too many requests error...', 'Rate limit...', 'Unsubscribed', 'Unauthorized', 'not working error...', 'Quota','quota', 'Blocked', 'Rate limit', 'Unauthorized error']
+
+
+# For pipeline environment preparation
+def get_white_list(tool_root_dir):
+    # print(tool_root_dir)
+    white_list_dir = os.path.join(tool_root_dir)
+    white_list = {}
+    for cate in tqdm(os.listdir(white_list_dir)):
+        if not os.path.isdir(os.path.join(white_list_dir,cate)):
+            continue
+        for file in os.listdir(os.path.join(white_list_dir,cate)):
+            if not file.endswith(".json"):
+                continue
+            standard_tool_name = file.split(".")[0]
+            # print(standard_tool_name)
+            with open(os.path.join(white_list_dir,cate,file)) as reader:
+                js_data = json.load(reader)
+            origin_tool_name = js_data["tool_name"]
+            white_list[standardize(origin_tool_name)] = {"description": js_data["tool_description"], "standard_tool_name": standard_tool_name}
+    return white_list
+
+def contain(candidate_list, white_list):
+    output = []
+    for cand in candidate_list:
+        if cand not in white_list.keys():
+            return False
+        output.append(white_list[cand])
+    return output
+
+
+# rapidapi env wrapper
+class rapidapi_wrapper(base_env):
+    def __init__(self, query_json, tool_descriptions, retriever, args, process_id=0):
+        super(rapidapi_wrapper).__init__()
+
+        self.tool_root_dir = args.tool_root_dir
+        self.toolbench_key = args.toolbench_key
+        self.rapidapi_key = args.rapidapi_key
+        self.use_rapidapi_key = args.use_rapidapi_key
+        self.api_customization = args.api_customization
+        self.service_url = "http://8.218.239.54:8080/rapidapi"
+        self.max_observation_length = args.max_observation_length
+        self.observ_compress_method = args.observ_compress_method
+        self.retriever = retriever
+        self.process_id = process_id
+
+        self.tool_names = []
+        self.cate_names = []
+
+        self.input_description = query_json["query"]
+        self.functions = []
+        self.api_name_reflect = {}
+        try:
+            self.rapidapi_key_list = json.load('rapidapi_key_list.json')
+        except:
+            self.rapidapi_key_list = []
+        self.use_rapidapi_key = True
+        self.api_customization = True
+
+        if self.retriever is not None:
+            query_json = self.retrieve_rapidapi_tools(self.input_description, args.retrieved_api_nums, args.tool_root_dir)
+            data_dict = self.fetch_api_json(query_json)
+            tool_descriptions = self.build_tool_description(data_dict)
+        else:
+            data_dict = self.fetch_api_json(query_json)
+
+        for k,api_json in enumerate(data_dict["api_list"]):
+            standard_tool_name = tool_descriptions[k][0]
+            openai_function_json,cate_name, pure_api_name = self.api_json_to_openai_json(api_json,standard_tool_name)
+            self.functions.append(openai_function_json)
+
+            self.api_name_reflect[openai_function_json["name"]] = pure_api_name
+            self.tool_names.append(standard_tool_name)
+            self.cate_names.append(cate_name)
+
+        finish_func = {
+            "name": "Finish",
+            "description": "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. Alternatively, if you recognize that you are unable to proceed with the task in the current state, call this function to restart. Remember: you must ALWAYS call this function at the end of your attempt, and the only part that will be shown to the user is the final answer, so it should contain sufficient information.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "return_type": {
+                        "type": "string",
+                        "enum": ["give_answer","give_up_and_restart"],
+                    },
+                    "final_answer": {
+                        "type": "string",
+                        "description": "The final answer you want to give the user. You should have this field if \"return_type\"==\"give_answer\"",
+                    }
+                },
+                "required": ["return_type"],
+            }
+        }
+
+        self.functions.append(finish_func)
+        self.CALL_MAX_TIME = 3
+        self.task_description = f'''You should use functions to help handle the real time user querys. Remember:
+1.ALWAYS call \"Finish\" function at the end of the task. And the final answer should contain enough information to show to the user,If you can't handle the task, or you find that function calls always fail(the function is not valid now), use function Finish->give_up_and_restart.
+2.Do not use origin tool names, use only subfunctions' names.
+You have access of the following tools:\n'''
+        
+        unduplicated_reflection = {}
+        for standardize_tool_name, tool_des in tool_descriptions:
+            unduplicated_reflection[standardize_tool_name] = tool_des
+
+        for k,(standardize_tool_name, tool_des) in enumerate(unduplicated_reflection.items()):
+            striped = tool_des[:512].replace('\n','').strip()
+            if striped == "":
+                striped = "None"
+            self.task_description += f"{k+1}.{standardize_tool_name}: {striped}\n"
+
+        self.success = 0
+
+    def build_tool_description(self, data_dict):
+        white_list = get_white_list(self.tool_root_dir)
+        origin_tool_names = [standardize(cont["tool_name"]) for cont in data_dict["api_list"]]
+        tool_des = contain(origin_tool_names,white_list)
+        tool_descriptions = [[cont["standard_tool_name"], cont["description"]] for cont in tool_des]
+        return tool_descriptions
+    
+    def retrieve_rapidapi_tools(self, query, top_k, jsons_path):
+        retrieved_tools = self.retriever.retrieving(query, top_k=top_k)
+        query_json = {"api_list":[]}
+        for tool_dict in retrieved_tools:
+            if len(query_json["api_list"]) == top_k:
+                break
+            category = tool_dict["category"]
+            tool_name = tool_dict["tool_name"]
+            api_name = tool_dict["api_name"]
+            if os.path.exists(jsons_path):
+                if os.path.exists(os.path.join(jsons_path, category)):
+                    if os.path.exists(os.path.join(jsons_path, category, tool_name+".json")):
+                        query_json["api_list"].append({
+                            "category_name": category,
+                            "tool_name": tool_name,
+                            "api_name": api_name
+                        })
+        return query_json
+    
+    def fetch_api_json(self, query_json):
+        data_dict = {"api_list":[]}
+        for item in query_json["api_list"]:
+            cate_name = item["category_name"]
+            tool_name = standardize(item["tool_name"])
+            api_name = change_name(standardize(item["api_name"]))
+            tool_json = json.load(open(os.path.join(self.tool_root_dir, cate_name, tool_name + ".json"), "r"))
+            append_flag = False
+            api_dict_names = []
+            for api_dict in tool_json["api_list"]:
+                api_dict_names.append(api_dict["name"])
+                pure_api_name = change_name(standardize(api_dict["name"]))
+                if pure_api_name != api_name:
+                    continue
+                api_json = {}
+                api_json["category_name"] = cate_name
+                api_json["api_name"] = api_dict["name"]
+                api_json["api_description"] = api_dict["description"]
+                api_json["required_parameters"] = api_dict["required_parameters"]
+                api_json["optional_parameters"] = api_dict["optional_parameters"]
+                api_json["tool_name"] = tool_json["tool_name"]
+                data_dict["api_list"].append(api_json)
+                append_flag = True
+                break
+            if not append_flag:
+                print(api_name, api_dict_names)
+        return data_dict
+
+    def api_json_to_openai_json(self, api_json,standard_tool_name):
+        description_max_length=256
+        templete =     {
+            "name": "",
+            "description": "",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                },
+                "required": [],
+                "optional": [],
+            }
+        }
+        
+        map_type = {
+            "NUMBER": "integer",
+            "STRING": "string",
+            "BOOLEAN": "boolean"
+        }
+
+        pure_api_name = change_name(standardize(api_json["api_name"]))
+        templete["name"] = pure_api_name+ f"_for_{standard_tool_name}"
+        templete["name"] = templete["name"][-64:]
+
+        templete["description"] = f"This is the subfunction for tool \"{standard_tool_name}\", you can use this tool."
+        
+        if api_json["api_description"].strip() != "":
+            tuncated_description = api_json['api_description'].strip().replace(api_json['api_name'],templete['name'])[:description_max_length]
+            templete["description"] = templete["description"] + f"The description of this function is: \"{tuncated_description}\""
+        if "required_parameters" in api_json.keys() and len(api_json["required_parameters"]) > 0:
+            for para in api_json["required_parameters"]:
+                name = standardize(para["name"])
+                name = change_name(name)
+                if para["type"] in map_type:
+                    param_type = map_type[para["type"]]
+                else:
+                    param_type = "string"
+                prompt = {
+                    "type":param_type,
+                    "description":para["description"][:description_max_length],
+                }
+
+                default_value = para['default']
+                if len(str(default_value)) != 0:    
+                    prompt = {
+                        "type":param_type,
+                        "description":para["description"][:description_max_length],
+                        "example_value": default_value
+                    }
+                else:
+                    prompt = {
+                        "type":param_type,
+                        "description":para["description"][:description_max_length]
+                    }
+
+                templete["parameters"]["properties"][name] = prompt
+                templete["parameters"]["required"].append(name)
+            for para in api_json["optional_parameters"]:
+                name = standardize(para["name"])
+                name = change_name(name)
+                if para["type"] in map_type:
+                    param_type = map_type[para["type"]]
+                else:
+                    param_type = "string"
+
+                default_value = para['default']
+                if len(str(default_value)) != 0:    
+                    prompt = {
+                        "type":param_type,
+                        "description":para["description"][:description_max_length],
+                        "example_value": default_value
+                    }
+                else:
+                    prompt = {
+                        "type":param_type,
+                        "description":para["description"][:description_max_length]
+                    }
+
+                templete["parameters"]["properties"][name] = prompt
+                templete["parameters"]["optional"].append(name)
+
+        return templete, api_json["category_name"],  pure_api_name
+
+    def check_success(self):
+        return self.success
+
+    def to_json(self):
+        return {}
+
+    def restart(self):
+        pass
+
+    def get_score(self):
+        return 0.0
+
+    def step(self,**args):
+        obs, code = self._step(**args)
+        if len(obs) > self.max_observation_length:
+            obs = obs[:self.max_observation_length] + "..."
+        return obs, code
+
+    def _step(self, action_name="", action_input=""):
+        """Need to return an observation string and status code:
+            0 means normal response
+            1 means there is no corresponding api name
+            2 means there is an error in the input
+            3 represents the end of the generation and the final answer appears
+            4 means that the model decides to pruning by itself
+            5 represents api call timeout
+            6 for 404
+            7 means not subscribed
+            8 represents unauthorized
+            9 represents too many requests
+            10 stands for rate limit
+            11 message contains "error" field
+            12 error sending request
+        """
+        if action_name == "Finish":
+            try:
+                json_data = json.loads(action_input,strict=False)
+            except:
+                json_data = {}
+                if '"return_type": "' in action_input:
+                    if '"return_type": "give_answer"' in action_input:
+                        return_type = "give_answer"
+                    elif '"return_type": "give_up_and_restart"' in action_input:
+                        return_type = "give_up_and_restart"
+                    else:
+                        return_type = action_input[action_input.find('"return_type": "')+len('"return_type": "'):action_input.find('",')]
+                    json_data["return_type"] = return_type
+                if '"final_answer": "' in action_input:
+                    final_answer = action_input[action_input.find('"final_answer": "')+len('"final_answer": "'):]
+                    json_data["final_answer"] = final_answer
+            if "return_type" not in json_data.keys():
+                return "{error:\"must have \"return_type\"\"}", 2
+            if json_data["return_type"] == "give_up_and_restart":
+                return "{\"response\":\"chose to give up and restart\"}",4
+            elif json_data["return_type"] == "give_answer":
+                if "final_answer" not in json_data.keys():
+                    return "{error:\"must have \"final_answer\"\"}", 2
+                
+                self.success = 1 # succesfully return final_answer
+                return "{\"response\":\"successfully giving the final answer.\"}", 3
+            else:
+                return "{error:\"\"return_type\" is not a valid choice\"}", 2
+        else:
+
+            for k, function in enumerate(self.functions):
+                if function["name"].endswith(action_name):
+                    pure_api_name = self.api_name_reflect[function["name"]]
+                    payload = {
+                        "category": self.cate_names[k],
+                        "tool_name": self.tool_names[k],
+                        "api_name": pure_api_name,
+                        "tool_input": action_input,
+                        "strip": self.observ_compress_method,
+                        "toolbench_key": self.toolbench_key
+                    }
+                    if self.process_id == 0:
+                        print(colored(f"query to {self.cate_names[k]}-->{self.tool_names[k]}-->{action_name}",color="yellow"))
+                    if self.use_rapidapi_key or self.api_customization:
+                        payload["rapidapi_key"] = self.rapidapi_key
+                        response = get_rapidapi_response(payload, api_customization=self.api_customization)
+                    else:
+                        time.sleep(2) # rate limit: 30 per minute
+                        headers = {"toolbench_key": self.toolbench_key}
+                        try:
+                            response = requests.post(self.service_url, json=payload, headers=headers, timeout=15)
+                        except:
+                            # return json.dumps({"error": action_name, "response": ""}), 13
+                            os.makedirs('output', exist_ok=True)
+                            print(payload, file=open('output/timeout.txt','a'))
+                            return json.dumps({"error": "connection timeout", "response": ""}), 13
+                        if response.status_code != 200:
+                            return json.dumps({"error": f"request invalid, data error. status_code={response.status_code}", "response": ""}), 12
+                        try:
+                            response = response.json()
+                        except:
+                            print(response)
+                            return json.dumps({"error": f"request invalid, data error", "response": ""}), 12
+                    while any([word in response["error"] for word in error_list]):
+                        if cnt < len(self.rapidapi_key_list):
+                            # if self.use_rapidapi_key or self.api_customization:
+                            print(f'use rapidapi key {cnt}', file=open('output/rapidapi_key_usage.txt','a'))
+                            print(colored(f'use rapidapi key {cnt}', 'red'))
+                            payload["rapidapi_key"] = self.rapidapi_key_list[cnt]
+                            response = get_rapidapi_response(payload, api_customization=self.api_customization)
+                            print(response['error'], file=open('output/rapidapi_key_usage.txt','a'))
+                            cnt += 1    
+                        else:
+                            break
+                    # 1 Hallucinating function names
+                    # 4 means that the model decides to pruning by itself
+                    # 5 represents api call timeout
+                    # 6 for 404
+                    # 7 means not subscribed
+                    # 8 represents unauthorized
+                    # 9 represents too many requests
+                    # 10 stands for rate limit
+                    # 11 message contains "error" field
+                    # 12 error sending request
+                    if response["error"] == "API not working error...":
+                        status_code = 6
+                    elif response["error"] == "Unauthorized error...":
+                        status_code = 7
+                    elif response["error"] == "Unsubscribed error...":
+                        status_code = 8
+                    elif response["error"] == "Too many requests error...":
+                        status_code = 9
+                    elif response["error"] == "Rate limit per minute error...":
+                        print("Reach api calling limit per minute, sleeping...")
+                        time.sleep(10)
+                        status_code = 10
+                    elif response["error"] == "Message error...":
+                        status_code = 11
+                    else:
+                        status_code = 0
+                    return json.dumps(response), status_code
+                    # except Exception as e:
+                    #     return json.dumps({"error": f"Timeout error...{e}", "response": ""}), 5
+            return json.dumps({"error": f"No such function name: {action_name}", "response": ""}), 1
+
+
+class pipeline_runner:
+    def __init__(self, args, add_retrieval=False, process_id=0, server=False):
+        self.args = args
+        self.add_retrieval = add_retrieval
+        self.process_id = process_id
+        self.server = server
+        if not self.server: self.task_list = self.generate_task_list()
+        else: self.task_list = []
+
+    def get_backbone_model(self):
+        args = self.args
+        if args.backbone_model == "toolllama":
+            # ratio = 4 means the sequence length is expanded by 4, remember to change the model_max_length to 8192 (2048 * ratio) for ratio = 4
+            ratio = int(args.max_sequence_length/args.max_source_sequence_length)
+            replace_llama_with_condense(ratio=ratio)
+            if args.lora:
+                backbone_model = ToolLLaMALoRA(base_name_or_path=args.model_path, model_name_or_path=args.lora_path, max_sequence_length=args.max_sequence_length)
+            else:
+                backbone_model = ToolLLaMA(model_name_or_path=args.model_path, max_sequence_length=args.max_sequence_length)
+        else:
+            backbone_model = args.backbone_model
+        return backbone_model
+
+    def get_retriever(self):
+        return ToolRetriever(corpus_tsv_path=self.args.corpus_tsv_path, model_path=self.args.retrieval_model_path)
+
+    def get_args(self):
+        return self.args
+
+    def generate_task_list(self):
+        args = self.args
+        query_dir = args.input_query_file
+        answer_dir = args.output_answer_file
+        if not os.path.exists(answer_dir):
+            os.mkdir(answer_dir)
+        method = args.method
+        backbone_model = self.get_backbone_model()
+        white_list = get_white_list(args.tool_root_dir)
+        task_list = []
+        querys = json.load(open(query_dir, "r"))
+        for query_id, data_dict in enumerate(querys):
+            if "query_id" in data_dict:
+                query_id = data_dict["query_id"]
+            if "api_list" in data_dict:
+                # data_dict['api_list'] = find_apis_with_details(data_dict['query'])
+                # print('#'*100)
+                # print(data_dict['api_list'])
+                origin_tool_names = [standardize(cont["tool_name"]) for cont in data_dict["api_list"]]
+                tool_des = contain(origin_tool_names,white_list)
+                if tool_des == False:
+                    continue
+                tool_des = [[cont["standard_tool_name"], cont["description"]] for cont in tool_des]
+            else:
+                tool_des = None
+            task_list.append((method, backbone_model, query_id, data_dict, args, answer_dir, tool_des))
+            # json.dump(querys, open(query_dir.split('.')[0]+'_new.json', 'w'))
+        return task_list
+    
+    def method_converter(self, backbone_model, openai_key, method, env, process_id, single_chain_max_step=12, max_query_count=60, callbacks=None):
+        if callbacks is None: callbacks = []
+        if backbone_model == "chatgpt_function":
+            # model = "gpt-4-0613"
+            model = "gpt-4-32k"
+            # model = "gpt-4-deployment"
+            # model = "gpt-3.5-turbo-16k-0613"
+            llm_forward = GPT4Function(model=model, openai_key=openai_key)
+        elif backbone_model == "davinci":
+            model = "text-davinci-003"
+            llm_forward = Davinci(model=model, openai_key=openai_key)
+        else:
+            model = backbone_model
+            llm_forward = model
+        
+        if method.startswith("CoT"):
+            passat = int(method.split("@")[-1])
+            chain = single_chain(llm=llm_forward, io_func=env,process_id=process_id)
+            result = chain.start(
+                                pass_at=passat,
+                                single_chain_max_step=single_chain_max_step,
+                                answer=1)
+        elif method.startswith("DFS"):
+            pattern = r".+_w(\d+)"
+            re_result = re.match(pattern,method)
+            assert re_result != None
+            width = int(re_result.group(1))
+            with_filter = True
+            if "woFilter" in method:
+                with_filter = False
+            chain = DFS_tree_search(llm=llm_forward, io_func=env,process_id=process_id, callbacks=callbacks)
+            result = chain.start(
+                                single_chain_max_step=single_chain_max_step,
+                                tree_beam_size = width,
+                                max_query_count = max_query_count,
+                                answer=1,
+                                with_filter=with_filter)
+        else:
+            print("invalid method")
+            raise NotImplementedError
+        return chain, result
+    
+    def run_single_task(self, method, backbone_model, query_id, data_dict, args, output_dir_path, tool_des, retriever=None, process_id=0, callbacks=None, server= None):
+        if server is None:
+            server = self.server
+        if callbacks is None:
+            if server: print("Warning: no callbacks are defined for server mode")
+            callbacks = []
+        splits = output_dir_path.split("/")
+        # print("/".join(splits[:-1]))
+        # print("/".join(splits))
+        # print('#'*100)
+        # os.makedirs("/".join(splits[:-1]),exist_ok=True)
+        os.makedirs("/".join(splits),exist_ok=True)
+        output_file_path = os.path.join(output_dir_path,f"{query_id}_{method}.json")
+        if (not server) and os.path.exists(output_file_path):
+            return
+        [callback.on_tool_retrieval_start() for callback in callbacks]
+        env = rapidapi_wrapper(data_dict, tool_des, retriever, args, process_id=process_id)
+        [callback.on_tool_retrieval_end(
+            tools=env.functions
+        ) for callback in callbacks]
+        query = data_dict["query"]
+        if process_id == 0:
+            print(colored(f"[process({process_id})]now playing {query}, with {len(env.functions)} APIs", "green"))
+        [callback.on_request_start(
+            user_input=query,
+            method=method,
+        ) for callback in callbacks]
+        chain,result = self.method_converter(
+            backbone_model=backbone_model,
+            openai_key=args.openai_key,
+            method=method,
+            env=env,
+            process_id=process_id,
+            single_chain_max_step=12,
+            max_query_count=200,
+            callbacks=callbacks
+        )
+        [callback.on_request_end(
+            chain=chain.terminal_node[0].messages,
+            outputs=chain.terminal_node[0].description,
+        ) for callback in callbacks]
+        if output_dir_path is not None:
+            print('#'*100)
+            print(output_file_path)
+            with open(output_file_path,"w") as writer:
+                data = chain.to_json(answer=True,process=True)
+                data["answer_generation"]["query"] = query
+                json.dump(data, writer, indent=2)
+                success = data["answer_generation"]["valid_data"] and "give_answer" in data["answer_generation"]["final_answer"]
+                print(colored(f"[process({process_id})]valid={success}", "green"))
+        return result
+        
+    def run(self):
+        task_list = self.task_list
+        random.seed(42)
+        random.shuffle(task_list)
+        print(f"total tasks: {len(task_list)}")
+        new_task_list = []
+        for task in task_list:
+            out_dir_path = task[-2]
+            query_id = task[2]
+            output_file_path = os.path.join(out_dir_path,f"{query_id}_{self.args.method}.json")
+            if not os.path.exists(output_file_path):
+                new_task_list.append(task)
+        task_list = new_task_list
+        print(f"undo tasks: {len(task_list)}")
+        if self.add_retrieval:
+            retriever = self.get_retriever()
+        else:
+            retriever = None
+        for k, task in enumerate(task_list):
+            print(f"process[{self.process_id}] doing task {k}/{len(task_list)}: real_task_id_{task[2]}")
+            result = self.run_single_task(*task, retriever=retriever, process_id=self.process_id)
+
--- a/toolbench/inference/LLM/init.py
+++ b/toolbench/inference/LLM/init.py
--- a/toolbench/inference/LLM/base_io.py
+++ b/toolbench/inference/LLM/base_io.py
@ -0,0 +1,4 @@
+import re
+
+def base_io(input_str):
+    pass
--- a/toolbench/inference/LLM/chatgpt_function_model.py
+++ b/toolbench/inference/LLM/chatgpt_function_model.py
@ -0,0 +1,240 @@
+import json
+import openai
+from tenacity import retry, wait_random_exponential, stop_after_attempt
+from termcolor import colored
+import time
+import random
+import openai
+import os
+from arguments import parse_args
+from config import *
+from openai_utils import call_gpt
+args = parse_args()
+output_dir = args.output_dir
+# import importlib
+# module = importlib.import_module(args.openai_config_path.replace('.py',''))
+# for name in dir(module):
+#     if not name.startswith('_'):
+#         globals()[name] = getattr(module, name)
+# api_key = globals()['api_key']
+# api_version = globals()['api_version']
+# model_name = globals()['model_name']
+# api_base = globals()['api_base']
+if api_type == "azure":
+    from openai import AzureOpenAI as Client
+else:
+    from openai import OpenAI as Client
+client = Client(
+    api_key=api_key,
+    api_version=api_version,
+    azure_endpoint = api_base
+    )
+@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
+def chat_completion_request(key, messages, functions=None,function_call=None,key_pos=None, model="gpt-4-32k",stop=None,process_id=0, **args):
+    use_messages = []
+    for message in messages:
+        if not("valid" in message.keys() and message["valid"] == False):
+            use_messages.append(message)
+    json_data = {
+        "model": model_name,
+        "messages": use_messages,
+        # "seed":123,
+        "max_tokens": 1024,
+        "frequency_penalty": 0,
+        "presence_penalty": 0,
+        **args
+    }
+    if stop is not None:
+        json_data.update({"stop": stop})
+    if functions is not None:
+        json_data.update({"functions": functions})
+    if function_call is not None:
+        json_data.update({"function_call": function_call})
+    
+    try:
+        # if model in ["gpt-3.5-turbo-16k-0613","gpt-4-0613", "gpt-4-deployment","gpt-4-32k", 'gpt-4-turbo']:
+        #     openai.api_key = key
+        # else:
+        #     raise NotImplementedError
+        # ts = time.time()
+        # print(json_data, file=open('output/gpt_io.txt','a'))
+        # print(time.time()-ts)
+        ts = time.time()
+        # json.dump(json_data['messages'], open(os.path.join(output_dir,'messages.json'),'w'), indent=4)
+        openai_response = call_gpt(
+            **json_data,
+        )
+        # openai_response = client.chat.completions.create(
+        #     **json_data,
+        # )
+        # print('solve', time.time()-ts, file=open(os.path.join(output_dir,'time.txt'),'a'))
+        # json_data = json.loads(str(openai_response))
+        json_data = json.loads(openai_response.json())
+        json_data["choices"][0]['message'].pop('tool_calls')
+        return json_data 
+
+    except Exception as e:
+        # print('solve', time.time()-ts, file=open(os.path.join(output_dir,'time.txt'),'a'))
+        # # json_data = json.loads(str(openai_response))
+        # json_data = json.loads(openai_response.json())
+        # json_data["choices"][0]['message'].pop('tool_calls')
+        print("Unable to generate ChatCompletion response")
+        print(f"OpenAI calling Exception: {e}")
+        return e
+
+class ChatGPTFunction:
+    def __init__(self, model="gpt-3.5-turbo-16k-0613", openai_key=""):
+        self.model = model
+        self.conversation_history = []
+        self.openai_key = openai_key
+        self.time = time.time()
+        self.TRY_TIME = 6
+
+    def add_message(self, message):
+        self.conversation_history.append(message)
+
+    def change_messages(self,messages):
+        self.conversation_history = messages
+
+    def display_conversation(self, detailed=False):
+        role_to_color = {
+            "system": "red",
+            "user": "green",
+            "assistant": "blue",
+            "function": "magenta",
+        }
+        print("before_print"+"*"*50)
+        for message in self.conversation_history:
+            print_obj = f"{message['role']}: {message['content']} "
+            if "function_call" in message.keys():
+                print_obj = print_obj + f"function_call: {message['function_call']}"
+            print_obj += ""
+            print(
+                colored(
+                    print_obj,
+                    role_to_color[message["role"]],
+                )
+            )
+        print("end_print"+"*"*50)
+
+    def parse(self,functions,process_id,key_pos=None,**args):
+        self.time = time.time()
+        for conversation in self.conversation_history:
+            if 'content' not in conversation:
+                conversation['content'] = ''
+                print(self.conversation_history, file=open('tmp.txt','a'))
+        conversation_history = self.conversation_history
+        for _ in range(self.TRY_TIME):
+            if _ != 0:
+                time.sleep(15)
+            if functions != []:
+                json_data = chat_completion_request(
+                    self.openai_key, conversation_history, functions=functions,process_id=process_id, key_pos=key_pos,**args
+                )
+            else:
+                json_data = chat_completion_request(
+                    self.openai_key, conversation_history,process_id=process_id,key_pos=key_pos, **args
+                )
+            try:
+                total_tokens = json_data['usage']['total_tokens']
+                message = json_data["choices"][0]["message"]
+                if process_id == 0:
+                    print(f"[process({process_id})]total tokens: {json_data['usage']['total_tokens']}")
+
+                if "function_call" in message.keys() and "." in message["function_call"]["name"]:
+                    message["function_call"]["name"] = message["function_call"]["name"].split(".")[-1]
+
+                return message, 0, total_tokens
+            except BaseException as e:
+                print(f"[process({process_id})]Parsing Exception: {repr(e)}. Try again.")
+                if json_data is not None:
+                    print(f"[process({process_id})]OpenAI return: {json_data}")
+            
+
+        return {"role": "assistant", "content": str(json_data)}, -1, 0
+    
+class GPT4Function:
+    def __init__(self, model="gpt-4-0613", openai_key=""):
+        self.model = model
+        self.conversation_history = []
+        self.openai_key = openai_key
+        self.time = time.time()
+        self.TRY_TIME = 6
+
+    def add_message(self, message):
+        self.conversation_history.append(message)
+
+    def change_messages(self,messages):
+        self.conversation_history = messages
+
+    def display_conversation(self, detailed=False):
+        role_to_color = {
+            "system": "red",
+            "user": "green",
+            "assistant": "blue",
+            "function": "magenta",
+        }
+        print("before_print"+"*"*50)
+        for message in self.conversation_history:
+            print_obj = f"{message['role']}: {message['content']} "
+            if "function_call" in message.keys():
+                print_obj = print_obj + f"function_call: {message['function_call']}"
+            print_obj += ""
+            print(
+                colored(
+                    print_obj,
+                    role_to_color[message["role"]],
+                )
+            )
+        print("end_print"+"*"*50)
+
+    def parse(self,functions,process_id,key_pos=None,**args):
+        self.time = time.time()
+        for conversation in self.conversation_history:
+            if 'content' not in conversation:
+                conversation['content'] = ''
+                print(self.conversation_history, file=open('tmp.txt','a'))
+        conversation_history = self.conversation_history
+        # print(**args)
+        for _ in range(self.TRY_TIME):
+            if _ != 0:
+                time.sleep(15)
+            if functions != []:
+                json_data = chat_completion_request(
+                    self.openai_key, conversation_history, functions=functions,process_id=process_id, key_pos=key_pos,model=self.model, **args
+                )
+            else:
+                json_data = chat_completion_request(
+                    self.openai_key, conversation_history,process_id=process_id,key_pos=key_pos, model=self.model, **args
+                )
+            try:
+                total_tokens = json_data['usage']['total_tokens']
+                message = json_data["choices"][0]["message"]
+                if process_id == 0:
+                    print(f"[process({process_id})]total tokens: {json_data['usage']['total_tokens']}")
+
+                if "function_call" in message.keys() and "." in message["function_call"]["name"]:
+                    message["function_call"]["name"] = message["function_call"]["name"].split(".")[-1]
+
+                return message, 0, total_tokens
+            except BaseException as e:
+                print(f"[process({process_id})]Parsing Exception: {repr(e)}. Try again.")
+                if json_data is not None:
+                    print(f"[process({process_id})]OpenAI return: {json_data}")
+            
+
+        return {"role": "assistant", "content": str(json_data)}, -1, 0
+
+if __name__ == "__main__":
+    llm = GPT4Function()
+    prompt = '''下面这句英文可能有语病，能不能把语病都改掉？
+If you think you get the result which can answer the task, call this function to give the final answer. Or, if you think you can't handle the task from this status, call this function to restart. Remember: you should ALWAYS call this function at the end of your try, and the final answer is the ONLY part that will be showed to user, so final answer should contain enough information.
+没语病的形式：
+'''
+    messages = [
+        {"role":"system","content":""},
+        {"role":"user","content":prompt},
+    ]
+    llm.change_messages(messages)
+    output,error_code,token_usage = llm.parse(functions=[],process_id=0)
+    print(output)
--- a/toolbench/inference/LLM/davinci_model.py
+++ b/toolbench/inference/LLM/davinci_model.py
@ -0,0 +1,132 @@
+#!/usr/bin/env python
+# coding=utf-8
+from typing import Optional, List, Mapping, Any
+from termcolor import colored
+import json
+import random
+import openai
+from typing import Optional
+from toolbench.model.model_adapter import get_conversation_template
+from toolbench.inference.utils import SimpleChatIO, react_parser
+from toolbench.inference.Prompts.ReAct_prompts import FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT
+
+
+class Davinci:
+    def __init__(self, model="text-davinci-003", openai_key="") -> None:
+        super().__init__()
+        self.model = model
+        self.openai_key = openai_key
+        self.chatio = SimpleChatIO()
+
+    def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        max_try = 10
+        while True:
+            openai.api_key = self.openai_key
+            try:
+                response = openai.Completion.create(
+                    engine=self.model,
+                    prompt=prompt,
+                    temperature=0.5,
+                    max_tokens=512,
+                    top_p=1,
+                    frequency_penalty=0,
+                    presence_penalty=0,
+                    stop="End Action"
+                )
+                result = response['choices'][0]['text'].strip()
+                break
+            except Exception as e:
+                print(e)
+                max_try -= 1
+                if max_try < 0:
+                    result = "Exceed max retry times. Please check your davinci api calling."
+                    break
+        return result, response["usage"]
+        
+    def add_message(self, message):
+        self.conversation_history.append(message)
+
+    def change_messages(self,messages):
+        self.conversation_history = messages
+
+    def display_conversation(self, detailed=False):
+        role_to_color = {
+            "system": "red",
+            "user": "green",
+            "assistant": "blue",
+            "function": "magenta",
+        }
+        print("before_print"+"*"*50)
+        for message in self.conversation_history:
+            print_obj = f"{message['role']}: {message['content']} "
+            if "function_call" in message.keys():
+                print_obj = print_obj + f"function_call: {message['function_call']}"
+            print_obj += ""
+            print(
+                colored(
+                    print_obj,
+                    role_to_color[message["role"]],
+                )
+            )
+        print("end_print"+"*"*50)
+
+    def parse(self,functions,process_id,**args):
+        conv = get_conversation_template("tool-llama-single-round")
+        roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]}
+        conversation_history = self.conversation_history
+        question = ''
+        for message in conversation_history:
+            role = roles[message['role']]
+            content = message['content']
+            if role == "User":
+                question = content
+                break
+        func_str = ""
+        func_list = []
+        for function_dict in functions:
+            param_str = ""
+            api_name = function_dict["name"]
+            func_list.append(api_name)
+            if "Finish" in api_name:
+                param_str = f'"return_type": string, "final_answer": string, '
+                api_desc = "If you believe that you have obtained a result that can answer the task, please call this function to provide the final answer. ALWAYS call this function at the end of your attempt to answer the question finally."
+                func_str += f"{api_name}: {api_desc}. Your input should be a json (args json schema): {param_str} The Action to trigger this API should be {api_name} and the input parameters should be a json dict string. Pay attention to the type of parameters.\n\n"
+            else:
+                api_desc = function_dict["description"][function_dict["description"].find("The description of this function is: ")+len("The description of this function is: "):]
+                for param_name in function_dict["parameters"]["properties"]:
+                    data_type = function_dict["parameters"]["properties"][param_name]["type"]
+                    param_str += f'"{param_name}": {data_type}, '
+                param_str = "{{" + param_str + "}}"
+                func_str += f"{api_name}: {api_desc}. Your input should be a json (args json schema): {param_str} The Action to trigger this API should be {api_name} and the input parameters should be a json dict string. Pay attention to the type of parameters.\n\n"
+        func_list = str(func_list)
+        prompt = FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT.replace("{func_str}", func_str).replace("{func_list}", func_list).replace("{func_list}", func_list).replace("{question}", question)
+        prompt = prompt.replace("{{", "{").replace("}}", "}")
+        for message in conversation_history:
+            role = roles[message['role']]
+            content = message['content']
+            if role == "Assistant":
+                prompt += f"\n{content}\n"
+            elif role == "Function":
+                prompt += f"Observation: {content}\n"
+        if functions != []:
+            predictions, usage = self.prediction(prompt)
+        else:
+            predictions, usage = self.prediction(prompt)
+        
+        # react format prediction
+        thought, action, action_input = react_parser(predictions)
+        message = {
+            "role": "assistant",
+            "content": thought,
+            "function_call": {
+                "name": action,
+                "arguments": action_input
+            }
+        }
+        return message, 0, usage["total_tokens"]
+
+
+if __name__ == "__main__":
+    llm = Davinci()
+    result = llm.prediction("How old are you?")
+    print(result)
--- a/toolbench/inference/LLM/llama_model.py
+++ b/toolbench/inference/LLM/llama_model.py
@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# coding=utf-8
+from typing import Optional, List, Mapping, Any
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from termcolor import colored
+import time
+from typing import Optional
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM
+)
+from toolbench.utils import process_system_message
+from toolbench.model.model_adapter import get_conversation_template
+from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser
+
+
+class LlamaModel:
+    def __init__(self, model_name_or_path: str, template:str="tool-llama-single-round", device: str="cuda", cpu_offloading: bool=False, max_sequence_length: int=2048) -> None:
+        super().__init__()
+        self.model_name = model_name_or_path
+        self.template = template
+        self.max_sequence_length = max_sequence_length
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, model_max_length=self.max_sequence_length)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path, low_cpu_mem_usage=True
+        )
+        if self.tokenizer.pad_token_id == None:
+            self.tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>"})
+            self.model.resize_token_embeddings(len(self.tokenizer))
+        self.use_gpu = (True if device == "cuda" else False)
+        if (device == "cuda" and not cpu_offloading) or device == "mps":
+            self.model.to(device)
+        self.chatio = SimpleChatIO()
+
+    def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        gen_params = {
+            "model": "",
+            "prompt": prompt,
+            "temperature": 0.5,
+            "max_new_tokens": 512,
+            "stop": "</s>",
+            "stop_token_ids": None,
+            "echo": False
+        }
+        generate_stream_func = generate_stream
+        output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True)
+        outputs = self.chatio.return_output(output_stream)
+        prediction = outputs.strip()
+        return prediction
+        
+    def add_message(self, message):
+        self.conversation_history.append(message)
+
+    def change_messages(self,messages):
+        self.conversation_history = messages
+
+    def display_conversation(self, detailed=False):
+        role_to_color = {
+            "system": "red",
+            "user": "green",
+            "assistant": "blue",
+            "function": "magenta",
+        }
+        print("before_print"+"*"*50)
+        for message in self.conversation_history:
+            print_obj = f"{message['role']}: {message['content']} "
+            if "function_call" in message.keys():
+                print_obj = print_obj + f"function_call: {message['function_call']}"
+            print_obj += ""
+            print(
+                colored(
+                    print_obj,
+                    role_to_color[message["role"]],
+                )
+            )
+        print("end_print"+"*"*50)
+
+    def parse(self,functions,process_id,**args):
+        conv = get_conversation_template(self.template)
+        if self.template == "tool-llama":
+            roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+        elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds":
+            roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]}
+
+        self.time = time.time()
+        conversation_history = self.conversation_history
+        prompt = ''
+        for message in conversation_history:
+            role = roles[message['role']]
+            content = message['content']
+            if role == "System" and functions != []:
+                content = process_system_message(content, functions)
+            prompt += f"{role}: {content}\n"
+        prompt += "Assistant:\n"
+        if functions != []:
+            predictions = self.prediction(prompt)
+        else:
+            predictions = self.prediction(prompt)
+
+        decoded_token_len = len(self.tokenizer(predictions))
+        if process_id == 0:
+            print(f"[process({process_id})]total tokens: {decoded_token_len}")
+        
+        thought, action, action_input = react_parser(predictions)
+        if len(thought.strip()) > 1:
+            print(thought)
+            # input()
+        message = {
+            "role": "assistant",
+            "content": thought,
+            "function_call": {
+                "name": action,
+                "arguments": action_input
+            }
+        }
+        return message, 0, decoded_token_len
+
+
+if __name__ == "__main__":
+    # can accept all huggingface LlamaModel family
+    llm = LlamaModel("decapoda-research/llama-7b-hf")
+    messages = [
+        {'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do
+the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go
+back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each
+step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look
+at the input format'''}, 
+{'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'}
+]
+    functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way
+to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}]
+
+    llm.change_messages(messages)
+    output = llm.parse(functions=functions)
+    print(output)
--- a/toolbench/inference/LLM/retriever.py
+++ b/toolbench/inference/LLM/retriever.py
@ -0,0 +1,56 @@
+import time
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+import json
+import re
+from toolbench.utils import standardize, standardize_category, change_name, process_retrieval_ducoment
+
+
+class ToolRetriever:
+    def __init__(self, corpus_tsv_path = "", model_path=""):
+        self.corpus_tsv_path = corpus_tsv_path
+        self.model_path = model_path
+        self.corpus, self.corpus2tool = self.build_retrieval_corpus()
+        self.embedder = self.build_retrieval_embedder()
+        self.corpus_embeddings = self.build_corpus_embeddings()
+        
+    def build_retrieval_corpus(self):
+        print("Building corpus...")
+        documents_df = pd.read_csv(self.corpus_tsv_path, sep='\t')
+        corpus, corpus2tool = process_retrieval_ducoment(documents_df)
+        corpus_ids = list(corpus.keys())
+        corpus = [corpus[cid] for cid in corpus_ids]
+        return corpus, corpus2tool
+
+    def build_retrieval_embedder(self):
+        print("Building embedder...")
+        embedder = SentenceTransformer(self.model_path)
+        return embedder
+    
+    def build_corpus_embeddings(self):
+        print("Building corpus embeddings with embedder...")
+        corpus_embeddings = self.embedder.encode(self.corpus, convert_to_tensor=True)
+        return corpus_embeddings
+
+    def retrieving(self, query, top_k=5, excluded_tools={}):
+        print("Retrieving...")
+        start = time.time()
+        query_embedding = self.embedder.encode(query, convert_to_tensor=True)
+        hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=10*top_k, score_function=util.cos_sim)
+        retrieved_tools = []
+        for rank, hit in enumerate(hits[0]):
+            category, tool_name, api_name = self.corpus2tool[self.corpus[hit['corpus_id']]].split('\t') 
+            # category = standardize_category(category)
+            # tool_name = standardize(tool_name) # standardizing
+            # api_name = change_name(standardize(api_name)) # standardizing
+            if category in excluded_tools:
+                if tool_name in excluded_tools[category]:
+                    top_k += 1
+                    continue
+            tmp_dict = {
+                "category": category,
+                "tool_name": tool_name,
+                "api_name": api_name
+            }
+            retrieved_tools.append(tmp_dict)
+        return retrieved_tools
--- a/toolbench/inference/LLM/tool_llama_lora_model.py
+++ b/toolbench/inference/LLM/tool_llama_lora_model.py
@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# coding=utf-8
+import time
+from termcolor import colored
+from typing import Optional, List
+from peft import PeftModel
+import torch
+from typing import Optional
+import torch
+from transformers import (
+    AutoTokenizer,
+    LlamaForCausalLM,
+)
+from toolbench.utils import process_system_message
+from toolbench.model.model_adapter import get_conversation_template
+from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser
+
+
+class ToolLLaMALoRA:
+    def __init__(
+            self, 
+            base_name_or_path: str, 
+            model_name_or_path: str, 
+            template:str="tool-llama-single-round", 
+            device: str="cuda", 
+            cpu_offloading: bool=False, 
+            load_8bit: bool=False,
+            max_sequence_length: int=8192
+        ) -> None:
+        super().__init__()
+        self.model_name = model_name_or_path
+        self.template = template
+        self.max_sequence_length = max_sequence_length
+        self.tokenizer = AutoTokenizer.from_pretrained(base_name_or_path, use_fast=False, model_max_length=self.max_sequence_length, padding_side="right")
+        model = LlamaForCausalLM.from_pretrained(
+            base_name_or_path,
+            load_in_8bit=load_8bit,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        self.model = PeftModel.from_pretrained(
+            model,
+            model_name_or_path,
+            torch_dtype=torch.float16,
+        )
+        self.tokenizer.pad_token = self.tokenizer.unk_token
+        
+        self.use_gpu = (True if device == "cuda" else False)
+        if (device == "cuda" and not cpu_offloading) or device == "mps":
+            self.model.to(device)
+        self.chatio = SimpleChatIO()
+
+    def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        gen_params = {
+            "model": "",
+            "prompt": prompt,
+            "temperature": 0.5,
+            "max_new_tokens": 512,
+            "stop": "</s>",
+            "stop_token_ids": None,
+            "echo": False
+        }
+        generate_stream_func = generate_stream
+        output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True)
+        outputs = self.chatio.return_output(output_stream)
+        prediction = outputs.strip()
+        return prediction
+        
+    def add_message(self, message):
+        self.conversation_history.append(message)
+
+    def change_messages(self,messages):
+        self.conversation_history = messages
+
+    def display_conversation(self, detailed=False):
+        role_to_color = {
+            "system": "red",
+            "user": "green",
+            "assistant": "blue",
+            "function": "magenta",
+        }
+        print("before_print"+"*"*50)
+        for message in self.conversation_history:
+            print_obj = f"{message['role']}: {message['content']} "
+            if "function_call" in message.keys():
+                print_obj = print_obj + f"function_call: {message['function_call']}"
+            print_obj += ""
+            print(
+                colored(
+                    print_obj,
+                    role_to_color[message["role"]],
+                )
+            )
+        print("end_print"+"*"*50)
+
+    def parse(self,functions,process_id,**args):
+        conv = get_conversation_template(self.template)
+        if self.template == "tool-llama":
+            roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+        elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds":
+            roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]}
+
+        self.time = time.time()
+        conversation_history = self.conversation_history
+        prompt = ''
+        for message in conversation_history:
+            role = roles[message['role']]
+            content = message['content']
+            if role == "System" and functions != []:
+                content = process_system_message(content, functions)
+            prompt += f"{role}: {content}\n"
+        prompt += "Assistant:\n"
+        if functions != []:
+            predictions = self.prediction(prompt)
+        else:
+            predictions = self.prediction(prompt)
+
+        decoded_token_len = len(self.tokenizer(predictions))
+        if process_id == 0:
+            print(f"[process({process_id})]total tokens: {decoded_token_len}")
+        
+        # react format prediction
+        thought, action, action_input = react_parser(predictions)
+        message = {
+            "role": "assistant",
+            "content": thought,
+            "function_call": {
+                "name": action,
+                "arguments": action_input
+            }
+        }
+        return message, 0, decoded_token_len
+
+
+if __name__ == "__main__":
+    # can accept all huggingface LlamaModel family
+    llm = ToolLLaMALoRA("decapoda-research/llama-7b-hf")
+    messages = [
+        {'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do
+the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go
+back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each
+step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look
+at the input format'''}, 
+{'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'}
+]
+    functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way
+to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}]
+
+    llm.change_messages(messages)
+    output = llm.parse(functions=functions)
+    print(output)
--- a/toolbench/inference/LLM/tool_llama_model.py
+++ b/toolbench/inference/LLM/tool_llama_model.py
@ -0,0 +1,143 @@
+#!/usr/bin/env python
+# coding=utf-8
+import time
+from termcolor import colored
+from typing import Optional, List
+import torch
+from typing import Optional
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+from toolbench.utils import process_system_message
+from toolbench.model.model_adapter import get_conversation_template
+from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser
+
+
+class ToolLLaMA:
+    def __init__(
+            self, 
+            model_name_or_path: str, 
+            template:str="tool-llama-single-round", 
+            device: str="cuda", 
+            cpu_offloading: bool=False, 
+            max_sequence_length: int=8192
+        ) -> None:
+        super().__init__()
+        self.model_name = model_name_or_path
+        self.template = template
+        self.max_sequence_length = max_sequence_length
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, model_max_length=self.max_sequence_length)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path, low_cpu_mem_usage=True
+        )
+        if self.tokenizer.pad_token_id == None:
+            self.tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>"})
+            self.model.resize_token_embeddings(len(self.tokenizer))
+        self.use_gpu = (True if device == "cuda" else False)
+        if (device == "cuda" and not cpu_offloading) or device == "mps":
+            self.model.to(device)
+        self.chatio = SimpleChatIO()
+
+    def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        with torch.no_grad():
+            gen_params = {
+                "model": "",
+                "prompt": prompt,
+                "temperature": 0.5,
+                "max_new_tokens": 512,
+                "stop": "</s>",
+                "stop_token_ids": None,
+                "echo": False
+            }
+            generate_stream_func = generate_stream
+            output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True)
+            outputs = self.chatio.return_output(output_stream)
+            prediction = outputs.strip()
+        return prediction
+        
+    def add_message(self, message):
+        self.conversation_history.append(message)
+
+    def change_messages(self,messages):
+        self.conversation_history = messages
+
+    def display_conversation(self, detailed=False):
+        role_to_color = {
+            "system": "red",
+            "user": "green",
+            "assistant": "blue",
+            "function": "magenta",
+        }
+        print("before_print"+"*"*50)
+        for message in self.conversation_history:
+            print_obj = f"{message['role']}: {message['content']} "
+            if "function_call" in message.keys():
+                print_obj = print_obj + f"function_call: {message['function_call']}"
+            print_obj += ""
+            print(
+                colored(
+                    print_obj,
+                    role_to_color[message["role"]],
+                )
+            )
+        print("end_print"+"*"*50)
+
+    def parse(self, functions, process_id, **args):
+        conv = get_conversation_template(self.template)
+        if self.template == "tool-llama":
+            roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+        elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds":
+            roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]}
+
+        self.time = time.time()
+        conversation_history = self.conversation_history
+        prompt = ''
+        for message in conversation_history:
+            role = roles[message['role']]
+            content = message['content']
+            if role == "System" and functions != []:
+                content = process_system_message(content, functions)
+            prompt += f"{role}: {content}\n"
+        prompt += "Assistant:\n"
+        
+        if functions != []:
+            predictions = self.prediction(prompt)
+        else:
+            predictions = self.prediction(prompt)
+
+        decoded_token_len = len(self.tokenizer(predictions))
+        if process_id == 0:
+            print(f"[process({process_id})]total tokens: {decoded_token_len}")
+
+        # react format prediction
+        thought, action, action_input = react_parser(predictions)
+        message = {
+            "role": "assistant",
+            "content": thought,
+            "function_call": {
+                "name": action,
+                "arguments": action_input
+            }
+        }
+        return message, 0, decoded_token_len
+
+
+if __name__ == "__main__":
+    # can accept all huggingface LlamaModel family
+    llm = ToolLLaMA("decapoda-research/llama-7b-hf")
+    messages = [
+        {'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do
+the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go
+back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each
+step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look
+at the input format'''}, 
+{'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'}
+]
+    functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way
+to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}]
+
+    llm.change_messages(messages)
+    output = llm.parse(functions=functions)
+    print(output)
--- a/toolbench/inference/LLM_rank/init.py
+++ b/toolbench/inference/LLM_rank/init.py
--- a/toolbench/inference/LLM_rank/rank_candidate.py
+++ b/toolbench/inference/LLM_rank/rank_candidate.py
@ -0,0 +1,101 @@
+'''
+Evaluate the score of a query corresponding to different candidates
+'''
+
+from toolbench.inference.Prompts.rank_prompts import LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT, LLM_PAIRWISE_RANK_USER_PROMPT
+import random
+from toolbench.inference.Tree.Tree import tree_node
+
+
+def rank2symmetry(llm_interface, LLM_rank_args, cand1,cand2):
+    '''
+    Use llm to compare the height, due to the sequence, you need to compare each of the two in the front
+    '''
+    single_rank_func = LLM_rank_args["rank_func"]
+    score = [0,0]
+    bigger1,query_count1, total_tokens1 = single_rank_func(llm_interface, LLM_rank_args, cand1,cand2)
+    score[1 - bigger1] += 1
+    bigger2,query_count2, total_tokens2 = single_rank_func(llm_interface, LLM_rank_args, cand2,cand1)
+    score[bigger2] += 1
+    if score[0] > score[1]:
+        return 1 , query_count1 + query_count2, total_tokens1 + total_tokens2
+    elif score[0] < score[1]:
+        return -1, query_count1 + query_count2, total_tokens1 + total_tokens2
+    else:
+        return 0, query_count1 + query_count2, total_tokens1 + total_tokens2
+
+
+
+def rank2_subfix(llm_interface,LLM_rank_args, cand1,cand2):
+    '''
+    Assumed that the two candidates have a long common prefix
+    '''
+    anscestor_interesction = tree_node.find_ancestor_intersection(cand1,cand2)
+    assert anscestor_interesction != None
+    intersect_trice = anscestor_interesction.get_former_trice_from_this_node(end_node=None)
+    trice_1 = cand1.get_former_trice_from_this_node(end_node=anscestor_interesction)
+    trice_2 = cand2.get_former_trice_from_this_node(end_node=anscestor_interesction)
+
+    system_message = LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT
+    system_message = system_message.replace("{task_description}", LLM_rank_args["task_description"])
+    system_message = system_message.replace("{intersect_trice}", intersect_trice)
+    system_message = system_message.replace("{candidate_A}",trice_1)
+    system_message = system_message.replace("{candidate_B}",trice_2)
+    llm_interface.change_messages([{"role":"system","content":system_message},
+                                   {"role":"user","content":LLM_PAIRWISE_RANK_USER_PROMPT},
+                                   ])
+    output,error_code, total_tokens = llm_interface.parse(functions=LLM_rank_args["functions"],function_call="none",process_id=LLM_rank_args["process_id"])
+    if output["content"].strip().lower()[-1] == "a":
+        return 1, 1, total_tokens
+    else:
+        return 0, 1, total_tokens
+    
+def sum_based_rankn(llm_interface,LLM_rank_args, candidates):
+    '''
+    All pairs are sorted pairwise, sum the total points, and choose the best
+    '''
+    total_querys = 0
+    total_tokens = 0
+    scores = [0]*len(candidates)
+    for i in range(len(candidates)-1):
+        for j in range(i+1,len(candidates)):
+            pairwise_rank,query_count,rank2_tokens = rank2symmetry(llm_interface,LLM_rank_args, candidates[i],candidates[j])
+            total_querys += query_count
+            total_tokens += rank2_tokens
+            if pairwise_rank > 0:
+                scores[i] += 1
+            elif pairwise_rank < 0:
+                scores[j] += 1
+            else:
+                scores[i] += 0.5
+                scores[j] += 0.5
+    return scores, total_querys, total_tokens
+
+
+
+if __name__ ==  "__main__":
+    random.seed(42)
+    # candidates = [
+    #     "234",
+    #     "66.5",
+    #     "77.1",
+    #     "88.967",
+    #     "pi",
+    #     # "e",
+    #     # "ln(2)"
+    # ]
+    candidates = [
+        "77.1",
+        "88.967",
+        "pi",
+        "66.5",
+        "234",
+        "ln(2)"
+    ]
+    '''
+    starting_delta:
+    50 -> 42.85%
+    100 -> 35.99%
+    150 -> 29.66%
+    200 -> 24.03%
+    '''
--- a/toolbench/inference/Prompts/ReAct_prompts.py
+++ b/toolbench/inference/Prompts/ReAct_prompts.py
@ -0,0 +1,60 @@
+
+
+
+FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ADAPTED = """You are AutoGPT, you can use many tools(functions) to do the following task.
+First I will give you the task description, and your task start.
+At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.
+After the call, you will get the call result, and you are now in a new state.
+Then you will analyze your status now, then decide what to do next...
+After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer. 
+If you feel you cannot solve the task or can only solve it partially, you should choose to give up and give your reason which should mention the names of the failed functions.
+Remember: 
+1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say "I give up and restart" and give the reason.
+2.All the thought is short, at most in 5 sentence.
+3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try.
+Let's Begin!
+Task description: {task_description}"""
+
+FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION = """You are AutoGPT, you can use many tools(functions) to do the following task.
+First I will give you the task description, and your task start.
+At each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.
+After the call, you will get the call result, and you are now in a new state.
+Then you will analyze your status now, then decide what to do next...
+After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.
+Remember: 
+1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say "I give up and restart".
+2.All the thought is short, at most in 5 sentence.
+3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try.
+Let's Begin!
+Task description: {task_description}"""
+
+
+FORMAT_INSTRUCTIONS_USER_FUNCTION = """
+{input_description}
+Begin!
+"""
+
+FORMAT_INSTRUCTIONS_SYSTEM_FUNCTION_ZEROSHOT = """Answer the following questions as best you can. Specifically, you have access to the following APIs:
+
+{func_str}
+
+Use the following format:
+Thought: you should always think about what to do
+Action: the action to take, should be one of {func_list}
+Action Input: the input to the action
+End Action
+
+Begin! Remember: (1) Follow the format, i.e,
+Thought:
+Action:
+Action Input:
+End Action
+(2)The Action: MUST be one of the following:{func_list}
+(3)If you believe that you have obtained enough information (which can be judge from the history observations) that can answer the task, please call:
+Action: Finish
+Action Input: {{"return_type": "give_answer", "final_answer": your answer string}}.
+Question: {question}
+
+Here are the history actions and observations:
+"""
+        
--- a/toolbench/inference/Prompts/Tree_search_prompts.py
+++ b/toolbench/inference/Prompts/Tree_search_prompts.py
@ -0,0 +1,6 @@
+DIVERSITY_PROMPT='''This is not the first time you try this task, all previous trails failed.
+Before you generate my thought for this state, I will first show you your previous actions for this state, and then you must generate actions that is different from all of them. Here are some previous actions candidates:
+{previous_candidate}
+Remember you are now in the intermediate state of a trail, you will first analyze the now state and previous action candidates, then make actions that is different from all the previous.'''
+
+
--- a/toolbench/inference/Prompts/init.py
+++ b/toolbench/inference/Prompts/init.py
--- a/toolbench/inference/Prompts/rank_prompts.py
+++ b/toolbench/inference/Prompts/rank_prompts.py
@ -0,0 +1,28 @@
+
+LLM_PAIRWISE_RANK_SUBFIX_SYSTEM_PROMPT = '''
+You are value-GPT, which is an expert of defining which trail is better, which trail is more close to solving the task. 
+All candidate tries to solve this task with some funciton calls:
+*******************************
+{{TASK_DESCRIPTION}}
+{task_description}
+{{END_TASK_DESCRIPTION}}
+*******************************
+First, all candidate do the following things:
+{intersect_trice}
+After that, there are two candidates A and B, they do different things:
+*******************************
+{{CANDIDATE_A_START}}
+{candidate_A}
+{{CANDIDATE_A_END}}
+*******************************
+{{CANDIDATE_B_START}}
+{candidate_B}
+{{CANDIDATE_B_END}}
+Which try do you think is more helpful to solving the task?
+'''
+
+
+
+
+LLM_PAIRWISE_RANK_USER_PROMPT = '''
+Tell me which candidate is better in ONE Word: "A" or "B":'''
--- a/toolbench/inference/Tree/Tree.py
+++ b/toolbench/inference/Tree/Tree.py
@ -0,0 +1,240 @@
+from termcolor import colored
+import numpy as np
+from copy import deepcopy
+from toolbench.inference.utils import softmax_bias
+import math
+
+class my_tree:
+    def __init__(self):
+        self.root = tree_node()
+        self.now_deal_node = self.root
+
+
+    def to_json_recursive(self,use_messages=False):
+        tree_structure =  self.root.to_json_recursive(use_messages=use_messages)
+        js_obj = {
+            "size": self.root.get_size(),
+            "max_length":self.root.get_max_depth(),
+            "tree": tree_structure,
+        }
+        return js_obj
+
+
+class tree_node:
+
+    def __init__(self):
+        self.is_terminal = False
+        self.pruned = False
+        self.finished = False
+
+        self.node_type = None
+        self.description = ""
+        self.observation = ""
+        self.observation_code = None
+        self.children = []
+
+        self.father = None
+
+
+        self.io_state = None
+
+
+
+        self.expand_num = 0 # The number of visits to the node, 0 means it has not been visited
+
+
+        self.Elo = 1000.0
+
+        # openai-messages of this node
+        self.messages = []
+
+    def compute_weight(self):
+        '''
+        Used in the UCT algorithm to calculate the node weight of each son during selection
+        '''
+        return 0.0
+
+    def get_max_depth(self):
+        '''
+        maximum depth of subtrees including self
+        '''
+        max_depth = 0
+        for child in self.children:
+            max_depth = max(max_depth,child.get_max_depth())
+        return max_depth + 1
+
+    def get_depth(self):
+        if self.father == None:
+            return 0
+        return self.father.get_depth() + 1
+
+    def get_size(self):
+        '''
+        subtree, including itself
+        '''
+        size = 1
+        for child in self.children:
+            size += child.get_size()
+        return size
+    
+    def prune(self):
+        '''
+        pruning off the subtree
+        '''
+        self.pruned = True
+        for child in self.children:
+            child.prune()
+
+    def print(self,process_id = 0):
+        if process_id != 0:
+            return
+        color_converter = {"Thought":"red", "Action": "blue", "Action Input": "cyan","Final Answer": "green","Reflection":"blue"}
+        print(colored(f"{self.node_type}: {self.description}",color = color_converter[self.node_type]))
+        if self.observation != "":
+            if len(self.observation) < 1536:
+                print(colored(f"Observation: {self.observation}",color="yellow"))
+            else:
+                print(colored(f"Observation: {self.observation[:1536]}......(len={len(self.observation)})",color="yellow"))
+
+
+    @classmethod
+    def find_ancestor_intersection(cls, node1, node2):
+        '''
+        find the first common ancestor
+        '''
+        if node1 == None or node2 == None:
+            return None
+        if node1 == node2:
+            return node1
+        length1 = node1.get_depth()
+        length2 = node2.get_depth()
+        if length1 > length2:
+            return tree_node.find_ancestor_intersection(node1.father,node2)
+        else:
+            return tree_node.find_ancestor_intersection(node1, node2.father)
+
+    
+
+    def to_json_recursive(self,use_messages=False):
+        js_obj = self.to_json(use_messages=use_messages)
+        js_obj["children"] = []
+        for child in self.children:
+            js_obj["children"].append(child.to_json_recursive())
+        return js_obj
+
+
+    def make_finish(self,inter_val=1):
+        '''
+        Recursively marked as finish, until the above inter_val nodes of action_input type (including yourself)
+        '''
+        self.finished = True
+        if self.node_type == "Action Input":
+            inter_val -= 1
+        if self.father != None and inter_val >= 0:
+            self.father.make_finish(inter_val)
+
+
+    def get_train_messages_from_this_node(self):
+        '''
+        Returns chained results, starting from this node up to the root node
+        '''
+        def sift_first_invalid_message(messages):
+            use_messages = []
+            flag = True
+            for message_id in range(len(messages))[::-1]:
+                if not ("valid" in messages[message_id].keys() and messages[message_id]["valid"] == False):
+                    use_messages = [messages[message_id]] + use_messages
+                elif flag:
+                    flag = False
+                    use_messages = [messages[message_id]] + use_messages
+            return use_messages
+
+        now_node = self
+        result = []
+        while now_node.father != None:
+            if now_node.node_type == "Action Input":
+                use_messages = deepcopy(now_node.messages)
+                while use_messages[-1]["role"] != "assistant":
+                    use_messages = use_messages[:-1]
+                use_messages = sift_first_invalid_message(use_messages)
+                result = [use_messages] + result
+            elif now_node.node_type == "Thought":
+                use_messages = deepcopy(now_node.messages)
+                while use_messages[-1]["role"] == "user":
+                    use_messages = use_messages[:-1]
+                use_messages = sift_first_invalid_message(use_messages)
+                if use_messages[-1]["role"] == "assistant":
+                    result = [use_messages] + result
+            now_node = now_node.father
+        return result
+
+    def get_chain_result_from_this_node(self,use_messages=False):
+        '''
+        Returns chained results, starting from this node up to the root node
+        '''
+        now_node = self
+        result = []
+        while now_node.father != None:
+            result = [now_node.to_json(use_messages=use_messages)] + result
+            now_node = now_node.father
+        return result
+
+    def get_former_trice_from_this_node(self,valid_types=["Thought","Action","Action Input","Observation"],end_node = None):
+        '''
+        Return path description from end_node -> self
+        Does not contain end_node, never contains root node
+        '''
+        node = self
+        output_str_list = []
+
+        while node != end_node and node.father != None:
+            now_node_des_list = []
+            if node.node_type in valid_types:
+                now_node_des_list.append(f"{node.node_type}: {node.description}\n")
+            if node.observation != "" and "Observation" in valid_types:
+                tuncated = node.observation
+                if len(node.observation) > 1024:
+                    tuncated = node.observation[:1024] + f"...(len={len(node.observation)})"
+                now_node_des_list.append(f"Observation: {tuncated}\n")
+            output_str_list = now_node_des_list + output_str_list
+            node = node.father
+        
+        now_str = ""
+        for k, cont in enumerate(output_str_list):
+            now_str += f"step_{k+1}: {cont}\n"
+
+        if now_str == "":
+            now_str = "None"
+        return now_str
+
+    def to_json(self, use_messages=False):
+        
+        json_obj = {}
+        json_obj["is_terminal"] = False
+        json_obj["pruned"] = self.pruned
+        json_obj["finished"] = self.finished
+
+        json_obj["depth"] = self.get_depth()
+        json_obj["node_type"] = self.node_type
+        json_obj["description"] = self.description
+        json_obj["Elo"] = self.Elo
+        if self.observation != "":
+            json_obj["observation"] = self.observation
+        if self.observation_code != None:
+            json_obj["observation_code"] = self.observation_code
+        json_obj["child_count"] = len(self.children)
+        json_obj["expand_num"] = self.expand_num
+
+        if self.io_state != None and self.node_type == "Action Input":
+            json_obj["io_state"] = self.io_state.to_json()
+
+            
+        if use_messages:
+            json_obj["messages"] = []
+            for message in self.messages:
+                if not ("valid" in message.keys() and message["valid"] == False):
+                    json_obj["messages"].append(message["role"])
+                else:
+                    json_obj["messages"].append(message["role"] + "_invalid")
+
+        return json_obj
--- a/toolbench/inference/Tree/init.py
+++ b/toolbench/inference/Tree/init.py
--- a/toolbench/inference/callbacks/ServerEventCallback.py
+++ b/toolbench/inference/callbacks/ServerEventCallback.py
@ -0,0 +1,188 @@
+from typing import Any, Dict, List, Union
+import queue
+class ServerEventCallback():
+    """Base callback handler"""
+
+    def __init__(self, queue: queue.Queue, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue = queue
+        self.llm_block_id = 0
+        self.tool_block_id = 0
+        self.tool_descriptions = {}
+
+    def add_to_queue(self, method_name: str, block_id, **kwargs: Any):
+        data = {
+            "method_name": method_name,
+            "block_id": block_id,
+        }
+        data.update(kwargs)
+        self.queue.put(data)
+
+    def on_tool_retrieval_start(self):
+        # tools should be of the form
+        # {tool_name, tool_desc}
+        self.add_to_queue(
+            "on_tool_retrieval_start",
+            "recommendation-1",
+        )
+        print("on_tool_retrieval_start method called")
+
+    def on_tool_retrieval_end(self, tools):
+        # tool should be of the form
+        # {tool_name, tool_desc}
+        self.add_to_queue(
+            "on_tool_retrieval_end",
+            "recommendation-1",
+            recommendations=tools
+        )
+        self.tool_descriptions = {
+            tool["name"]: tool for tool in tools
+        }
+        print("on_tool_retrieval_end method called")
+    def on_request_start(self, user_input: str, method: str) -> Any:
+        self.tool_block_id = 0
+        self.llm_block_id = 0
+        self.add_to_queue(
+            "on_request_start",
+            block_id="start",
+            user_input=user_input,
+            method=method
+        )
+    def on_request_end(self, outputs: str, chain: List[Any]):
+        self.add_to_queue(
+            "on_request_end",
+            block_id="end",
+            output=outputs,
+            chain=chain
+        )
+    def on_request_error(self, error: str):
+        self.add_to_queue(
+            "on_request_error",
+            block_id="error",
+            error=error
+        )
+
+    # keep
+    def on_chain_start(self, inputs: str, depth: int) -> Any:
+        """Run when chain starts running."""
+        print("on_chain_start method called")
+        self.llm_block_id += 1
+        block_id = "llm-" + str(self.llm_block_id)
+        self.add_to_queue(
+            "on_chain_start",
+            block_id=block_id,
+            messages=inputs,
+            depth=depth
+        )
+        return block_id
+
+    # this one needs the block_id memorized
+    def on_chain_end(self, block_id: str, depth: int) -> Any:
+        self.add_to_queue(
+            "on_chain_end",
+            block_id=block_id,
+            # output=output,
+            depth=depth
+        )
+        print("on_chain_end method called")
+
+    def on_chain_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any:
+        method_name = "on_chain_error"
+        self.add_to_queue(method_name, error=error, **kwargs)
+        print("on_chain_error method called")
+
+    def on_llm_start(
+            self, messages: str, depth: int
+    ) -> Any:
+        """Run when LLM starts running."""
+        self.add_to_queue(
+            "on_llm_start",
+            block_id="llm-" + str(self.llm_block_id),
+            messages=messages,
+            depth=depth
+        )
+        print("on_llm_start method called")
+
+    def on_llm_new_token(self, token: str, **kwargs: Any) -> Any:
+        """Run on new LLM token. Only available when streaming is enabled."""
+        method_name = "on_llm_new_token"
+        self.add_to_queue(method_name, token=token, **kwargs)
+        print("on_llm_new_token method called")
+
+    def on_llm_end(self, response: str, depth: int) -> Any:
+        """Run when LLM ends running."""
+        self.add_to_queue(
+            "on_llm_end",
+            block_id="llm-" + str(self.llm_block_id),
+            response=response,
+            depth=depth
+        )
+        print("on_llm_end method called")
+
+    def on_llm_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any:
+        """Run when LLM errors."""
+        self.add_to_queue(
+            "on_llm_error",
+            block_id="llm-" + str(self.llm_block_id),
+            message=str(error),
+            error=error
+        )
+        print("on_llm_error method called")
+
+    def on_agent_action(self, action, action_input, depth: int) -> str:
+        self.tool_block_id += 1
+        block_id="tool-" + str(self.tool_block_id)
+        self.add_to_queue(
+            "on_agent_action",
+            block_id=block_id,
+            action=action,
+            action_input = action_input,
+            depth=depth
+        )
+        print("on_agent_action method called")
+        return block_id
+
+    def on_tool_start(self, tool_name: str, tool_input: str,  depth: int) -> Any:
+        method_name = "on_tool_start"
+        tool_description = "Tool not found in tool descriptions"
+        if tool_name in self.tool_descriptions:
+            tool_description = self.tool_descriptions[tool_name]
+        else:
+            print(self.tool_descriptions)
+            print("Key", tool_name, "not found in tool descriptions")
+        self.add_to_queue(
+            method_name,
+            block_id="tool-" + str(self.tool_block_id),
+            tool_name=tool_name,
+            tool_description=tool_description,
+            tool_input=tool_input,
+            depth=depth
+        )
+        print("on_tool_start method called")
+
+    def on_tool_end(self, output: str, status:int, depth: int) -> Any:
+        method_name = "on_tool_end"
+        self.add_to_queue(
+            method_name,
+            block_id="tool-" + str(self.tool_block_id),
+            output=output,
+            status= status,
+            depth=depth
+        )
+        print("on_tool_end method called")
+
+    def on_tool_error(self, error: Union[Exception, KeyboardInterrupt]) -> Any:
+        method_name = "on_tool_error"
+        self.add_to_queue(
+            method_name,
+            error=error
+        )
+        print("on_tool_error method called")
+
+    def on_agent_end(self, block_id:str, depth: int):
+        self.add_to_queue(
+            "on_agent_end",
+            block_id=block_id,
+            depth=depth
+        )
+        print("on_agent_end method called")
--- a/toolbench/inference/qa_pipeline.py
+++ b/toolbench/inference/qa_pipeline.py
@ -0,0 +1,37 @@
+'''
+Close-domain QA Pipeline
+'''
+
+import argparse
+from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner
+import os
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama')
+    parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model')
+    parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='')
+    parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
+    parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
+    parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
+    parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length')
+    parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length')
+    parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length')
+    parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='observation compress method')
+    parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
+    parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
+    parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path')
+    parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service')
+    parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
+    parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
+    parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.")
+    
+    
+    
+    args = parser.parse_args()
+    os.makedirs(args.output_answer_file, exist_ok=True)
+
+    pipeline_runner = pipeline_runner(args)
+    pipeline_runner.run()
+
--- a/toolbench/inference/qa_pipeline_open_domain.py
+++ b/toolbench/inference/qa_pipeline_open_domain.py
@ -0,0 +1,35 @@
+'''
+Open-domain QA Pipeline
+'''
+import argparse
+from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--corpus_tsv_path', type=str, default="your_retrival_corpus_path/", required=False, help='')
+    parser.add_argument('--retrieval_model_path', type=str, default="your_model_path/", required=False, help='')
+    parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='')
+    parser.add_argument('--backbone_model', type=str, default="toolllama", required=False, help='chatgpt_function or davinci or toolllama')
+    parser.add_argument('--openai_key', type=str, default="", required=False, help='openai key for chatgpt_function or davinci model')
+    parser.add_argument('--model_path', type=str, default="your_model_path/", required=False, help='')
+    parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
+    parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
+    parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
+    parser.add_argument('--max_observation_length', type=int, default=1024, required=False, help='maximum observation length')
+    parser.add_argument('--max_source_sequence_length', type=int, default=4096, required=False, help='original maximum model sequence length')
+    parser.add_argument('--max_sequence_length', type=int, default=8192, required=False, help='maximum model sequence length')
+    parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], required=False, help='maximum observation length')
+    parser.add_argument('--method', type=str, default="CoT@1", required=False, help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
+    parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
+    parser.add_argument('--output_answer_file', type=str, default="",required=False, help='output path')
+    parser.add_argument('--toolbench_key', type=str, default="",required=False, help='your toolbench key to request rapidapi service')
+    parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
+    parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
+    parser.add_argument('--api_customization', action="store_true", help="To use customized api or not. NOT SUPPORTED currently under open domain setting.")
+    
+    args = parser.parse_args()
+
+    pipeline_runner = pipeline_runner(args, add_retrieval=True)
+    pipeline_runner.run()
--- a/toolbench/inference/server.py
+++ b/toolbench/inference/server.py
@ -0,0 +1,177 @@
+from pydantic import BaseModel
+import json
+import os
+from typing import Union
+from toolbench.utils import standardize, change_name
+import random
+
+
+class Info(BaseModel):
+    category: str
+    tool_name: str
+    api_name: str
+    tool_input: Union[str, dict]
+    strip: str
+
+def prepare_tool_name_and_url(tools_root, info):
+    category = info.category
+    standard_category = category.replace(" ", "_").replace(",", "_").replace("/", "_")
+    while " " in standard_category or "," in standard_category:
+        standard_category = standard_category.replace(" ", "_").replace(",", "_")
+    standard_category = standard_category.replace("__", "_")
+    
+    tool_name = info.tool_name
+    api_name = change_name(standardize(info.api_name))
+    if not tool_name.endswith(f"_for_{standard_category}"):
+        tool_name = standardize(info.tool_name)
+        code_string = f"""from {tools_root}.{standard_category}.{tool_name}.api import {api_name}"""
+        tool_name += f"_for_{standard_category}"
+    else:
+        tmp_tool_name = standardize(tool_name.replace(f"_for_{standard_category}", ""))
+        code_string = f"""from {tools_root}.{standard_category}.{tmp_tool_name}.api import {api_name}"""
+    return tool_name, standard_category, api_name, code_string
+
+def process_error(response):
+    save_cache_flag = False
+    switch_flag = False
+    if "The request to the API has timed out. Please try again later, or if the issue persists" in str(response):
+        return_dict = {"error": "API temporarily not working error...", "response": response}
+
+    if "Your Client (working) ---> Gateway (working) ---> API (not working)" in str(response):
+        return_dict = {"error": "API not working error...", "response": response}
+        
+    elif "Unauthorized" in str(response) or "unauthorized" in str(response):
+        save_cache_flag = True
+        return_dict = {"error": "Unauthorized error...", "response": response}
+    
+    elif "You are not subscribed to this API." in str(response):
+        switch_flag = True
+        return_dict = {"error": "Unsubscribed error...", "response": response}
+    
+    elif "Too many requests" in str(response):
+        switch_flag = True
+        return_dict = {"error": "Too many requests error...", "response": response}
+
+    elif "You have exceeded" in str(response) or "you are being rate limited"  in str(response):
+        switch_flag = True
+        return_dict = {"error": "Rate limit error...", "response": response}
+
+    elif "Access restricted. Check credits balance or enter the correct API key." in str(response):
+        switch_flag = True
+        return_dict = {"error": "Rate limit error...", "response": response}
+    
+    elif "Oops, an error in the gateway has occurred." in str(response):
+        switch_flag = True
+        return_dict = {"error": "Gateway error...", "response": response}
+
+    elif "Blocked User. Please contact your API provider." in str(response):
+        switch_flag = True
+        return_dict = {"error": "Blocked error...", "response": response}
+    
+    elif "error" in str(response):
+        return_dict = {"error": "Message error...", "response": response}
+
+    else:
+        save_cache_flag = True
+        return_dict = {"error": "", "response": response}
+    return return_dict, save_cache_flag, switch_flag
+
+def run(toolbench_code_string, toolbench_api_name, toolbench_input_params_str):
+    # get observation
+    success_flag = False
+    switch_flag = False
+    save_cache = False
+    exec(toolbench_code_string)
+    try:
+        eval_func_str = f"{toolbench_api_name}({toolbench_input_params_str})"
+        new_func = eval(eval_func_str)
+        response, save_cache, switch_flag = process_error(new_func)
+        success_flag = True
+    except Exception as e:
+        response = {"error": f"Function executing {toolbench_code_string} error...\n{e}", "response": ""}
+        save_cache = False
+    return success_flag, switch_flag, response, save_cache
+
+
+def dict_shorten(origin: dict, schema: dict):
+    for key, value in list(origin.items()):
+        if key not in schema:
+            del origin[key]
+        else:
+            if isinstance(value, dict):
+                dict_shorten(value, schema[key]) # schema[key] should be a dict
+            elif isinstance(value, list):
+                if value:
+                    if isinstance(value[0], dict):
+                        for item in value:
+                            dict_shorten(item, schema[key][0]) # schema[key] should be a list with only one dict element
+    return origin
+
+def observation_shorten(schema_root, response_dict, category, tool_name, api_name, strip_method):
+    print(random.random())
+    if strip_method == "filter" or (strip_method == "random" and random.random() > 0.5):
+        if isinstance(response_dict["response"], dict):
+            if os.path.exists(os.path.join(schema_root, category)):
+                if os.path.exists(os.path.join(schema_root, category, tool_name+".json")):
+                    schema_dicts = json.load(open(os.path.join(schema_root, category, tool_name+".json"), "r"))
+                    api_list = schema_dicts["api_list"]
+                    schema = None
+                    for schema_dict in api_list:
+                        schema_api_name = change_name(standardize(schema_dict["name"]))
+                        if schema_api_name == api_name and len(schema_dict["schema"]) > 0:
+                            schema = schema_dict["schema"]
+                            break
+                    if schema is not None:
+                        response_dict["response"] = dict_shorten(response_dict["response"], schema)
+    return str(response_dict["response"])
+
+
+def get_rapidapi_response(input_dict: dict, api_customization: bool=False, tools_root: str="data.toolenv.tools", schema_root: str="data/toolenv/response_examples"):
+    info = Info
+    info.category = input_dict['category']
+    info.tool_name = input_dict['tool_name']
+    info.api_name = input_dict['api_name']
+    info.tool_input = input_dict['tool_input']
+    info.strip = input_dict['strip']
+    rapidapi_key = input_dict['rapidapi_key']
+
+    tool_name, standard_category, api_name, code_string = prepare_tool_name_and_url(tools_root, info)
+    tool_input = info.tool_input
+    
+    strip_method = info.strip
+    
+    try:
+        tool_input = json.loads(tool_input)
+    except Exception as e:
+        if tool_input == "":
+            tool_input = {}
+        else:
+            print(f"Can not parse tool input into json: {tool_input}")
+            response_dict = {"error": f"Tool input parse error...\n", "response": ""}
+            return response_dict
+    
+    input_params_str = ""
+    if len(tool_input) > 0:
+        for key, value in tool_input.items():
+            if isinstance(value, str):
+                input_params_str += f'{key}="{value}", '
+            else:
+                input_params_str += f'{key}={value}, '
+    if not api_customization:
+        input_params_str += f"toolbench_rapidapi_key='{rapidapi_key}'"
+    success_flag, switch_flag, response_dict, save_cache = run(code_string, api_name, input_params_str)
+    observation = observation_shorten(schema_root, response_dict, standard_category, tool_name.replace(f"_for_{standard_category}", ""), api_name, strip_method)
+    result = str(observation)[:2048]
+    return {"error": response_dict['error'], "response": result}
+
+
+if __name__ == "__main__":
+    result = get_rapidapi_response({
+        "category": "Social",
+        "tool_name": "olato_quotes",
+        "api_name": "love_quote",
+        "tool_input": '{}',
+        "strip": "filter",
+        "rapidapi_key": ""
+    })
+    print(result)
--- a/toolbench/inference/toolbench_server.py
+++ b/toolbench/inference/toolbench_server.py
@ -0,0 +1,176 @@
+from flask import Flask, Response, stream_with_context, request
+from flask_cors import CORS, cross_origin
+from callbacks.ServerEventCallback import ServerEventCallback
+import argparse
+from toolbench.inference.Downstream_tasks.rapidapi import pipeline_runner
+import subprocess
+import concurrent.futures
+import json
+import signal
+import time
+from queue import Queue
+import copy
+import time
+app = Flask(__name__)
+cors = CORS(app)
+
+
+class Model:
+    def __init__(self, gpu=0):
+        self.inuse = False
+        print("Initializing...")
+        starting_time = time.time()
+        self.args = self.get_args()
+        self.pipeline = pipeline_runner(self.args, add_retrieval=False, server=True)
+        print("Loading model...")
+        self.llm = self.pipeline.get_backbone_model()
+        print("Model loaded in {} seconds".format(time.time() - starting_time))
+        starting_time = time.time()
+        print("Loading retriever...")
+        self.retriever = self.pipeline.get_retriever()
+        print("Retriever loaded in {} seconds".format(time.time() - starting_time))
+        self.query_id = 0
+        # self.process_num = self.args.process_num
+
+        self.queue = Queue()
+        self.callback = ServerEventCallback(self.queue)
+        self.occupied = False
+
+        print("Server ready")
+
+    def run_pipeline(self, user_input, method, top_k):
+        # empty the queue
+        while not self.queue.empty():
+            self.queue.get()
+        self.query_id += 1
+        temp_args = copy.deepcopy(self.args)
+        temp_args.retrieved_api_nums = top_k
+        temp_args.method = method
+        data_dict = {
+            "query": user_input,
+        }
+        self.pipeline.run_single_task(
+            method=method,
+            backbone_model=self.llm,
+            query_id=self.query_id,
+            data_dict=data_dict,
+            output_dir_path=self.args.output_answer_file,
+            retriever=self.retriever,
+            args=temp_args,
+            tool_des=None,
+            callbacks=[self.callback]
+        )
+
+    def get_queue(self):
+        while not self.queue.empty():
+            yield self.queue.get()
+
+    def get_args(self):
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--corpus_tsv_path', type=str, default="your_retrival_corpus_path/", required=False,
+                            help='')
+        parser.add_argument('--retrieval_model_path', type=str, default="your_model_path/", required=False, help='')
+        parser.add_argument('--retrieved_api_nums', type=int, default=5, required=False, help='')
+        parser.add_argument('--backbone_model', type=str, default="toolllama", required=False,
+                            help='chatgpt_function or davinci or toolllama')
+        parser.add_argument('--openai_key', type=str, default="", required=False,
+                            help='openai key for chatgpt_function or davinci model')
+        parser.add_argument('--model_path', type=str, default="your_model_path/", required=True, help='')
+        parser.add_argument('--tool_root_dir', type=str, default="your_tools_path/", required=True, help='')
+        parser.add_argument("--lora", action="store_true", help="Load lora model or not.")
+        parser.add_argument('--lora_path', type=str, default="your_lora_path if lora", required=False, help='')
+        parser.add_argument('--max_observation_length', type=int, default=1024, required=False,
+                            help='maximum observation length')
+        parser.add_argument('--observ_compress_method', type=str, default="truncate", choices=["truncate", "filter", "random"], 
+                            required=False, help='observation compress method')
+        parser.add_argument('--method', type=str, default="CoT@1", required=False,
+                            help='method for answer generation: CoT@n,Reflexion@n,BFS,DFS,UCT_vote')
+        parser.add_argument('--input_query_file', type=str, default="", required=False, help='input path')
+        parser.add_argument('--output_answer_file', type=str, default="", required=False, help='output path')
+        parser.add_argument('--toolbench_key', type=str, default="", required=False, help='your toolbench key')
+        parser.add_argument('--rapidapi_key', type=str, default="",required=False, help='your rapidapi key to request rapidapi service')
+        parser.add_argument('--use_rapidapi_key', action="store_true", help="To use customized rapidapi service or not.")
+        parser.add_argument('--api_customization', action="store_true", help="To use customized api or not.")
+
+        args = parser.parse_args()
+        return args
+
+model = Model()
+
+
+@app.route('/stream', methods=['GET', 'POST'])
+@cross_origin()
+def stream():
+    data = json.loads(request.data)
+    user_input = data["text"]
+    top_k = data["top_k"]
+    method = data["method"]
+    print("Called stream")
+    global model
+
+    def generate(model):
+        print("Called generate")
+        if model.inuse:
+            # send 409 error
+            return Response(json.dumps({
+                "method_name": "error",
+                "error": "Model in use"
+            }), status=409, mimetype='application/json')
+            return
+        model.inuse = True
+
+        # run model.run_agent in the background
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+
+            future = executor.submit(model.run_pipeline, user_input, method, top_k)
+            # keep waiting for the queue to be empty
+            while True:
+                if model.queue.empty():
+                    if future.done():
+                        print("Finished with future")
+                        break
+                    time.sleep(0.01)
+                    continue
+                else:
+                    obj = model.queue.get()
+                if obj["method_name"] == "unknown": continue
+                if obj["method_name"] == "on_request_end":
+                    yield json.dumps(obj)
+                    break
+
+                try:
+                    yield json.dumps(obj) + "\n"
+                except Exception as e:
+                    model.inuse = False
+                    print(obj)
+                    print(e)
+
+            try:
+                future.result()
+            except Exception as e:
+                model.inuse = False
+                print(e)
+
+        model.inuse = False
+        return
+
+    return Response(stream_with_context(generate(model)))
+
+@app.route('/methods', methods=['GET'])
+@cross_origin()
+def methods():
+    # return a list of available methods
+    return Response(json.dumps({
+        {
+            "methods": ["DFS_woFilter_w2"]
+        }
+    }), status=200, mimetype='application/json')
+
+def handle_keyboard_interrupt(signal, frame):
+    global model
+    exit(0)
+
+signal.signal(signal.SIGINT, handle_keyboard_interrupt)
+
+if __name__ == '__main__':
+    app.run(use_reloader=False, host="0.0.0.0", debug=True, port=5000)
--- a/toolbench/inference/utils.py
+++ b/toolbench/inference/utils.py
@ -0,0 +1,267 @@
+import gc
+import abc
+import numpy as np
+import math
+from typing import Iterable
+import torch
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+)
+
+# For DFS
+def softmax_bias(answers,temperature=1):
+
+    sums = 0.0
+    answers = [ 10**((cont/temperature)/400) for cont in answers]
+    for cont in answers:
+        assert type(cont) == float or type(cont) == int
+        sums += cont
+    answers = [ cont/sums for cont in answers]
+    return np.array(answers)
+
+def compute_epsilon_new_node(p_new_node):
+    '''
+    根据公式换算delta
+    '''
+    delta = 400 * math.log10(p_new_node /(1-p_new_node))
+    return 1000 + delta
+
+# For prediction parsing, into ReACT format
+def react_parser(string):
+    thought = [string[string.find("Thought: ") + len("Thought: "): string.find("\nAction: ")]]
+    action = [string[string.find("Action: ") + len("Action: "): string.find("\nAction Input: ")]]
+    action_input = [string[string.find("Action Input: ") + len("Action Input: "):]]
+    return thought[0], action[0], action_input[0]
+
+# For toolllama's predictions 
+def prepare_logits_processor(
+    temperature: float, repetition_penalty: float, top_p: float, top_k: int
+) -> LogitsProcessorList:
+    processor_list = LogitsProcessorList()
+    # TemperatureLogitsWarper doesn't accept 0.0, 1.0 makes it a no-op so we skip two cases.
+    if temperature >= 1e-5 and temperature != 1.0:
+        processor_list.append(TemperatureLogitsWarper(temperature))
+    if repetition_penalty > 1.0:
+        processor_list.append(RepetitionPenaltyLogitsProcessor(repetition_penalty))
+    if 1e-8 <= top_p < 1.0:
+        processor_list.append(TopPLogitsWarper(top_p))
+    if top_k > 0:
+        processor_list.append(TopKLogitsWarper(top_k))
+    return processor_list
+
+@torch.inference_mode()
+def generate_stream(
+    model, tokenizer, params, device, context_len=8192, stream_interval=2, force_generate=False
+):
+    prompt = params["prompt"]
+    len_prompt = len(prompt)
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    top_k = int(params.get("top_k", -1))  # -1 means disable
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    stop_str = params.get("stop", None)
+    echo = bool(params.get("echo", True))
+    stop_token_ids = params.get("stop_token_ids", None) or []
+    stop_token_ids.append(tokenizer.eos_token_id)
+
+    logits_processor = prepare_logits_processor(
+        temperature, repetition_penalty, top_p, top_k
+    )
+
+    input_ids = tokenizer(prompt).input_ids
+    input_echo_len = len(input_ids)
+    output_ids = list(input_ids)
+
+    if model.config.is_encoder_decoder:
+        max_src_len = context_len
+    else:
+        max_src_len = context_len - max_new_tokens - 8
+
+    input_ids = input_ids[-max_src_len:]
+
+    if model.config.is_encoder_decoder:
+        encoder_output = model.encoder(
+            input_ids=torch.as_tensor([input_ids], device=device)
+        )[0]
+        start_ids = torch.as_tensor(
+            [[model.generation_config.decoder_start_token_id]],
+            dtype=torch.int64,
+            device=device,
+        )
+
+    past_key_values = out = None
+    for i in range(max_new_tokens):
+        if i == 0:
+            if model.config.is_encoder_decoder:
+                out = model.decoder(
+                    input_ids=start_ids,
+                    encoder_hidden_states=encoder_output,
+                    use_cache=True,
+                )
+                logits = model.lm_head(out[0])
+            else:
+                out = model(torch.as_tensor([input_ids], device=device), use_cache=True)
+                logits = out.logits
+            past_key_values = out.past_key_values
+        else:
+            if model.config.is_encoder_decoder:
+                out = model.decoder(
+                    input_ids=torch.as_tensor([[token]], device=device),
+                    encoder_hidden_states=encoder_output,
+                    use_cache=True,
+                    past_key_values=past_key_values,
+                )
+
+                logits = model.lm_head(out[0])
+            else:
+                out = model(
+                    input_ids=torch.as_tensor([[token]], device=device),
+                    use_cache=True,
+                    past_key_values=past_key_values,
+                )
+                logits = out.logits
+            past_key_values = out.past_key_values
+
+        if logits_processor:
+            if repetition_penalty > 1.0:
+                tmp_output_ids = torch.as_tensor([output_ids], device=logits.device)
+            else:
+                tmp_output_ids = None
+            last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
+        else:
+            last_token_logits = logits[0, -1, :]
+
+        if device == "mps":
+            # Switch to CPU by avoiding some bugs in mps backend.
+            last_token_logits = last_token_logits.float().to("cpu")
+
+        if temperature < 1e-5 or top_p < 1e-8:  # greedy
+            token = int(torch.argmax(last_token_logits))
+        else:
+            probs = torch.softmax(last_token_logits, dim=-1)
+            token = int(torch.multinomial(probs, num_samples=1))
+
+        output_ids.append(token)
+
+        if token in stop_token_ids:
+            stopped = True
+        else:
+            stopped = False
+        if i == 0 and force_generate:
+            stopped = False
+        if i == max_new_tokens - 1 or stopped:
+            if echo:
+                tmp_output_ids = output_ids
+                rfind_start = len_prompt
+            else:
+                tmp_output_ids = output_ids[input_echo_len:]
+                rfind_start = 0
+
+            output = tokenizer.decode(
+                tmp_output_ids,
+                skip_special_tokens=True,
+                spaces_between_special_tokens=False,
+            )
+            if stop_str:
+                if isinstance(stop_str, str):
+                    pos = output.rfind(stop_str, rfind_start)
+                    if pos != -1:
+                        output = output[:pos]
+                        stopped = True
+                elif isinstance(stop_str, Iterable):
+                    for each_stop in stop_str:
+                        pos = output.rfind(each_stop, rfind_start)
+                        if pos != -1:
+                            output = output[:pos]
+                            stopped = True
+                            break
+                else:
+                    raise ValueError("Invalid stop field type.")
+
+            yield {
+                "text": output,
+                "usage": {
+                    "prompt_tokens": input_echo_len,
+                    "completion_tokens": i,
+                    "total_tokens": input_echo_len + i,
+                },
+                "finish_reason": None,
+            }
+
+        if stopped:
+            break
+
+    # finish stream event, which contains finish reason
+    if i == max_new_tokens - 1:
+        finish_reason = "length"
+    elif stopped:
+        finish_reason = "stop"
+    else:
+        finish_reason = None
+
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+
+    # clean
+    del past_key_values, out
+    gc.collect()
+    torch.cuda.empty_cache()
+
+# For IO presentation
+class ChatIO(abc.ABC):
+    @abc.abstractmethod
+    def prompt_for_input(self, role: str) -> str:
+        """Prompt for input from a role."""
+
+    @abc.abstractmethod
+    def prompt_for_output(self, role: str):
+        """Prompt for output from a role."""
+
+    @abc.abstractmethod
+    def stream_output(self, output_stream):
+        """Stream output."""
+    
+    @abc.abstractmethod
+    def return_output(self, output_stream):
+        """Return output."""
+
+class SimpleChatIO(ChatIO):
+    def prompt_for_input(self, role) -> str:
+        return input(f"{role}: ")
+
+    def prompt_for_output(self, role: str):
+        print(f"{role}: ", end="", flush=True)
+
+    def stream_output(self, output_stream):
+        pre = 0
+        for outputs in output_stream:
+            output_text = outputs["text"]
+            output_text = output_text.strip().split(" ")
+            now = len(output_text) - 1
+            if now > pre:
+                print(" ".join(output_text[pre:now]), end=" ", flush=True)
+                pre = now
+        print(" ".join(output_text[pre:]), flush=True)
+        return " ".join(output_text)
+    
+    def return_output(self, output_stream):
+        pre = 0
+        for outputs in output_stream:
+            output_text = outputs["text"]
+            output_text = output_text.strip().split(" ")
+            now = len(output_text) - 1
+            if now > pre:
+                pre = now
+        return " ".join(output_text)
--- a/toolbench/model/init.py
+++ b/toolbench/model/init.py
@ -0,0 +1,5 @@
+from toolbench.model.model_adapter import (
+    load_model,
+    get_conversation_template,
+    add_model_args,
+)
--- a/toolbench/model/apply_delta.py
+++ b/toolbench/model/apply_delta.py
@ -0,0 +1,165 @@
+"""
+Apply the delta weights on top of a base model.
+
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
+"""
+import argparse
+import gc
+import glob
+import json
+import os
+import shutil
+import tempfile
+
+from huggingface_hub import snapshot_download
+import torch
+from torch import nn
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+
+
+GB = 1 << 30
+
+
+def split_files(model_path, tmp_path, split_size):
+    if not os.path.exists(model_path):
+        model_path = snapshot_download(repo_id=model_path)
+    if not os.path.exists(tmp_path):
+        os.makedirs(tmp_path)
+
+    file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
+    files = glob.glob(file_pattern)
+
+    part = 0
+    try:
+        for file_path in tqdm(files):
+            state_dict = torch.load(file_path)
+            new_state_dict = {}
+
+            current_size = 0
+            for name, param in state_dict.items():
+                param_size = param.numel() * param.element_size()
+
+                if current_size + param_size > split_size:
+                    new_file_name = f"pytorch_model-{part}.bin"
+                    new_file_path = os.path.join(tmp_path, new_file_name)
+                    torch.save(new_state_dict, new_file_path)
+                    current_size = 0
+                    new_state_dict = None
+                    gc.collect()
+                    new_state_dict = {}
+                    part += 1
+
+                new_state_dict[name] = param
+                current_size += param_size
+
+            new_file_name = f"pytorch_model-{part}.bin"
+            new_file_path = os.path.join(tmp_path, new_file_name)
+            torch.save(new_state_dict, new_file_path)
+            new_state_dict = None
+            gc.collect()
+            new_state_dict = {}
+            part += 1
+    except Exception as e:
+        print(f"An error occurred during split_files: {e}")
+        shutil.rmtree(tmp_path)
+        raise
+
+
+def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
+    delta_config = AutoConfig.from_pretrained(delta_path)
+
+    if os.path.exists(target_model_path):
+        shutil.rmtree(target_model_path)
+    os.makedirs(target_model_path)
+
+    split_size = 4 * GB
+
+    with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
+        print(f"Split files for the base model to {tmp_base_path}")
+        split_files(base_model_path, tmp_base_path, split_size)
+        print(f"Split files for the delta weights to {tmp_delta_path}")
+        split_files(delta_path, tmp_delta_path, split_size)
+
+        base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
+        base_files = glob.glob(base_pattern)
+        delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
+        delta_files = glob.glob(delta_pattern)
+        delta_state_dict = torch.load(delta_files[0])
+
+        print("Applying the delta")
+        weight_map = {}
+        total_size = 0
+
+        for i, base_file in tqdm(enumerate(base_files)):
+            state_dict = torch.load(base_file)
+            file_name = f"pytorch_model-{i}.bin"
+            for name, param in state_dict.items():
+                if name not in delta_state_dict:
+                    for delta_file in delta_files:
+                        delta_state_dict = torch.load(delta_file)
+                        gc.collect()
+                        if name in delta_state_dict:
+                            break
+
+                state_dict[name] += delta_state_dict[name]
+                weight_map[name] = file_name
+                total_size += param.numel() * param.element_size()
+                gc.collect()
+            torch.save(state_dict, os.path.join(target_model_path, file_name))
+
+        with open(
+            os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
+        ) as f:
+            json.dump(
+                {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
+            )
+
+    print(f"Saving the target model to {target_model_path}")
+    delta_tokenizer.save_pretrained(target_model_path)
+    delta_config.save_pretrained(target_model_path)
+
+
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print(f"Loading the delta weights from {delta_path}")
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
+    delta = AutoModelForCausalLM.from_pretrained(
+        delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+
+    print(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+
+    print("Applying the delta")
+    for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
+        assert name in delta.state_dict()
+        param.data += delta.state_dict()[name]
+
+    print(f"Saving the target model to {target_model_path}")
+    base.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument(
+        "--low-cpu-mem",
+        action="store_true",
+        help="Lower the cpu memory usage. This will split large files and use "
+        "disk as swap to reduce the memory usage below 10GB.",
+    )
+    args = parser.parse_args()
+
+    if args.low_cpu_mem:
+        apply_delta_low_cpu_mem(
+            args.base_model_path, args.target_model_path, args.delta_path
+        )
+    else:
+        apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
--- a/toolbench/model/compression.py
+++ b/toolbench/model/compression.py
@ -0,0 +1,199 @@
+import dataclasses
+import os
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+
+
+@dataclasses.dataclass
+class CompressionConfig:
+    """Group-wise quantization."""
+
+    num_bits: int
+    group_size: int
+    group_dim: int
+    symmetric: bool
+    enabled: bool = True
+
+
+default_compression_config = CompressionConfig(
+    num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
+)
+
+
+class CLinear(nn.Module):
+    """Compressed Linear Layer."""
+
+    def __init__(self, weight=None, bias=None, device=None):
+        super().__init__()
+        self.weight = weight
+        self.bias = bias
+
+    def forward(self, input):
+        return F.linear(input.to(self.weight.dtype), self.weight, self.bias)
+
+
+def compress_module(module, target_device):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
+            setattr(
+                module,
+                name,
+                CLinear(child.weight, child.bias, target_device),
+            )
+            compress_module(child, target_device)
+
+
+def get_compressed_list(module, prefix=""):
+    compressed_list = []
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
+            full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight"
+            compressed_list.append(full_name)
+            compressed_list.extend(
+                get_compressed_list(child, full_name)
+            )
+    return compressed_list
+
+
+def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
+            full_name = f"{prefix}.{name}.weight" if prefix else f"{name}.weight"
+            setattr(
+                module,
+                name,
+                CLinear(
+                    compressed_state_dict[full_name], child.bias, target_device
+                ),
+            )
+            apply_compressed_weight(child, compressed_state_dict, target_device, full_name)
+
+
+def load_compress_model(model_path, device, torch_dtype):
+    # partially load model
+    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+    base_pattern = os.path.join(model_path, "pytorch_model-*.bin")
+    files = glob.glob(base_pattern)
+
+    config = AutoConfig.from_pretrained(
+        model_path, low_cpu_mem_usage=True, torch_dtype=torch_dtype
+    )
+    model = AutoModelForCausalLM.from_config(config)
+    linear_weights = get_compressed_list(model)
+
+    compressed_state_dict = {}
+
+    for filename in files:
+        tmp_state_dict = torch.load(filename)
+        for name in tmp_state_dict:
+            if name in linear_weights:
+                tensor = tmp_state_dict[name].to(device).data.to(torch_dtype)
+                compressed_state_dict[name] = compress(
+                    tensor, default_compression_config
+                )
+            else:
+                compressed_state_dict[name] = tmp_state_dict[name].to(device)
+            tmp_state_dict[name] = None
+            tensor = None
+            torch.cuda.empty_cache()
+
+    for name, param in model.named_parameters():
+        if name not in linear_weights:
+            param.data = compressed_state_dict[name]
+    apply_compressed_weight(model, compressed_state_dict, device)
+
+    model.to(device)
+
+    return model, tokenizer
+
+
+def compress(tensor, config):
+    """Simulate group-wise quantization."""
+    if not config.enabled:
+        return tensor
+
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size,
+        config.num_bits,
+        config.group_dim,
+        config.symmetric,
+    )
+    assert num_bits <= 8
+
+    original_shape = tensor.shape
+    num_groups = (original_shape[group_dim] + group_size - 1) // group_size
+    new_shape = (
+        original_shape[:group_dim]
+        + (num_groups, group_size)
+        + original_shape[group_dim + 1 :]
+    )
+
+    # Pad
+    pad_len = group_size - original_shape[group_dim] % group_size
+    if pad_len != 0:
+        pad_shape = (
+            original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
+        )
+        tensor = torch.cat(
+            [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
+            dim=group_dim,
+        )
+    data = tensor.view(new_shape)
+
+    # Quantize
+    if symmetric:
+        B = 2 ** (num_bits - 1) - 1
+        scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
+        data = data * scale
+        data = data.clamp_(-B, B).round_().to(torch.int8)
+        return data, scale, original_shape
+    else:
+        B = 2**num_bits - 1
+        mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
+        mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
+
+        scale = B / (mx - mn)
+        data = data - mn
+        data *= scale
+
+        data = data.clamp_(0, B).round_().to(torch.uint8)
+        return data, mn, scale, original_shape
+
+
+def decompress(packed_data, config):
+    """Simulate group-wise dequantization."""
+    if not config.enabled:
+        return packed_data
+
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size,
+        config.num_bits,
+        config.group_dim,
+        config.symmetric,
+    )
+
+    # Dequantize
+    if symmetric:
+        data, scale, original_shape = packed_data
+        data = data / scale
+    else:
+        data, mn, scale, original_shape = packed_data
+        data = data / scale
+        data += mn
+
+    # Unpad
+    pad_len = group_size - original_shape[group_dim] % group_size
+    if pad_len:
+        padded_original_shape = (
+            original_shape[:group_dim]
+            + (original_shape[group_dim] + pad_len,)
+            + original_shape[group_dim + 1 :]
+        )
+        data = data.reshape(padded_original_shape)
+        indices = [slice(0, x) for x in original_shape]
+        return data[indices].contiguous()
+    else:
+        return data.view(original_shape)
--- a/toolbench/model/make_delta.py
+++ b/toolbench/model/make_delta.py
@ -0,0 +1,48 @@
+"""
+Make the delta weights by subtracting base weights.
+
+Usage:
+python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
+"""
+import argparse
+
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+def make_delta(base_model_path, target_model_path, delta_path):
+    print(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+
+    print(f"Loading the target model from {target_model_path}")
+    target = AutoModelForCausalLM.from_pretrained(
+        target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
+
+    print("Calculating the delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        assert name in base.state_dict()
+        param.data -= base.state_dict()[name]
+
+    print(f"Saving the delta to {delta_path}")
+    if args.hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str)
+    args = parser.parse_args()
+
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path)
--- a/toolbench/model/model_adapter.py
+++ b/toolbench/model/model_adapter.py
@ -0,0 +1,293 @@
+"""Model adapter registration."""
+
+import math
+import sys
+from typing import List, Optional
+import warnings
+
+if sys.version_info >= (3, 9):
+    from functools import cache
+else:
+    from functools import lru_cache as cache
+
+import psutil
+import torch
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    LlamaForCausalLM,
+)
+from peft import PeftModel
+
+from toolbench.tool_conversation import Conversation, get_conv_template
+from toolbench.model.compression import load_compress_model
+from toolbench.utils import get_gpu_memory
+
+
+class BaseAdapter:
+    """The base and the default model adapter."""
+
+    def match(self, model_path: str):
+        return True
+
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
+        )
+        return model, tokenizer
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("one_shot")
+
+
+# A global registry for all model adapters
+model_adapters: List[BaseAdapter] = []
+
+
+def register_model_adapter(cls):
+    """Register a model adapter."""
+    model_adapters.append(cls())
+
+
+@cache
+def get_model_adapter(model_path: str) -> BaseAdapter:
+    """Get a model adapter for a model_path."""
+    for adapter in model_adapters:
+        if adapter.match(model_path):
+            return adapter
+    raise ValueError(f"No valid model adapter for {model_path}")
+
+
+def raise_warning_for_incompatible_cpu_offloading_configuration(
+    device: str, load_8bit: bool, cpu_offloading: bool
+):
+    if cpu_offloading:
+        if not load_8bit:
+            warnings.warn(
+                "The cpu-offloading feature can only be used while also using 8-bit-quantization.\n"
+                "Use '--load-8bit' to enable 8-bit-quantization\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+        if not "linux" in sys.platform:
+            warnings.warn(
+                "CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+        if device != "cuda":
+            warnings.warn(
+                "CPU-offloading is only enabled when using CUDA-devices\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+    return cpu_offloading
+
+
+def load_model(
+    model_path: str,
+    device: str,
+    num_gpus: int,
+    max_gpu_memory: Optional[str] = None,
+    load_8bit: bool = False,
+    cpu_offloading: bool = False,
+    debug: bool = False,
+    lora: bool = False,
+    lora_base_model : str = "huggyllama/llama-7b"
+):
+    """Load a model from Hugging Face."""
+
+    # Handle device mapping
+    cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
+        device, load_8bit, cpu_offloading
+    )
+    if device == "cpu":
+        kwargs = {"torch_dtype": torch.float32}
+    elif device == "cuda":
+        kwargs = {"torch_dtype": torch.float16}
+        if lora:
+            model = LlamaForCausalLM.from_pretrained(
+                lora_base_model,
+                load_in_8bit=load_8bit,
+                torch_dtype=torch.float16,
+                device_map="auto",
+            )
+            model = PeftModel.from_pretrained(
+                model,
+                model_path,
+                torch_dtype=torch.float16,
+            )
+        
+        elif num_gpus != 1:
+            
+            kwargs["device_map"] = "auto"
+            if max_gpu_memory is None:
+                kwargs[
+                    "device_map"
+                ] = "sequential"  # This is important for not the same VRAM sizes
+                available_gpu_memory = get_gpu_memory(num_gpus)
+                kwargs["max_memory"] = {
+                    i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
+                    for i in range(num_gpus)
+                }
+            else:
+                kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
+    else:
+        raise ValueError(f"Invalid device: {device}")
+
+    if cpu_offloading:
+        # raises an error on incompatible platforms
+        from transformers import BitsAndBytesConfig
+
+        if "max_memory" in kwargs:
+            kwargs["max_memory"]["cpu"] = (
+                str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib"
+            )
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_8bit_fp32_cpu_offload=cpu_offloading
+        )
+        kwargs["load_in_8bit"] = load_8bit
+    elif load_8bit:
+        if num_gpus != 1:
+            warnings.warn(
+                "8-bit quantization is not supported for multi-gpu inference."
+            )
+        else:
+            return load_compress_model(
+                model_path=model_path, device=device, torch_dtype=kwargs["torch_dtype"]
+            )
+
+    # Load model
+    if not lora:
+        adapter = get_model_adapter(model_path)
+        model, tokenizer = adapter.load_model(model_path, kwargs)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, model_max_length=8192)
+    if device == "cuda" and num_gpus == 1 and not cpu_offloading:
+        model.to(device)
+
+    if debug:
+        print(model)
+
+    return model, tokenizer
+
+
+def get_conversation_template(model_path: str) -> Conversation:
+    adapter = get_model_adapter(model_path)
+    return adapter.get_default_conv_template(model_path)
+
+
+def add_model_args(parser):
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="lmsys/fastchat-t5-3b-v1.0",
+        help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "cuda"],
+        default="cuda",
+        help="The device type",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=str,
+        default=None,
+        help="A single GPU like 1 or multiple GPUs like 0,2",
+    )
+    parser.add_argument("--num-gpus", type=int, default=1)
+    parser.add_argument(
+        "--max-gpu-memory",
+        type=str,
+        help="The maximum memory per gpu. Use a string like '13Gib'",
+    )
+    parser.add_argument(
+        "--load-8bit", action="store_true", help="Use 8-bit quantization"
+    )
+    parser.add_argument(
+        "--cpu-offloading",
+        action="store_true",
+        help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU",
+    )
+
+
+class VicunaAdapter(BaseAdapter):
+    "Model adapater for vicuna-v1.1"
+
+    def match(self, model_path: str):
+        return "vicuna" in model_path
+
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        self.raise_warning_for_old_weights(model)
+        return model, tokenizer
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("vicuna-v1.1")
+
+    def raise_warning_for_old_weights(self, model):
+        if isinstance(model, LlamaForCausalLM) and model.model.vocab_size > 32000:
+            warnings.warn(
+                "\nYou are probably using the old Vicuna-v0 model, "
+                "which will generate unexpected results with the "
+                "current toolbench.\nYou can try one of the following methods:\n"
+                "1. Upgrade your weights to the new Vicuna-v1.1: https://github.com/lm-sys/FastChat#vicuna-weights.\n"
+                "2. Use the old conversation template by `python3 -m toolbench.serve.cli --model-path /path/to/vicuna-v0 --conv-template conv_one_shot`\n"
+                "3. Downgrade fschat to fschat==0.1.10 (Not recommonded).\n"
+            )
+
+
+class ToolLlamaAdapter(BaseAdapter):
+    "Model adapater for tool-llama"
+
+    def match(self, model_path: str):
+        return "tool-llama" == model_path
+
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("tool-llama")
+
+class ToolLlamaAdapterSingleRound(BaseAdapter):
+    "Model adapater for tool-llama-single-round"
+
+    def match(self, model_path: str):
+        return "tool-llama-single-round" == model_path
+
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, model_max_length=8192)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("tool-llama-single-round")
+
+
+# Note: the registration order matters.
+# The one registered earlier has a higher matching priority.
+register_model_adapter(VicunaAdapter)
+register_model_adapter(ToolLlamaAdapter)
+register_model_adapter(ToolLlamaAdapterSingleRound)
+
+# After all adapters, try the default base adapter.
+register_model_adapter(BaseAdapter)
--- a/toolbench/retrieval/api_evaluator.py
+++ b/toolbench/retrieval/api_evaluator.py
@ -0,0 +1,250 @@
+from sklearn.metrics import ndcg_score
+import numpy as np
+import logging
+import os
+from typing import List, Dict, Set
+from tqdm import trange
+from tqdm import tqdm
+import torch
+from multiprocessing import Pool
+import heapq
+from sentence_transformers.evaluation import SentenceEvaluator
+from sentence_transformers.util import cos_sim
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# 配置logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# 创建一个FileHandler来保存日志到文件中
+log_file = "log_file.txt"
+if os.path.exists(log_file):
+    os.remove(log_file)
+file_handler = logging.FileHandler(log_file)
+file_handler.setLevel(logging.INFO)
+
+# 创建一个StreamHandler来将日志输出到控制台
+stream_handler = logging.StreamHandler()
+stream_handler.setLevel(logging.INFO)
+
+# 设置日志输出格式
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+file_handler.setFormatter(formatter)
+stream_handler.setFormatter(formatter)
+
+# 将FileHandler和StreamHandler添加到logger中
+logger.addHandler(file_handler)
+logger.addHandler(stream_handler)
+
+
+def compute_ndcg_for_query(query_tuple):
+    _, query_id, top_hits, relevant_docs, corpus_ids, k = query_tuple
+    query_relevant_docs = relevant_docs[query_id]
+
+    # Build the ground truth relevance scores and the model's predicted scores
+    true_relevance = np.zeros(len(corpus_ids))
+    predicted_scores = np.zeros(len(corpus_ids))
+
+    for hit in top_hits:
+        predicted_scores[corpus_ids.index(hit["corpus_id"])] = hit["score"]
+        if hit["corpus_id"] in query_relevant_docs:
+            true_relevance[corpus_ids.index(hit["corpus_id"])] = 1
+
+    return ndcg_score([true_relevance], [predicted_scores], k)
+
+
+class APIEvaluator(SentenceEvaluator):
+    """
+    This class evaluates an Information Retrieval (IR) setting.
+    Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document.
+    """
+
+    def __init__(
+        self,
+        queries: Dict[str, str],  # qid => query
+        corpus: Dict[str, str],  # cid => doc
+        relevant_docs: Dict[str, Set[str]],  # qid => Set[cid]
+        corpus_chunk_size: int = 5,
+        show_progress_bar: bool = True,
+        batch_size: int = 1,
+        write_csv: bool = True,
+        score_function=cos_sim,  # Score function, higher=more similar
+    ):
+        self.queries_id = list(queries.keys())
+        self.queries = [queries[qid] for qid in self.queries_id]
+        self.corpus_ids = list(corpus.keys())
+        self.corpus = [corpus[cid] for cid in self.corpus_ids]
+        self.relevant_docs = relevant_docs
+        self.corpus_chunk_size = corpus_chunk_size
+        self.show_progress_bar = show_progress_bar
+        self.batch_size = batch_size
+        self.write_csv = write_csv
+        self.score_function = score_function
+
+        self.csv_file: str = "Information-Retrieval_evaluation_results.csv"
+        self.csv_headers = [
+            "epoch",
+            "steps",
+            "Average NDCG@1",
+            "Average NDCG@3",
+            "Average NDCG@5",
+        ]
+
+        # for k in accuracy_at_k:
+        #     self.csv_headers.append("Accuracy@{}".format(k))
+
+    def __call__(
+        self,
+        model,
+        output_path: str = None,
+        epoch: int = -1,
+        steps: int = -1,
+        *args,
+        **kwargs
+    ) -> float:
+        if epoch != -1:
+            out_txt = (
+                " after epoch {}:".format(epoch)
+                if steps == -1
+                else " in epoch {} after {} steps:".format(epoch, steps)
+            )
+        else:
+            out_txt = ":"
+        logger.info("Information Retrieval Evaluation" + out_txt)
+
+        # scores = self.compute_metrices(model)
+        avg_ndcg = self.compute_metrices(model)
+
+        # Write results to disc
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                fOut = open(csv_path, mode="w", encoding="utf-8")
+                fOut.write(",".join(self.csv_headers))
+                fOut.write("\n")
+            else:
+                fOut = open(csv_path, mode="a", encoding="utf-8")
+
+            output_data = [epoch, steps]
+            output_data.append(avg_ndcg)
+            # for k in self.accuracy_at_k:
+            #     output_data.append(scores[k])
+
+            fOut.write(",".join(map(str, output_data)))
+            fOut.write("\n")
+            fOut.close()
+
+        return min(avg_ndcg)
+
+    def compute_metrices(self, model) -> Dict[int, float]:
+        # Compute embedding for the queries
+        query_embeddings = model.encode(
+            self.queries,
+            show_progress_bar=self.show_progress_bar,
+            batch_size=self.batch_size,
+            convert_to_tensor=True,
+        )
+
+        queries_result_list = [[] for _ in range(len(query_embeddings))]
+
+        # Iterate over chunks of the corpus
+        for corpus_start_idx in trange(
+            0,
+            len(self.corpus),
+            self.corpus_chunk_size,
+            desc="Corpus Chunks",
+            disable=not self.show_progress_bar,
+        ):
+            corpus_end_idx = min(
+                corpus_start_idx + self.corpus_chunk_size, len(self.corpus)
+            )
+
+            # Encode chunk of corpus
+            sub_corpus_embeddings = model.encode(
+                self.corpus[corpus_start_idx:corpus_end_idx],
+                show_progress_bar=False,
+                batch_size=self.batch_size,
+                convert_to_tensor=True,
+            )
+
+            # Compute cosine similarites
+            pair_scores = self.score_function(query_embeddings, sub_corpus_embeddings)
+
+            # Convert scores to list
+            pair_scores_list = pair_scores.cpu().tolist()
+
+            for query_itr in range(len(query_embeddings)):
+                for sub_corpus_id, score in enumerate(pair_scores_list[query_itr]):
+                    corpus_id = self.corpus_ids[corpus_start_idx + sub_corpus_id]
+                    queries_result_list[query_itr].append(
+                        {"corpus_id": corpus_id, "score": score}
+                    )
+
+        for query_itr in range(len(queries_result_list)):
+            for doc_itr in range(len(queries_result_list[query_itr])):
+                score, corpus_id = (
+                    queries_result_list[query_itr][doc_itr]["score"],
+                    queries_result_list[query_itr][doc_itr]["corpus_id"],
+                )
+                queries_result_list[query_itr][doc_itr] = {
+                    "corpus_id": corpus_id,
+                    "score": score,
+                }
+
+        logger.info("Queries: {}".format(len(self.queries)))
+        logger.info("Corpus: {}\n".format(len(self.corpus)))
+
+        # Compute scores
+        scores = self.compute_metrics(queries_result_list)
+
+        # Output
+        logger.info("Average NDCG@1: {:.2f}".format(scores[0] * 100))
+        logger.info("Average NDCG@3: {:.2f}".format(scores[1] * 100))
+        logger.info("Average NDCG@5: {:.2f}".format(scores[2] * 100))
+        return scores
+
+    def compute_metrics(self, queries_result_list):
+        # Init score computation values
+        ndcg_scores = []
+
+        # Compute scores on results using a pool of workers
+        k_list = [1, 3, 5]
+        scores = []
+
+        for k in k_list:
+            # Build a list of tuples, each containing the data needed for one query
+            query_tuples = []
+            for query_itr in range(len(queries_result_list)):
+                query_id = self.queries_id[query_itr]
+                top_hits = sorted(
+                    queries_result_list[query_itr],
+                    key=lambda x: x["score"],
+                    reverse=True,
+                )
+                query_tuples.append(
+                    (
+                        query_itr,
+                        query_id,
+                        top_hits,
+                        self.relevant_docs,
+                        self.corpus_ids,
+                        k,
+                    )
+                )  # add 'k' to each tuple
+
+            ndcg_scores.clear()  # clear the list for each 'k'
+
+            with Pool() as p:
+                max_ = len(query_tuples)
+                with tqdm(total=max_) as pbar:
+                    for i, _ in tqdm(
+                        enumerate(p.imap(compute_ndcg_for_query, query_tuples))
+                    ):
+                        pbar.update()
+                        ndcg_scores.append(_)
+            scores.append(np.mean(ndcg_scores))
+
+        # Return the average NDCG@k of all queries for each 'k'
+        return scores
--- a/toolbench/retrieval/inference_example.py
+++ b/toolbench/retrieval/inference_example.py
@ -0,0 +1,68 @@
+from sentence_transformers import SentenceTransformer, util
+import json
+import pandas as pd
+from collections import defaultdict
+import torch
+from tqdm import tqdm
+import argparse
+import os
+
+# 创建参数解析器并添加参数
+parser = argparse.ArgumentParser()
+parser.add_argument('model_path', type=str, required=True, help='Your trained model path')
+parser.add_argument('dataset_path', help='The processed dataset files path')
+
+# 解析命令行参数
+args = parser.parse_args()
+
+# Check if a GPU is available and if not, use a CPU
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+model_path = args.model_path
+
+# Load the trained model
+model = SentenceTransformer(model_path).to(device)
+
+# Load test data
+documents_df = pd.read_csv(os.path.join(args.dataset_path, 'corpus.tsv'), sep='\t')
+test_queries_df = pd.read_csv(os.path.join(args.dataset_path, 'test.query.txt'), sep='\t', names=['qid', 'query_text'])
+test_labels_df = pd.read_csv(os.path.join(args.dataset_path, 'qrels.test.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label'])
+
+# Create mappings, get 'tool_name' and 'api_name' from the document_content
+ir_corpus = {row.docid: (json.loads(row.document_content)['tool_name'], json.loads(row.document_content)['api_name']) for _, row in documents_df.iterrows()}
+ir_test_queries = {row.qid: row.query_text for _, row in test_queries_df.iterrows()}
+
+# Create query-doc mapping from the test set
+ir_relevant_docs = defaultdict(list)
+for _, row in test_labels_df.iterrows():
+    ir_relevant_docs[row.qid].append(row.docid)
+
+# Convert queries and documents to embeddings
+test_query_embeddings = model.encode(list(ir_test_queries.values()), convert_to_tensor=True).to(device)
+corpus_embeddings = model.encode(list(map(' '.join, ir_corpus.values())), convert_to_tensor=True).to(device)
+
+# Compute cosine similarity between queries and documents
+cos_scores = util.pytorch_cos_sim(test_query_embeddings, corpus_embeddings)
+
+# Get the top_k most similar documents for each query
+top_k = 5
+top_results = {}
+for query_index, (query_id, query) in enumerate(ir_test_queries.items()):
+    relevant_docs_indices = cos_scores[query_index].topk(top_k).indices
+    relevant_docs_scores = cos_scores[query_index].topk(top_k).values
+    relevant_docs = [(list(ir_corpus.keys())[index], list(ir_corpus.values())[index]) for index in relevant_docs_indices]
+    relevant_docs_with_scores = {str((doc_id, tool_name_api_name)): {'score': float(score)} for (doc_id, tool_name_api_name), score in zip(relevant_docs, relevant_docs_scores)}
+
+    # Count the number of successful matches
+    matches = len(set([doc_id for doc_id, _ in relevant_docs]) & set(ir_relevant_docs[query_id]))
+    
+    # Save query, original docs, top 5 docs with scores, and successful match count
+    top_results[query] = {
+        'original_docs': [' '.join(ir_corpus[doc_id]) for doc_id in ir_relevant_docs[query_id]],
+        'top_docs': relevant_docs_with_scores,
+        'successful_matches': matches
+    }
+
+# Save the results to a json file
+with open('top5_results_with_matches.json', 'w') as f:
+    json.dump(top_results, f, indent=4)
--- a/toolbench/retrieval/train.py
+++ b/toolbench/retrieval/train.py
@ -0,0 +1,108 @@
+import logging
+import os
+import json
+import pandas as pd
+from datetime import datetime
+import torch
+import torch.nn as nn
+from sentence_transformers import SentenceTransformer, models, InputExample, losses, LoggingHandler
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from api_evaluator import APIEvaluator
+import argparse
+import os
+from toolbench.utils import process_retrieval_ducoment
+
+import os
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--data_path", default=None, type=str, required=True,
+                    help="The input data dir. Should contain the .tsv files for the task.")
+parser.add_argument("--model_name", default=None, type=str, required=True,
+                    help="The base model name.")
+parser.add_argument("--output_path", default=None, type=str, required=True,
+                    help="The base path where the model output will be saved.")
+parser.add_argument("--num_epochs", default=5, type=int, required=True,
+                    help="Train epochs.")
+parser.add_argument("--train_batch_size", default=32, type=int, required=True,
+                    help="Train batch size.")
+parser.add_argument("--learning_rate", default=2e-5, type=float, required=True,
+                    help="Learning rate.")
+parser.add_argument("--warmup_steps", default=500, type=float, required=True,
+                    help="Warmup steps.")
+parser.add_argument("--max_seq_length", default=256, type=int, required=True,
+                    help="Max sequence length.")
+args = parser.parse_args()
+
+logging.basicConfig(format='%(asctime)s - %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S',
+                    level=logging.INFO,
+                    handlers=[LoggingHandler()])
+logger = logging.getLogger(__name__)
+
+torch.manual_seed(42)
+torch.cuda.manual_seed(42)
+
+num_epochs = args.num_epochs
+train_batch_size = args.train_batch_size
+lr = args.learning_rate
+warmup_steps = args.warmup_steps
+data_path = args.data_path
+output_path = args.output_path
+os.makedirs(output_path, exist_ok=True)
+
+model_save_path = os.path.join(output_path, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
+os.makedirs(model_save_path, exist_ok=True)
+
+tensorboard_name = 'name_desc'
+logs_writer = SummaryWriter(os.path.join(output_path, 'tensorboard', tensorboard_name))
+
+
+def log_callback_st(train_ix, global_step, training_steps, current_lr, loss_value):
+    logs_writer.add_scalar('train_loss', loss_value, global_step)
+    logs_writer.add_scalar('lr', current_lr[0], global_step)
+
+
+# Model definition
+word_embedding_model = models.Transformer(args.model_name, max_seq_length=args.max_seq_length)
+pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+ir_train_queries = {}
+ir_test_queries = {}
+ir_relevant_docs = {}
+train_samples = []
+
+documents_df = pd.read_csv(os.path.join(data_path, 'corpus.tsv'), sep='\t')
+ir_corpus, _ = process_retrieval_ducoment(documents_df)
+
+train_queries_df = pd.read_csv(os.path.join(data_path, 'train.query.txt'), sep='\t', names=['qid', 'query'])
+for row in train_queries_df.itertuples():
+    ir_train_queries[row.qid] = row.query
+train_queries_df = pd.read_csv(os.path.join(data_path, 'test.query.txt'), sep='\t', names=['qid', 'query'])
+for row in train_queries_df.itertuples():
+    ir_test_queries[row.qid] = row.query
+
+labels_df = pd.read_csv(os.path.join(data_path, 'qrels.train.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label'])
+for row in labels_df.itertuples():
+    sample = InputExample(texts=[ir_train_queries[row.qid], ir_corpus[row.docid]], label=row.label)
+    train_samples.append(sample)
+labels_df = pd.read_csv(os.path.join(data_path, 'qrels.test.tsv'), sep='\t', names=['qid', 'useless', 'docid', 'label'])
+for row in labels_df.itertuples():
+    ir_relevant_docs.setdefault(row.qid, set()).add(row.docid)
+
+train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, pin_memory=True)
+train_loss = losses.MultipleNegativesRankingLoss(model)
+ir_evaluator = APIEvaluator(ir_test_queries, ir_corpus, ir_relevant_docs)
+
+# You may need to modify the .fit() method to ensure all data is moved to the correct device during parallel computations
+
+model.fit(train_objectives=[(train_dataloader, train_loss)],
+                evaluator=ir_evaluator,
+                epochs=num_epochs,
+                warmup_steps=warmup_steps,
+                optimizer_params={'lr': lr},
+                output_path=model_save_path
+                )
+
+
--- a/toolbench/tool_conversation.py
+++ b/toolbench/tool_conversation.py
@ -0,0 +1,297 @@
+"""
+Tool conversation prompt templates. Basically copy from FastChat.
+"""
+
+import dataclasses
+from enum import auto, Enum
+from typing import List, Any, Dict
+
+
+class SeparatorStyle(Enum):
+    """Separator styles."""
+
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ONLY_LAST_ASSISTANT = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+
+    # The name of this template
+    name: str
+    # The System prompt
+    system: str
+    # Two roles
+    roles: List[str]
+    # All messages
+    messages: List[List[str]]
+    # Offset of few shot examples
+    offset: int
+    # Separators
+    sep_style: SeparatorStyle
+    sep: str
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: str = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        if self.sep_style == SeparatorStyle.ONLY_LAST_ASSISTANT:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if i + 1 == len(self.messages) and message:
+                    ret += role + ": " + str(message) + seps[1]
+                elif message:
+                    ret += role + ": " + str(message) + seps[0]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                try:
+                    if message:
+                        ret += role + ": " + message + seps[i % 2]
+                    else:
+                        ret += role + ":"
+                except:
+                    continue
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = self.system
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = self.system
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ": "
+                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = self.system
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def to_gradio_chatbot(self):
+        """Convert the history to gradio chatbot format"""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        ret = [{"role": "system", "content": self.system}]
+
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+
+    def dict(self):
+        return {
+            "name": self.name,
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+
+
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+
+
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert template.name not in conv_templates, f"{name} has been registered."
+    conv_templates[template.name] = template
+
+
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+
+
+# A template with one conversation example
+register_conv_template(
+    Conversation(
+        name="one_shot",
+        system="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        messages=(
+            (
+                "Human",
+                "What are the key differences between renewable and non-renewable energy sources?",
+            ),
+            (
+                "Assistant",
+                "Renewable energy sources are those that can be replenished naturally in a relatively "
+                "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+                "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+                "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+                "renewable and non-renewable energy sources:\n"
+                "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+                "energy sources are finite and will eventually run out.\n"
+                "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+                "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+                "and other negative effects.\n"
+                "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+                "have lower operational costs than non-renewable sources.\n"
+                "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+                "locations than non-renewable sources.\n"
+                "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+                "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+                "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+                "non-renewable sources are not, and their depletion can lead to economic and social instability.",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n### ",
+        stop_str="###",
+    )
+)
+
+# Vicuna v1.1 template
+register_conv_template(
+    Conversation(
+        name="vicuna-v1.1",
+        system="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+# tool-llama template
+register_conv_template(
+    Conversation(
+        name="tool-llama",
+        system="A chat between a curious user and an artificial intelligence assistant who can use external tools and APIs to solve the user's question. "
+        "The assistant gives tools and APIs calling processes or final answer to the human's question.",
+        roles=("Human", "Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+# tool_llama_v2 with openai function template
+register_conv_template(
+    Conversation(
+        name="tool-llama-single-round",
+        system="", # We put the system message in the specific SFT data. Remember to use the same system message in inference.
+        roles=("System", "User", "Function", "Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ONLY_LAST_ASSISTANT,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+
+
+if __name__ == "__main__":
+    conv = get_conv_template("vicuna_v1.1")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
--- a/toolbench/tooleval/README.md
+++ b/toolbench/tooleval/README.md
@ -0,0 +1,168 @@
+<div align= "center">
+    <h1> 🛠️Tool Eval🤖</h1>
+</div>
+
+By fine-tuning LLaMA on ToolBench, we obtain **ToolLLaMA**. Considering that human evaluation can be time-consuming, we follow [AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/) to develop an efficient machine evaluator **ToolEval**, which incorporates two evaluation metrics:
+ - **Pass Rate**: Calculates the proportion of successfully completing an instruction within limited OpenAI API calls. 
+ - **Preference**: Measured by comparing two answers (action sequences) for a given instruction. We pre-define a set of criteria for a better answer, which are organized as prompts for ChatGPT. We provide the test instruction and two candidate answers to the evaluator and obtain its preference. We evaluate each answer pair multiple times to improve the reliability of our system. Then we calculate the **Win Rate** (percentage of being preferred by the evaluator). More details can be found in our paper.
+
+To validate the reliability of ChatGPT evaluator in both pass rate and win rate, we sample among four different methods (ChatGPT+ReACT, ChatGPT+DFSDT, ToolLLaMA+DFSDT and GPT4+DFSDT) to obtain solution pairs for 300 test instructions for each method. Then we engage humans to annotate the pass rate for ChatGPT+DFSDT, ToolLLaMA+DFSDT and GPT4+DFSDT, and the win rate among ChatGPT+ReACT and ChatGPT+DFSDT.
+Our ChatGPT evaluator demonstrates a high agreement of **87.1%** in pass rate and **80.3%** in win rate with human annotators. This result shows that our evaluator generates highly similar evaluation results to humans and can be viewed as a credible evaluator who simulates human evaluation on pass rate and win rate.
+
+## 🚀Usage
+### Install
+Install Package (python>=3.9)
+```bash
+pip install -r requirements.txt
+```
+
+### Evaluation
+*If you want to reproduce the official results, download the reproduction data `reproduction_data.zip` through [Google Drive](https://drive.google.com/drive/folders/1yBUQ732mPu-KclJnuQELEhtKakdXFc3J), unzip it and put the `reproduction_data` under `ToolBench/data/`, and skip the data preparation process.*
+- Data preparation. To evaluate your own model and method using ToolEval, first you need to prepare all the model predictions for the six test subsets. Create a directory naming with your model and method, e.g. `chatgpt_cot` then put each test set's predictions under the directory. The file sturcture of the directory should be:
+```
+├── /chatgpt_cot/
+│  ├── /G1_instruction/
+│  │  ├── /10160_CoT@1.json
+│  │  └── ...
+│  ├── /G1_tool/
+│  │  ├── /10221_CoT@1.json
+│  │  └── ...
+│  ├── ...
+│  ├── /G3_instruction/
+│  │  ├── /10221_CoT@1.json
+│  │  └── ...
+```
+
+Then preprocess the predictions by running the following commands:
+```bash
+export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
+export MODEL_NAME=chatgpt_cot
+export METHOD=CoT
+mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
+for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction
+do
+    answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
+    output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
+    python convert_to_answer_format.py\
+        --answer_dir ${answer_dir} \
+        --method ${METHOD} \
+        --output ${output_file}
+done
+```
+After that, check if there are preprocessed json files for the test sets under `${CONVERTED_ANSWER_PATH}/${MODEL_NAME}`. If so, you're ready to run the following evaluate process. If not, check if there is anything wrong with the model's predictions.
+
+- OpenAI Key. Prepare your openai key to use our evaluator. The key(s) should be stored in a json file, e.g. `path/to/your/openai_key_json_file.json`:
+```bash
+[
+    {
+        "username": "your_user_name",
+        "passwd": "your_password",
+        "api_key": "your_openai_key",
+        "organization": "your_organization"
+    },
+    ...
+]
+```
+
+- Pass rate:
+```bash
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
+export SAVE_PATH=pass_rate_results
+export CANDIDATE_MODEL=chatgpt_cot
+export API_POOL_FILE=path/to/your/openai_key_json_file.json
+
+python eval_pass_rate.py \
+    --converted_answer_path ${CONVERTED_ANSWER_PATH} \
+    --save_path ${SAVE_PATH} \
+    --reference_model ${CANDIDATE_MODEL} \
+    --test_ids ../../data/test_ids/ \
+    --max_eval_threads 20 \
+    --evaluate_times 4
+
+```
+The result files will be stored under the ${SAVE_PATH}.
+
+- Win rate. The below example take ChatGPT-ReACT as reference model and GPT4-ReACT as candidate model. Notice that you need to get both model's pass rate results first, then run the following commands to evaluate the preference result of GPT4-ReACT:
+```bash
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
+export SAVE_PATH=preference_results
+export PASS_TARE_PATH=pass_rate_results
+export REFERENCE_MODEL=chatgpt_cot
+export CANDIDATE_MODEL=gpt-4-0613_cot
+export API_POOL_FILE=path/to/your/openai_key_json_file.json
+
+python eval_preference.py \
+    --converted_answer_path ${CONVERTED_ANSWER_PATH} \
+    --reference_model ${REFERENCE_MODEL} \
+    --output_model ${CANDIDATE_MODEL} \
+    --test_ids ../../data/test_ids/ \
+    --save_path ${SAVE_PATH} \
+    --pass_rate_result_path ${PASS_TARE_PATH} \
+    --max_eval_threads 20 \
+    --use_pass_rate true \
+    --evaluate_times 4
+```
+The result files will be stored under the ${SAVE_PATH}.
+
+### Evaluate New Method
+To evaluate with a new method besides ReACT and DFSDT, you should prepare your converted answer for evaluation following the above Data preparation step. The converted answers should be a json file in following format:
+
+```json
+[
+    {
+        "method":"method name",
+        "total_steps": int, // a integer count total steps in answer details
+        "final_answer": "final answer from the method",
+        "answer_details":[{
+            "role":"node role, can be system, user, assistant and tool",
+            "message":"message for the node",
+            "next":[//next steps, can have multiple elements if the node have multiple candidates.
+                {
+                    "role":"",
+                    "message":"",
+                    "next":[...]
+                },
+                ...//more candidates
+            ]
+        }]
+    }
+    ... // more answers for the give query in the testdata
+]
+```
+
+### Update the Leaderboard
+
+To update the [ToolEval Leaderboard](https://openbmb.github.io/ToolBench/), you should submit your converted answer file (`${CONVERTED_ANSWER_PATH}/${MODEL_NAME}`) to us (urtoolbench@gmail.com) in above format or open a pull request.
+We will run the evaluation script to get the result and update the leaderboard.     
+
+
+### Create new Automatic Evaluators
+To create new automatic evaluators, you can following the steps below:
+1. Create a config folder under `toolbench/tooleval/evaluators`, name it with the name of your evaluators.
+Adding a `config.yaml` file (must have) and a `template.txt` file (optional) in the folder.
+You can refer to the `toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized` folder for example. 
+2. Create your own evaluator class and implement the `fn_completions` function in folder `toolbench/tooleval/evaluators/registered_cls` if needed.
+Or you can use the precreated class like `OpenAINormalizedEvaluator`.
+Fill the `registered_cls_name` with class name of the evaluator in your `config.yaml`.
+Here is a example of the evaluator class: 
+```Python
+from evaluators import register_evaluator,BaseEvaluator
+from typing import Dict,List
+
+@register_evaluator
+class MyEvaluator(BaseEvaluator):
+    def __init__(self,config):
+        super().__init__(
+            fn_completions=self.fn_completions,
+        )
+        # set your configures here
+    
+    def fn_completions(self,query:Dict,answers:List[Dict])->int:
+        # implement your evaluator here
+        # return the index of the preferred answer
+        return 0
+```
+The wrapper `register_evaluator` will register your evaluator to the available evaluators.
+
+3. Run the script `evaluators_comparison.py` to test the performance of your evaluators.
--- a/toolbench/tooleval/README_ZH.md
+++ b/toolbench/tooleval/README_ZH.md
@ -0,0 +1,173 @@
+<div align= "center">
+    <h1> 🛠️Tool Eval🤖</h1>
+</div>
+
+通过在ToolBench上对LLaMA进行微调，我们得到了**ToolLLaMA**。考虑到人工评估非常耗时，我们借鉴[AlpacaEval](https://tatsu-lab.github.io/alpaca_eval/)开发了一个高效的机器自动评估**ToolEval**，其中包含两个评估指标：
+
+- **通过率**：计算在有限的OpenAI API调用次数内成功完成指令的比例。
+
+- **偏好**：通过比较给定指令的两个答案（动作序列）来衡量。我们预先定义了一组更好答案的标准，这些标准被组织成ChatGPT的提示。我们向评估器提供测试指令和两个候选答案，并获得其偏好。我们对每个答案对进行多次评估以提高系统的可靠性。然后，我们计算**优胜率**（被评估器选择为更优的百分比。有关详细信息，请参阅我们的论文。
+
+为了验证ChatGPT评估器在通过率和胜率方面的可靠性，我们从四种不同的方法（ChatGPT+ReACT，ChatGPT+DFSDT，ToolLLaMA+DFSDT和GPT4+DFSDT）中进行采样，为每种方法的300个测试指令获取解决方案对。然后，我们请人类标注ChatGPT+DFSDT，ToolLLaMA+DFSDT和GPT4+DFSDT的通过率，以及ChatGPT+ReACT和ChatGPT+DFSDT之间的胜率。
+
+我们的ChatGPT评估器在通过率方面与人类标注者具有高达**87.1%**的一致性，在胜率方面具有**80.3%**的一致性。这个结果表明，我们的评估器生成的评估结果与人类非常相似，并且可以视为在通过率和胜率上模拟人类评估的可靠评估器。
+有关ToolEval的更多细节，请参阅我们的论文。
+
+## 🚀用法
+
+### Install
+Install Package (python>=3.9)
+```bash
+pip install -r requirements.txt
+```
+
+### Evaluation
+*若要复现结果，直接通过[Google Drive](https://drive.google.com/drive/folders/1yBUQ732mPu-KclJnuQELEhtKakdXFc3J)下载我们的`reproduction_data.zip`，解压后置`reproduction_data`于`ToolBench/data/`下即可，可以跳过数据准备流程。*
+- 数据准备。若要使用 ToolEval 评估您自己的模型和方法，首先需要为六个测试子集准备所有的模型预测。创建一个以您的模型和方法命名的目录，例如 `chatgpt_cot`，然后将每个测试集的预测放在该目录下。目录的文件结构应如下：
+```
+├── /chatgpt_cot/
+│  ├── /G1_instruction/
+│  │  ├── /10160_CoT@1.json
+│  │  └── ...
+│  ├── /G1_tool/
+│  │  ├── /10221_CoT@1.json
+│  │  └── ...
+│  ├── ...
+│  ├── /G3_instruction/
+│  │  ├── /10221_CoT@1.json
+│  │  └── ...
+```
+
+然后对模型预测进行预处理:
+
+```bash
+export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
+export MODEL_NAME=chatgpt_cot
+export METHOD=CoT
+mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
+for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction
+do
+    answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
+    output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
+    python convert_to_answer_format.py\
+        --answer_dir ${answer_dir} \
+        --method ${METHOD} \
+        --output ${output_file}
+done
+```
+之后，检查`${CONVERTED_ANSWER_PATH}/${MODEL_NAME}`下是否有测试集的预处理JSON文件。如果有，你就可以准备运行以下评估过程了。如果没有，请检查模型的预测是否有问题。
+
+- OpenAI Key
+准备您的OpenAI Key来搭建我们的evaluator。Key需要被存储到一个json file中，如`path/to/your/openai_key_json_file.json`：
+```bash
+[
+    {
+        "username": "your_user_name",
+        "passwd": "your_password",
+        "api_key": "your_openai_key",
+        "organization": "your_organization"
+    },
+    ...
+]
+```
+- Pass rate.
+```bash
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
+export SAVE_PATH=pass_rate_results
+export CANDIDATE_MODEL=chatgpt_cot
+export API_POOL_FILE=path/to/your/openai_key_json_file.json
+
+python eval_pass_rate.py \
+    --converted_answer_path ${CONVERTED_ANSWER_PATH} \
+    --save_path ${SAVE_PATH} \
+    --reference_model ${CANDIDATE_MODEL} \
+    --test_ids ../../data/test_query_ids/ \
+    --max_eval_threads 20 \
+    --evaluate_times 4
+
+```
+
+结果文件会被存储至${SAVE_PATH}中。
+
+- Win rate. 以下示例以ChatGPT-ReACT作为参考模型，GPT4-ReACT作为候选模型。请注意，您首先需要获取两个模型的pass rate结果，然后运行以下命令来评估GPT4-ReACT的win rate结果:
+```bash
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
+export SAVE_PATH=preference_results
+export PASS_TARE_PATH=pass_rate_results
+export REFERENCE_MODEL=chatgpt_cot
+export CANDIDATE_MODEL=gpt-4-0613_cot
+export API_POOL_FILE=path/to/your/openai_key_json_file.json
+
+python eval_preference.py \
+    --converted_answer_path ${CONVERTED_ANSWER_PATH} \
+    --reference_model ${REFERENCE_MODEL} \
+    --output_model ${CANDIDATE_MODEL} \
+    --test_ids ../../data/test_query_ids/ \
+    --save_path ${SAVE_PATH} \
+    --pass_rate_result_path ${PASS_TARE_PATH} \
+    --max_eval_threads 20 \
+    --use_pass_rate true \
+    --evaluate_times 4
+```
+
+结果文件会被存储至${SAVE_PATH}中。
+
+### 评估新方法
+要评估除了ReACT和DFSDT之外的方法，您需要遵循以上Data preparation的步骤准备您的预处理好的answer数据。预处理好的answer数据需遵循以下json格式:
+
+```json
+[
+    {
+        "method":"method name",
+        "total_steps": int, // a integer count total steps in answer details
+        "final_answer": "final answer from the method",
+        "answer_details":[{
+            "role":"node role, can be system, user, assistant and tool",
+            "message":"message for the node",
+            "next":[//next steps, can have multiple elements if the node have multiple candidates.
+                {
+                    "role":"",
+                    "message":"",
+                    "next":[...]
+                },
+                ...//more candidates
+            ]
+        }]
+    }
+    ... // more answers for the give query in the testdata
+]
+```
+
+
+### 更新排行榜
+
+如果您想将您的模型的结果上传到[ToolEval Leaderboard](https://openbmb.github.io/ToolBench/)，请您将您的结果文件整理成上述格式发送给我们（urtoolbench@gmail.com）或者开一个pull request。
+我们将运行评测脚本更新结果并将您的模型添加到排行榜中。
+
+
+### 创建新的自动评估器
+如果您想创建新的自动评估器，您需要按下列步骤进行：
+1. 在路径`toolbench/tooleval/evaluators`下创建一个评测器配置文件目录，命名与你的评测器名一致。在其中添加`config.yaml`文件与`template.txt`文件。具体配置方式可参考`toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized`中的实现。
+2. 创建你的evaluator类并实现`fn_completions`函数在文件夹`toolbench/tooleval/evaluators/registered_cls`中，或者你可以使用我们预先定义好的类例如`OpenAINormalizedEvaluator`。
+完成后将配置文件中`registered_cls_name`字段填写为该类的名称。
+这里给出一个例子：
+```Python
+from evaluators import register_evaluator,BaseEvaluator
+from typing import Dict,List
+
+@register_evaluator
+class MyEvaluator(BaseEvaluator):
+    def __init__(self,config):
+        super().__init__(
+            fn_completions=self.fn_completions,
+        )
+        # set your configures here
+    
+    def fn_completions(self,query:Dict,answers:List[Dict])->int:
+        # implement your evaluator here
+        # return the index of the preferred answer
+        return 0
+```
+其中register_evaluator是一个装饰器，用于注册评估器，BaseEvaluator是一个基类，用于实现评估器的基本功能。
+3. 测试评估器的性能，运行脚本`evaluators_comparison.py`。
--- a/toolbench/tooleval/init.py
+++ b/toolbench/tooleval/init.py
--- a/toolbench/tooleval/automatic_check_query_solved.py
+++ b/toolbench/tooleval/automatic_check_query_solved.py
@ -0,0 +1,137 @@
+import os
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor,as_completed
+from tqdm import tqdm
+import numpy as np
+import argparse
+import random
+from evaluation import UserEvaluation,BaseToolMethod
+from evaluators import load_registered_automatic_evaluator
+from typing import List,Dict,Callable
+import pandas as pd
+
+abs_dir = os.path.split(__file__)[0]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output',default=os.path.join(abs_dir,'dataset','test.json'),help='where to store the method output.')
+    parser.add_argument('--method',default='unknown',help='what the name of the method.')
+    parser.add_argument('--ref_method',default='gpt-3.5-turbo_CoT',help='what the reference method is')
+    parser.add_argument('--ref_output',default=os.path.join(abs_dir,'dataset','ref_sample.json'),help='where the reference answer stored')
+    parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored')
+    parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use')
+    parser.add_argument('--max_eval_threads',default=1,type=int,help='how many threads to use for evaluation')
+    parser.add_argument('--evalset',default='default_evalset',help='which the evaluation dataset to use')
+    parser.add_argument('--eval_server_address',default='http://localhost:8000',help='the address of the evaluation server')
+    parser.add_argument('--use_existed_output',default=False,action='store_true',help='whether to use the existed output')
+    
+    return parser.parse_args()
+    
+
+## !!define your method here !!
+class SampleMethod(BaseToolMethod):
+    def __init__(self):
+        super().__init__()
+    def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
+        return {}
+    def convert_result_to_dict(self,result):
+        return {
+            'method': 'sample',
+            'total_steps': 0,
+            'final_answer': '',
+            'answer_details': []
+        }
+def process_answer(answer: Dict):
+    # answer['final_answer'] = answer['final_answer'][:1000]
+    # answer['answer_details'] = answer['answer_details'][:3000]
+    # answer.pop('method', None)
+    return answer
+if __name__=='__main__':
+    args = parse_args()
+
+    # exec_generating_method_outputs = True
+    # if os.path.exists(args.output):
+    #     print('Output file {} already exists!'.format(args.output))
+    #     if args.use_existed_output:
+    #         exec_generating_method_outputs = False
+    #     else:
+    #         print('Overwrite? (y/n)')
+    #         exec_generating_method_outputs = input()=='y'
+            
+    # if exec_generating_method_outputs:
+    #     ## change the SampleMethod to your method
+    #     usereval = UserEvaluation(SampleMethod(),args.eval_server_address,args.evalset)
+    #     print('Generating method outputs...')
+    #     results = usereval.run()
+    #     print('Saving method outputs...')
+    #     with open(args.output,'w') as f:
+    #         json.dump(results,f)
+    # else:
+    #     print('Use existed output.')
+    results = json.load(open(args.output))
+        
+    print('Loading reference answer for evaluation...')
+    try:
+        output = json.load(open(args.output))
+    except:
+        raise Exception('Cannot load reference answer from {}\n Please Download before evaluation!'.format(args.ref_output))
+    
+    print('Loading automatic evaluators...')
+    evaluators = [load_registered_automatic_evaluator(vars(args)) for _ in range(args.max_eval_threads)]
+    
+    def check_query_solved(qid,query, ans):
+        global evaluators
+        evaluator = random.choice(evaluators)
+        is_solved = evaluator.check_solve_query(
+            query,
+            ans)
+        return qid, is_solved
+
+    
+    print('Evaluating...')
+    prefer_dict = {}
+    with ThreadPoolExecutor(args.max_eval_threads) as pool:
+        future = []
+        solved = []
+        data_list = []
+        for qid in output.keys():
+            try:
+                results[qid]['answer'] = process_answer(results[qid]['answer'])
+                future.append(pool.submit(
+                    check_query_solved,
+                    qid,
+                    output[qid]['query'],
+                    results[qid]['answer']['final_answer']
+                ))
+            except KeyError as e:
+                print('Warning : Missing answer for query {} in answer file! '.format(e))
+
+        for thd in tqdm(as_completed(future),total=len(future),ncols=100):
+            qid, is_solved = thd.result()
+            solved.append(is_solved)
+        # print(ref_output[qid]['query'], file=open('output/check_solved.txt','a'))
+        # print(results[qid]['answer'], file=open('output/check_solved.txt','a'))
+            data = {'query':output[qid]['query'],
+                'answer':results[qid]['answer']['final_answer'],
+                'solved':is_solved,
+                'qid':qid}
+            data_list.append(data)
+        print(np.mean(solved))
+        file_name = args.output.split('/')[-1]
+        prefix = os.path.dirname(args.output)
+        json.dump(data_list, open(os.path.join(prefix, f'{file_name}_check_solved.json'),'w'), indent=4)
+
+            
+    
+    # df = pd.DataFrame.from_dict([{
+    #     'Method':args.method,
+    #     'Win Rate':prefer.mean(),
+    #     'Std Error':np.std(prefer)/np.sqrt(len(prefer))
+    # }])
+    # print('###### Leaderboard vs {} ######'.format(args.ref_method))
+    # print(df)
+    # save_file = os.path.join(abs_dir,'results',args.evalset,args.method)
+    # os.makedirs(save_file,exist_ok=True)
+    # df.to_csv(os.path.join(save_file,'win.csv'))
--- a/toolbench/tooleval/automatic_eval_sample.py
+++ b/toolbench/tooleval/automatic_eval_sample.py
@ -0,0 +1,131 @@
+import os
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor,as_completed
+from tqdm import tqdm
+import numpy as np
+import argparse
+import random
+from evaluation import UserEvaluation,BaseToolMethod
+from evaluators import load_registered_automatic_evaluator
+from typing import List,Dict,Callable
+import pandas as pd
+
+abs_dir = os.path.split(__file__)[0]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output',default=os.path.join(abs_dir,'dataset','test.json'),help='where to store the method output.')
+    parser.add_argument('--method',default='unknown',help='what the name of the method.')
+    parser.add_argument('--ref_method',default='gpt-3.5-turbo_CoT',help='what the reference method is')
+    parser.add_argument('--ref_output',default=os.path.join(abs_dir,'dataset','ref_sample.json'),help='where the reference answer stored')
+    parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored')
+    parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use')
+    parser.add_argument('--max_eval_threads',default=1,type=int,help='how many threads to use for evaluation')
+    parser.add_argument('--evalset',default='default_evalset',help='which the evaluation dataset to use')
+    parser.add_argument('--eval_server_address',default='http://localhost:8000',help='the address of the evaluation server')
+    parser.add_argument('--use_existed_output',default=False,action='store_true',help='whether to use the existed output')
+    
+    return parser.parse_args()
+    
+
+## !!define your method here !!
+class SampleMethod(BaseToolMethod):
+    def __init__(self):
+        super().__init__()
+    def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
+        return {}
+    def convert_result_to_dict(self,result):
+        return {
+            'method': 'sample',
+            'total_steps': 0,
+            'final_answer': '',
+            'answer_details': []
+        }
+
+if __name__=='__main__':
+    args = parse_args()
+
+    exec_generating_method_outputs = True
+    if os.path.exists(args.output):
+        print('Output file {} already exists!'.format(args.output))
+        if args.use_existed_output:
+            exec_generating_method_outputs = False
+        else:
+            print('Overwrite? (y/n)')
+            exec_generating_method_outputs = input()=='y'
+            
+    if exec_generating_method_outputs:
+        ## change the SampleMethod to your method
+        usereval = UserEvaluation(SampleMethod(),args.eval_server_address,args.evalset)
+        print('Generating method outputs...')
+        results = usereval.run()
+        print('Saving method outputs...')
+        with open(args.output,'w') as f:
+            json.dump(results,f)
+    else:
+        print('Use existed output.')
+        results = json.load(open(args.output))
+        
+    print('Loading reference answer for evaluation...')
+    try:
+        ref_output = json.load(open(args.ref_output))
+    except:
+        raise Exception('Cannot load reference answer from {}\n Please Download before evaluation!'.format(args.ref_output))
+    
+    print('Loading automatic evaluators...')
+    evaluators = [load_registered_automatic_evaluator(vars(args)) for _ in range(args.max_eval_threads)]
+    
+    def get_preference(qid,query,tools,ref_ans,ans,):
+        global evaluators
+        evaluator = random.choice(evaluators)
+        ret = evaluator.annotate_preference(
+            query,
+            tools,
+            [ref_ans,ans])
+        return qid,ret
+    def get_most_preferred(d:list)->np.ndarray:
+        if np.iterable(d):
+            d = np.asanyarray(d)
+            bins = np.bincount(d)
+            max_val = np.max(bins)
+            argmax = np.where(max_val==bins)[0]
+            return argmax
+        else:
+            return np.asarray([d])
+    
+    print('Evaluating...')
+    prefer_dict = {}
+    with ThreadPoolExecutor(args.max_eval_threads) as pool:
+        future = []
+        for qid in ref_output.keys():
+            try:
+                future.append(pool.submit(
+                    get_preference,
+                    qid,
+                    ref_output[qid]['query'],
+                    ref_output[qid]['available_tools'],
+                    ref_output[qid]['answer'],
+                    results[qid]['answer']
+                ))
+            except KeyError as e:
+                print('Warning : Missing answer for query {} in answer file! '.format(e))
+
+        for thd in tqdm(as_completed(future),total=len(future),ncols=100):
+            qid,preference = thd.result()
+            prefer_dict[qid] = get_most_preferred(preference)[0]
+            
+    prefer = list(prefer_dict.values())
+    
+    prefer = np.array(prefer)
+    df = pd.DataFrame.from_dict([{
+        'Method':args.method,
+        'Win Rate':prefer.mean(),
+        'Std Error':np.std(prefer)/np.sqrt(len(prefer))
+    }])
+    print('###### Leaderboard vs {} ######'.format(args.ref_method))
+    print(df)
+    save_file = os.path.join(abs_dir,'results',args.evalset,args.method)
+    os.makedirs(save_file,exist_ok=True)
+    df.to_csv(os.path.join(save_file,'win.csv'))
--- a/toolbench/tooleval/convert_answers.py
+++ b/toolbench/tooleval/convert_answers.py
@ -0,0 +1,34 @@
+from convert_to_answer_format import process_invalid_data,process_valid_data
+import json
+from glob import glob
+import os
+
+save_dir = 'path/to/save/dir'
+
+groups_dirs = ['path/to/dataset/eval/result/folders']
+
+for groups_dir in groups_dirs:
+    method = os.path.split(groups_dir)[1]
+    print(method)
+    groups_save_dir = os.path.join(save_dir,method)
+    os.makedirs(groups_save_dir,exist_ok=True)
+    groups = [os.path.split(g)[1] for g in glob(groups_dir+'/*')]
+    full_answer = {}
+    for g in groups:
+        print(g)
+        answer_dict = {}
+        files = glob(os.path.join(groups_dir,g,'*.json'))
+        for file in files:
+            qid = os.path.split(file)[1].split('_')[0]
+            try:
+                data = json.load(open(file))
+            except:
+                print('Read error: ',file)
+                continue
+            if not data['answer_generation']['valid_data']:
+                answer_dict[qid] = process_invalid_data(method,data)
+            else:
+                answer_dict[qid] = process_valid_data(method,data['answer_generation'])
+        json.dump(answer_dict,open(os.path.join(groups_save_dir,f'{g}.json'),'w'))
+        full_answer.update(answer_dict)
+    # json.dump(full_answer,open(os.path.join(groups_save_dir,f'fullanswer.json'),'w'))
--- a/toolbench/tooleval/convert_to_answer_format.py
+++ b/toolbench/tooleval/convert_to_answer_format.py
@ -0,0 +1,209 @@
+"""
+Data preprocessing
+"""
+import argparse
+import json
+import os
+from toolbench.tooleval.evaluation import ExecutionGraph,ExecutionNode
+import random
+random.seed(42)
+parser = argparse.ArgumentParser()
+parser.add_argument('--answer_dir',type=str, required=True,help='where the answers stored.')
+parser.add_argument('--method',type=str,required=True,help='the name of the method.')
+parser.add_argument('--output', type=str, default="converted_answers.json", required=False, help='output path for the converted answer.')
+
+
+def generate_init_message_node(eg:ExecutionGraph,functions,query):
+    init_node = ExecutionNode(role='system', message="You are AutoGPT, you can use many tools(functions) to do the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say \"I give up and restart\".\n2.All the thought is short, at most in 5 sentence.\n3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try.\nLet's Begin!\nTask description: You should use functions to help handle the real time user querys. Remember to ALWAYS call \"Finish\" function at the end of the task. And the final answer should contain enough information to show to the user.\nSpecifically, you have access to the following functions: " + str(functions))
+    eg.set_init_node(init_node)
+    
+    node = ExecutionNode(role='user', message=query)
+    eg.add_node(node)
+    eg[init_node,node] = None
+    return node
+
+
+
+def process_valid_data(method,answer_generation):
+    conversation = answer_generation['train_messages'][-1]
+    functions = answer_generation['function']
+    query = answer_generation['query']
+    eg = ExecutionGraph()
+    last_node = generate_init_message_node(eg,functions,query)
+    
+    index = 2
+    while index < len(conversation):
+        message = conversation[index]
+        role = message['role']
+        if role == 'system' or role == 'user' or role == 'function':
+            index = index + 1
+            continue
+        elif role == 'assistant':
+            if 'function_call' in message :
+                node = ExecutionNode(role='tool', message={
+                    'name':message['function_call']['name'],
+                    'arguments':message['function_call']['arguments'],
+                    'response':conversation[index+1]['content'] if message['function_call']['name']!='Finish' else ''
+                    })
+                index = index + 1
+            else:
+                node = ExecutionNode(role='assistant',
+                                        message=message['content'])
+                
+            
+        else:
+            raise NotImplementedError(f'Unkown role {role}')
+        
+        index = index + 1
+        eg.add_node(node)
+        eg[last_node,node] = None
+        last_node = node
+    
+    eg = eg.reduce_graph_to_sequence()
+    
+    return {
+        'query':query,
+        'available_tools':functions,
+        'answer':{
+            'method':method,
+            'total_steps': eg.node_count,
+            'final_answer': answer_generation['final_answer'],
+            'answer_details': eg.convert_to_dict()
+        }
+    }
+def process_invalid_data(method,data_dict):
+    answer_generation = data_dict['answer_generation']
+    functions = answer_generation['function']
+    query = answer_generation['query']
+    eg = ExecutionGraph()
+    last_node = generate_init_message_node(eg,functions,query)
+    if 'CoT' in method or 'cot' in method:
+        trail = random.choice(data_dict["trys"])
+
+        
+        index = 0
+        while index < len(trail['chain']):
+            message = trail['chain'][index]
+            if message['node_type'] == 'Action':
+                node = ExecutionNode(role='tool', message={
+                    'name':message['description'],
+                    'arguments':(trail['chain'][index+1]['description']),
+                    'response':(trail['chain'][index+1]['observation'])})
+            
+                index = index + 1
+            elif message['node_type'] == 'Thought':
+                node = ExecutionNode(role='assistant',
+                                        message=message['description'])
+            else:
+                raise NotImplementedError(f"Unknown node_type: {message['node_type']}")
+            index = index + 1
+
+            eg.add_node(node)
+            eg[last_node,node] = None
+            last_node = node
+        eg = eg.reduce_graph_to_sequence()
+   
+    elif 'DFS' in method or 'dfs' in method:
+
+        def DFS(root):
+            if len(root['children']) == 0:
+                node = ExecutionNode(role=root['node_type'],message=root)
+                eg.add_node(node)
+                return node
+            else:
+                child_nodes = [DFS(node) for node in root['children']]
+                root['children'] = None
+                root_node = ExecutionNode(role=root['node_type'],message=root)
+                eg.add_node(root_node)
+                for child_node in child_nodes:
+                    eg.add_edge(root_node,child_node)
+                return root_node
+        for node in data_dict['tree']['tree']['children']:
+            eg[last_node,DFS(node)] = None
+
+        
+        # purify the graph
+        def purify_graph(node:ExecutionNode):
+            if node.role == 'Action':
+                adj_nodes = eg.get_adjacent_node(node)
+                for adj_node in adj_nodes:
+                    adj_node = eg[adj_node]
+                    if adj_node.role == 'Action Input':
+                        node.role = 'tool'
+                        node.message = {
+                            'name':node.message['description'],
+                            'arguments':(adj_node.message['description']),
+                            'response':(adj_node.message['observation'])
+                            
+                        }
+                        # remove adj_node
+                        adj_node = eg.pop_node(adj_node)
+                        to_nodes = eg.edges.pop(adj_node.node_id,{})
+                        eg.edges[node.node_id].update(to_nodes)
+                        eg.edges[node.node_id].pop(adj_node.node_id)
+                        node.out_degree += len(to_nodes)
+                        break
+            elif node.role == 'Thought':
+                node.role = 'assistant'
+                node.message = node.message['description']
+            elif node.role == 'Action Input':
+                print('Founding Extra Action Input Node')
+                pass
+            elif node.role =='system' or node.role=='user':
+                pass
+            else:
+                raise Exception('Unknown role {}'.format(node.role))
+            adj_nodes = eg.get_adjacent_node(node)
+            for adj_node in adj_nodes:
+                purify_graph(eg[adj_node])
+            
+        purify_graph(last_node)
+        eg = eg.reduce_graph_to_sequence()
+    else:
+        raise NotImplementedError(f'Unknown method {method}')
+    return {
+        'query':query,
+        'available_tools':functions,
+        'answer':{
+            'method':method,
+            'total_steps': eg.node_count,
+            'final_answer': answer_generation['final_answer'],
+            'answer_details': eg.convert_to_dict()
+        }
+    }
+             
+                    
+                    
+if __name__=='__main__':
+    args = parser.parse_args()
+    answer_dir = args.answer_dir
+    method = args.method
+    output = args.output
+    os.makedirs(os.path.dirname(output),exist_ok=True)
+    answer_dict = {}
+    print('#'*100)
+    # for filename in os.listdir(answer_dir):
+    #     if filename.endswith('.json') and method in filename:
+    #         qid = filename.split('_')[0]
+    #         print(os.path.join(answer_dir,filename))
+    #         data_dict = json.load(open(os.path.join(answer_dir,filename)))
+    #         if not data_dict['answer_generation']['valid_data']:
+    #             answer_dict[qid] = process_invalid_data(method,data_dict)
+    #         else:
+    #             answer_dict[qid] = process_valid_data(method,data_dict['answer_generation'])
+                
+    # json.dump(answer_dict,open(output,'w'), indent=4)
+    # answer_dir = f'../../result2/test_instruction/G1_category_r1'
+    
+    for i in range(200):
+        data = json.load(open(os.path.join(answer_dir,'{}.json'.format(i)), 'r'))
+        last_solve_time = data['last_solve_time']
+        data_dict = json.load(open(os.path.join(answer_dir,'{}_{}_DFS_woFilter_w2.json'.format(i,last_solve_time)), 'r'))
+        qid = data['query_id']
+        # print(os.path.join(answer_dir,filename))
+        if not data_dict['answer_generation']['valid_data']:
+            answer_dict[qid] = process_invalid_data(method,data_dict)
+        else:
+            answer_dict[qid] = process_valid_data(method,data_dict['answer_generation'])
+                
+    json.dump(answer_dict,open(output,'w'), indent=4)
--- a/toolbench/tooleval/dataset/init.py
+++ b/toolbench/tooleval/dataset/init.py
--- a/toolbench/tooleval/eval_and_update_leaderboard.py
+++ b/toolbench/tooleval/eval_and_update_leaderboard.py
@ -0,0 +1,170 @@
+# Evaluate a method outputs in different aspectes and update the leaderboard
+# `result_folder` should contain the following 6 json files:
+#   - `G1_category.json`: 
+#           single-tool instruction;
+#           test on unseen tools from unseen categories
+#   - `G1_instruction.json`: 
+#           single-tool instruction; 
+#           test the model's instruction generalization ability
+#   - `G1_tool.json`: 
+#           single-tool instruction; 
+#           test the model's generalization abilities on unseen tools from seen categories
+#   - `G2_category.json`: 
+#           intra-category multi-tool instruction
+#           test on unseen tools from unseen categories
+#   - `G2_instruction.json`: 
+#           intra-category multi-tool instruction
+#           test the model's instruction generalization ability
+#   - `G3_instruction.json`: 
+#           intra-collection multi-tool instruction
+#           test the model's instruction generalization ability
+from glob import glob
+import os
+import argparse
+import json
+import pandas as pd
+import random
+import numpy as np
+from evaluators import load_registered_automatic_evaluator
+from concurrent.futures import ThreadPoolExecutor,as_completed
+from tqdm import tqdm
+abs_dir = os.path.split(__file__)[0]
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--leaderboard_folder',default=os.path.join(abs_dir,'results'))
+    
+    parser.add_argument('--evalset',default='default_evalset',help='the name of the evalset.')
+    parser.add_argument('--method',default='',help='what\' the name of the method.')
+    parser.add_argument('--result_folder',required=True,help='where the method result stored.')
+    parser.add_argument('--ref_method',default='',help='what the reference method is.')
+    parser.add_argument('--ref_result_folder',default=os.path.join(abs_dir,'results','default_evalset','gpt-3.5-turbo_CoT'),help='where the reference answer stored.')
+    
+    
+    parser.add_argument('--evaluators_cfg_path',default=os.path.join(abs_dir,'evaluators'),help='where the evaluators config files are stored.')
+    parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_normalized',help='which evaluator to use.')
+    parser.add_argument('--max_eval_threads',default=16,type=int,help='how many threads to use for evaluation.')
+
+    return parser.parse_args()
+
+
+if __name__=='__main__':
+    args = parse_args()
+    if args.method =='':
+        args.method = os.path.split(args.result_folder)[1]
+    if args.ref_method =='':
+        args.ref_method = os.path.split(args.ref_result_folder)[1]
+    
+    leaderboard_filename = '###'.join(['leaderboard',args.evalset,args.evaluator,args.ref_method]) + '.csv'
+    leaderboard_filepath = os.path.join(args.leaderboard_folder,leaderboard_filename)
+
+        
+    # setting up eval set
+    evalset = {(os.path.split(file)[1]).split('.json')[0]:json.load(open(file)) for file in glob(os.path.join(args.ref_result_folder,'*.json'))}
+    
+    # read the result
+    result = {
+        subset:json.load(open(os.path.join(args.result_folder,subset+'.json')))
+        for subset in evalset.keys()
+    }
+    
+    
+    if os.path.exists(leaderboard_filepath):
+        leaderboard = pd.read_csv(leaderboard_filepath)
+    else:
+        print('File {} not exists. Creating...'.format(leaderboard_filepath))
+        leaderboard = pd.DataFrame(columns=['Method','WinRate','StdError',
+                                            *[subset+'_WinRate' for subset in evalset.keys()],
+                                            *[subset+'_StdError' for subset in evalset.keys()]])
+    def print_and_save_leaderboard(leaderboard):
+        leaderboard.sort_values(axis=0,by='WinRate',ascending=False,inplace=True)
+        print('###### Leaderboard vs {} ######'.format(args.ref_method))
+        print(leaderboard)
+        leaderboard.to_csv(leaderboard_filepath,index=False)
+    print_and_save_leaderboard(leaderboard)
+    
+    if args.method in leaderboard['Method'].values:
+        print('Warning: The method {} has already been in the leaderboard. Overwrite? (y/n)'.format(args.method))
+        if input()!='y':
+            print('Abort.')
+            exit(0)
+        print('Replacing...')
+        # adding the method to the leaderboard
+    leaderboard.loc[len(leaderboard)] = {'Method':args.method}
+    print(leaderboard.loc[leaderboard['Method']==args.method])
+    # setting up evaluators
+    evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator,evaluators_cfg_path=args.evaluators_cfg_path) for _ in range(args.max_eval_threads)]
+    
+    print('#####  Evaluation Info #####')
+    print('Evalset: {}'.format(args.evalset))
+    print('Evalset Subsets: {} '.format(list(evalset.keys())))
+    print('Method: {}'.format(args.method))
+    print('Reference Method: {}'.format(args.ref_method))
+    print('Evaluator: {}'.format(args.evaluator))
+    print('Result Folder: {}'.format(args.result_folder))
+    print('Reference Result Folder: {}'.format(args.ref_result_folder))
+    print('Leaderboard FilePath: {}'.format(leaderboard_filepath))
+    print()
+    
+    
+    def get_preference(qid,query,tools,ref_ans,ans,):
+        global evaluators
+        evaluator = random.choice(evaluators)
+        ret = evaluator.annotate_preference(
+            query,
+            tools,
+            [ref_ans,ans])
+        return qid,ret
+    def get_most_preferred(d:list)->np.ndarray:
+        if np.iterable(d):
+            d = np.asanyarray(d)
+            bins = np.bincount(d)
+            max_val = np.max(bins)
+            argmax = np.where(max_val==bins)[0]
+            return argmax
+        else:
+            return np.asarray([d])
+
+
+    pref_dict_filepath = os.path.join(args.leaderboard_folder,'###'.join(['total_pref_dict',args.evalset,args.evaluator,args.ref_method,args.method])+'.npy')
+
+    # evaluate each subset
+    total_pref = []
+    total_pref_dict = {}
+    for subset in evalset.keys():
+        print('Evaluating {}...'.format(subset))
+        leaderboard.loc[leaderboard['Method']==args.method,subset+'_WinRate'] = 0
+        leaderboard.loc[leaderboard['Method']==args.method,subset+'_StdError'] = 0
+        pref = []
+        prefer_dict = {}
+        with ThreadPoolExecutor(args.max_eval_threads) as pool:
+            future = []
+            for qid in evalset[subset].keys():
+                try:
+                    future.append(pool.submit(
+                        get_preference,
+                        qid,
+                        evalset[subset][qid]['query'],
+                        evalset[subset][qid]['available_tools'],
+                        evalset[subset][qid]['answer'],
+                        result[subset][qid]['answer']
+                    ))
+                except KeyError as e:
+                    print('Warning : Missing answer for query {} in answer file! '.format(e))
+            
+            for thd in tqdm(as_completed(future),total=len(future),ncols=100):
+                qid,preference = thd.result()
+                prefer_dict[qid] = get_most_preferred(preference)[0]
+        pref = np.array(list(prefer_dict.values()))        
+        # update the leaderboard
+        leaderboard.loc[leaderboard['Method']==args.method,subset+'_WinRate'] = np.mean(pref)
+        leaderboard.loc[leaderboard['Method']==args.method,subset+'_StdError'] = np.std(pref)/np.sqrt(len(pref))
+        total_pref.extend(pref)
+        total_pref_dict.update(prefer_dict)
+    leaderboard.loc[leaderboard['Method']==args.method,'WinRate'] = np.mean(total_pref)
+    leaderboard.loc[leaderboard['Method']==args.method,'StdError'] = np.std(total_pref)/np.sqrt(len(total_pref))
+    
+    # np.save(os.path.join(args.leaderboard_folder,'###'.join(['total_pref',args.evalset,args.evaluator,args.ref_method,args.method])+'.npy'),total_pref)
+    np.save(pref_dict_filepath,total_pref_dict)
+
+    print_and_save_leaderboard(leaderboard)
--- a/toolbench/tooleval/eval_pass_rate.py
+++ b/toolbench/tooleval/eval_pass_rate.py
@ -0,0 +1,186 @@
+from toolbench.tooleval.evaluators import load_registered_automatic_evaluator
+import os
+import json
+import csv
+from toolbench.tooleval.evaluators.registered_cls.rtl import AnswerStatus, TaskStatus, AnswerPass
+import random
+from concurrent.futures import ThreadPoolExecutor,as_completed
+import argparse
+from tqdm import tqdm
+from toolbench.tooleval.utils import test_sets, get_steps
+
+abs_dir = os.path.split(__file__)[0]
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--converted_answer_path', type=str, default="", required=True, help='converted answer path')
+    parser.add_argument('--save_path', type=str, default="", required=False, help='result save path')
+    parser.add_argument('--reference_model', type=str, default="", required=False, help='model predictions path')
+    parser.add_argument('--test_ids', type=str, default="", required=True, help='model predictions path')
+    parser.add_argument('--evaluator', type=str, default="tooleval_gpt-3.5-turbo_default", required=False, help='which evaluator to use.')
+    parser.add_argument('--max_eval_threads', type=int, default=30, required=False, help='max threads nums')
+    parser.add_argument('--evaluate_times', type=int, default=4, required=False, help='how many times to predict with the evaluator for each solution path.')
+    return parser.parse_args()
+
+def write_results(filename: str, reference_model: str, label_cnt: dict) -> None:
+    with open(filename, 'w', newline='') as file:
+        writer = csv.writer(file, delimiter="\t")
+        writer.writerow(["query", "solvable", "available_tools", "model_intermediate_steps", "model_final_step", "model", "query_id", "is_solved", "pass_rate_label", "reason", "not_hallucinate"])
+        for query_id in label_cnt:
+            if label_cnt[query_id]["passed"] > label_cnt[query_id]["failed"]:
+                final_label = "passed"
+            elif label_cnt[query_id]["passed"] < label_cnt[query_id]["failed"]:
+                final_label = "failed"
+            else:
+                if random.random() < 0.5: # if tie, random choose
+                    final_label = "passed"
+                else:
+                    final_label = "failed"
+            query = label_cnt[query_id]["query"]
+            task_solvable = label_cnt[query_id]["task_solvable"]
+            tool_names = label_cnt[query_id]["tool_names"]
+            answer_steps = label_cnt[query_id]["answer_steps"]
+            final_step = label_cnt[query_id]["final_step"]
+            is_solved = label_cnt[query_id]["is_solved"]
+            reason = label_cnt[query_id]["reason"]
+            not_hallucinate = label_cnt[query_id]["not_hallucinate"]
+            writer.writerow([query, task_solvable, tool_names, answer_steps, final_step, reference_model, query_id, is_solved, final_label, reason, not_hallucinate])
+            
+
+def compute_pass_rate(query_id, example):
+    global evaluators
+    evaluator = random.choice(evaluators)
+    try:
+        not_hallucinate = evaluator.check_has_hallucination(
+        example['available_tools'],
+        example['answer']
+        )
+    except:
+        not_hallucinate = True
+    answer_steps, final_step = get_steps(example)
+    
+    if "'name': 'Finish'" not in final_step:
+        return query_id, TaskStatus.Solvable, AnswerStatus.Unsolved, "failed", "No answer", not_hallucinate
+    
+    is_solved, is_solved_reason = evaluator.check_is_solved(
+        {
+            'query':example['query'],
+            'available_tools':example['available_tools'],
+        },
+        example['answer'],
+        return_reason=True
+    )
+    if is_solved == AnswerStatus.Solved:
+        is_solved_flag = True
+    elif is_solved == AnswerStatus.Unsolved:
+        is_solved_flag = False
+    else:
+        is_solved_flag = False
+
+    task_solvable, task_solvable_reason = evaluator.check_task_solvable(
+        {
+            'query':example['query'],
+            'available_tools':example['available_tools'],
+        },
+        has_been_solved=is_solved_flag,
+        return_reason=True
+    )
+
+    is_passed = evaluator.is_passed(
+        {
+            'query':example['query'],
+            'available_tools':example['available_tools'],
+        },
+        example['answer'],
+        answer_status=is_solved,
+        task_status=task_solvable
+    )
+
+    reason = f"Is solved: {is_solved_reason}\nTask solvable: {task_solvable_reason}"
+    if is_passed == AnswerPass.Passed:
+        label = "passed"
+    elif is_passed == AnswerPass.Failed:
+        label = "failed"
+    else:
+        if random.random() < 0.5: # if unsure, random choose
+            label = "passed"
+        else:
+            label = "failed"
+    return query_id, task_solvable, is_solved, label, reason, not_hallucinate
+
+if __name__ == "__main__":
+    args = parse_args()
+    evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator, evaluators_cfg_path=os.path.join(abs_dir,'evaluators')) for _ in range(args.max_eval_threads)]
+    os.makedirs(args.save_path, exist_ok=True)
+        
+    reference_model = args.reference_model
+    output_list = []
+    for test_set in test_sets:
+        reference_path = f"{args.converted_answer_path}/{reference_model}/{test_set}.json"
+        if not os.path.exists(reference_path):
+            print(f"Warning: {reference_path} not exists.")
+            continue
+        test_ids = list(json.load(open(os.path.join(args.test_ids, test_set+".json"), "r")).keys())
+        reference_examples = json.load(open(reference_path, "r"))
+        if os.path.exists(f"{args.save_path}/{test_set}_{reference_model}.json"):
+            existed_ids = list(json.load(open(f"{args.save_path}/{test_set}_{reference_model}.json", "r")).keys())
+            label_cnt = json.load(open(f"{args.save_path}/{test_set}_{reference_model}.json", "r"))
+        else:
+            existed_ids = []
+            label_cnt = {}
+        
+        with ThreadPoolExecutor(args.max_eval_threads) as pool:
+            future = []
+            cnt = 0
+            for query_id in reference_examples:
+                cnt += 1
+                if str(query_id) not in test_ids:
+                    # print('continue')
+                    continue
+                if query_id in existed_ids:
+                    continue
+                for i in range(args.evaluate_times):
+                    example = reference_examples[query_id]
+                    future.append(pool.submit(
+                        compute_pass_rate,
+                        query_id,
+                        example
+                    ))
+            print(cnt)
+            for thd in tqdm(as_completed(future),total=len(future),ncols=100):
+                query_id, task_solvable, is_solved, machine_label, reason, not_hallucinate = thd.result()
+                example = reference_examples[query_id]
+                query = example["query"]
+                tool_names = []
+                for tool_dict in example["available_tools"]:
+                    tool_name = tool_dict["name"]
+                    tool_names.append(tool_name)
+                answer_steps, final_step = get_steps(example)
+                if query_id not in label_cnt:
+                    label_cnt[query_id] = {"passed":0, "failed":0}
+                if machine_label == "passed":
+                    label_cnt[query_id]["passed"] += 1
+                else:
+                    label_cnt[query_id]["failed"] += 1
+                label_cnt[query_id]["query"] = query
+                label_cnt[query_id]["task_solvable"] = str(task_solvable)
+                label_cnt[query_id]["tool_names"] = tool_names
+                label_cnt[query_id]["answer_steps"] = answer_steps
+                label_cnt[query_id]["final_step"] = final_step
+                label_cnt[query_id]["is_solved"] = str(is_solved)
+                label_cnt[query_id]["reason"] = reason
+                label_cnt[query_id]["not_hallucinate"] = not_hallucinate
+                json.dump(label_cnt, open(f"{args.save_path}/{test_set}_{reference_model}.json", "w"), ensure_ascii=False, indent=4)
+        json.dump(label_cnt, open(f"{args.save_path}/{test_set}_{reference_model}.json", "w"), ensure_ascii=False, indent=4)
+        
+        filename = f"{args.save_path}/{test_set}_{reference_model}.csv"
+        write_results(filename, reference_model, label_cnt)
+        pass_rate = 0
+        for query_id in label_cnt:
+            if label_cnt[query_id]["failed"] <= label_cnt[query_id]["passed"]:
+                pass_rate += 1
+        pass_rate /= len(label_cnt)
+        print(f"Test set: {test_set}. Model: {reference_model}. Pass rate: {str(pass_rate)}")
+        
+
+        
--- a/toolbench/tooleval/eval_preference.py
+++ b/toolbench/tooleval/eval_preference.py
@ -0,0 +1,265 @@
+# Evaluate a method outputs in different aspectes and update the leaderboard
+# `result_folder` should contain the following 6 json files:
+#   - `G1_category.json`: 
+#           single-tool instruction;
+#           test on unseen tools from unseen categories
+#   - `G1_instruction.json`: 
+#           single-tool instruction; 
+#           test the model's instruction generalization ability
+#   - `G1_tool.json`: 
+#           single-tool instruction; 
+#           test the model's generalization abilities on unseen tools from seen categories
+#   - `G2_category.json`: 
+#           intra-category multi-tool instruction
+#           test on unseen tools from unseen categories
+#   - `G2_instruction.json`: 
+#           intra-category multi-tool instruction
+#           test the model's instruction generalization ability
+#   - `G3_instruction.json`: 
+#           intra-collection multi-tool instruction
+#           test the model's instruction generalization ability
+from glob import glob
+import os
+import argparse
+import json
+import pandas as pd
+import random
+import numpy as np
+from evaluators import load_registered_automatic_evaluator
+from concurrent.futures import ThreadPoolExecutor,as_completed
+from tqdm import tqdm
+from utils import test_sets, get_steps, task_status_mapping, answer_status_mapping
+import csv
+
+abs_dir = os.path.split(__file__)[0]
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--converted_answer_path', type=str, default="", required=True, help='result save path')
+    parser.add_argument('--reference_model', type=str, default="gpt-4-0613_dfs", required=False, help='ref model predictions path')
+    parser.add_argument('--output_model', type=str, default="toolllama-2-0830-thought", required=False, help='output model predictions path')
+    parser.add_argument('--test_ids', type=str, default="", required=True, help='test query ids path')
+    parser.add_argument('--save_path', type=str, default="preference_results", required=False, help='preference results save path')
+    parser.add_argument('--pass_rate_result_path', type=str, default="pass_rate_results", required=False, help='pass rate results save path')
+    parser.add_argument('--max_eval_threads', type=int, default=3, required=False, help='max threads nums')
+    parser.add_argument('--evaluator',default='tooleval_gpt-3.5-turbo_default', required=False, help='which evaluator to use.')
+    parser.add_argument('--use_pass_rate',default='false',help='to use existed pass rate result or compare preference from scratch.')
+    parser.add_argument('--evaluate_times',default=2,help='how many times to predict with the evaluator for each solution path.')
+    
+    return parser.parse_args()
+
+def get_pass_rate_results(filename: str) -> dict:
+    csv_reader = csv.reader(open(filename), delimiter="\t")
+    return_dict = {}
+    line_cnt = 0
+    for line in csv_reader:
+        if line_cnt == 0:
+            for index, item in enumerate(line):
+                if item == "query":
+                    query_index = index
+                elif item == "solvable":
+                    solvable_index = index
+                elif item == "available_tools":
+                    atools_index = index
+                elif item == "model_intermediate_steps":
+                    mid_steps_index = index
+                elif item == "model":
+                    modelname_index = index
+                elif item == "model_final_step":
+                    final_step_index = index
+                elif item == "is_solved":
+                    is_solved_index = index
+                elif item == "pass_rate_label":
+                    machine_label_index = index
+                elif item == "query_id":
+                    query_id_index = index
+                elif item == "reason":
+                    reason_index = index
+                elif item == "not_hallucinate":
+                    not_hallucinate_index = index
+                else:
+                    print(f"Unrecognized item: {item}")
+                        
+        line_cnt = 1
+        query = line[query_index]
+        query_id = line[query_id_index]
+        solvable = line[solvable_index]
+        atools = line[atools_index]
+        mid_steps = line[mid_steps_index]
+        modelname = line[modelname_index]
+        final_step = line[final_step_index]
+        is_solved = line[is_solved_index]
+        machine_label = line[machine_label_index]
+        return_dict[query_id] = {
+            "query": query,
+            "solvable": solvable,
+            "atools": atools,
+            "mid_steps": mid_steps,
+            "modelname": modelname,
+            "final_step": final_step,
+            "is_solved": is_solved,
+            "machine_label": machine_label
+        }
+    # print(return_dict.keys(), len(return_dict.keys()))
+    return return_dict
+
+def write_results(filename:str, prefer_dict: dict, reference_model: str, output_model: str, reference_examples: dict, output_examples: dict) -> None:
+    with open(filename, 'w', newline='') as file:
+        writer = csv.writer(file, delimiter="\t")
+        writer.writerow(["query", "available_tools", "ref_model_intermediate_steps", "ref_model_final_step", "output_model_intermediate_steps", "output_model_final_step", "preference_label", "query_id", "ref_model", "output_model"])
+    
+        for query_id in prefer_dict:
+            ref_example = reference_examples[query_id]
+            output_example = output_examples[query_id]
+            tool_names = []
+            for tool_dict in ref_example['available_tools']:
+                tool_name = tool_dict["name"]
+                tool_names.append(tool_name)
+            ref_steps, ref_final_step = get_steps(ref_example)
+            output_steps, output_final_step = get_steps(output_example)
+
+            if prefer_dict[query_id][reference_model] > prefer_dict[query_id][output_model]:
+                preference = 1
+            elif prefer_dict[query_id][reference_model] < prefer_dict[query_id][output_model]:
+                preference = 2
+            else:
+                preference = 3     
+            writer.writerow([ref_example['query'], str(tool_names), ref_steps, ref_final_step, output_steps, output_final_step, str(preference), query_id, reference_model, output_model])
+    return None
+
+
+if __name__=='__main__':
+    args = parse_args()
+    evaluators = [load_registered_automatic_evaluator(evaluator_name=args.evaluator, evaluators_cfg_path=os.path.join(abs_dir,'evaluators')) for _ in range(args.max_eval_threads)]
+    
+    def get_preference(query_id, task_status, answer_statuss, ref_example, output_example):
+        global evaluators
+        evaluator = random.choice(evaluators)
+        
+        preference = evaluator.annotate_preference(
+            ref_example['query'],
+            ref_example['available_tools'],
+            [ref_example['answer'], output_example['answer']],
+            task_status=task_status, answer_statuss=answer_statuss
+        )
+        if preference == 0:
+            return query_id, "ref"
+        elif preference == 1:
+            return query_id, "output"
+        else:
+            return query_id, "equal"
+    
+    reference_model = args.reference_model
+    output_model = args.output_model
+
+    for test_set in test_sets:
+        test_ids = list(json.load(open(os.path.join(f"{args.test_ids}/{test_set}.json"), "r")).keys())
+        reference_path = f"{args.converted_answer_path}/{reference_model}/{test_set}.json"
+        output_path = f"{args.converted_answer_path}/{output_model}/{test_set}.json"
+        reference_examples = json.load(open(reference_path, "r"))
+        output_examples = json.load(open(output_path, "r"))
+        print('Evaluating {}...'.format(test_set))
+        pref = []
+        if os.path.exists(f"{args.save_path}/{test_set}_{reference_model}_{output_model}.json"):
+            prefer_dict = json.load(open(f"{args.save_path}/{test_set}_{reference_model}_{output_model}.json", "r"))
+        else:
+            prefer_dict = {}
+        ref_pass_result_file = f"{args.pass_rate_result_path}/{test_set}_{reference_model}.csv"
+        output_pass_result_file = f"{args.pass_rate_result_path}/{test_set}_{output_model}.csv"
+
+        ref_pass_result_dict = get_pass_rate_results(ref_pass_result_file)
+        output_pass_result_dict = get_pass_rate_results(output_pass_result_file)
+        for i in range(int(args.evaluate_times)):
+            with ThreadPoolExecutor(args.max_eval_threads) as pool:
+                future = []
+                for qid in test_ids:
+                    if qid not in prefer_dict:
+                        prefer_dict[qid] = {reference_model: 0, output_model: 0, f"round_{i}": "incomplete"}
+                    elif prefer_dict[qid][f"round_{i}"] == "complete":
+                        continue
+                    if qid in ref_pass_result_dict and qid in output_pass_result_dict:
+                        if ref_pass_result_dict[qid]["machine_label"] == "passed" and output_pass_result_dict[qid]["machine_label"] == "failed":
+                            prefer_dict[qid][reference_model] += 1
+                            continue
+                        elif ref_pass_result_dict[qid]["machine_label"] == "failed" and output_pass_result_dict[qid]["machine_label"] == "passed":
+                            prefer_dict[qid][output_model] += 1
+                            continue
+                    
+                    if qid not in reference_examples:
+                        prefer_dict[qid][output_model] += 1
+                        continue
+                    if qid not in output_examples:
+                        print(f"Query {qid} not in output model converted answers!")
+                        prefer_dict[qid][reference_model] += 1
+                        continue
+
+                    ref_example = reference_examples[qid]
+                    output_example = output_examples[qid]
+                    if args.use_pass_rate == 'true':
+                        try:
+                            task_status = task_status_mapping[ref_pass_result_dict[qid]["solvable"]]
+                            answer_statuss = [answer_status_mapping[ref_pass_result_dict[qid]["is_solved"]],answer_status_mapping[output_pass_result_dict[qid]["is_solved"]]]
+                        except:
+                            task_status = None
+                            answer_statuss = [None, None]
+                    else:
+                        task_status = None
+                        answer_statuss = [None, None]
+
+                    if i % 2 == 0 or i >= 0:
+                        future.append(pool.submit(
+                            get_preference,
+                            qid,
+                            task_status,
+                            answer_statuss,
+                            ref_example,
+                            output_example
+                        ))
+                    else:
+                        answer_statuss = answer_statuss[::-1]
+                        future.append(pool.submit(
+                            get_preference,
+                            qid,
+                            task_status,
+                            answer_statuss,
+                            output_example,
+                            ref_example
+                        ))
+                    
+                for thd in tqdm(as_completed(future),total=len(future),ncols=100):
+                    qid, preference = thd.result()
+                    
+                    if i % 2 == 0 or i >= 0:
+                        if preference == "ref":
+                            prefer_dict[qid][reference_model] += 1
+                        elif preference == "output":
+                            prefer_dict[qid][output_model] += 1
+                        prefer_dict[qid][f"round_{i}"] = "complete"
+                    else:
+                        if preference == "ref":
+                            prefer_dict[qid][output_model] += 1
+                        elif preference == "output":
+                            prefer_dict[qid][reference_model] += 1
+                        prefer_dict[qid][f"round_{i}"] = "complete"
+                    json.dump(prefer_dict, open(f"{args.save_path}/{test_set}_{reference_model}_{output_model}.json", "w"), ensure_ascii=False, indent=4)
+        
+        json.dump(prefer_dict, open(f"{args.save_path}/{test_set}_{reference_model}_{output_model}.json", "w"), ensure_ascii=False, indent=4)
+        filename = f"{args.save_path}/{test_set}_{reference_model}_{output_model}.csv"
+        write_results(filename, prefer_dict, reference_model, output_model, reference_examples, output_examples)
+        
+        win_rate, lose_rate, tie_rate = 0, 0, 0
+        for query_id in prefer_dict:
+            if prefer_dict[query_id][reference_model] > prefer_dict[query_id][output_model]:
+                preference = 1
+                lose_rate += 1
+            elif prefer_dict[query_id][reference_model] < prefer_dict[query_id][output_model]:
+                preference = 2
+                win_rate += 1
+            else:
+                preference = 3
+                tie_rate += 1
+        win_rate /= len(prefer_dict)
+        lose_rate /= len(prefer_dict)
+        tie_rate /= len(prefer_dict)
+        print(f"Test set: {test_set}. Reference model: {reference_model}, Candidate model: {output_model}. Win rate: {str(win_rate)}, Tie rate: {str(tie_rate)}")
+        
--- a/toolbench/tooleval/evaluation/init.py
+++ b/toolbench/tooleval/evaluation/init.py
@ -0,0 +1,3 @@
+from .usereval import UserEvaluation
+from .methodcls import BaseToolMethod
+from .dataclass import ExecutionGraph,ExecutionNode,DirectedEdge
--- a/toolbench/tooleval/evaluation/dataclass.py
+++ b/toolbench/tooleval/evaluation/dataclass.py
@ -0,0 +1,272 @@
+from pydantic import BaseModel,Field
+from typing import Union, Dict, List, Optional,Any
+import random
+import uuid
+import re
+
+class EvalCompleted(Exception):
+    pass
+
+class Tool(BaseModel): 
+    tid:str
+    name:str
+    description:str
+    class Parameters(BaseModel):
+        required:List[str]
+        optional:Optional[List[str]] = []
+        type_:str  = Field(alias='type')
+        class Properties(BaseModel):
+            type_:str  = Field(alias='type')
+            enum:Optional[List[str]] = None
+            description:Optional[str] = None
+            example_value:Optional[Union[str,bool,int,float]] = None
+        properties:Dict[str,Properties]
+    parameters:Parameters
+    
+    # meta:ToolMeta #removed 
+
+class Question(BaseModel):
+    qid:str
+    query:str
+    available_tools:List[Tool]
+    
+    
+GID = str
+
+def assign_gid()->GID:
+    return str(uuid.uuid4())
+
+class ExecutionNode(BaseModel):
+    node_id:GID = Field(default_factory=assign_gid)
+    role: Optional[Any] = None # System, User, Assistant, Tool
+    message: Optional[Any] = None
+    in_degree:int = 0
+    out_degree:int = 0
+    
+    def __eq__(self, other) -> bool:
+        if isinstance(other,ExecutionNode):
+            return self.node_id == other.node_id
+        raise NotImplementedError('Unsupported operation between {} and {}'.format(type(self),type(other)))
+    
+    def __str__(self) -> str:
+        return str(self.node_id)
+
+    
+class DirectedEdge(BaseModel):
+    edge_id:GID  = Field(default_factory=assign_gid)
+    def __eq__(self, other) -> bool:
+        if isinstance(other,DirectedEdge):
+            return self.edge_id == other.edge_id
+        raise NotImplementedError('Unsupported operation between {} and {}'.format(type(self),type(other)))
+    
+    def __str__(self) -> str:
+        return str(self.edge_id)
+    
+class ExecutionGraph(BaseModel):
+    init_node:Optional[GID] = None
+    nodes:Dict[GID,ExecutionNode] = {}
+    edges:Dict[GID,Dict[GID,DirectedEdge]] = {}
+    
+    def convert_to_dict(self):
+        data = []
+        all_start_nodes = [node.node_id for node in self.nodes.values() if node.in_degree == 0]
+        all_visited_nodes = set()
+        for node in all_start_nodes:
+            def dfs(node:ExecutionNode)->Dict[Any,Any]:
+                if node.node_id in all_visited_nodes:
+                    return None
+                all_visited_nodes.add(node.node_id)
+                node_json={
+                    'role':node.role,
+                    'message':node.message if node.role != 'system' and node.role !='user' else '',
+                    'next':[]
+                }
+                for next_node in self.get_adjacent_node(node):
+                    next_node_dict = dfs(self.nodes[next_node])
+                    if next_node_dict is not None:
+                        node_json['next'].append(next_node_dict)
+                return node_json
+            
+            data.append(dfs(self.nodes[node]))
+        
+        return data
+    
+    def reduce_graph_to_sequence(self):
+        # random walk to a leaf node
+        eg = ExecutionGraph()
+        node = self.nodes[self.init_node]
+        eg.set_init_node(node)
+        last_node = node
+        adj_nodes = self.get_adjacent_node(node)
+        while len(adj_nodes)>0:
+            node = self.nodes[random.choice(adj_nodes)]
+            adj_nodes = self.get_adjacent_node(node)
+            eg.add_node(node)
+            eg[last_node,node] = None
+            last_node = node
+        return eg
+    
+    def draw(self):
+        import pygraphviz as pgv
+        G = pgv.AGraph(directed=True)
+        G.add_nodes_from([str(node) for node in self.nodes.values()])
+        VIS_CONFIG={
+            'system':{'shape':'plaintext'},
+            'user': {'fillcolor':'yellow','style':'filled','shape':'circle'},
+            'tool': {
+                # 'fillcolor':'red','style':'filled',
+                'shape':'diamond'},
+            'assistant': {
+                # 'fillcolor':'green','style':'filled',
+                'shape':'box'}
+        }
+        def wrap_text(text:str, width=20):
+            wrapped_text = ''
+            for i in range(0, min(width*5,len(text)), width):
+                wrapped_text += text[i:i+width] + '\n'
+            escaped_chars = re.findall(r'\\[nrt\'"\\]', wrapped_text)
+            for escaped_char in escaped_chars:
+                wrapped_text = wrapped_text.replace(escaped_char, '')
+            return wrapped_text
+        
+        def set_node_vis(gnode,node:ExecutionNode):
+            for k,v in VIS_CONFIG[node.role].items():
+                gnode.attr[k] = v
+                
+            if node.role == 'system':
+                gnode.attr['label']='SystemPrompt'
+            elif node.role == 'tool':
+                if node.message['name'] == 'Finish':
+                    # args = json.loads(node.message['arguments'])
+                    args = str(node.message['arguments'])
+                    idx = args.find('return_type')
+                    
+                    if 'give_answer' in args[idx:idx+30]:
+                        gnode.attr['fillcolor'] = 'green'
+                        # gnode.attr['xlabel'] = f"{wrap_text(args.get('final_answer',''))}"
+                        gnode.attr['label'] = wrap_text(args[args.find('final_answer'):])
+                    else:
+                        gnode.attr['fillcolor'] = 'red'
+                        gnode.attr['label'] = 'restart'
+                    gnode.attr['style'] = 'filled'
+                    gnode.attr['shape'] = 'ellipse'
+                else:
+                    gnode.attr['label'] = f"tool: {wrap_text(node.message['name'])}"
+                    gnode.attr['xlabel'] = f"{wrap_text(node.message['response'])}"
+            elif node.role =='assistant':
+                gnode.attr['label'] = node.role.upper() +'\n'+ wrap_text(str(node.message))
+            else:
+                gnode.attr['xlabel'] = wrap_text(str(node.message))
+                gnode.attr['label'] = node.role.upper()
+                
+        for node in self.nodes.values():
+            gnode = G.get_node(str(node))
+            set_node_vis(gnode,node)
+            to_nodes = list(self.edges.get(node.node_id,{}).keys())
+            G.add_edges_from([(str(node),str(to_node)) for to_node in to_nodes])
+
+        # return G.draw(prog='neato',format='jpeg',args='-Goverlap=false')
+        return G.draw(prog='dot',format='jpeg',args='-Goverlap=false')
+
+    @property
+    def node_count(self):
+        return len(self.nodes.keys())
+    @property
+    def edge_count(self):
+        count = 0
+        for k,d in self.edges.items():
+            count += len(d.keys())
+        return count
+    
+    def set_init_node(self,node:Union[GID,ExecutionNode]):
+        if isinstance(node,ExecutionNode):
+            self.init_node = node.node_id
+            if node.node_id not in self.nodes:
+                self.nodes[node.node_id] = node
+        elif isinstance(node,GID):
+            if node not in self.nodes:
+                raise KeyError('node not in graph!')
+            else:
+                self.init_node = node
+        else:
+            raise TypeError('node must be instance of ExecutionNode!')
+        
+    def get_init_node(self):
+        return self.nodes[self.init_node]
+    
+    def add_node(self,node:ExecutionNode):
+        if isinstance(node,ExecutionNode):
+            self.nodes[node.node_id] = node
+        else:
+            raise TypeError('node must be instance of ExecutionNode!')
+    
+    def add_edge(self,from_node:Union[ExecutionNode,GID],to_node:Union[ExecutionNode,GID],edge:DirectedEdge=None):
+        if isinstance(from_node,ExecutionNode):
+            from_node = from_node.node_id
+        if isinstance(to_node,ExecutionNode):
+            to_node = to_node.node_id
+        if from_node not in self.edges:
+            self.edges[from_node] = {}
+        if edge is None:
+            self.edges[from_node][to_node] = DirectedEdge()
+        else:
+            if isinstance(edge,DirectedEdge):
+                self.edges[from_node][to_node] = edge
+            else:
+                raise TypeError('edge must be instance of DirectedEdge!')
+        self.nodes[to_node].in_degree += 1
+        self.nodes[from_node].out_degree +=1
+
+        
+    def pop_node(self,node:Union[ExecutionNode,GID])->Union[ExecutionNode,None]:
+        if isinstance(node,ExecutionNode):
+            node = node.node_id
+        return self.nodes.pop(node,None)
+        
+    def pop_edge(self,from_node:Union[ExecutionNode,GID],to_node:Union[ExecutionNode,GID])->Union[DirectedEdge,None]:
+        if isinstance(from_node,ExecutionNode):
+            from_node = from_node.node_id
+        if isinstance(to_node,ExecutionNode):
+            to_node = to_node.node_id
+        if from_node in self.edges:
+            return self.edges[from_node].pop(to_node,None)
+        return None
+    
+    def get_adjacent_node(self,node:Union[ExecutionNode,GID])->List[GID]:
+        if isinstance(node,ExecutionNode):
+            node = node.node_id
+        return list(self.edges.get(node,{}).keys())
+    
+    
+        
+    def __getitem__(self, item)->Union[ExecutionNode,DirectedEdge]:
+        if isinstance(item, GID):
+            return self.nodes[item]
+        elif isinstance(item, tuple) and len(item) == 2:
+            k1,k2 = item
+            if isinstance(k1,ExecutionNode):
+                k1 = k1.node_id
+            if isinstance(k2,ExecutionNode):
+                k2 = k2.node_id
+            
+            if isinstance(k1,GID) and isinstance(k2,GID):
+                return self.edges[k1][k2]
+            else:
+                raise TypeError('key must be GID or ExecutionNode!')
+        else:
+            raise IndexError("Invalid number of arguments")
+    
+    def __setitem__(self,key,value):
+        if len(key)==0:
+            self.add_node(value)
+        elif isinstance(key, GID):
+            if isinstance(value,ExecutionNode):
+                value.node_id = key
+                self.nodes[key] = value
+            else:
+                raise TypeError('node must be instance of ExecutionNode!')
+            
+        elif isinstance(key, tuple) and len(key) == 2:
+            self.add_edge(key[0],key[1],value)
+        else:
+            raise IndexError("Invalid number of arguments")
--- a/toolbench/tooleval/evaluation/methodcls.py
+++ b/toolbench/tooleval/evaluation/methodcls.py
@ -0,0 +1,40 @@
+from typing import Dict, List,Callable
+
+class BaseToolMethod:
+    def __init__(self):
+        pass
+    def convert_result_to_dict(self,result):
+        '''Return Format
+        --------
+        {
+            'method': 'method name',
+            'total_steps': int,
+            'final_answer': 'answer',
+            'answer_details': [{
+                "role": "system",
+                "message": "",
+                "next": [
+                    {
+                        "role": "user",
+                        "message": "I am planning ...",
+                        "next": [
+                            {
+                                "role": "tool",
+                                "message": "{'name': 'Finish', 'arguments': '{\\n  \"return_type\": \"give_answer\",\\n  \"final_answer\": \"I encountere...",
+                                "next": []
+                            }
+                        ]
+                    }
+                ]
+            }]
+        }
+        
+        '''
+        pass
+    def forward(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
+        pass
+    
+    def __call__(self,query:str,tools:List[Dict],tool_func:Callable)->Dict:
+        result = self.forward(query,tools,tool_func)
+        return self.convert_result_to_dict(result)
+    
--- a/toolbench/tooleval/evaluation/usereval.py
+++ b/toolbench/tooleval/evaluation/usereval.py
@ -0,0 +1,70 @@
+import requests
+from tqdm import tqdm
+from typing import Union, Dict, List, Optional,Tuple
+from .methodcls import BaseToolMethod
+from .dataclass import *
+import json
+
+class UserEvaluation:
+    def __init__(self,
+                 method:BaseToolMethod,
+                 eval_server_addr='http://localhost:8000',
+                 evalset='eval20230718'):
+        self.eval_server_addr = eval_server_addr
+        self.evalset = evalset
+        self.method = method
+        res = requests.post(self.eval_server_addr+'/neweval',json=self.evalset)
+        if res.status_code != 200:
+            raise Exception('Failed to obtain new evaluation id! Error: '+res.text)
+        ret = res.json()
+        self.eval_id = ret['evaluation_id']
+        self.len = ret['len']
+
+    def get_new_question(self)->Tuple[str,List]:
+        res = requests.post(self.eval_server_addr+'/next_question',json=self.eval_id)
+        if res.status_code == 204:
+            raise EvalCompleted()
+        if res.status_code != 200:
+            raise Exception('Failed to obtain new question!')
+        
+        self.question = Question(**res.json())
+        self.tool_name_to_id = {}
+        tools = [tool.model_dump() for tool in self.question.available_tools]
+        for tool in tools:
+            self.tool_name_to_id[tool['name']] = tool.pop('tid')
+        
+        
+        return self.question.query,tools
+    def tool_func(self,tool_name:str,tool_args:str)->requests.Response:
+        tid = self.tool_name_to_id[tool_name]
+        # res = requests.post(self.eval_server_addr+'/api',json={
+        #     'evaluation_id':self.eval_id,
+        #     'tool_id':tid,
+        #     'tool_args':tool_args
+        # })
+        res = requests.post(self.eval_server_addr+'/rapidapi',json={
+            'evaluation_id':self.eval_id,
+            'tool_id':tid,
+            'tool_args':tool_args
+        })
+        
+        return res
+    def _forward(self,query:str,tools:List[Dict])->Dict:
+        method_ret = self.method(query,tools,self.tool_func)
+        
+        return self.question.qid,{
+            'query':query,
+            'available_tools':tools,
+            'answer':method_ret
+        }
+        
+    
+    def run(self)->Dict:
+        results = {}
+        for _ in tqdm(range(self.len),ncols=100):
+            try:
+                qid,ret = self._forward(*self.get_new_question())
+            except EvalCompleted:
+                return results
+            results[qid] = ret
+        return results
--- a/toolbench/tooleval/evaluators/init.py
+++ b/toolbench/tooleval/evaluators/init.py
@ -0,0 +1,18 @@
+from toolbench.tooleval.evaluators.registered_cls import BaseEvaluator,register_evaluator,get_evaluator_cls
+
+__all__=['register_evaluator','get_evaluator_cls','BaseEvaluator','load_registered_automatic_evaluator']
+
+
+
+def load_registered_automatic_evaluator(config:dict={},evaluator_name=None,evaluators_cfg_path=None)->BaseEvaluator:
+    import os
+    import yaml
+    
+    evaluator_name = config['evaluator'] if evaluator_name is None else evaluator_name
+    cfg_path = config['evaluators_cfg_path'] if evaluators_cfg_path is None else evaluators_cfg_path
+    cfg_path = os.path.join(cfg_path,evaluator_name)
+    
+    cls_name = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)['registered_cls_name']
+    
+    evaluator:BaseEvaluator = get_evaluator_cls(cls_name)(cfg_path)
+    return evaluator
--- a/toolbench/tooleval/evaluators/registered_cls/init.py
+++ b/toolbench/tooleval/evaluators/registered_cls/init.py
@ -0,0 +1,20 @@
+from .base import BaseEvaluator
+from .utils import register_evaluator,get_evaluator_cls
+
+__all__ = ['register_evaluator','get_evaluator_cls','BaseEvaluator']
+
+import os
+import importlib
+current_dir = os.path.dirname(__file__)
+
+for item in os.listdir(current_dir):
+    item_path = os.path.join(current_dir, item)
+    
+    if os.path.isfile(item_path) and item != '__init__.py' and item.endswith('.py'):
+        module_name = item[:-3]
+        
+        full_module_path = f"{__name__}.{module_name}"
+        
+        imported_module = importlib.import_module(full_module_path)
+        
+        globals()[module_name] = imported_module
--- a/toolbench/tooleval/evaluators/registered_cls/base.py
+++ b/toolbench/tooleval/evaluators/registered_cls/base.py
@ -0,0 +1,124 @@
+import random
+from typing import List, Union, Dict, Any, Callable
+import os
+import yaml
+from .utils import register_evaluator
+
+def process_answer(answer: Dict):
+    answer['final_answer'] = answer['final_answer'][:1000]
+    answer['answer_details'] = answer['answer_details'][:3000]
+    answer.pop('method', None)
+    return answer
+
+
+def process_tools(tools: List[Dict]):
+    for tool in tools:
+        tool.pop('description', None)
+        tool.pop('parameters', None)
+    return tools
+
+@register_evaluator
+class BaseEvaluator:
+    """Base class for evaluators.
+    
+    Attributes:
+    ----------
+        fn_completions : Callable[[Dict,List[Dict]],int]
+            The completion function of the evaluator, used to get annotated results.
+            This function should take two arguments: `task_description`:Dict and `answers`:List[Dict], return a int stand for the index of best answer.
+    
+    Functions:
+    ---------
+        annotate_preference : Callable
+            Annotate and return the index of the preferred answer.
+        
+    """
+    def __init__(self,
+                 fn_completions: Callable[[Dict,List[Dict]],int] = None,
+                 *args,
+                 **kwargs):
+        self.fn_completions = fn_completions
+    def annotate_preference(self,
+                            query: str,
+                            available_tools: List[Dict[Any, Any]],
+                            answers:List[Dict],
+                            multisample=False,
+                            sample_n=4,
+                            task_status=None,
+                            answer_statuss=[None, None]) -> Union[List[int], int]:
+        """Annotate and return the index of the preferred answer.
+        
+        For given query, available tools, and two answers, return the index of the preferred answer by calling function `fn_completions` of the evaluator.
+        
+        Parameters:
+        ----------
+            query : str
+                The query of the task.
+            available_tools : List[Dict[Any, Any]]
+                The list of available tools for the task. The specific format of the tool is defined in `tooleval/evaluation/dataclass.py`
+            answers : List[Dict]
+                The list of answers for comparison.
+            multisample : bool, optional
+                Whether to use multisample to get the preference. If True, the function will return a list of preferences, otherwise return a single preference.
+            sample_n : int, optional
+                The number of samples to get the preference.
+
+        Returns:
+        -------
+            preference : Union[List[int], int]
+                The index of the preferred answer. If `multisample` is True, return a list of preferences, otherwise return a single preference.
+        
+        Raise:
+        -----
+        
+        """
+        answers_processed = [process_answer(ans) for ans in answers]
+        available_tools = process_tools(available_tools)
+        
+        def shuffle_run() -> int:
+            indexs = list(range(len(answers_processed)))
+            random.shuffle(indexs)
+            
+            answers_projected = [answers[idx] for idx in indexs]
+            
+            preferred_index = self.fn_completions(
+                {
+                    'query':query,
+                    'available_tools':available_tools,
+                },
+                answers_projected,
+                task_status,
+                answer_statuss
+            )
+            if preferred_index in indexs:
+                return indexs.index(preferred_index)
+            raise ValueError(f'Preferred index {preferred_index} is invalid!')
+        
+        if not multisample:
+            return shuffle_run()
+        else:
+            prefers = [shuffle_run() for _ in range(sample_n)]
+            return prefers
+
+@register_evaluator
+class ToolEvalEvaluator(BaseEvaluator):
+    """ToolEval common evaluator class.
+    
+    Attributes:
+    ----------
+        cfg_path : str
+            A path store the configuration of the evaluator.  
+
+        
+    """
+    def __init__(self,
+                 cfg_path: str = None,
+                ):
+        eval_config = yaml.load(open(os.path.join(cfg_path,'config.yaml')),Loader=yaml.FullLoader)
+        template = open(os.path.join(cfg_path,eval_config['prompt_template'])).read()
+        
+        super().__init__(
+            fn_completions=getattr(self,eval_config['fn_completions'])
+            )
+        self.eval_config = eval_config
+        self.template = template
--- a/toolbench/tooleval/evaluators/registered_cls/rtl.py
+++ b/toolbench/tooleval/evaluators/registered_cls/rtl.py
@ -0,0 +1,246 @@
+import json
+import re
+import random
+import math
+from typing import List, Union, Dict, Any, Callable, Optional
+from copy import deepcopy
+from tenacity import retry, stop_after_attempt
+
+
+
+from .utils import register_evaluator,OpenaiPoolRequest
+from .tooleval import OpenAINormalizedEvaluator
+
+from enum import Enum
+
+class AnswerStatus(Enum):
+    Unsure = "Unsure"
+    Unsolved = "Unsolved"
+    Solved = "Solved"
+    Filtered = "Filtered"
+    
+class TaskStatus(Enum):
+    Unsure = "Unsure"
+    Unsolvable = "Unsolvable"
+    Solvable = "Solvable"
+    Filtered = "Filtered"
+    
+class AnswerPass(Enum):
+    Unsure = "Unsure"
+    Failed = "Failed"
+    Passed = "Passed"
+    
+
+@register_evaluator
+class ReinforceToolLearningEvaluator(OpenAINormalizedEvaluator):
+    def check_has_hallucination(self,available_tools:List[Dict],answer:Dict[Any,Any])->bool:
+        available_names = set([tool['name'] for tool in available_tools])
+        
+        def check_node_valid(node:Dict)->bool:
+            # print(node)
+            if node['role'] == "tool":
+                if isinstance(node['message'], dict):
+                    node['message'] = str(node['message'])
+                name = re.findall(r"'name':\s*'(.*?)'",node['message'],re.DOTALL)[0]
+                return name in available_names
+            return True            
+        
+        def recurssive_check(nodes:Union[List,Dict])->bool:
+            if isinstance(nodes,Dict):
+                if not check_node_valid(nodes):
+                    return False
+                else:
+                    return recurssive_check(nodes['next'])
+            if isinstance(nodes,List):
+                for node in nodes:
+                    if not recurssive_check(node):
+                        return False
+                return True
+            raise ValueError(f'Unknown node type {type(nodes)}')
+            
+        return recurssive_check(answer['answer_details'])
+    
+    def check_is_solved(self,
+                        task_description:Dict,
+                        answer:Dict[Any,Any],
+                        return_reason=False,
+                        ) -> Union[AnswerStatus,Optional[str]]:
+        
+        # empty situation
+        if answer['final_answer']=='' or 'give_up_and_restart' in  answer['final_answer']:
+            if return_reason:
+                return AnswerStatus.Unsolved, "Empty final answer!",0
+            return AnswerStatus.Unsolved, "", 0
+        # print(answer['final_answer'])
+        
+        ret = self.function_call(
+            'check_answer_status',
+            {
+                'query':task_description['query'],
+                'answer':answer['final_answer']
+            },
+            return_reason=return_reason
+        )
+        answer_status = AnswerStatus(ret['answer_status'])
+        
+        if answer_status == AnswerStatus.Unsure:
+            # detailed check here
+            ret = self.function_call(
+                'parse_answer_status',
+                {
+                    'query':task_description['query'],
+                    'answer':json.dumps(answer)
+                },
+                return_reason=return_reason
+            )
+            if ret['answer_status'] not in ['Unsure','Unsolved', 'Solved']:
+                ret['answer_status'] = 'Unsure'
+            answer_status = AnswerStatus(ret['answer_status'])
+        # print(answer_status)
+
+        if return_reason:
+            return answer_status,ret['reason'], ret['tokens']
+        return answer_status, "", ret['tokens']
+    
+    def check_task_solvable(self,
+                            task_description:Dict,
+                            has_been_solved=False,
+                            return_reason=False,
+                            )->Union[TaskStatus,Optional[str]]:
+        if has_been_solved:
+            if return_reason:
+                return TaskStatus.Solvable, 'Task has been solved before.', 0
+            return TaskStatus.Solvable, '', 0
+        
+        ret = self.function_call(
+            'check_task_solvable',
+            {
+                'task':json.dumps(task_description)
+            },
+            return_reason=return_reason
+        )
+        task_status = TaskStatus(ret['task_status'])
+        if return_reason:
+            return task_status, ret['reason'], ret['tokens']
+        return task_status, '', ret['tokens']
+        
+    def is_passed(self,
+                  task_description:Dict,
+                  answer:Dict[Any,Any],
+                  answer_status:AnswerStatus=None,
+                  task_status:TaskStatus=None,
+                  )->AnswerPass:
+        tokens = 0
+        if answer_status is None:
+            answer_status, _, tokens = self.check_is_solved(task_description,answer)
+            
+        if answer_status == AnswerStatus.Solved:
+            return AnswerPass.Passed, tokens
+        else:
+            if task_status is None:
+                task_status, _ = self.check_task_solvable(
+                    task_description,
+                    has_been_solved=answer_status==AnswerStatus.Solved)
+            
+            if answer_status == AnswerStatus.Unsolved:
+                if task_status == TaskStatus.Solvable:
+                    return AnswerPass.Failed, tokens
+                if task_status == TaskStatus.Unsure:
+                    return AnswerPass.Unsure, tokens
+                if task_status == TaskStatus.Unsolvable:
+                    return AnswerPass.Passed, tokens
+            elif answer_status == AnswerStatus.Unsure:
+                if task_status == TaskStatus.Solvable:
+                    return AnswerPass.Unsure, tokens
+                if task_status == TaskStatus.Unsure:
+                    return AnswerPass.Unsure, tokens
+                if task_status == TaskStatus.Unsolvable:
+                    return AnswerPass.Passed, tokens
+                                
+        return AnswerPass.Failed, tokens
+    
+    def check_identity_answers(self,
+                       answers:List[Dict[Any,Any]],
+                       )->bool:
+        ref_answer = answers[0]
+        for ans in answers[1:]:
+            if ans['final_answer']!=ref_answer['final_answer']:
+                return False
+            if str(ans['answer_details'])!=str(ref_answer['answer_details']):
+                return False
+        return True
+    
+    @retry(stop=stop_after_attempt(3),reraise=True)
+    def select_better_answer(self,
+                           task_description:Dict,
+                           task_status:TaskStatus,
+                           ans_idxs:List[int],
+                           answers:List[Dict[Any,Any]],
+                           answer_status:AnswerStatus,
+                           *,
+                           return_reason=True)->int:
+        answers = deepcopy(answers)
+        
+        if self.check_identity_answers(answers):
+            return random.choice(ans_idxs)
+        
+        judge_focus = {
+            TaskStatus.Solvable:'Since query is solvable, you should select answer with smaller "total_steps" and informative, accurate "final_answer".',
+            TaskStatus.Unsure:'Since query is unsure, you should select a more comprehensive exploration for possible solutions.',
+            TaskStatus.Unsolvable:'Since query is unsolvable, you should select answer with smaller "total_steps" and detailed reasons for failure.'
+        }
+        
+        ret = self.function_call(
+            'select_better_answer', {
+                'query':task_description['query'],
+                'answer_0':json.dumps(answers[0]),
+                'answer_1':json.dumps(answers[1]),
+                # 'q_status':judge_focus[task_status],
+            },
+            return_reason=return_reason
+        )
+        index = int(ret['index'])
+        if index in ans_idxs:
+            return index
+        else:
+            raise ValueError(f'Index {index} not found!')
+    
+    def normalized_openai_completions(self,task_description:Dict, answers:List[Dict[Any,Any]], task_status:None, answer_statuss:[None, None])->int:
+        if answer_statuss[0] is None:
+            print("comparing from scratch...")
+            status = [self.check_is_solved(task_description,ans)[0] for ans in answers]
+        else:
+            status = answer_statuss
+        # check whether there are answers solve the task
+        solves = [idx for idx,s in enumerate(status) if s==AnswerStatus.Solved]
+        
+        if len(solves)==1:
+            return solves[0]
+        elif len(solves)>1:
+            # pick best one
+            if task_status is None:
+                task_status, _ = self.check_task_solvable(task_description,has_been_solved=True)
+            else:
+                task_status = task_status
+            return self.select_better_answer(task_description,task_status,solves,[answers[idx] for idx in solves],AnswerStatus.Solved)
+        
+        # if no answer solves the task, check whether unsure answer exists
+        unsures = [idx for idx,s in enumerate(status) if s==AnswerStatus.Unsure]
+        
+        if len(unsures) == 1:
+            return unsures[0]
+        elif len(unsures)>1:
+            # pick best one
+            if task_status is None:
+                task_status, _ = self.check_task_solvable(task_description)
+            else:
+                task_status = task_status
+            return self.select_better_answer(task_description,task_status,unsures,[answers[idx] for idx in unsures],AnswerStatus.Unsure)
+        
+        # if all failed
+        # pick best one
+        if task_status is None:
+            task_status, _ = self.check_task_solvable(task_description)
+        else:
+            task_status = task_status
+        return self.select_better_answer(task_description,task_status,list(range(len(answers))),answers,AnswerStatus.Unsolved)
--- a/toolbench/tooleval/evaluators/registered_cls/tooleval.py
+++ b/toolbench/tooleval/evaluators/registered_cls/tooleval.py
@ -0,0 +1,198 @@
+from copy import deepcopy
+import json
+import re
+import random
+import math
+
+
+from .base import ToolEvalEvaluator
+from typing import List, Union, Dict, Any, Callable
+from .utils import register_evaluator,OpenaiPoolRequest
+
+from tenacity import retry, stop_after_attempt
+
+
+@register_evaluator
+class OpenAIEvaluator(ToolEvalEvaluator):
+    def __init__(self,
+                 cfg_path: str = None,
+                ):
+        super().__init__(cfg_path)
+        self.opr = OpenaiPoolRequest(self.eval_config['apis_json'])
+        
+        self.conversation_template = []
+        for message in re.findall(r"<message>(.*?)</message>", self.template,re.DOTALL):
+            message = {
+                'role':re.findall(r"<role>(.*?)</role>",message,re.DOTALL)[0],
+                'content':re.findall(r"<content>(.*?)</content>",message,re.DOTALL)[0]
+            }
+            self.conversation_template.append(message)
+            
+
+    def openai_completions(self,task_description:Dict,answers:Dict)->int:
+        conversation = deepcopy(self.conversation_template)
+        for msg in conversation:
+            if msg['role'] == 'user':
+                msg['content'] = msg['content'].format(
+                    task_description=json.dumps(task_description),
+                    answers=json.dumps(answers)
+                    )
+        
+        res = self.opr(messages=conversation,**self.eval_config['completions_kwargs'])
+    
+        prefers = []
+        for choice in res.choices:
+            prefers.append(int(json.loads(choice.message.function_call.arguments)['preference']))
+            
+        return random.choice(prefers)
+    
+@register_evaluator
+class OpenAINormalizedEvaluator(ToolEvalEvaluator):
+    def __init__(self,
+                 cfg_path: str = None,
+                ):
+        super().__init__(cfg_path)
+        
+        self.opr = OpenaiPoolRequest(self.eval_config['apis_json'])
+        
+        # setting up the function templates
+        self.parsed_function_templates = {}
+        for function in re.findall(r"<function>(.*?)</function>", self.template,re.DOTALL):
+            name = re.findall(r"<name>(.*?)</name>",function,re.DOTALL)[0]
+            description = re.findall(r"<description>(.*?)</description>",function,re.DOTALL)[0]
+            self.parsed_function_templates[name] = description
+            
+        self.functions = {}
+        for function in self.eval_config['completions_kwargs']['functions']:
+            self.functions[function['name']] = function
+    
+    # @retry(stop=stop_after_attempt(3),reraise=True)
+    def function_call(self,
+                      func_name,
+                      func_args:Dict,
+                      *,
+                      return_reason=False,
+                      return_content=False):
+        completion_kwargs = deepcopy(self.eval_config['completions_kwargs'])
+        func_description = deepcopy(self.functions[func_name])
+        
+        if return_reason:
+            func_description['parameters']['required'].append('reason')
+            func_description['parameters']['properties']['reason'] = {
+                'type':'string',
+                'description':'explain your answer.'
+            }
+        
+        completion_kwargs['function_call'] = {'name':func_name}
+        completion_kwargs['functions'] = [func_description]
+
+        completion_kwargs['messages'] = [{
+            'role':'user',
+            'content':str(self.parsed_function_templates[func_name]).format(**func_args)
+        }]
+                    
+        res = self.opr.request(**completion_kwargs)
+        # print(res)
+        ret = json.loads(res.choices[0].message.function_call.arguments)
+        
+        # check required items
+        required_args = getattr(func_description['parameters'],'required',None)
+        if required_args is not None:
+            ret_args = set(ret.keys())
+            for arg in required_args:
+                if arg not in ret_args:
+                    raise KeyError(f"Arg {arg} not found in reply!")
+        
+        if return_content:
+            ret['content'] = dict(res.choices[0].message).get('content','')
+        ret['tokens'] = res.usage.total_tokens
+        return ret
+    
+    def select_best_final_answer(self,query,final_answers:List[str])->int:
+        hashed_ans = list(map(hash,final_answers))
+        all_same = True
+        for item in hashed_ans[1:]:
+            if item != hashed_ans[0]:
+                all_same = False
+        if all_same:
+            return random.choice(range(len(final_answers)))
+        while True:
+            selected = int(self.function_call('select_best_final_answer',{'query':query,'final_answers':final_answers})['best_answer_index'])
+            if selected<len(final_answers) and selected>=0:
+                break
+        return selected
+    def check_solve_query(self,query,final_answer:str)->bool:
+        return bool(self.function_call('check_solve_query',{'query':query,'final_answer':final_answer})['is_solved'])
+    
+    def compare_answer_details(self,answer:List)->List[int]:         
+        parsed_answers = []
+        
+        for ans in answer:
+            parsed_ans = self.function_call('parse_answer_details',{'answer_details':ans['answer_details']})
+            parsed_ans['total_steps'] = ans['total_steps']
+            parsed_answers.append(parsed_ans)
+
+        # calculate socre and return one with highest score
+        scores = []
+        for ans in parsed_answers:
+            score = 0
+            score += int(ans['succeed_tool_calling'])*10
+            score += int(ans['used_tool_types'])*5
+            if int(ans['total_steps'])<=0:
+                score -= int(1e5)
+            else:
+                score += -5*math.log(ans['total_steps'])
+            scores.append(score)
+        # return index of highest score
+        highest_score = max(scores)
+        highest_idx = [idx for idx,score in enumerate(scores) if score==highest_score]         
+        return random.choice(highest_idx)
+    
+    def normalized_openai_completions(self,task_description:Dict,answers:List[Dict[Any,Any]])->int:
+        
+        all_empty = True
+        all_nonempty = True
+        is_nonempty = []
+        for ans in answers:
+            status = ans['final_answer']!=''
+            if status:
+                all_empty = False
+            else:
+                all_nonempty = False
+            is_nonempty.append(status)
+        # print(is_nonempty)
+        if all_nonempty:
+            all_solved = True
+            all_failed = True
+            is_solved = []
+            for ans in answers:
+                status = self.check_solve_query(task_description['query'],ans['final_answer'])
+                # print(ans['final_answer'])
+                if status:
+                    all_failed = False
+                else:
+                    all_solved = False
+                is_solved.append(status)
+            
+            # print(is_solved)
+            if all_solved:
+                steps = [int(ans['total_steps']) for ans in answers]
+                shortest_steps = min(steps)
+                ans_idxs = [idx for idx,step in enumerate(steps) if step==shortest_steps]
+                # return only one idx
+                if len(ans_idxs)>1:
+                    return ans_idxs[self.select_best_final_answer(
+                        task_description['query'],
+                        [answers[idx]['final_answer'] for idx in ans_idxs]
+                        )]
+                else:
+                    return ans_idxs[0]
+                
+            elif all_failed:
+                return self.compare_answer_details(answers)
+            else:
+                return random.choice([index for index,solve in enumerate(is_solved) if solve])
+        elif all_empty:
+            return self.compare_answer_details(answers)
+        else:
+            return random.choice([index for index,nonempty in enumerate(is_nonempty) if nonempty])
--- a/toolbench/tooleval/evaluators/registered_cls/utils.py
+++ b/toolbench/tooleval/evaluators/registered_cls/utils.py
@ -0,0 +1,85 @@
+import os
+import json
+from typing import List,Dict
+import requests
+from tenacity import retry, wait_random_exponential, stop_after_attempt
+
+import openai
+import random
+import time
+import tiktoken
+from openai_utils import *
+from arguments import parse_args
+args = parse_args()
+output_dir = args.output_dir
+from config import *
+if api_type == "azure":
+    from openai import AzureOpenAI as Client
+else:
+    from openai import OpenAI as Client
+client = Client(
+    api_key=api_key,
+    api_version=api_version,
+    azure_endpoint = api_base
+    )
+
+__registered_evaluators__ = {}
+
+def register_evaluator(cls):
+    """
+    Decorator function to register classes with the registered_evaluators list.
+    """
+    __registered_evaluators__[cls.__name__] = cls
+    return cls
+
+def get_evaluator_cls(clsname):
+    """
+    Return the evaluator class with the given name.
+    """
+    try:
+        return __registered_evaluators__.get(clsname)
+    except:
+        raise ModuleNotFoundError('Cannot find evaluator class {}'.format(clsname))
+
+
+class OpenaiPoolRequest:
+    def __init__(self, pool_json_file=None):
+        self.pool:List[Dict] = []
+        __pool_file = pool_json_file
+        if os.environ.get('API_POOL_FILE',None) is not None:
+            __pool_file = os.environ.get('API_POOL_FILE')
+            self.now_pos = random.randint(-1, len(self.pool))
+        if os.path.exists(__pool_file):
+            self.pool = json.load(open(__pool_file))
+            self.now_pos = random.randint(-1, len(self.pool))
+        print(__pool_file)
+        if os.environ.get('OPENAI_KEY',None) is not None:
+            self.pool.append({
+                'api_key':os.environ.get('OPENAI_KEY'),
+                'organization':os.environ.get('OPENAI_ORG',None),
+                'api_type':os.environ.get('OPENAI_TYPE',None),
+                'api_version':os.environ.get('OPENAI_VER',None)
+            })
+
+    # @retry(wait=wait_random_exponential(multiplier=1, max=30), stop=stop_after_attempt(20),reraise=True)
+    def request(self,messages,**kwargs):
+        # self.now_pos = (self.now_pos + 1) % len(self.pool)
+        # key_pos = self.now_pos
+        # item = self.pool[key_pos]
+        # kwargs['api_key'] = item['api_key']
+        # if item.get('organization',None) is not None:
+        #     kwargs['organization'] = item['organization'] 
+        # kwargs['engine'] = 'gpt-35-turbo'
+        kwargs['model'] = model_name
+        # kwargs['model'] = 'gpt-35-turbo-16k'
+
+        try:
+            response = call_gpt(messages, **kwargs)
+        except openai.APITimeoutError as e:
+            time.sleep(40)
+            raise e
+        return response
+    
+    def __call__(self,messages,**kwargs):
+        return self.request(messages,**kwargs)
+   
--- a/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/config.yaml
+++ b/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/config.yaml
@ -0,0 +1,49 @@
+evaluator_name: "tooleval_gpt-3.5-turbo_default"
+registered_cls_name: "ReinforceToolLearningEvaluator"
+prompt_template: "template.txt"
+fn_completions: "normalized_openai_completions"
+apis_json: "your/path/to/api_pool.json"
+completions_kwargs:
+  model: "gpt-4-turbo"
+  max_tokens: 1000
+  temperature: 0.2
+  timeout: 10
+  functions:
+    - name: "check_answer_status"
+      description: "Parse the json answer with layerd nodes and return the answer_status about the answer"
+      parameters:
+        type: "object"
+        properties:
+          answer_status:
+            type: "string"
+            enum: ["Unsure","Unsolved","Solved"]
+        required: ["answer_status"]
+    - name: "parse_answer_status"
+      description: "Parse the json answer with layerd nodes and return the answer_status about the answer"
+      parameters:
+        type: "object"
+        properties:
+          answer_status:
+            type: "string"
+            enum: ["Unsure","Unsolved","Solved"]
+        required: ["answer_status"]
+    - name: "check_task_solvable"
+      description: "Parse the task description and return the task_status about the task"
+      parameters:
+        type: "object"
+        properties:
+          task_status:
+            type: "string"
+            enum: ["Unsure","Unsolvable","Solvable"]
+        required: ["task_status"]
+    - name: "select_better_answer"
+      description: "Select the better answer with a comprehensive investigation on given aspects. You should ignore the impact of the order of candidate answers."
+      parameters:
+        type: "object"
+        properties:
+          index:
+            type: "number"
+            description: "The `index` value in the selected better answer."
+        required: ["index"]
+fn_completion_parser: "index_parser"
+batch_size: 1
--- a/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/template.txt
+++ b/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_default/template.txt
@ -0,0 +1,80 @@
+<function>
+<name>check_answer_status</name>
+<description>
+Giving the query and answer, you need give `answer_status` of the answer by following rules:
+1. If the answer is a sorry message or not a positive/straight response for the given query, return "Unsolved".
+2. If the answer is a positive/straight response for the given query, you have to further check.
+2.1 If the answer is not sufficient to determine whether the solve the query or not, return "Unsure".
+2.2 If you are confident that the answer is sufficient to determine whether the solve the query or not, return "Solved" or "Unsolved".
+
+Query:
+{query}
+Answer:
+{answer}
+
+Now give your reason in "content" and `answer_status` of JSON to `check_answer_status`.
+</description>
+</function>
+
+<function>
+<name>parse_answer_status</name>
+<description>
+Giving the query and the correspond execution detail of an answer, you need give `answer_status` of the answer by following rules:
+1. If all 'tool' nodes' message indicate that there are errors happened, return "Unsolved"
+2. If you find the information in the "final_answer" is not true/valid according to the messages in 'tool' nodes, return "Unsolved"
+3. If you are unable to verify the authenticity and validity of the information, return "Unsure"
+4. If there are 'tool' node in the chain contains successful func calling and those calling indeed solve the query, return "Solved"
+
+Query:
+{query}
+Answer:
+{answer}
+
+Now you are requested to give reason in "content" and `answer_status` of JSON to `parse_answer_status`.
+</description>
+</function>
+
+<function>
+<name>check_task_solvable</name>
+<description>
+Please check whether the given task solvable with following rules:
+1. If the `query` provide invalid information (e.g. invalid email address or phone number), return "Unsolvable"
+2. If the `query` needs more information to solve (e.g. the target restaurant name in a navigation task), return "Unsolvable"
+3. If you are unable to draw a conclusion, return "Unsure"
+4. If the currently `available_tools` are enough to solve the query, return "Solvable"
+
+Task:
+{task}
+
+Now give your reason in "content" and `task_status` of JSON to `check_task_solvable`.
+</description>
+</function>
+
+
+
+<function>
+<name>select_better_answer</name>
+<description>
+Query:
+{query}
+
+Answer_0:
+{answer_0}
+
+Answer_1:
+{answer_1}
+
+Given above query and answers in JSON format, you must follow the rules to select the relatively better answer and give the index of the answer **(0 for Answer_0, 1 for Answer_1)**:
+1. Compare the value of "final_answer" in following aspects:
+- Informative: whether it contains all necessary information to reply to the query.
+- Factuality: whether it accurately describes what has been done, and what failed in the end.
+- Reasoning: If answer does not solve the query, whether gives a detailed and accurate reason for failure.
+2. If you cannot determine yet, compare the value of "answer_details" in following aspects:
+- Tool calling costs: calculating the percentage of failed and replicated tools calling.
+- Running costs: calculating the total tokens T used in execution.
+- Milestone: calculating the milestone(fixed subtasks) reached in execution.
+- Exploration: whether tries potential useful tools in execution. Just count times of successful tool calling with different tools/arguments in execution.
+
+If you have made your decision, calling `select_better_answer`, else if you cannot determine, select a random answer.
+</description>
+</function>
--- a/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/config.yaml
+++ b/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/config.yaml
@ -0,0 +1,24 @@
+evaluator_name: "tooleval_gpt-3.5-turbo_fn"
+registered_cls_name: "OpenAIEvaluator"
+prompt_template: "template.txt"
+fn_completions: "openai_completions"
+apis_json: "your/path/to/api_pool.json"
+completions_kwargs:
+  model: "gpt-4-turbo"
+  max_tokens: 100
+  temperature: 0
+  timeout: 10
+  function_call:
+    name: "choose_preference"
+  functions:
+    - name: "choose_preference"
+      description: "Choose the preferred answer for the query within all given answers."
+      parameters:
+        type: "object"
+        properties:
+          preference:
+            type: "number"
+            description: "The index of the preferred answer in all given answers."
+        required: [ "preference" ]
+fn_completion_parser: "index_parser"
+batch_size: 1
--- a/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/template.txt
+++ b/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_fn/template.txt
@ -0,0 +1,28 @@
+<message>
+<role>system</role>
+<content>You are a helpful annotator, that help user to annotate data.</content>
+</message>
+<message>
+<role>user</role>
+<content>Giving task description and candidate answers, I want you to choose one preferred answer based on the rules. To do so, I will give you the task description that given to the models, and the candidate answers in a list for chosen. To choose the one preferred answer, you need to first analyse answers based on rules, then give the index number of the preferred answer of JSON to `choose_preference`. 
+
+Here are the preference rules:
+1. if both answers give the none empty `final_answer`, check whether the given `final_answer` solves the given query.
+1.1 if both answers solve the query, choose one with smaller `total_steps`.
+1.1.1 if `total_steps` are same, choose one answer with better `final_answer` quality.
+1.2 if one answer solve while the other not, chose the answer that solve query.
+1.3 if both answers failed, check the `answer_details` to choose one with considering following preference:
+1.3.1 check `response` and prefer more successful tool calling.
+1.3.2 check `name` and prefer using more various tool usage.
+1.3.3 prefer smaller `total_steps`.
+2. if one give none empty `final_answer` while other not, choose the one give `final_answer`.
+3. if both failed to give none empty `final_answer`, following 1.3 to choose one with better `answer_details`.
+
+Here is the task description in JSON format:
+{task_description}
+
+Here are the candidate answers in JSON format:
+{answers}
+
+Now choose the preferred answer by analysing results and the rules given, return the index in range [0,1].</content>
+</message>
--- a/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/config.yaml
+++ b/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/config.yaml
@ -0,0 +1,43 @@
+evaluator_name: "tooleval_gpt-3.5-turbo_normalized"
+registered_cls_name: "OpenAINormalizedEvaluator"
+prompt_template: "template.txt"
+fn_completions: "normalized_openai_completions"
+apis_json: "your/path/to/api_pool.json"
+completions_kwargs:
+  engine: "gpt-4-turbo"
+  max_tokens: 100
+  temperature: 0
+  timeout: 10
+  functions:
+    - name: "parse_answer_details"
+      description: "Parse the json answer with layerd nodes and return the informations about the answer"
+      parameters:
+        type: "object"
+        properties:
+          succeed_tool_calling:
+            type: "number"
+            description: "Give the number of times that the 'tool' nodes' message is called successfully without any errors in the response"
+          used_tool_types:
+            type: "number"
+            description: "Give the number of different 'name' in 'tool' nodes' message"
+        required: [ "succeed_tool_calling", "used_tool_types"]
+    - name: "select_best_final_answer"
+      description: "For given query, select the best answer in answers list and return the index of the best answer"
+      parameters:
+        type: "object"
+        properties:
+          best_answer_index:
+            type: "number"
+            description: "The index of the best answer in the answer list, start from 0"
+        required: [ "best_answer_index"]
+    - name: "check_solve_query"
+      description: "Check whether the given answer solve the given query, return true or false"
+      parameters:
+        type: "object"
+        properties:
+          is_solved:
+            type: "boolean"
+            description: "true if solved and false if not"
+        required: ["is_solved"]
+fn_completion_parser: "index_parser"
+batch_size: 1
--- a/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/template.txt
+++ b/toolbench/tooleval/evaluators/tooleval_gpt-3.5-turbo_normalized/template.txt
@ -0,0 +1,31 @@
+<function>
+<name>parse_answer_details</name>
+<description>
+Giving answer details in the following JSON format:
+{answer_details}
+
+I want you to parse the answer details and give the information of JSON to `parse_answer_details`. Now parse the answer.
+</description>
+</function>
+<function>
+<name>select_best_final_answer</name>
+<description>
+For query {query}, you have the following answers in JSON format:
+{final_answers}
+
+I want you to select the best answer from the above answers and give the index of the answer of JSON to `select_best_final_answer`. Now select the best answer.
+</description>
+</function>
+<function>
+<name>check_solve_query</name>
+<description>
+Please check whether the answer solve the query or not.
+Query:
+{query}
+
+Answer:
+{final_answer}
+
+Now give your judgment of JSON to `check_solve_query`, remember do not be too strict.
+</description>
+</function>
--- a/toolbench/tooleval/evaluators_comparison.py
+++ b/toolbench/tooleval/evaluators_comparison.py
@ -0,0 +1,152 @@
+import pandas as pd
+import json
+from concurrent.futures import ThreadPoolExecutor,as_completed
+from tqdm import tqdm
+from evaluators import load_registered_automatic_evaluator
+import os 
+import numpy as np
+import copy
+from typing import List
+from scipy.stats import pearsonr,spearmanr
+import random
+random.seed(42)
+
+abs_dir = os.path.split(__file__)[0]
+annotated_data = json.load(open(os.path.join(abs_dir,'dataset/human_cross_annotated_data.json')))
+NUM_WORKERS=16
+
+def get_most_preferred(d:list)->np.ndarray:
+    if np.iterable(d):
+        d = np.asanyarray(d)
+        bins = np.bincount(d)
+        max_val = np.max(bins)
+        argmax = np.where(max_val==bins)[0]
+        return argmax
+    else:
+        return np.asarray([d])
+    
+def agreement_score(x,ref:list)->float:
+    majority_x = get_most_preferred(x)
+    majority_ref = get_most_preferred(ref)
+    score_unit = 1/len(majority_x)/len(majority_ref)
+    score = 0.0
+    for x in majority_x:
+        if x in majority_ref:
+            score += score_unit
+    return score
+def get_correlation(x,y):
+    x= np.asarray(x)
+    y = np.asarray(y)
+    x = x+1
+    y = y+1
+    if np.var(x)==0 or np.var(y)==0:
+        return float(random.choice(get_most_preferred(x))==random.choice(get_most_preferred(y)))
+    return pearsonr(x,y)[0]
+
+def test_on_annotated_data(evaluator_cfg)->List[List[int]]:
+    evaluators = [load_registered_automatic_evaluator(evaluator_cfg) for _ in range(NUM_WORKERS)]
+    def get_preference(idx):
+        data = annotated_data[idx]
+        def process_tools(tools:list):
+            for tool in tools:
+                tool.pop('description',None)
+                tool.pop('parameters',None)
+            return tools
+
+        tools = process_tools(data['available_tools'])
+        ret = evaluators[idx%NUM_WORKERS].annotate_preference(
+            data['query'],
+            tools,
+            data['answers'],multisample=True)
+        return idx,ret
+    prefer_dict = {}
+    with ThreadPoolExecutor(NUM_WORKERS) as pool:
+        # future = [pool.submit(get_preference,idx) for idx in range(100)]
+        future = [pool.submit(get_preference,idx) for idx in range(len(annotated_data))]
+        for thd in tqdm(as_completed(future),total=len(future),ncols=100):
+            if thd.exception() is not None:
+                pool.shutdown(cancel_futures=True)
+                raise thd.exception()
+                exit(-1)
+            idx,preference = thd.result()
+            prefer_dict[idx] = preference
+    prefer = [prefer_dict[idx] for idx in range(len(future))]
+    return prefer
+
+def get_popped_and_rest(d:list,index:int):
+    l = copy.deepcopy(d)
+    popped = l.pop(index)
+    return popped,l
+
+def calculate_human_performance():
+    human_agreement = []
+    variance = []
+    for data in annotated_data:
+        agreement_scores = [
+            agreement_score(*get_popped_and_rest(data['preference'],idx))
+            for idx in range(len(data['preference']))
+        ]
+        human_agreement.append(np.mean(agreement_scores))
+        variance.append(np.var([1-agreement_scores[idx] for idx in range(len(agreement_scores))]))
+        
+            
+    return {
+        'human_agreement':np.mean(human_agreement),
+        'bias':0,
+        'variance':np.mean(variance)
+    }
+
+        
+    
+def calculate_evaluator_performance(evaluator_preference,human_preference):
+    human_agreement = []
+    bias = []
+    variance = []
+    assert len(evaluator_preference)==len(human_preference),'length of evaluator_preference and human_preference should be the same!'
+    correlation = []
+    for idx in range(len(evaluator_preference)):
+        human_pref = human_preference[idx]
+        evaluator_pref = evaluator_preference[idx]
+        
+        human_agreement.append([
+            agreement_score(pref,human_pref) for pref in evaluator_pref
+        ])
+        bias.append(
+            1 - agreement_score(human_pref,evaluator_pref)
+        )
+        variance.append(
+            np.var([1-score for score in human_agreement[-1]])
+        )
+        correlation.append(get_correlation(human_pref,evaluator_pref))
+        
+    return{
+        'correlation': np.mean(correlation),
+        'human_agreement':np.mean(np.mean(human_agreement,axis=1)),
+        'bias':np.mean(bias),
+        'variance':np.mean(variance)
+    }
+    
+if __name__=='__main__':
+    evaluators = ['tooleval_gpt-3.5-turbo_normalized',]
+    human_perference = [
+        data['preference'] for data in annotated_data
+    ]
+    
+    evaluator_performance = [calculate_human_performance()]
+    for evaluator in evaluators:
+        if not os.path.exists(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy')):
+            evaluator_cfg = {
+                'evaluators_cfg_path':os.path.join(abs_dir,'evaluators'),
+                'evaluator':evaluator
+            }
+            evaluator_perference = test_on_annotated_data(evaluator_cfg)
+            np.save(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),evaluator_perference)
+        
+        evaluator_perference = np.load(os.path.join(abs_dir,'dataset',f'performance_{evaluator}.npy'),allow_pickle=True)
+        performance = calculate_evaluator_performance(evaluator_perference,human_perference)
+        print(performance)
+        evaluator_performance.append(performance)
+    
+    df = pd.DataFrame(evaluator_performance,index=['human']+evaluators)
+    df.to_csv(os.path.join(abs_dir,'dataset','evaluator_performance.csv'))
+    print(df)
--- a/toolbench/tooleval/pass_rate.py
+++ b/toolbench/tooleval/pass_rate.py
@ -0,0 +1,339 @@
+import os
+import re
+import json
+import numpy as np
+import sys
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--answer_dir',type=str, required=True,help='where the answers stored.')
+
+if __name__=='__main__':
+    args = parser.parse_args()
+    input_dir = args.answer_dir
+    test_count = "100"
+    method2result = {}
+    method2querycount = {}
+
+    def get_size(node):
+        size = 1
+        if len(node["children"]) == 0:
+            node["size"] = size
+            return size
+        else:
+            for child in node["children"]:
+                size += get_size(child)
+            node["size"] = size
+            return size
+    def get_leaf_node_count(node):
+        '''
+        返回值：叶子节点数，最大Elo积分，总子节点树，Thought节点数, 选择最左数量，选择后几个数量
+        '''
+        thought_count = (1 if node["node_type"] == "Thought" else 0)
+        if len(node["children"]) == 0:
+            return (1 if node["expand_num"] != 0 else 0), node["Elo"], 1, thought_count
+        else:
+            result = 0
+            max_elo = -1e7
+            node_count = 1
+            for child in node["children"]:
+                child_left_node_count, child_max_elo, child_node_count, child_thought_count = get_leaf_node_count(child)
+                result += child_left_node_count
+                node_count += child_node_count
+                thought_count += child_thought_count
+                max_elo = max(max_elo,child_max_elo)
+            return result, max_elo, node_count, thought_count
+
+    def recursive_get_error_code(obj):
+        result = []
+        if type(obj) == dict:
+            for key,value in obj.items():
+                if key == "observation_code":
+                    assert type(value) == int
+                    # assert "observation" in obj.keys()
+                    if "observation" in obj.keys() and "html" in str(obj["observation"]).lower():
+                        result = result + ["html"]
+                    else:
+                        result = result + [value]
+
+                    # if value == -1:
+                    #     print(obj["description"])
+
+                elif key == "description":
+                    if "OpenAI service is unavailable" in value:
+                        result = result + ["openai"]
+                        # print("hello")
+                else:
+                    # print(f"in {key}")
+                    result = result + recursive_get_error_code(value)
+        elif type(obj) == list:
+            for cont in obj:
+                result = result + recursive_get_error_code(cont)
+        return result
+
+
+    def check_real_valid(string):
+        if not isinstance(string, str):
+            string = str(string)
+        fake_true_vocab = ["sorry","apologize","apology","unfortunately","couldn't"]
+        for word in fake_true_vocab:
+            if word in string.lower():
+                return False
+        return True
+
+    for file in os.listdir(input_dir):
+        if "result" in file:
+            continue
+        pattern = r"(\d+)_([^_]+)_(.+)\.json"
+        re_result = re.match(pattern,file)
+        if re_result == None or "DFS" in (re_result.group(3) + "_" + re_result.group(2)):
+            pattern2 = r"(\d+)_(.+)\.json"
+            re_result = re.match(pattern2,file)
+            idx = re_result.group(1)
+            method = re_result.group(2)
+        else:
+            idx = int(re_result.group(1))
+            method = re_result.group(3) + "_" + re_result.group(2)
+
+        if method2result.get(method,-1) == -1:
+            method2result[method] = {
+                "total_count": 0,
+                "pass_at_acc": [0,0.0],
+                "best_answer_acc": [0,0.0],
+                "best_answer_is_real_valid": [],
+                "query_count": [],
+                "average_token_usage": [],
+                "fake_valid": [0,0.0],
+                "thought_node_rate": [],
+                "give_answer_rate": [],
+                "root/max_Elo": [],
+                "valid_per_data": [],
+                "vote_to_the_first_node": [],
+                "hallucination_name": [0,0.0],
+                "hallucination_name_error": [0,0.0],
+                "valid_observation_count": [],
+                "valid_answer_count": [],
+                "leaf_node_count": [],
+                "max_query_count_stopping": [0,0.0],
+                "html_in_response": [0,0.0],
+                "html_in_response_error": [0,0.0],
+                "openai_llm_bug": [0,0.0],
+                "give_up_and_restart": [0,0.0],
+                "\"error\" in response": [0,0.0],
+                "API not working error":[0,0],
+                "unauthorized_error":[0, 0.0],
+                "unsubscribed_error":[0, 0.0],
+                "too many requests error": [0,0.0],
+                "rate limit per minute error": [0, 0.0],
+                "message error":[0,0.0],
+                "request invalid data error": [0,0],
+                "other_error": [0,0.0],
+                "connection_timeout": [0,0.0],
+            }
+        if method2querycount.get(method,-1) == -1:
+            method2querycount[method] = []
+        method2querycount[method].append(idx)
+        
+        reader =  open(os.path.join(input_dir,file),"r")
+        try:
+            json_data = json.load(reader)
+        except:
+            print(file)
+            reader.close()
+            continue
+        reader.close()
+
+        json_data["answer_generation"]["finish_type"] = "give_answer"
+        if "CoT" in method or "Reflexion" in method:
+            flatten_error_codes = recursive_get_error_code(json_data["trys"]) 
+        else:
+            flatten_error_codes = recursive_get_error_code(json_data["tree"])  
+            get_size(json_data["tree"]["tree"])
+
+        if -1 in flatten_error_codes:
+            os.remove(os.path.join(input_dir,file))
+            continue
+
+        method2result[method]["total_count"] += 1
+        method2result[method]["query_count"].append(json_data["answer_generation"]["query_count"]) #
+        if "total_tokens" in json_data["answer_generation"].keys():
+            method2result[method]["average_token_usage"].append(json_data["answer_generation"]["total_tokens"]) #
+
+
+        if "CoT" in method or "Reflexion" in method:
+            method2result[method]["leaf_node_count"].append(json_data["try_count"])
+        else:
+            leaf_node_count, max_elo, node_count, thought_count = get_leaf_node_count(json_data["tree"]["tree"])
+            method2result[method]["leaf_node_count"].append(leaf_node_count)
+            method2result[method]["thought_node_rate"].append(thought_count/node_count)
+            # assert json_data["tree"]["tree"]["Elo"] >= 0, os.path.join(input_dir,file)
+            if max_elo > 0:
+                method2result[method]["root/max_Elo"].append(max_elo)
+            else:
+                method2result[method]["root/max_Elo"].append( max_elo)
+        if 13 in flatten_error_codes:
+            # print(13)
+            method2result[method]["connection_timeout"][0] += 1
+        if 6 in flatten_error_codes:
+            method2result[method]["API not working error"][0] += 1
+        if 7 in flatten_error_codes:
+            method2result[method]["unauthorized_error"][0] += 1
+        if 8 in flatten_error_codes:
+            method2result[method]["unsubscribed_error"][0] += 1
+        if 9 in flatten_error_codes:
+            method2result[method]["too many requests error"][0] += 1
+        if 10 in flatten_error_codes:
+            method2result[method]["rate limit per minute error"][0] += 1
+        if 11 in flatten_error_codes:
+            method2result[method]["message error"][0] += 1
+        if 12 in flatten_error_codes:
+            method2result[method]["request invalid data error"][0] += 1
+            
+        
+
+        if "html" in flatten_error_codes: #html
+            method2result[method]["html_in_response"][0] += 1
+        if 1 in flatten_error_codes:
+            method2result[method]["hallucination_name"][0] += 1
+        if len(json_data["compare_candidates"]) > 0:
+            method2result[method]["valid_answer_count"].append(len(json_data["compare_candidates"])) #
+        
+        if json_data["answer_generation"]["valid_data"] == True:
+            if json_data["answer_generation"]["finish_type"] == "give_answer":
+                method2result[method]["give_answer_rate"].append(1) #
+            else:
+                method2result[method]["give_answer_rate"].append(0) #
+
+        valid = len(json_data["compare_candidates"]) > 0
+        real_valid = False
+        best_answer_real_valid = False
+
+        for instance in json_data["compare_candidates"]: #只要有一个valid answer就算真阳
+            assert instance[-1]["node_type"] == "Action Input", file
+            real_valid = check_real_valid(instance[-1]["description"]) or real_valid #只要一个过，就算过
+        
+        if len(json_data["compare_candidates"]) > 0:
+            best_id = -1
+            max_elo = -1e7
+            for k,cont in enumerate(json_data["compare_candidates"]):
+                if cont[-1]["Elo"] > max_elo:
+                    best_id = k
+                    max_elo = cont[-1]["Elo"]
+
+            best_answer_real_valid = check_real_valid(json_data["compare_candidates"][best_id][-1]["description"])
+        if "ETS" in method:
+            method2result[method]["valid_per_data"].append( 1 if best_answer_real_valid else 0 )
+        else:
+            method2result[method]["valid_per_data"].append( 1 if best_answer_real_valid else 0 )
+        
+        if best_answer_real_valid:
+            method2result[method]["best_answer_acc"][0] += 1
+
+        if valid and real_valid:
+            method2result[method]["pass_at_acc"][0] += 1
+
+            
+            if json_data["answer_generation"]["valid_data"]:
+                observation_length = 0
+                for temp_node in json_data["answer_generation"]["train_messages"]:
+                    assert temp_node[-1]["role"] == "assistant"
+                    if "function_call" in temp_node[-1].keys():
+                        observation_length += 1
+                method2result[method]["valid_observation_count"].append(observation_length) #
+
+        else: #生成失败
+            if valid: # 假阳
+                method2result[method]["fake_valid"][0] += 1
+            # else:
+            #     print('#'*100, file=open('output.txt','a'))
+            #     print(file, file=open('output.txt','a'))
+
+            # print(flatten_error_codes)
+            if 1 in flatten_error_codes:
+                method2result[method]["hallucination_name_error"][0] += 1
+
+            if "forward_args" in json_data.keys() and "max_query_count" in json_data["forward_args"].keys() and json_data["forward_args"]["max_query_count"] <= json_data["answer_generation"]["query_count"]:
+                method2result[method]["max_query_count_stopping"][0] += 1
+            
+            '''
+            按错误的严重程度逐级判断
+            '''
+            if "html" in flatten_error_codes: #html
+                method2result[method]["html_in_response_error"][0] += 1
+            
+            if -1 in flatten_error_codes: #接口挂了
+                method2result[method]["openai_llm_bug"][0] += 1
+            elif 4 in flatten_error_codes: #html
+                method2result[method]["give_up_and_restart"][0] += 1
+            elif 11 in flatten_error_codes: #error in message
+                method2result[method]["\"error\" in response"][0] += 1
+            else:
+                method2result[method]["other_error"][0] += 1
+
+        if valid and real_valid:
+            method2result[method]["best_answer_is_real_valid"].append(1 if best_answer_real_valid else 0)
+
+    for method in method2result.keys():
+        for key,value in method2result[method].items():
+            if key in ["valid_observation_count","query_count","leaf_node_count","thought_node_rate","valid_answer_count","average_token_usage","give_answer_rate","best_answer_is_real_valid","vote_to_the_first_node"]:
+                method2result[method][key] = f"{np.mean(np.array(method2result[method][key])):.02f}"
+            elif type(value) == list and len(value) == 2:
+                method2result[method][key][1] = f"{method2result[method][key][0]*100 / method2result[method]['total_count']:.2f}\%"
+
+    def classify_N(xs,yss,N):
+        zip_value = list(zip(xs,yss[0]))
+        zip_value.sort(key = lambda x: x[0])
+
+        threshold =  []
+        for i in range(N):
+            threshold.append(zip_value[min(((i+1)*len(xs))//(N),len(zip_value)-1)][0])
+
+        bucket = [[] for _ in range(N)]
+        for cont in bucket:
+            for i in range(len(yss)):
+                cont.append([])
+        for k,ys in enumerate(yss):
+            for x,y in zip(xs,ys):
+                for i in range(N):
+                    if x < threshold[i]:
+                        bucket[i][k].append(y)
+                        break
+        for i in range(len(bucket)):
+            for k in range(len(bucket[i])):
+                bucket[i][k] = np.mean(np.array(bucket[i][k]))
+        return bucket
+
+    def print_table(table):
+        methods = list((table.keys()))
+        methods.sort()
+        column_names =  ["method"]+list(table[methods[0]].keys())
+        for key in table.keys():
+            table[key]["method"] = key
+
+        key_length = {}
+        for key in column_names:
+            if key in ["root/max_Elo","valid_per_data"]:
+                continue
+            now_max = len(key)
+            for method in methods:
+                now_max = max(now_max, len(str(table[method][key])))
+            key_length[key] = now_max
+        
+        for key in column_names:
+            if key in ["root/max_Elo","valid_per_data"]:
+                continue
+            # print(key,end=" "*(key_length[key]- len(key))+"|")    
+
+        mode = input_dir[len(input_dir[::-1][input_dir[::-1].find("/"):][::-1]):]
+    
+        for cnt, method in enumerate(methods):
+            for cnt_key, key in enumerate(column_names):
+                if key in ["root/max_Elo","valid_per_data"]:
+                    continue
+                if cnt == 0 and cnt_key == 0:
+                    print(mode + "|" + str(table[method][key]),end=" "*(key_length[key]- len(str(table[method][key])))+"|")    
+                else:
+                    print(key + ' ' + str(table[method][key]),end=" "*(key_length[key]- len(str(table[method][key])))+"|")    
+            print("")
+    print('timeout:', method2result[method]['connection_timeout'][1])
+    print_table(method2result)
--- a/toolbench/tooleval/requirements.txt
+++ b/toolbench/tooleval/requirements.txt
@ -0,0 +1,7 @@
+tqdm
+numpy
+pandas
+pydantic
+tenacity
+openai
+pyyaml
--- a/toolbench/tooleval/run_convert_answer.sh
+++ b/toolbench/tooleval/run_convert_answer.sh
@ -0,0 +1,16 @@
+export RAW_ANSWER_PATH=../../data/reproduction_data/model_predictions/
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
+export MODEL_NAME=chatgpt_cot
+export METHOD=CoT
+mkdir ${CONVERTED_ANSWER_PATH}/${MODEL_NAME}
+
+for test_set in G1_instruction G1_category G1_tool G2_category G2_instruction G3_instruction
+do
+    answer_dir=${RAW_ANSWER_PATH}/${MODEL_NAME}/${test_set}
+    output_file=${CONVERTED_ANSWER_PATH}/${MODEL_NAME}/${test_set}.json
+    
+    python convert_to_answer_format.py\
+        --answer_dir ${answer_dir} \
+        --method ${METHOD} \
+        --output ${output_file}
+done
--- a/toolbench/tooleval/run_pass_rate.sh
+++ b/toolbench/tooleval/run_pass_rate.sh
@ -0,0 +1,19 @@
+# export CONVERTED_ANSWER_PATH=../../result2/test_instruction/
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted
+export SAVE_PATH=../../output/pass_rate_results
+# export CANDIDATE_MODEL=gpt-4-turbo_dfs
+# export CANDIDATE_MODEL=reassign_turbo
+export CANDIDATE_MODEL=chatgpt_dfs
+# export CANDIDATE_MODEL=toolllama_dfs_retriever
+# export CANDIDATE_MODEL=gpt-4-0613_dfs
+export API_POOL_FILE=../../openai_key.json
+for i in 1
+do
+python eval_pass_rate.py \
+    --converted_answer_path ${CONVERTED_ANSWER_PATH} \
+    --save_path ${SAVE_PATH} \
+    --reference_model ${CANDIDATE_MODEL} \
+    --test_ids ../../data/test_query_ids/ \
+    --max_eval_threads 10 \
+    --evaluate_times 7
+done
--- a/toolbench/tooleval/run_preference.sh
+++ b/toolbench/tooleval/run_preference.sh
@ -0,0 +1,17 @@
+export CONVERTED_ANSWER_PATH=../../data/reproduction_data/model_predictions_converted/
+export SAVE_PATH=output/preference_results
+export PASS_TARE_PATH=output/pass_rate_results
+export REFERENCE_MODEL=chatgpt_cot
+export CANDIDATE_MODEL=gpt4_dfs_find_api
+export API_POOL_FILE=../../openai_key.json
+
+python eval_preference.py \
+    --converted_answer_path ${CONVERTED_ANSWER_PATH} \
+    --reference_model ${REFERENCE_MODEL} \
+    --output_model ${CANDIDATE_MODEL} \
+    --test_ids ../../data/test_query_ids/ \
+    --save_path ${SAVE_PATH} \
+    --pass_rate_result_path ${PASS_TARE_PATH} \
+    --max_eval_threads 20 \
+    --use_pass_rate true \
+    --evaluate_times 7
--- a/toolbench/tooleval/utils.py
+++ b/toolbench/tooleval/utils.py
@ -0,0 +1,199 @@
+"""
+Utils for tooleval.
+"""
+from toolbench.tooleval.evaluation import ExecutionGraph,ExecutionNode
+import random
+random.seed(42)
+from toolbench.tooleval.evaluators.registered_cls.rtl import AnswerStatus, TaskStatus
+
+task_status_mapping = {
+    "TaskStatus.Solvable": TaskStatus.Solvable,
+    "TaskStatus.Unsolvable": TaskStatus.Unsolvable,
+    "TaskStatus.Unsure": TaskStatus.Unsure
+}
+answer_status_mapping = {
+    "AnswerStatus.Solved": AnswerStatus.Solved,
+    "AnswerStatus.Unsolved": AnswerStatus.Unsolved,
+    "AnswerStatus.Unsure": AnswerStatus.Unsure
+}
+# test_sets = ["G1_category"]
+test_sets = ["G1_instruction", "G1_category", "G1_tool", "G2_instruction", "G2_category", "G3_instruction"]
+
+def get_steps(example):
+    answer_details = example["answer"]["answer_details"][0]
+    answer_steps = []
+    step_cnt = 1
+    final_step = ""
+
+    while "next" in answer_details:
+        answer_str = answer_details["message"]
+        role_str = answer_details["role"]
+
+        if answer_str and role_str == "tool":
+            step_text = f"Step {step_cnt}: {answer_str}"
+            answer_steps.append(step_text)
+            final_step = f"Final step: {answer_str}"
+            step_cnt += 1
+
+        if not answer_details["next"]:
+            break
+
+        answer_details = answer_details["next"][0]
+
+    return "\n".join(answer_steps), final_step
+
+
+def generate_init_message_node(eg:ExecutionGraph,functions,query):
+    init_node = ExecutionNode(role='system', message="You are AutoGPT, you can use many tools(functions) to do the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, say \"I give up and restart\".\n2.All the thought is short, at most in 5 sentence.\n3.You can do more then one trys, so if your plan is to continusly try some conditions, you can do one of the conditions per try.\nLet's Begin!\nTask description: You should use functions to help handle the real time user querys. Remember to ALWAYS call \"Finish\" function at the end of the task. And the final answer should contain enough information to show to the user.\nSpecifically, you have access to the following functions: " + str(functions))
+    eg.set_init_node(init_node)
+    
+    node = ExecutionNode(role='user', message=query)
+    eg.add_node(node)
+    eg[init_node,node] = None
+    return node
+
+def process_valid_data(method,answer_generation):
+    conversation = answer_generation['train_messages'][-1]
+    functions = answer_generation['function']
+    query = answer_generation['query']
+    eg = ExecutionGraph()
+    last_node = generate_init_message_node(eg,functions,query)
+    
+    index = 2
+    while index < len(conversation):
+        message = conversation[index]
+        role = message['role']
+        if role == 'system' or role == 'user' or role == 'function':
+            index = index + 1
+            continue
+        elif role == 'assistant':
+            if 'function_call' in message :
+                node = ExecutionNode(role='tool', message={
+                    'name':message['function_call']['name'],
+                    'arguments':message['function_call']['arguments'],
+                    'response':conversation[index+1]['content'] if message['function_call']['name']!='Finish' else ''
+                    })
+                index = index + 1
+            else:
+                node = ExecutionNode(role='assistant',
+                                        message=message['content'])
+                
+        else:
+            raise NotImplementedError(f'Unkown role {role}')
+        
+        index = index + 1
+        eg.add_node(node)
+        eg[last_node,node] = None
+        last_node = node
+    
+    eg = eg.reduce_graph_to_sequence()
+    
+    return {
+        'query':query,
+        'available_tools':functions,
+        'answer':{
+            'method':method,
+            'total_steps': eg.node_count,
+            'final_answer': answer_generation['final_answer'],
+            'answer_details': eg.convert_to_dict()
+        }
+    }
+
+def process_invalid_data(method,data_dict):
+    answer_generation = data_dict['answer_generation']
+    functions = answer_generation['function']
+    query = answer_generation['query']
+    eg = ExecutionGraph()
+    last_node = generate_init_message_node(eg,functions,query)
+    if 'CoT' in method:
+        trail = random.choice(data_dict["trys"])
+        index = 0
+        while index < len(trail['chain']):
+            message = trail['chain'][index]
+            if message['node_type'] == 'Action':
+                node = ExecutionNode(role='tool', message={
+                    'name':message['description'],
+                    'arguments':(trail['chain'][index+1]['description']),
+                    'response':(trail['chain'][index+1]['observation'])})
+            
+                index = index + 1
+            elif message['node_type'] == 'Thought':
+                node = ExecutionNode(role='assistant',
+                                        message=message['description'])
+            else:
+                raise NotImplementedError(f"Unknown node_type: {message['node_type']}")
+            index = index + 1
+
+            eg.add_node(node)
+            eg[last_node,node] = None
+            last_node = node
+        eg = eg.reduce_graph_to_sequence()
+   
+    elif 'DFS' in method:
+
+        def DFS(root):
+            if len(root['children']) == 0:
+                node = ExecutionNode(role=root['node_type'],message=root)
+                eg.add_node(node)
+                return node
+            else:
+                child_nodes = [DFS(node) for node in root['children']]
+                root['children'] = None
+                root_node = ExecutionNode(role=root['node_type'],message=root)
+                eg.add_node(root_node)
+                for child_node in child_nodes:
+                    eg.add_edge(root_node,child_node)
+                return root_node
+        for node in data_dict['tree']['tree']['children']:
+            eg[last_node,DFS(node)] = None
+
+        
+        # purify the graph
+        def purify_graph(node:ExecutionNode):
+            if node.role == 'Action':
+                adj_nodes = eg.get_adjacent_node(node)
+                for adj_node in adj_nodes:
+                    adj_node = eg[adj_node]
+                    if adj_node.role == 'Action Input':
+                        node.role = 'tool'
+                        node.message = {
+                            'name':node.message['description'],
+                            'arguments':(adj_node.message['description']),
+                            'response':(adj_node.message['observation'])
+                            
+                        }
+                        # remove adj_node
+                        adj_node = eg.pop_node(adj_node)
+                        to_nodes = eg.edges.pop(adj_node.node_id,{})
+                        eg.edges[node.node_id].update(to_nodes)
+                        eg.edges[node.node_id].pop(adj_node.node_id)
+                        node.out_degree += len(to_nodes)
+                        break
+            elif node.role == 'Thought':
+                node.role = 'assistant'
+                node.message = node.message['description']
+            elif node.role == 'Action Input':
+                print('Founding Extra Action Input Node')
+                pass
+            elif node.role =='system' or node.role=='user':
+                pass
+            else:
+                raise Exception('Unknown role {}'.format(node.role))
+            adj_nodes = eg.get_adjacent_node(node)
+            for adj_node in adj_nodes:
+                purify_graph(eg[adj_node])
+            
+        purify_graph(last_node)
+        eg = eg.reduce_graph_to_sequence()
+    else:
+        raise NotImplementedError(f'Unknown method {method}')
+    return {
+        'query':query,
+        'available_tools':functions,
+        'answer':{
+            'method':method,
+            'total_steps': eg.node_count,
+            'final_answer': answer_generation['final_answer'],
+            'answer_details': eg.convert_to_dict()
+        }
+    }
--- a/toolbench/train/llama_condense_monkey_patch.py
+++ b/toolbench/train/llama_condense_monkey_patch.py
@ -0,0 +1,44 @@
+# code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py
+import torch
+import transformers
+import transformers.models.llama.modeling_llama
+
+from functools import partial
+
+class CondenseRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, ratio, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        
+        # Build here to make `torch.jit.trace` work.
+        self.ratio = ratio
+        max_position_embeddings *= ratio
+        print(f"Condensing Positional embeddings from {max_position_embeddings} to {max_position_embeddings // ratio}")
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) / ratio
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        dtype = torch.get_default_dtype()
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) / self.ratio
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
+            self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+
+def replace_llama_with_condense(ratio):
+    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(CondenseRotaryEmbedding, ratio=ratio)
--- a/toolbench/train/llama_flash_attn_monkey_patch.py
+++ b/toolbench/train/llama_flash_attn_monkey_patch.py
@ -0,0 +1,76 @@
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn
+
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+import torch
+from torch import nn
+import torch.nn.functional as F
+import math
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+    
+
+def forward_2(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+    assert not output_attentions, "output_attentions is not supported"
+    assert not use_cache, "use_cache is not supported"
+    assert past_key_value is None, "past_key_value is not supported"
+
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+    attn_output= F.scaled_dot_product_attention(query_states,key_states,value_states,dropout_p=0.0, is_causal=True)
+    attn_weights = None
+    
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
+                                    inputs_embeds, past_key_values_length):
+    # [bsz, seq_len]
+    return attention_mask
+
+def replace_llama_attn_with_flash_attn():
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_2
+    
--- a/Show more
+++ b/Show more