AnyTool/toolbench/inference/LLM/llama_model.py
2024-02-23 15:13:06 +08:00

135 lines
No EOL
6.8 KiB
Python

#!/usr/bin/env python
# coding=utf-8
from typing import Optional, List, Mapping, Any
from transformers import AutoTokenizer, AutoModelForCausalLM
from termcolor import colored
import time
from typing import Optional
from transformers import (
AutoTokenizer,
AutoModelForCausalLM
)
from toolbench.utils import process_system_message
from toolbench.model.model_adapter import get_conversation_template
from toolbench.inference.utils import SimpleChatIO, generate_stream, react_parser
class LlamaModel:
def __init__(self, model_name_or_path: str, template:str="tool-llama-single-round", device: str="cuda", cpu_offloading: bool=False, max_sequence_length: int=2048) -> None:
super().__init__()
self.model_name = model_name_or_path
self.template = template
self.max_sequence_length = max_sequence_length
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, model_max_length=self.max_sequence_length)
self.model = AutoModelForCausalLM.from_pretrained(
model_name_or_path, low_cpu_mem_usage=True
)
if self.tokenizer.pad_token_id == None:
self.tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>"})
self.model.resize_token_embeddings(len(self.tokenizer))
self.use_gpu = (True if device == "cuda" else False)
if (device == "cuda" and not cpu_offloading) or device == "mps":
self.model.to(device)
self.chatio = SimpleChatIO()
def prediction(self, prompt: str, stop: Optional[List[str]] = None) -> str:
gen_params = {
"model": "",
"prompt": prompt,
"temperature": 0.5,
"max_new_tokens": 512,
"stop": "</s>",
"stop_token_ids": None,
"echo": False
}
generate_stream_func = generate_stream
output_stream = generate_stream_func(self.model, self.tokenizer, gen_params, "cuda", self.max_sequence_length, force_generate=True)
outputs = self.chatio.return_output(output_stream)
prediction = outputs.strip()
return prediction
def add_message(self, message):
self.conversation_history.append(message)
def change_messages(self,messages):
self.conversation_history = messages
def display_conversation(self, detailed=False):
role_to_color = {
"system": "red",
"user": "green",
"assistant": "blue",
"function": "magenta",
}
print("before_print"+"*"*50)
for message in self.conversation_history:
print_obj = f"{message['role']}: {message['content']} "
if "function_call" in message.keys():
print_obj = print_obj + f"function_call: {message['function_call']}"
print_obj += ""
print(
colored(
print_obj,
role_to_color[message["role"]],
)
)
print("end_print"+"*"*50)
def parse(self,functions,process_id,**args):
conv = get_conversation_template(self.template)
if self.template == "tool-llama":
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
elif self.template == "tool-llama-single-round" or self.template == "tool-llama-multi-rounds":
roles = {"system": conv.roles[0], "user": conv.roles[1], "function": conv.roles[2], "assistant": conv.roles[3]}
self.time = time.time()
conversation_history = self.conversation_history
prompt = ''
for message in conversation_history:
role = roles[message['role']]
content = message['content']
if role == "System" and functions != []:
content = process_system_message(content, functions)
prompt += f"{role}: {content}\n"
prompt += "Assistant:\n"
if functions != []:
predictions = self.prediction(prompt)
else:
predictions = self.prediction(prompt)
decoded_token_len = len(self.tokenizer(predictions))
if process_id == 0:
print(f"[process({process_id})]total tokens: {decoded_token_len}")
thought, action, action_input = react_parser(predictions)
if len(thought.strip()) > 1:
print(thought)
# input()
message = {
"role": "assistant",
"content": thought,
"function_call": {
"name": action,
"arguments": action_input
}
}
return message, 0, decoded_token_len
if __name__ == "__main__":
# can accept all huggingface LlamaModel family
llm = LlamaModel("decapoda-research/llama-7b-hf")
messages = [
{'role': 'system', 'content': '''You are AutoGPT, you can use many tools(functions) to do
the following task.\nFirst I will give you the task description, and your task start.\nAt each step, you need to give your thought to analyze the status now and what to do next, with a function call to actually excute your step.\nAfter the call, you will get the call result, and you are now in a new state.\nThen you will analyze your status now, then decide what to do next...\nAfter many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.\nRemember: \n1.the state change is , you can\'t go
back to the former state, if you want to restart the task, say "I give up and restart".\n2.All the thought is short, at most in 5 sentence.\nLet\'s Begin!\nTask description: Use numbers and basic arithmetic operations (+ - * /) to obtain exactly one number=24. Each
step, you are only allowed to choose two of the left numbers to obtain a new number. For example, you can combine [3,13,9,7] as 7*9 - 3*13 = 24.\nRemember:\n1.all of the number must be used , and must be used ONCE. So Only when left numbers is exact 24, you will win. So you don\'t succeed when left number = [24, 5]. You succeed when left number = [24]. \n2.all the try takes exactly 3 steps, look
at the input format'''},
{'role': 'user', 'content': '\nThe real task input is: [1, 2, 4, 7]\nBegin!\n'}
]
functions = [{'name': 'play_24', 'description': '''make your current conbine with the format "x operation y = z (left: aaa) " like "1+2=3, (left: 3 5 7)", then I will tell you whether you win. This is the ONLY way
to interact with the game, and the total process of a input use 3 steps of call, each step you can only combine 2 of the left numbers, so the count of left numbers decrease from 4 to 1''','parameters':{'type': 'object', 'properties':{}}}]#, 'parameters': {'type': 'object', 'properties': {'input': {'type': 'string', 'description': 'describe what number you want to conbine, and how to conbine.'}}, 'required': ['input']}}]
llm.change_messages(messages)
output = llm.parse(functions=functions)
print(output)