from vllm import LLM, SamplingParams from transformers import AutoTokenizer max_seq_length = 2048 dtype = None load_in_4bit = True instruction_message = """You are a boolean logic parser for natural language queries in a conversational search recruitment system. Your task is to insert XML tags that capture boolean operators (AND, OR, NOT) without altering the original query's text, spelling, grammar, word order, or formatting. The goal is to tag components that correspond to boolean logic while preserving the original sentence structure. The rules you must follow are: 1. Use the tag for items that are of the same type, such as skills, certifications, or qualifications, when they are listed together. 2. Use the tag for items of the same type when they are expressed as alternatives. 3. Use the tag for negations, and apply it to the entire negated portion. Nested boolean operators inside are unnecessary. 4. Avoid tagging unrelated components (e.g., degree and location) together even if they are connected by coordinating conjunctions. 5. Do not nest boolean operators. Each operator (AND, OR, NOT) should be handled separately and should not be placed within another operator's tag. Nested logic is not required. When the tags are removed, the query must remain exactly as it was originally written. Your role is to accurately capture the underlying boolean logic, not to rewrite or rephrase the query. """ conversation_history = [ { "role": "system", "content": f"{instruction_message}" }, { "role": "user", "content": "Seeking a Cyclist or Supply Technician with business collaboration and product revenue, with no experience at Google." }, { "role": "assistant", "content": "Seeking a Cyclist or Supply Technician with business collaboration and product revenue, with no experience at Google." }, ] class EndpointHandler: def __init__(self, path=""): """ Initializes the EndpointHandler with a specified model and tokenizer. Args: path (str): The local path or identifier for the model to load. This path should contain both the model and tokenizer files. """ self.llm = LLM(model=path, max_model_len=2048, quantization='awq', gpu_memory_utilization=0.8) self.tokenizer = AutoTokenizer.from_pretrained(path) def __call__(self, data) -> str: """ Processes the input data by generating a formatted conversation history string and passing it to the language model for generation. Args: data (dict): A dictionary containing the input data with the key `inputs`, which is a list representing a conversation history. Each conversation history item should be a dictionary with 'role' (e.g., "assistant" or "user") and 'content' (the message text). Returns: str: The generated output from the model after processing the conversation history. """ # Get inputs and preprocess user_string = data.pop("user_string") user_example = {"role": "user", "content": user_string} conversation_input = conversation_history.copy() conversation_input.append(user_example) model_input = self.tokenizer.apply_chat_template(conversation_input, tokenize=False, add_generation_prompt=True) # Set sampling parameters sampling_params = SamplingParams(temperature=0.1, min_p=0.6, max_tokens=1024) # Generate output output = self.llm.generate(model_input, sampling_params) generated_text = output[0].outputs[0].text.replace("<|end|>", "").strip() return generated_text