from vllm import LLM, SamplingParams
from transformers import AutoTokenizer 

max_seq_length = 2048
dtype = None
load_in_4bit = True

instruction_message = """You are tasked with extracting relevant information from a given text representing a candidate profile or job requirement. Your goal is to insert specific tags around relevant pieces of information without altering the original text's formatting, punctuation, or word order.
**Tags to use:**
- `<jobtitle>`: Job title of the candidate or job requirement.
- `<skill>`: Specific skills or competencies mentioned.
- `<minyear>`: Minimum number of years of experience.
- `<maxyear>`: Maximum number of years of experience.
- `<language>`: Languages spoken or required.
- `<location>`: Geographical location related to the candidate or job.
- `<degree>`: Academic qualifications or degrees.
- `<certification>`: Professional certifications or accreditations.
- `<institution_education>`: Names of educational institutions related to the degree.
- `<institution_company>`: Names of companies related to employment history.
- `<proximity>`: Distance or location-related preferences (e.g., "within 30 miles").
- `<industry>`: Specific industry or sector experience.
**Guidelines:**
1. **Preserve Original Text**: Do not change the original text's formatting, punctuation, or word order. The output should be identical to the input except for the addition of tags.
2. **Tagging**: Enclose each relevant piece of information with the appropriate tag. Do not include commas or other punctuation inside the tags unless they are part of the tagged item.
3. **Experience Years**: Use `<minyear>` as default if only a single year is given, relating to the experience.
4. **IMPORTANT** You **must not** include any tag that is not in the provided list. You can only tag using the ones provided.
Your role is to accurately tag the text while preserving its original appearance.
"""

conversation_history = [
    {
        "role": "system",
        "content": f"{instruction_message}"
    },
    {
        "role": "user",
        "content": "Financial Analysts located within 50 miles of London, with skills in planning, budgeting, and a Master's Degree in Finance. Previous employment at Goldman Sachs or degree from MIT is a bonus."
    },
    {
        "role": "assistant",
        "content": "<jobtitle>Financial Analysts</jobtitle> located within <proximity>50 miles</proximity> of <location>London</location>, with skills in <skill>planning</skill>, <skill>budgeting</skill>, and a <degree>Master's Degree in Finance</degree>. Previous employment at <institution_company>Goldman Sachs</institution_company> or degree from <institution_education>MIT</institution_education> is a bonus."
    }
]

class EndpointHandler:

    def __init__(self, path=""):
        """
        Initializes the EndpointHandler with a specified model and tokenizer.
    
        Args:
            path (str): The local path or identifier for the model to load. 
                        This path should contain both the model and tokenizer files.
        """
        self.llm = LLM(model=path, max_model_len=2048, quantization='awq', gpu_memory_utilization=0.8)
        self.tokenizer = AutoTokenizer.from_pretrained(path)
    

    def __call__(self, data) -> str:
        """
        Processes the input data by generating a formatted conversation history string 
        and passing it to the language model for generation.
    
        Args:
            data (dict): A dictionary containing the input data with the key `inputs`, 
                         which is a list representing a conversation history. 
                         Each conversation history item should be a dictionary with 'role' 
                         (e.g., "assistant" or "user") and 'content' (the message text).
    
        Returns:
            str: The generated output from the model after processing the conversation history.
        """
        
        # Get inputs and preprocess
        user_string = data.pop("user_string")
        user_example = {"role": "user", "content": user_string}

        conversation_input = conversation_history.copy()
        conversation_input.append(user_example)
        model_input = self.tokenizer.apply_chat_template(conversation_input, tokenize=False, add_generation_prompt=True)
        
        # Set sampling parameters
        sampling_params = SamplingParams(temperature=0.1, min_p=0.6, max_tokens=1024)
        
        # Generate output
        output = self.llm.generate(model_input, sampling_params)
        generated_text = output[0].outputs[0].text

        return generated_text