from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

def gptq_model_options():
    return [
        "TheBloke/Llama-2-7B-Chat-GPTQ", 
        "TheBloke/Llama-2-13B-chat-GPTQ",
        "TheBloke/meditron-7B-GPTQ",
        "TheBloke/meditron-70B-GPTQ",
    ]

def get_llm_response(model_name_or_path, temperature, do_sample, top_p, top_k, max_new_tokens, repetition_penalty, formatted_prompt):
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                device_map="auto",
                                                trust_remote_code=False,
                                                revision="main")

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
            
    print("Formatted prompt:")
    print(formatted_prompt)
    
    print("\n\n*** Generate:")

    input_ids = tokenizer(formatted_prompt, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, temperature=temperature, do_sample=do_sample, top_p=top_p, top_k=top_k, max_new_tokens=max_new_tokens)
    print(tokenizer.decode(output[0], skip_special_tokens=True)) 

    print("*** Pipeline:")
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        top_k=top_p,
        repetition_penalty=repetition_penalty
    )

    response = pipe(formatted_prompt)[0]['generated_text']
    print(response)
    return response