Spaces:

Robichh
/

local-llm-2

Paused

File size: 1,575 Bytes

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


def get_llm_response(repo, filename, model_type, gpu_layers, system_message, prompt):

    model_name_or_path = "TheBloke/meditron-7B-GPTQ"
    # To use a different branch, change revision
    # For example: revision="gptq-4bit-128g-actorder_True"
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                device_map="auto",
                                                trust_remote_code=False,
                                                revision="main")

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
    
    prompt_template=f'''<|im_start|>system
        {system_message}<|im_end|>
        <|im_start|>user
        {prompt}<|im_end|>
        <|im_start|>assistant
        '''
        
    print("Template:")
    print(prompt_template)
    
    print("\n\n*** Generate:")

    input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, temperature=0.01, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
    print(tokenizer.decode(output[0])) 

    print("*** Pipeline:")
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        top_k=40,
        repetition_penalty=1.1
    )

    

    response = pipe(prompt_template)[0]['generated_text']
    print(response)
    return response