File size: 1,600 Bytes
90d439d
 
 
74044e0
9730359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47



def get_llm_response(repo, filename, model_type, gpu_layers, prompt):
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

    model_name_or_path = "TheBloke/meditron-7B-GPTQ"
    # To use a different branch, change revision
    # For example: revision="gptq-4bit-128g-actorder_True"
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                device_map="auto",
                                                trust_remote_code=False,
                                                revision="main")

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)    

    print("\n\n*** Generate:")

    #input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
    #output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
    #print(tokenizer.decode(output[0]))

    # Inference can also be done using transformers' pipeline

    print("*** Pipeline:")
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        top_k=40,
        repetition_penalty=1.1
    )

    prompt_template=f'''<|im_start|>system
    {system_message}<|im_end|>
    <|im_start|>user
    {prompt}<|im_end|>
    <|im_start|>assistant
    '''.format(system_message="You are an assistant", prompt=prompt)

    response = pipe(prompt_template)[0]['generated_text']
    print(response)
    return response