File size: 2,126 Bytes
8e3ee33
4ab0120
8e3ee33
 
 
4b787b5
96ef64b
8e3ee33
6c49eee
4b787b5
 
 
 
 
 
 
 
 
 
 
 
 
8e3ee33
 
 
 
 
 
 
 
 
cc6a446
4b787b5
cc6a446
4b787b5
 
cc6a446
4b787b5
cc6a446
4b787b5
 
cc6a446
4b787b5
2cc37c4
4b787b5
 
2cc37c4
28433a3
4b787b5
28433a3
4b787b5
4ab0120
c0799ff
4b787b5
28433a3
 
 
8e3ee33
6c49eee
36a2111
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
os.system("pip install ctransformers gradio")
import time
import requests
from tqdm import tqdm
import ctransformers
import gradio as gr

if not os.path.isfile('./llama-2-7b.ggmlv3.q4_K_S.bin'):
    print("Downloading Model from HuggingFace")
    url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

configObj = ctransformers.Config(stop=["\n", 'User'])
config = ctransformers.AutoConfig(config=configObj, model_type='llama')
config.config.stop = ["\n"]

llm = ctransformers.AutoModelForCausalLM.from_pretrained('./llama-2-7b.ggmlv3.q4_K_S.bin', config=config)
print("Loaded model")

def complete(prompt, stop=["User", "Assistant"]):
    print("0")
    tokens = llm.tokenize(prompt)
    print("1")
    token_count = 0
    output = ''
    print("2")
    for token in llm.generate(tokens):
        print("tokens")
        token_count += 1
        result = llm.detokenize(token)
        print("detokens")
        output += result
        print(output)
        for word in stop:
            if word in output:
                print(output, " | ", token_count)
                return output, token_count

    return output, token_count

def greet(question):
    print(question)
    output, token_count = complete(f'User: {question}. Can you please answer this as informatively but concisely as possible.\nAssistant: ')
    response = f"Response: {output} | Tokens: {token_count}"
    print(response)
    return response

iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch(share=True)