Spaces:
Sleeping
Sleeping
File size: 2,126 Bytes
8e3ee33 4ab0120 8e3ee33 4b787b5 96ef64b 8e3ee33 6c49eee 4b787b5 8e3ee33 cc6a446 4b787b5 cc6a446 4b787b5 cc6a446 4b787b5 cc6a446 4b787b5 cc6a446 4b787b5 2cc37c4 4b787b5 2cc37c4 28433a3 4b787b5 28433a3 4b787b5 4ab0120 c0799ff 4b787b5 28433a3 8e3ee33 6c49eee 36a2111 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
os.system("pip install ctransformers gradio")
import time
import requests
from tqdm import tqdm
import ctransformers
import gradio as gr
if not os.path.isfile('./llama-2-7b.ggmlv3.q4_K_S.bin'):
print("Downloading Model from HuggingFace")
url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kibibyte
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("ERROR, something went wrong")
configObj = ctransformers.Config(stop=["\n", 'User'])
config = ctransformers.AutoConfig(config=configObj, model_type='llama')
config.config.stop = ["\n"]
llm = ctransformers.AutoModelForCausalLM.from_pretrained('./llama-2-7b.ggmlv3.q4_K_S.bin', config=config)
print("Loaded model")
def complete(prompt, stop=["User", "Assistant"]):
print("0")
tokens = llm.tokenize(prompt)
print("1")
token_count = 0
output = ''
print("2")
for token in llm.generate(tokens):
print("tokens")
token_count += 1
result = llm.detokenize(token)
print("detokens")
output += result
print(output)
for word in stop:
if word in output:
print(output, " | ", token_count)
return output, token_count
return output, token_count
def greet(question):
print(question)
output, token_count = complete(f'User: {question}. Can you please answer this as informatively but concisely as possible.\nAssistant: ')
response = f"Response: {output} | Tokens: {token_count}"
print(response)
return response
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch(share=True)
|