Spaces:

Studiobotxyz
/

StudioGPT2

Sleeping

File size: 2,005 Bytes

8e3ee33
4ab0120
8e3ee33
 
 
4b787b5
96ef64b
8e3ee33
6c49eee
4b787b5
 
 
 
 
 
 
 
 
 
 
 
 
8e3ee33
 
 
 
 
 
 
 
 
4b787b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ab0120
c0799ff
4b787b5
4ab0120
8e3ee33
6c49eee
36a2111

import os
os.system("pip install ctransformers gradio")
import time
import requests
from tqdm import tqdm
import ctransformers
import gradio as gr

if not os.path.isfile('./llama-2-7b.ggmlv3.q4_K_S.bin'):
    print("Downloading Model from HuggingFace")
    url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

configObj = ctransformers.Config(stop=["\n", 'User'])
config = ctransformers.AutoConfig(config=configObj, model_type='llama')
config.config.stop = ["\n"]

llm = ctransformers.AutoModelForCausalLM.from_pretrained('./llama-2-7b.ggmlv3.q4_K_S.bin', config=config)
print("Loaded model")

def complete(prompt, stop=["User", "Assistant"]):
    tokens = llm.tokenize(prompt)
    token_count = 0
    output = ''
    for token in llm.generate(tokens):
        token_count += 1
        result = llm.detokenize(token)
        output += result
        for word in stop:
            if word in output:
                print('\n')
                return [output, token_count]
        print(result, end='', flush=True)

    print('\n')
    return [output, token_count]

def greet(question):
    print(question)
    output, token_count = complete(f'User: {question}. Can you please answer this as informatively but concisely as possible.\nAssistant: ')
    return f"Response: {output} | Tokens: {token_count}"

iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch(share=True)