StudioGPT2 / app.py
Studiobotxyz's picture
Update app.py
0844557
raw
history blame
2.7 kB
import os
os.system("pip install flask ctransformers")
import time
import requests
from tqdm import tqdm
from flask import Flask, request, jsonify
import ctransformers
app = Flask(__name__)
if not os.path.isfile('llama-2-7b.ggmlv3.q4_K_S.bin'):
print("Downloading Model from HuggingFace")
url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kibibyte
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("ERROR, something went wrong")
configObj = ctransformers.Config(stop=["\n", 'User'])
config = ctransformers.AutoConfig(config=configObj, model_type='llama')
config.config.stop = ["\n"]
llm = ctransformers.AutoModelForCausalLM.from_pretrained('./llama-2-7b.ggmlv3.q4_K_S.bin', config=config)
print("Loaded model")
def time_it(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
execution_time = end_time - start_time
print(f"Function '{func.__name__}' took {execution_time:.6f} seconds to execute.")
return result
return wrapper
def complete(prompt, stop=["User", "Assistant"]):
tokens = llm.tokenize(prompt)
token_count = 0
output = ''
for token in llm.generate(tokens):
token_count += 1
result = llm.detokenize(token)
output += result
for word in stop:
if word in output:
print('\n')
return [output, token_count]
print(result, end='', flush=True)
print('\n')
return [output, token_count]
@app.route('/generate', methods=['POST'])
def generate_response():
data = request.get_json()
question = data.get('question', '')
start_time = time.time()
output, token_count = complete(f'User: {question}. Can you please answer this as informatively but concisely as possible.\nAssistant: ')
end_time = time.time()
execution_time = end_time - start_time
response = {
'output': output,
'token_count': token_count,
'execution_time': execution_time,
'tokens_per_second': token_count / execution_time
}
return jsonify(response)
if __name__ == '__main__':
app.run(debug=True)