Spaces:

Studiobotxyz
/

StudioGPT2

Sleeping

App Files Files Community

Studiobotxyz commited on Jan 7, 2024

Commit

4b787b5

1 Parent(s): 8e3ee33

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -43

app.py CHANGED Viewed

@@ -1,31 +1,26 @@
 import os
-os.system('pip install ctransformers')
-import ctransformers
 import time
 import requests
 from tqdm import tqdm
-import uuid
-#Get the model file - you will need Expandable Storage to make this work
 if not os.path.isfile('llama-2-7b.ggmlv3.q4_K_S.bin'):
-  print("Downloading Model from HuggingFace")
-  url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
-  response = requests.get(url, stream=True)
-  total_size_in_bytes= int(response.headers.get('content-length', 0))
-  block_size = 1024 #1 Kibibyte
-  progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
-  with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
-      for data in response.iter_content(block_size):
-          progress_bar.update(len(data))
-          file.write(data)
-  progress_bar.close()
-  if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
-      print("ERROR, something went wrong")
-#Sets up the transformer library and adds in the Llama-2 model
 configObj = ctransformers.Config(stop=["\n", 'User'])
 config = ctransformers.AutoConfig(config=configObj, model_type='llama')
@@ -45,27 +40,40 @@ def time_it(func):
     return wrapper
 def complete(prompt, stop=["User", "Assistant"]):
-  tokens = llm.tokenize(prompt)
-  token_count = 0
-  output = ''
-  for token in llm.generate(tokens):
-    token_count += 1
-    result = llm.detokenize(token)
-    output += result
-    for word in stop:
-      if word in output:
-        print('\n')
-        return [output, token_count]
-    print(result, end='',flush=True)
-  print('\n')
-  return [output, token_count]
-while True:
-  question = input("\nWhat is your question? > ")
-  start_time = time.time()
-  output, token_count = complete(f'User: {question}. Can you please answer this as informative but concisely as possible.\nAssistant: ')
-  end_time = time.time()
-  execution_time = end_time - start_time
-  print(f"{token_count} tokens generated in {execution_time:.6f} seconds.\n{token_count/execution_time} tokens per second")

 import os
 import time
 import requests
 from tqdm import tqdm
+from flask import Flask, request, jsonify
+import ctransformers
+app = Flask(__name__)
 if not os.path.isfile('llama-2-7b.ggmlv3.q4_K_S.bin'):
+    print("Downloading Model from HuggingFace")
+    url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
+    response = requests.get(url, stream=True)
+    total_size_in_bytes = int(response.headers.get('content-length', 0))
+    block_size = 1024  # 1 Kibibyte
+    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+    with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+        print("ERROR, something went wrong")
 configObj = ctransformers.Config(stop=["\n", 'User'])
 config = ctransformers.AutoConfig(config=configObj, model_type='llama')
     return wrapper
 def complete(prompt, stop=["User", "Assistant"]):
+    tokens = llm.tokenize(prompt)
+    token_count = 0
+    output = ''
+    for token in llm.generate(tokens):
+        token_count += 1
+        result = llm.detokenize(token)
+        output += result
+        for word in stop:
+            if word in output:
+                print('\n')
+                return [output, token_count]
+        print(result, end='', flush=True)
+    print('\n')
+    return [output, token_count]
+@app.route('/generate', methods=['POST'])
+def generate_response():
+    data = request.get_json()
+    question = data.get('question', '')
+    start_time = time.time()
+    output, token_count = complete(f'User: {question}. Can you please answer this as informatively but concisely as possible.\nAssistant: ')
+    end_time = time.time()
+    execution_time = end_time - start_time
+    response = {
+        'output': output,
+        'token_count': token_count,
+        'execution_time': execution_time,
+        'tokens_per_second': token_count / execution_time
+    }
+    return jsonify(response)
+if __name__ == '__main__':
+    app.run(debug=True)