Studiobotxyz commited on
Commit
4b787b5
·
1 Parent(s): 8e3ee33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -43
app.py CHANGED
@@ -1,31 +1,26 @@
1
  import os
2
- os.system('pip install ctransformers')
3
-
4
- import ctransformers
5
  import time
6
  import requests
7
  from tqdm import tqdm
 
 
8
 
9
-
10
- import uuid
11
- #Get the model file - you will need Expandable Storage to make this work
12
 
13
  if not os.path.isfile('llama-2-7b.ggmlv3.q4_K_S.bin'):
14
- print("Downloading Model from HuggingFace")
15
- url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
16
- response = requests.get(url, stream=True)
17
- total_size_in_bytes= int(response.headers.get('content-length', 0))
18
- block_size = 1024 #1 Kibibyte
19
- progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
20
- with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
21
- for data in response.iter_content(block_size):
22
- progress_bar.update(len(data))
23
- file.write(data)
24
- progress_bar.close()
25
- if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
26
- print("ERROR, something went wrong")
27
-
28
- #Sets up the transformer library and adds in the Llama-2 model
29
 
30
  configObj = ctransformers.Config(stop=["\n", 'User'])
31
  config = ctransformers.AutoConfig(config=configObj, model_type='llama')
@@ -45,27 +40,40 @@ def time_it(func):
45
  return wrapper
46
 
47
  def complete(prompt, stop=["User", "Assistant"]):
48
- tokens = llm.tokenize(prompt)
49
- token_count = 0
50
- output = ''
51
- for token in llm.generate(tokens):
52
- token_count += 1
53
- result = llm.detokenize(token)
54
- output += result
55
- for word in stop:
56
- if word in output:
57
- print('\n')
58
- return [output, token_count]
59
- print(result, end='',flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- print('\n')
62
- return [output, token_count]
63
 
64
- while True:
65
- question = input("\nWhat is your question? > ")
66
- start_time = time.time()
67
- output, token_count = complete(f'User: {question}. Can you please answer this as informative but concisely as possible.\nAssistant: ')
68
- end_time = time.time()
69
- execution_time = end_time - start_time
70
- print(f"{token_count} tokens generated in {execution_time:.6f} seconds.\n{token_count/execution_time} tokens per second")
71
-
 
1
  import os
 
 
 
2
  import time
3
  import requests
4
  from tqdm import tqdm
5
+ from flask import Flask, request, jsonify
6
+ import ctransformers
7
 
8
+ app = Flask(__name__)
 
 
9
 
10
  if not os.path.isfile('llama-2-7b.ggmlv3.q4_K_S.bin'):
11
+ print("Downloading Model from HuggingFace")
12
+ url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
13
+ response = requests.get(url, stream=True)
14
+ total_size_in_bytes = int(response.headers.get('content-length', 0))
15
+ block_size = 1024 # 1 Kibibyte
16
+ progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
17
+ with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
18
+ for data in response.iter_content(block_size):
19
+ progress_bar.update(len(data))
20
+ file.write(data)
21
+ progress_bar.close()
22
+ if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
23
+ print("ERROR, something went wrong")
 
 
24
 
25
  configObj = ctransformers.Config(stop=["\n", 'User'])
26
  config = ctransformers.AutoConfig(config=configObj, model_type='llama')
 
40
  return wrapper
41
 
42
  def complete(prompt, stop=["User", "Assistant"]):
43
+ tokens = llm.tokenize(prompt)
44
+ token_count = 0
45
+ output = ''
46
+ for token in llm.generate(tokens):
47
+ token_count += 1
48
+ result = llm.detokenize(token)
49
+ output += result
50
+ for word in stop:
51
+ if word in output:
52
+ print('\n')
53
+ return [output, token_count]
54
+ print(result, end='', flush=True)
55
+
56
+ print('\n')
57
+ return [output, token_count]
58
+
59
+ @app.route('/generate', methods=['POST'])
60
+ def generate_response():
61
+ data = request.get_json()
62
+ question = data.get('question', '')
63
+
64
+ start_time = time.time()
65
+ output, token_count = complete(f'User: {question}. Can you please answer this as informatively but concisely as possible.\nAssistant: ')
66
+ end_time = time.time()
67
+ execution_time = end_time - start_time
68
+
69
+ response = {
70
+ 'output': output,
71
+ 'token_count': token_count,
72
+ 'execution_time': execution_time,
73
+ 'tokens_per_second': token_count / execution_time
74
+ }
75
 
76
+ return jsonify(response)
 
77
 
78
+ if __name__ == '__main__':
79
+ app.run(debug=True)