Studiobotxyz commited on
Commit
fd34502
·
1 Parent(s): d60b634

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system('pip install ctransformers')
3
+
4
+ import ctransformers
5
+ import time
6
+ import requests
7
+ from tqdm import tqdm
8
+
9
+
10
+ import uuid
11
+ #Get the model file - you will need Expandable Storage to make this work
12
+
13
+ if not os.path.isfile('llama-2-7b.ggmlv3.q4_K_S.bin'):
14
+ print("Downloading Model from HuggingFace")
15
+ url = "https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_K_S.bin"
16
+ response = requests.get(url, stream=True)
17
+ total_size_in_bytes= int(response.headers.get('content-length', 0))
18
+ block_size = 1024 #1 Kibibyte
19
+ progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
20
+ with open('llama-2-7b.ggmlv3.q4_K_S.bin', 'wb') as file:
21
+ for data in response.iter_content(block_size):
22
+ progress_bar.update(len(data))
23
+ file.write(data)
24
+ progress_bar.close()
25
+ if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
26
+ print("ERROR, something went wrong")
27
+
28
+ #Sets up the transformer library and adds in the Llama-2 model
29
+
30
+ configObj = ctransformers.Config(stop=["\n", 'User'])
31
+ config = ctransformers.AutoConfig(config=configObj, model_type='llama')
32
+ config.config.stop = ["\n"]
33
+
34
+ llm = ctransformers.AutoModelForCausalLM.from_pretrained('./llama-2-7b.ggmlv3.q4_K_S.bin', config=config)
35
+ print("Loaded model")
36
+
37
+ def time_it(func):
38
+ def wrapper(*args, **kwargs):
39
+ start_time = time.time()
40
+ result = func(*args, **kwargs)
41
+ end_time = time.time()
42
+ execution_time = end_time - start_time
43
+ print(f"Function '{func.__name__}' took {execution_time:.6f} seconds to execute.")
44
+ return result
45
+ return wrapper
46
+
47
+ def complete(prompt, stop=["User", "Assistant"]):
48
+ tokens = llm.tokenize(prompt)
49
+ token_count = 0
50
+ output = ''
51
+ for token in llm.generate(tokens):
52
+ token_count += 1
53
+ result = llm.detokenize(token)
54
+ output += result
55
+ for word in stop:
56
+ if word in output:
57
+ print('\n')
58
+ return [output, token_count]
59
+ print(result, end='',flush=True)
60
+
61
+ print('\n')
62
+ return [output, token_count]
63
+
64
+ while True:
65
+ question = input("\nWhat is your question? > ")
66
+ start_time = time.time()
67
+ output, token_count = complete(f'User: {question}. Can you please answer this as informative but concisely as possible.\nAssistant: ')
68
+ end_time = time.time()
69
+ execution_time = end_time - start_time
70
+ print(f"{token_count} tokens generated in {execution_time:.6f} seconds.\n{token_count/execution_time} tokens per second")
71
+