Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,14 +11,33 @@ import nltk
|
|
11 |
import scipy.io.wavfile
|
12 |
import os
|
13 |
import subprocess
|
|
|
14 |
|
15 |
subprocess.run(['bash','llama.sh'])
|
|
|
16 |
|
17 |
os.environ["SAFETENSORS_FAST_GPU"] = "1"
|
18 |
os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
|
19 |
|
20 |
from espnet2.bin.tts_inference import Text2Speech
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
try:
|
23 |
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
|
24 |
except LookupError:
|
@@ -108,6 +127,7 @@ def process_audio(img, microphone, audio_upload, state, answer_mode): # Added a
|
|
108 |
torch.backends.cudnn.deterministic = False
|
109 |
torch.backends.cudnn.benchmark = True
|
110 |
torch.set_float32_matmul_precision("highest")
|
|
|
111 |
vicuna_output = vicuna_model.generate(
|
112 |
**vicuna_input,
|
113 |
max_new_tokens = 512,
|
@@ -115,6 +135,13 @@ def process_audio(img, microphone, audio_upload, state, answer_mode): # Added a
|
|
115 |
do_sample = True,
|
116 |
low_memory = False
|
117 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
if answer_mode == 'medium':
|
119 |
torch.backends.cuda.matmul.allow_tf32 = True
|
120 |
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
|
|
|
11 |
import scipy.io.wavfile
|
12 |
import os
|
13 |
import subprocess
|
14 |
+
from huggingface_hub import hf_hub_download
|
15 |
|
16 |
subprocess.run(['bash','llama.sh'])
|
17 |
+
from llama_cpp import Llama
|
18 |
|
19 |
os.environ["SAFETENSORS_FAST_GPU"] = "1"
|
20 |
os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
|
21 |
|
22 |
from espnet2.bin.tts_inference import Text2Speech
|
23 |
+
'''
|
24 |
+
repo_id = "TheBloke/vicuna-7B-v1.5-GGUF"
|
25 |
+
filename = "vicuna-7b-v1.5.Q8_0.gguf"
|
26 |
+
hf_hub_download(repo_id=repo_id, filename=filename)
|
27 |
+
llm = Llama(
|
28 |
+
model_path="./models/7B/llama-model.gguf",
|
29 |
+
n_gpu_layers=-1, # Uncomment to use GPU acceleration
|
30 |
+
# seed=1337, # Uncomment to set a specific seed
|
31 |
+
# n_ctx=2048, # Uncomment to increase the context window
|
32 |
+
)
|
33 |
+
'''
|
34 |
+
llm = Llama.from_pretrained(
|
35 |
+
repo_id="TheBloke/vicuna-7B-v1.5-GGUF",
|
36 |
+
filename="vicuna-7b-v1.5.Q8_0.gguf",
|
37 |
+
n_gpu_layers=-1, # Uncomment to use GPU acceleration
|
38 |
+
verbose=False
|
39 |
+
)
|
40 |
+
|
41 |
try:
|
42 |
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
|
43 |
except LookupError:
|
|
|
127 |
torch.backends.cudnn.deterministic = False
|
128 |
torch.backends.cudnn.benchmark = True
|
129 |
torch.set_float32_matmul_precision("highest")
|
130 |
+
'''
|
131 |
vicuna_output = vicuna_model.generate(
|
132 |
**vicuna_input,
|
133 |
max_new_tokens = 512,
|
|
|
135 |
do_sample = True,
|
136 |
low_memory = False
|
137 |
)
|
138 |
+
'''
|
139 |
+
vicuna_output = llm(
|
140 |
+
**vicuna_input,
|
141 |
+
max_tokens=128, # Generate up to 32 tokens, set to None to generate up to the end of the context window
|
142 |
+
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
|
143 |
+
echo=True # Echo the prompt back in the output
|
144 |
+
)
|
145 |
if answer_mode == 'medium':
|
146 |
torch.backends.cuda.matmul.allow_tf32 = True
|
147 |
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
|