1inkusFace commited on
Commit
03de205
·
verified ·
1 Parent(s): d20d564

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -1
app.py CHANGED
@@ -11,14 +11,33 @@ import nltk
11
  import scipy.io.wavfile
12
  import os
13
  import subprocess
 
14
 
15
  subprocess.run(['bash','llama.sh'])
 
16
 
17
  os.environ["SAFETENSORS_FAST_GPU"] = "1"
18
  os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
19
 
20
  from espnet2.bin.tts_inference import Text2Speech
21
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  try:
23
  nltk.data.find('taggers/averaged_perceptron_tagger_eng')
24
  except LookupError:
@@ -108,6 +127,7 @@ def process_audio(img, microphone, audio_upload, state, answer_mode): # Added a
108
  torch.backends.cudnn.deterministic = False
109
  torch.backends.cudnn.benchmark = True
110
  torch.set_float32_matmul_precision("highest")
 
111
  vicuna_output = vicuna_model.generate(
112
  **vicuna_input,
113
  max_new_tokens = 512,
@@ -115,6 +135,13 @@ def process_audio(img, microphone, audio_upload, state, answer_mode): # Added a
115
  do_sample = True,
116
  low_memory = False
117
  )
 
 
 
 
 
 
 
118
  if answer_mode == 'medium':
119
  torch.backends.cuda.matmul.allow_tf32 = True
120
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 
11
  import scipy.io.wavfile
12
  import os
13
  import subprocess
14
+ from huggingface_hub import hf_hub_download
15
 
16
  subprocess.run(['bash','llama.sh'])
17
+ from llama_cpp import Llama
18
 
19
  os.environ["SAFETENSORS_FAST_GPU"] = "1"
20
  os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
21
 
22
  from espnet2.bin.tts_inference import Text2Speech
23
+ '''
24
+ repo_id = "TheBloke/vicuna-7B-v1.5-GGUF"
25
+ filename = "vicuna-7b-v1.5.Q8_0.gguf"
26
+ hf_hub_download(repo_id=repo_id, filename=filename)
27
+ llm = Llama(
28
+ model_path="./models/7B/llama-model.gguf",
29
+ n_gpu_layers=-1, # Uncomment to use GPU acceleration
30
+ # seed=1337, # Uncomment to set a specific seed
31
+ # n_ctx=2048, # Uncomment to increase the context window
32
+ )
33
+ '''
34
+ llm = Llama.from_pretrained(
35
+ repo_id="TheBloke/vicuna-7B-v1.5-GGUF",
36
+ filename="vicuna-7b-v1.5.Q8_0.gguf",
37
+ n_gpu_layers=-1, # Uncomment to use GPU acceleration
38
+ verbose=False
39
+ )
40
+
41
  try:
42
  nltk.data.find('taggers/averaged_perceptron_tagger_eng')
43
  except LookupError:
 
127
  torch.backends.cudnn.deterministic = False
128
  torch.backends.cudnn.benchmark = True
129
  torch.set_float32_matmul_precision("highest")
130
+ '''
131
  vicuna_output = vicuna_model.generate(
132
  **vicuna_input,
133
  max_new_tokens = 512,
 
135
  do_sample = True,
136
  low_memory = False
137
  )
138
+ '''
139
+ vicuna_output = llm(
140
+ **vicuna_input,
141
+ max_tokens=128, # Generate up to 32 tokens, set to None to generate up to the end of the context window
142
+ stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
143
+ echo=True # Echo the prompt back in the output
144
+ )
145
  if answer_mode == 'medium':
146
  torch.backends.cuda.matmul.allow_tf32 = True
147
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False