vicuna-clip

Sleeping

App Files Files Community

1inkusFace commited on Feb 17

Commit

03de205

verified ·

1 Parent(s): d20d564

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -1

app.py CHANGED Viewed

@@ -11,14 +11,33 @@ import nltk
 import scipy.io.wavfile
 import os
 import subprocess
 subprocess.run(['bash','llama.sh'])
 os.environ["SAFETENSORS_FAST_GPU"] = "1"
 os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
 from espnet2.bin.tts_inference import Text2Speech
 try:
     nltk.data.find('taggers/averaged_perceptron_tagger_eng')
 except LookupError:
@@ -108,6 +127,7 @@ def process_audio(img, microphone, audio_upload, state, answer_mode):  # Added a
             torch.backends.cudnn.deterministic = False
             torch.backends.cudnn.benchmark = True
             torch.set_float32_matmul_precision("highest")
             vicuna_output = vicuna_model.generate(
                 **vicuna_input,
                 max_new_tokens = 512,
@@ -115,6 +135,13 @@ def process_audio(img, microphone, audio_upload, state, answer_mode):  # Added a
                 do_sample = True,
                 low_memory = False
             )
         if answer_mode == 'medium':
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False

 import scipy.io.wavfile
 import os
 import subprocess
+from huggingface_hub import hf_hub_download
 subprocess.run(['bash','llama.sh'])
+from llama_cpp import Llama
 os.environ["SAFETENSORS_FAST_GPU"] = "1"
 os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
 from espnet2.bin.tts_inference import Text2Speech
+'''
+repo_id = "TheBloke/vicuna-7B-v1.5-GGUF"
+filename = "vicuna-7b-v1.5.Q8_0.gguf"
+hf_hub_download(repo_id=repo_id, filename=filename)
+llm = Llama(
+      model_path="./models/7B/llama-model.gguf",
+      n_gpu_layers=-1, # Uncomment to use GPU acceleration
+      # seed=1337, # Uncomment to set a specific seed
+      # n_ctx=2048, # Uncomment to increase the context window
+)
+'''
+llm = Llama.from_pretrained(
+    repo_id="TheBloke/vicuna-7B-v1.5-GGUF",
+    filename="vicuna-7b-v1.5.Q8_0.gguf",
+    n_gpu_layers=-1, # Uncomment to use GPU acceleration
+    verbose=False
+)
 try:
     nltk.data.find('taggers/averaged_perceptron_tagger_eng')
 except LookupError:
             torch.backends.cudnn.deterministic = False
             torch.backends.cudnn.benchmark = True
             torch.set_float32_matmul_precision("highest")
+            '''
             vicuna_output = vicuna_model.generate(
                 **vicuna_input,
                 max_new_tokens = 512,
                 do_sample = True,
                 low_memory = False
             )
+            '''
+            vicuna_output = llm(
+                **vicuna_input,
+                max_tokens=128, # Generate up to 32 tokens, set to None to generate up to the end of the context window
+                stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
+                echo=True # Echo the prompt back in the output
+            )
         if answer_mode == 'medium':
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False