Spaces:

JohnInizio
/

conversational_ai_poc

Sleeping

John Langley commited on Sep 2, 2024

Commit

8167b16

1 Parent(s): ba649da

streaming voice

Files changed (2) hide show

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from faster_whisper import WhisperModel
-from utilsinference import get_sentence, tts_interface
 os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
 os.system('python -m unidic download')
@@ -61,14 +61,16 @@ def respond(chat_history, voice):
     if not voice:
         return None, gr.Warning("Please select a voice.")
-    for sentence, chatbot_history in get_sentence(chat_history, mistral_llm):
-        print("Inserting sentence to queue")
-        print(sentence)
-        audiopb = tts_interface(sentence, voice)
     #history, response = get_sentence(chat_history, mistral_llm)
-        yield chatbot_history, sentence, audiopb
 #Gradio Interface

 from llama_cpp import Llama
 from faster_whisper import WhisperModel
+from utilsinference import get_sentence, tts_interface, generate_llm_output
 os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
 os.system('python -m unidic download')
     if not voice:
         return None, gr.Warning("Please select a voice.")
+    sentence = generate_llm_output(chat_history[-1][0], chat_history[:-1], mistral_llm)
+    audiopb = tts_interface(sentence, voice)
+    print("Inserting sentence to queue")
+    print(sentence)
     #history, response = get_sentence(chat_history, mistral_llm)
+    yield chat_history, sentence, audiopb
 #Gradio Interface

utilsinference.py CHANGED Viewed

@@ -49,32 +49,25 @@ def generate_llm_output(
         temperature = float(temperature)
         if temperature < 1e-2:
             temperature = 1e-2
-            top_p = float(top_p)
-            generate_kwargs = dict(
-                temperature=temperature,
-                max_new_tokens=256,
-                top_p=top_p,
-                repetition_penalty=1.0,
-                do_sample=True,
-                seed=42,
-            )
         formatted_prompt = format_prompt(prompt, history)
-        try:
-            print("LLM Input:", formatted_prompt)
-            # Local GGUF
-            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
-            output = ""
-            for response in stream:
-                output += response.token.text
-                yield output
-            return output
-        except Exception as e:
-            print("Unhandled Exception: ", str(e))
-            gr.Warning("Unfortunately Mistral is unable to process")
-            output = "I do not know what happened but I could not understand you ."
         return output

         temperature = float(temperature)
         if temperature < 1e-2:
             temperature = 1e-2
+        top_p = float(top_p)
+        generate_kwargs = dict(
+            temperature=temperature,
+            max_new_tokens=max_tokens,
+            top_p=top_p,
+            repetition_penalty=1.0,
+            do_sample=True,
+            seed=42,
+        )
         formatted_prompt = format_prompt(prompt, history)
+        stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+        output = ""
+        for response in stream:
+            output += response.token.text
+            yield output
         return output