John Langley commited on
Commit
8167b16
·
1 Parent(s): ba649da

streaming voice

Browse files
Files changed (2) hide show
  1. app.py +8 -6
  2. utilsinference.py +16 -23
app.py CHANGED
@@ -7,7 +7,7 @@ from huggingface_hub import hf_hub_download
7
  from llama_cpp import Llama
8
  from faster_whisper import WhisperModel
9
 
10
- from utilsinference import get_sentence, tts_interface
11
 
12
  os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
13
  os.system('python -m unidic download')
@@ -61,14 +61,16 @@ def respond(chat_history, voice):
61
  if not voice:
62
  return None, gr.Warning("Please select a voice.")
63
 
64
- for sentence, chatbot_history in get_sentence(chat_history, mistral_llm):
65
- print("Inserting sentence to queue")
66
- print(sentence)
 
 
 
67
 
68
- audiopb = tts_interface(sentence, voice)
69
 
70
  #history, response = get_sentence(chat_history, mistral_llm)
71
- yield chatbot_history, sentence, audiopb
72
 
73
 
74
  #Gradio Interface
 
7
  from llama_cpp import Llama
8
  from faster_whisper import WhisperModel
9
 
10
+ from utilsinference import get_sentence, tts_interface, generate_llm_output
11
 
12
  os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
13
  os.system('python -m unidic download')
 
61
  if not voice:
62
  return None, gr.Warning("Please select a voice.")
63
 
64
+ sentence = generate_llm_output(chat_history[-1][0], chat_history[:-1], mistral_llm)
65
+ audiopb = tts_interface(sentence, voice)
66
+
67
+ print("Inserting sentence to queue")
68
+ print(sentence)
69
+
70
 
 
71
 
72
  #history, response = get_sentence(chat_history, mistral_llm)
73
+ yield chat_history, sentence, audiopb
74
 
75
 
76
  #Gradio Interface
utilsinference.py CHANGED
@@ -49,32 +49,25 @@ def generate_llm_output(
49
  temperature = float(temperature)
50
  if temperature < 1e-2:
51
  temperature = 1e-2
52
- top_p = float(top_p)
53
 
54
- generate_kwargs = dict(
55
- temperature=temperature,
56
- max_new_tokens=256,
57
- top_p=top_p,
58
- repetition_penalty=1.0,
59
- do_sample=True,
60
- seed=42,
61
- )
62
 
63
  formatted_prompt = format_prompt(prompt, history)
64
- try:
65
- print("LLM Input:", formatted_prompt)
66
- # Local GGUF
67
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
68
- output = ""
69
- for response in stream:
70
- output += response.token.text
71
- yield output
72
- return output
73
-
74
- except Exception as e:
75
- print("Unhandled Exception: ", str(e))
76
- gr.Warning("Unfortunately Mistral is unable to process")
77
- output = "I do not know what happened but I could not understand you ."
78
  return output
79
 
80
 
 
49
  temperature = float(temperature)
50
  if temperature < 1e-2:
51
  temperature = 1e-2
52
+ top_p = float(top_p)
53
 
54
+ generate_kwargs = dict(
55
+ temperature=temperature,
56
+ max_new_tokens=max_tokens,
57
+ top_p=top_p,
58
+ repetition_penalty=1.0,
59
+ do_sample=True,
60
+ seed=42,
61
+ )
62
 
63
  formatted_prompt = format_prompt(prompt, history)
64
+
65
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
66
+ output = ""
67
+
68
+ for response in stream:
69
+ output += response.token.text
70
+ yield output
 
 
 
 
 
 
 
71
  return output
72
 
73