Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,68 +1,260 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import requests
|
4 |
import tempfile
|
5 |
-
import
|
6 |
-
import numpy as np
|
7 |
-
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
|
8 |
-
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
9 |
-
from langchain_community.vectorstores import Neo4jVector
|
10 |
-
from langchain_community.graphs import Neo4jGraph
|
11 |
-
from langchain_experimental.graph_transformers import LLMGraphTransformer
|
12 |
-
from langchain_core.prompts import ChatPromptTemplate
|
13 |
import time
|
14 |
-
import
|
15 |
-
from
|
16 |
-
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
stopped: bool = False
|
26 |
-
conversation: list = field(default_factory=list)
|
27 |
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
|
41 |
-
|
|
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
def
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
)
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# Define the ASR model with Whisper
|
67 |
model_id = 'openai/whisper-large-v3'
|
68 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -83,13 +275,12 @@ pipe_asr = pipeline(
|
|
83 |
return_timestamps=True
|
84 |
)
|
85 |
|
86 |
-
#
|
87 |
def auto_reset_state():
|
88 |
-
time.sleep(
|
89 |
return None, "" # Reset the state and clear input text
|
90 |
|
91 |
|
92 |
-
# Function to process audio input and transcribe it
|
93 |
def transcribe_function(stream, new_chunk):
|
94 |
try:
|
95 |
sr, y = new_chunk[0], new_chunk[1]
|
@@ -97,6 +288,7 @@ def transcribe_function(stream, new_chunk):
|
|
97 |
print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
|
98 |
return stream, "", None
|
99 |
|
|
|
100 |
if y is None or len(y) == 0:
|
101 |
return stream, "", None
|
102 |
|
@@ -105,168 +297,94 @@ def transcribe_function(stream, new_chunk):
|
|
105 |
if max_abs_y > 0:
|
106 |
y = y / max_abs_y
|
107 |
|
|
|
108 |
if stream is not None and len(stream) > 0:
|
109 |
stream = np.concatenate([stream, y])
|
110 |
else:
|
111 |
stream = y
|
112 |
|
|
|
113 |
result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
|
114 |
full_text = result.get("text", "")
|
115 |
|
|
|
116 |
threading.Thread(target=auto_reset_state).start()
|
117 |
-
return stream, full_text, full_text
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
# Function to generate a full-text search query for Neo4j
|
122 |
-
#def generate_full_text_query(input: str) -> str:
|
123 |
-
#full_text_query = ""
|
124 |
-
#words = [el for el in input.split() if el]
|
125 |
-
#for word in words[:-1]:
|
126 |
-
#full_text_query += f" {word}~2 AND"
|
127 |
-
#full_text_query += f" {words[-1]}~2"
|
128 |
-
#return full_text_query.strip()
|
129 |
-
|
130 |
-
|
131 |
-
# Function to generate a full-text search query for Neo4j
|
132 |
-
def generate_full_text_query(input: str) -> str:
|
133 |
-
# Split the input into words, ignoring any empty strings
|
134 |
-
words = [el for el in input.split() if el]
|
135 |
-
|
136 |
-
# Check if there are no words
|
137 |
-
if not words:
|
138 |
-
return "" # Return an empty string or a default query if desired
|
139 |
-
|
140 |
-
# Create the full-text query with fuzziness (~2 for proximity search)
|
141 |
-
full_text_query = ""
|
142 |
-
for word in words[:-1]:
|
143 |
-
full_text_query += f" {word}~2 AND"
|
144 |
-
full_text_query += f" {words[-1]}~2"
|
145 |
-
return full_text_query.strip()
|
146 |
-
|
147 |
|
|
|
148 |
|
149 |
-
# Function to generate audio with Eleven Labs TTS
|
150 |
-
def generate_audio_elevenlabs(text):
|
151 |
-
XI_API_KEY = os.environ['ELEVENLABS_API']
|
152 |
-
VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
|
153 |
-
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
|
154 |
-
headers = {
|
155 |
-
"Accept": "application/json",
|
156 |
-
"xi-api-key": XI_API_KEY
|
157 |
-
}
|
158 |
-
data = {
|
159 |
-
"text": str(text),
|
160 |
-
"model_id": "eleven_multilingual_v2",
|
161 |
-
"voice_settings": {
|
162 |
-
"stability": 1.0,
|
163 |
-
"similarity_boost": 0.0,
|
164 |
-
"style": 0.60,
|
165 |
-
"use_speaker_boost": False
|
166 |
-
}
|
167 |
-
}
|
168 |
-
response = requests.post(tts_url, headers=headers, json=data, stream=True)
|
169 |
-
if response.ok:
|
170 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
|
171 |
-
for chunk in response.iter_content(chunk_size=1024):
|
172 |
-
if chunk:
|
173 |
-
f.write(chunk)
|
174 |
-
audio_path = f.name
|
175 |
-
return audio_path # Return audio path for automatic playback
|
176 |
-
else:
|
177 |
-
print(f"Error generating audio: {response.text}")
|
178 |
-
return None
|
179 |
|
180 |
-
# Define the template for generating responses based on context
|
181 |
-
template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
|
182 |
-
Ask your question directly, and I'll provide a precise and quick,short and crisp response in a conversational and straight-foreward way without any Greet.
|
183 |
-
Context:
|
184 |
-
{context}
|
185 |
|
186 |
-
|
187 |
-
|
|
|
188 |
|
189 |
-
# Create a prompt object using the template
|
190 |
-
prompt = ChatPromptTemplate.from_template(template)
|
191 |
|
192 |
-
# Function to generate a response using the prompt and the context
|
193 |
-
def generate_response_with_prompt(context, question):
|
194 |
-
formatted_prompt = prompt.format(
|
195 |
-
context=context,
|
196 |
-
question=question
|
197 |
-
)
|
198 |
-
# Use the ChatOpenAI instance to generate a response directly from the formatted prompt
|
199 |
-
llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
|
200 |
-
response = llm(formatted_prompt)
|
201 |
-
return response.content.strip()
|
202 |
-
|
203 |
-
# Define the function to generate a hybrid response using Neo4j and other retrieval methods
|
204 |
-
def retriever(question: str):
|
205 |
-
# Structured data retrieval from Neo4j
|
206 |
-
structured_query = f"""
|
207 |
-
CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
|
208 |
-
YIELD node, score
|
209 |
-
RETURN node.id AS entity, node.text AS context, score
|
210 |
-
ORDER BY score DESC
|
211 |
-
LIMIT 2
|
212 |
-
"""
|
213 |
-
structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
|
214 |
-
structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])
|
215 |
-
|
216 |
-
# Unstructured data retrieval from vector store
|
217 |
-
unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
|
218 |
-
unstructured_response = "\n".join(unstructured_data)
|
219 |
-
|
220 |
-
# Combine structured and unstructured responses
|
221 |
-
combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
|
222 |
-
|
223 |
-
# Generate the final response using the prompt template
|
224 |
-
final_response = generate_response_with_prompt(combined_context, question)
|
225 |
-
return final_response
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
-
|
243 |
-
|
244 |
|
245 |
-
return audio_path, state
|
246 |
-
|
247 |
-
|
248 |
-
with gr.Blocks() as demo:
|
249 |
with gr.Row():
|
|
|
250 |
with gr.Column():
|
251 |
-
|
|
|
252 |
with gr.Column():
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
261 |
)
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
)
|
267 |
-
cancel = gr.Button("Stop Conversation", variant="stop")
|
268 |
-
cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
|
269 |
-
[state, input_audio], cancels=[stream, restart])
|
270 |
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
272 |
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
5 |
+
from langchain_core.output_parsers import StrOutputParser
|
6 |
+
from langchain_openai import ChatOpenAI
|
7 |
+
from langchain_community.graphs import Neo4jGraph
|
8 |
+
from typing import List, Tuple
|
9 |
+
from pydantic import BaseModel, Field
|
10 |
+
from langchain_core.messages import AIMessage, HumanMessage
|
11 |
+
from langchain_core.runnables import (
|
12 |
+
RunnableBranch,
|
13 |
+
RunnableLambda,
|
14 |
+
RunnablePassthrough,
|
15 |
+
RunnableParallel,
|
16 |
+
)
|
17 |
+
from langchain_core.prompts.prompt import PromptTemplate
|
18 |
import requests
|
19 |
import tempfile
|
20 |
+
from langchain.memory import ConversationBufferWindowMemory
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
import time
|
22 |
+
import logging
|
23 |
+
from langchain.chains import ConversationChain
|
24 |
+
import torch
|
25 |
+
import torchaudio
|
26 |
+
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
|
27 |
+
import numpy as np
|
28 |
+
import threading
|
29 |
|
30 |
|
31 |
+
#code for history
|
32 |
+
conversational_memory = ConversationBufferWindowMemory(
|
33 |
+
memory_key='chat_history',
|
34 |
+
k=10,
|
35 |
+
return_messages=True
|
36 |
+
)
|
|
|
|
|
37 |
|
38 |
+
# Setup Neo4j
|
39 |
+
graph = Neo4jGraph(
|
40 |
+
url="neo4j+s://c62d0d35.databases.neo4j.io",
|
41 |
+
username="neo4j",
|
42 |
+
password="_x8f-_aAQvs2NB0x6s0ZHSh3W_y-HrENDbgStvsUCM0"
|
43 |
+
)
|
44 |
|
45 |
+
# Define entity extraction and retrieval functions
|
46 |
+
class Entities(BaseModel):
|
47 |
+
names: List[str] = Field(
|
48 |
+
..., description="All the person, organization, or business entities that appear in the text"
|
49 |
+
)
|
50 |
|
51 |
+
entity_prompt = ChatPromptTemplate.from_messages([
|
52 |
+
("system", "You are extracting organization and person entities from the text."),
|
53 |
+
("human", "Use the given format to extract information from the following input: {question}"),
|
54 |
+
])
|
55 |
|
56 |
+
chat_model = ChatOpenAI(temperature=0, model_name="gpt-4o", api_key=os.environ['OPENAI_API_KEY'])
|
57 |
+
entity_chain = entity_prompt | chat_model.with_structured_output(Entities)
|
58 |
|
59 |
+
def remove_lucene_chars(input: str) -> str:
|
60 |
+
return input.translate(str.maketrans({
|
61 |
+
"\\": r"\\", "+": r"\+", "-": r"\-", "&": r"\&", "|": r"\|", "!": r"\!",
|
62 |
+
"(": r"\(", ")": r"\)", "{": r"\{", "}": r"\}", "[": r"\[", "]": r"\]",
|
63 |
+
"^": r"\^", "~": r"\~", "*": r"\*", "?": r"\?", ":": r"\:", '"': r'\"',
|
64 |
+
";": r"\;", " ": r"\ "
|
65 |
+
}))
|
66 |
|
67 |
+
def generate_full_text_query(input: str) -> str:
|
68 |
+
full_text_query = ""
|
69 |
+
words = [el for el in remove_lucene_chars(input).split() if el]
|
70 |
+
for word in words[:-1]:
|
71 |
+
full_text_query += f" {word}~2 AND"
|
72 |
+
full_text_query += f" {words[-1]}~2"
|
73 |
+
return full_text_query.strip()
|
74 |
|
75 |
+
# Setup logging to a file to capture debug information
|
76 |
+
logging.basicConfig(filename='neo4j_retrieval.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
77 |
+
|
78 |
+
def structured_retriever(question: str) -> str:
|
79 |
+
result = ""
|
80 |
+
entities = entity_chain.invoke({"question": question})
|
81 |
+
for entity in entities.names:
|
82 |
+
response = graph.query(
|
83 |
+
"""CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
|
84 |
+
YIELD node,score
|
85 |
+
CALL {
|
86 |
+
WITH node
|
87 |
+
MATCH (node)-[r:!MENTIONS]->(neighbor)
|
88 |
+
RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
|
89 |
+
UNION ALL
|
90 |
+
WITH node
|
91 |
+
MATCH (node)<-[r:!MENTIONS]-(neighbor)
|
92 |
+
RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
|
93 |
+
}
|
94 |
+
RETURN output LIMIT 50
|
95 |
+
""",
|
96 |
+
{"query": generate_full_text_query(entity)},
|
97 |
+
)
|
98 |
+
result += "\n".join([el['output'] for el in response])
|
99 |
+
return result
|
100 |
+
|
101 |
+
def retriever_neo4j(question: str):
|
102 |
+
structured_data = structured_retriever(question)
|
103 |
+
logging.debug(f"Structured data: {structured_data}")
|
104 |
+
return structured_data
|
105 |
+
|
106 |
+
# Setup for condensing the follow-up questions
|
107 |
+
_template = """Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question,
|
108 |
+
in its original language.
|
109 |
+
Chat History:
|
110 |
+
{chat_history}
|
111 |
+
Follow Up Input: {question}
|
112 |
+
Standalone question:"""
|
113 |
+
|
114 |
+
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
|
115 |
+
|
116 |
+
def _format_chat_history(chat_history: list[tuple[str, str]]) -> list:
|
117 |
+
buffer = []
|
118 |
+
for human, ai in chat_history:
|
119 |
+
buffer.append(HumanMessage(content=human))
|
120 |
+
buffer.append(AIMessage(content=ai))
|
121 |
+
return buffer
|
122 |
+
|
123 |
+
_search_query = RunnableBranch(
|
124 |
+
(
|
125 |
+
RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
|
126 |
+
run_name="HasChatHistoryCheck"
|
127 |
+
),
|
128 |
+
RunnablePassthrough.assign(
|
129 |
+
chat_history=lambda x: _format_chat_history(x["chat_history"])
|
130 |
+
)
|
131 |
+
| CONDENSE_QUESTION_PROMPT
|
132 |
+
| ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
|
133 |
+
| StrOutputParser(),
|
134 |
+
),
|
135 |
+
RunnableLambda(lambda x: x["question"]),
|
136 |
)
|
137 |
|
138 |
+
|
139 |
+
template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
|
140 |
+
Ask your question directly, and I'll provide a precise and quick,short and crisp response in a conversational way without any Greet.
|
141 |
+
{context}
|
142 |
+
Question: {question}
|
143 |
+
Answer:"""
|
144 |
+
|
145 |
+
|
146 |
+
qa_prompt = ChatPromptTemplate.from_template(template)
|
147 |
+
|
148 |
+
# Define the chain for Neo4j-based retrieval and response generation
|
149 |
+
chain_neo4j = (
|
150 |
+
RunnableParallel(
|
151 |
+
{
|
152 |
+
"context": _search_query | retriever_neo4j,
|
153 |
+
"question": RunnablePassthrough(),
|
154 |
+
}
|
155 |
+
)
|
156 |
+
| qa_prompt
|
157 |
+
| chat_model
|
158 |
+
| StrOutputParser()
|
159 |
)
|
160 |
|
161 |
+
# Define the function to get the response
|
162 |
+
def get_response(question):
|
163 |
+
try:
|
164 |
+
return chain_neo4j.invoke({"question": question})
|
165 |
+
except Exception as e:
|
166 |
+
return f"Error: {str(e)}"
|
167 |
+
|
168 |
+
# Define the function to clear input and output
|
169 |
+
def clear_fields():
|
170 |
+
return [],"",None
|
171 |
+
|
172 |
+
# Function to generate audio with Eleven Labs TTS
|
173 |
+
def generate_audio_elevenlabs(text):
|
174 |
+
XI_API_KEY = os.environ['ELEVENLABS_API']
|
175 |
+
VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
|
176 |
+
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
|
177 |
+
headers = {
|
178 |
+
"Accept": "application/json",
|
179 |
+
"xi-api-key": XI_API_KEY
|
180 |
+
}
|
181 |
+
data = {
|
182 |
+
"text": str(text),
|
183 |
+
"model_id": "eleven_multilingual_v2",
|
184 |
+
"voice_settings": {
|
185 |
+
"stability": 1.0,
|
186 |
+
"similarity_boost": 0.0,
|
187 |
+
"style": 0.60,
|
188 |
+
"use_speaker_boost": False
|
189 |
+
}
|
190 |
+
}
|
191 |
+
response = requests.post(tts_url, headers=headers, json=data, stream=True)
|
192 |
+
if response.ok:
|
193 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
|
194 |
+
for chunk in response.iter_content(chunk_size=1024):
|
195 |
+
if chunk:
|
196 |
+
f.write(chunk)
|
197 |
+
audio_path = f.name
|
198 |
+
logging.debug(f"Audio saved to {audio_path}")
|
199 |
+
return audio_path # Return audio path for automatic playback
|
200 |
+
else:
|
201 |
+
logging.error(f"Error generating audio: {response.text}")
|
202 |
+
return None
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
def handle_mode_selection(mode, chat_history, question):
|
207 |
+
if mode == "Normal Chatbot":
|
208 |
+
# Append the user's question to chat history first
|
209 |
+
chat_history.append((question, "")) # Placeholder for the bot's response
|
210 |
+
|
211 |
+
# Stream the response and update chat history with each chunk
|
212 |
+
for response_chunk in chat_with_bot(chat_history):
|
213 |
+
chat_history[-1] = (question, response_chunk[-1][1]) # Update last entry with streamed response
|
214 |
+
yield chat_history, "", None # Stream each chunk to display in the chatbot
|
215 |
+
yield chat_history, "", None # Final yield to complete the response
|
216 |
+
|
217 |
+
elif mode == "Voice to Voice Conversation":
|
218 |
+
# Voice to Voice mode: Stream the response text and then convert it to audio
|
219 |
+
response_text = get_response(question) # Retrieve response text
|
220 |
+
audio_path = generate_audio_elevenlabs(response_text) # Convert response to audio
|
221 |
+
yield [], "", audio_path # Only output the audio response without updating chatbot history
|
222 |
+
|
223 |
+
|
224 |
+
# Function to add a user's message to the chat history and clear the input box
|
225 |
+
def add_message(history, message):
|
226 |
+
if message.strip():
|
227 |
+
history.append((message, "")) # Add the user's message to the chat history only if it's not empty
|
228 |
+
return history, "" # Clear the input box
|
229 |
+
|
230 |
+
# Define function to generate a streaming response
|
231 |
+
def chat_with_bot(messages):
|
232 |
+
user_message = messages[-1][0] # Get the last user message (input)
|
233 |
+
messages[-1] = (user_message, "") # Prepare a placeholder for the bot's response
|
234 |
+
|
235 |
+
response = get_response(user_message) # Assume `get_response` is a generator function
|
236 |
+
|
237 |
+
# Stream each character in the response and update the history progressively
|
238 |
+
for character in response:
|
239 |
+
messages[-1] = (user_message, messages[-1][1] + character)
|
240 |
+
yield messages # Stream each updated chunk
|
241 |
+
time.sleep(0.05) # Adjust delay as needed for real-time effect
|
242 |
+
|
243 |
+
yield messages # Final yield to complete the response
|
244 |
+
|
245 |
+
|
246 |
+
|
247 |
+
# Function to generate audio with Eleven Labs TTS from the last bot response
|
248 |
+
def generate_audio_from_last_response(history):
|
249 |
+
# Get the most recent bot response from the chat history
|
250 |
+
if history and len(history) > 0:
|
251 |
+
recent_response = history[-1][1] # The second item in the tuple is the bot response text
|
252 |
+
if recent_response:
|
253 |
+
return generate_audio_elevenlabs(recent_response)
|
254 |
+
return None
|
255 |
+
|
256 |
+
|
257 |
+
|
258 |
# Define the ASR model with Whisper
|
259 |
model_id = 'openai/whisper-large-v3'
|
260 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
275 |
return_timestamps=True
|
276 |
)
|
277 |
|
278 |
+
# Define the function to reset the state after 10 seconds
|
279 |
def auto_reset_state():
|
280 |
+
time.sleep(5)
|
281 |
return None, "" # Reset the state and clear input text
|
282 |
|
283 |
|
|
|
284 |
def transcribe_function(stream, new_chunk):
|
285 |
try:
|
286 |
sr, y = new_chunk[0], new_chunk[1]
|
|
|
288 |
print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
|
289 |
return stream, "", None
|
290 |
|
291 |
+
# Ensure y is not empty and is at least 1-dimensional
|
292 |
if y is None or len(y) == 0:
|
293 |
return stream, "", None
|
294 |
|
|
|
297 |
if max_abs_y > 0:
|
298 |
y = y / max_abs_y
|
299 |
|
300 |
+
# Ensure stream is also at least 1-dimensional before concatenation
|
301 |
if stream is not None and len(stream) > 0:
|
302 |
stream = np.concatenate([stream, y])
|
303 |
else:
|
304 |
stream = y
|
305 |
|
306 |
+
# Process the audio data for transcription
|
307 |
result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
|
308 |
full_text = result.get("text", "")
|
309 |
|
310 |
+
# Start a thread to reset the state after 10 seconds
|
311 |
threading.Thread(target=auto_reset_state).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
|
313 |
+
return stream, full_text, full_text
|
314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
+
# Define the function to clear the state and input text
|
318 |
+
def clear_transcription_state():
|
319 |
+
return None, ""
|
320 |
|
|
|
|
|
321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
+
with gr.Blocks(theme="rawrsor1/Everforest") as demo:
|
324 |
+
# Hide the chatbot component by setting `visible=False`
|
325 |
+
chatbot = gr.Chatbot([], elem_id="RADAR", bubble_full_width=False, visible=False)
|
326 |
|
327 |
+
with gr.Row():
|
328 |
+
with gr.Column():
|
329 |
+
# Hide the "Normal Chatbot" radio button by removing it or setting `visible=False`
|
330 |
+
mode_selection = gr.Radio(
|
331 |
+
choices=["Voice to Voice Conversation"], # Removed "Normal Chatbot" option
|
332 |
+
label="Mode Selection",
|
333 |
+
value="Voice to Voice Conversation",
|
334 |
+
visible=False # Hide the mode selection entirely
|
335 |
+
)
|
336 |
+
# Remaining code unchanged
|
337 |
+
with gr.Row():
|
338 |
+
with gr.Column():
|
339 |
+
question_input = gr.Textbox(label="Ask a Question", placeholder="Type your question here...",visible=False)
|
340 |
+
audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1, label="Speak to Ask")
|
341 |
+
submit_voice_btn = gr.Button("Submit Voice")
|
342 |
|
343 |
+
with gr.Column():
|
344 |
+
audio_output = gr.Audio(label="Audio", type="filepath", autoplay=True, interactive=False)
|
345 |
|
|
|
|
|
|
|
|
|
346 |
with gr.Row():
|
347 |
+
|
348 |
with gr.Column():
|
349 |
+
clear_state_btn = gr.Button("Clear State")
|
350 |
+
|
351 |
with gr.Column():
|
352 |
+
clean_btn = gr.Button("Clean")
|
353 |
+
|
354 |
+
|
355 |
+
|
356 |
+
# Adjust the interactions for the Get Response button
|
357 |
+
submit_voice_btn.click(
|
358 |
+
fn=handle_mode_selection,
|
359 |
+
inputs=[mode_selection, chatbot, question_input],
|
360 |
+
outputs=[chatbot, question_input, audio_output],
|
361 |
+
api_name="api_voice_to_voice_translation"
|
362 |
)
|
363 |
+
|
364 |
+
# Speech-to-Text functionality
|
365 |
+
state = gr.State()
|
366 |
+
audio_input.stream(
|
367 |
+
transcribe_function,
|
368 |
+
inputs=[state, audio_input],
|
369 |
+
outputs=[state, question_input],
|
370 |
+
api_name="api_voice_to_text"
|
371 |
+
)
|
372 |
+
|
373 |
+
|
374 |
+
|
375 |
+
clean_btn.click(
|
376 |
+
fn=clear_fields,
|
377 |
+
inputs=[],
|
378 |
+
outputs=[chatbot, question_input, audio_output],
|
379 |
+
api_name="api_clear_textbox"
|
380 |
)
|
|
|
|
|
|
|
381 |
|
382 |
+
# Clear state interaction
|
383 |
+
clear_state_btn.click(
|
384 |
+
fn=clear_transcription_state,
|
385 |
+
outputs=[question_input, state],
|
386 |
+
api_name="api_clean_state_transcription"
|
387 |
+
)
|
388 |
|
389 |
+
# Launch the Gradio interface
|
390 |
+
demo.launch(show_error=True, share=True)
|