Pijush2023 commited on
Commit
b8d3256
·
verified ·
1 Parent(s): 3595ee8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -95
app.py CHANGED
@@ -8,18 +8,42 @@ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
8
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
9
  from langchain_community.vectorstores import Neo4jVector
10
  from langchain_community.graphs import Neo4jGraph
 
11
  from langchain_core.prompts import ChatPromptTemplate
12
  import time
13
  import os
14
- from dataclasses import dataclass
15
 
16
- # Define AppState to store audio state information
 
17
  @dataclass
18
  class AppState:
19
  stream: np.ndarray | None = None
20
  sampling_rate: int = 0
21
  pause_detected: bool = False
22
- started_talking: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Neo4j setup
25
  graph = Neo4jGraph(
@@ -58,79 +82,10 @@ pipe_asr = pipeline(
58
  return_timestamps=True
59
  )
60
 
61
- # Adjusted function to determine if a pause occurred
62
- def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
63
- """Take in the stream, determine if a pause happened."""
64
- temp_audio = audio
65
- dur_vad = len(temp_audio) / sampling_rate # Simulating VAD duration for this example
66
- duration = len(audio) / sampling_rate
67
-
68
- # Log the duration and VAD result for debugging
69
- print(f"Duration after VAD: {dur_vad:.3f} s, Total Duration: {duration:.3f} s")
70
-
71
- # Check if speech has started
72
- if dur_vad > 0.5 and not state.started_talking:
73
- print("Started talking")
74
- state.started_talking = True
75
- return False
76
-
77
- # If the difference between total duration and VAD duration is significant, consider it a pause
78
- # Adjust the threshold for pause detection (e.g., 0.5 seconds)
79
- pause_threshold = 0.5 # This value can be adjusted to be more sensitive
80
- if (duration - dur_vad) > pause_threshold and state.started_talking:
81
- print("Pause detected")
82
- return True
83
-
84
- return False
85
-
86
-
87
- # Function to process audio input, detect pauses, and handle state
88
- def process_audio(audio: tuple, state: AppState):
89
- # Ensure audio input is not None and has valid data
90
- if audio is None or audio[1] is None:
91
- print("Audio input is None or empty.")
92
- return None, state
93
-
94
- if state.stream is None:
95
- state.stream = audio[1]
96
- state.sampling_rate = audio[0]
97
- else:
98
- state.stream = np.concatenate((state.stream, audio[1]))
99
-
100
- # Check for a pause in speech
101
- pause_detected = determine_pause(state.stream, state.sampling_rate, state)
102
- state.pause_detected = pause_detected
103
-
104
- if state.pause_detected and state.started_talking:
105
- # Transcribe the audio when a pause is detected
106
- _, transcription, _ = transcribe_function(state.stream, (state.sampling_rate, state.stream))
107
- print(f"Transcription: {transcription}")
108
-
109
- # Check if transcription is empty
110
- if not transcription:
111
- print("No transcription available.")
112
- return None, state
113
-
114
- # Retrieve hybrid response using Neo4j and other methods
115
- response_text = retriever(transcription)
116
- print(f"Response: {response_text}")
117
-
118
- # Check if the response is empty before proceeding
119
- if not response_text:
120
- print("No response generated.")
121
- return None, state
122
-
123
- # Generate audio from the response text
124
- audio_path = generate_audio_elevenlabs(response_text)
125
-
126
- # Reset state for the next input
127
- state.stream = None
128
- state.started_talking = False
129
- state.pause_detected = False
130
-
131
- return audio_path, state
132
-
133
- return None, state
134
 
135
 
136
  # Function to process audio input and transcribe it
@@ -141,6 +96,7 @@ def transcribe_function(stream, new_chunk):
141
  print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
142
  return stream, "", None
143
 
 
144
  if y is None or len(y) == 0:
145
  return stream, "", None
146
 
@@ -149,27 +105,51 @@ def transcribe_function(stream, new_chunk):
149
  if max_abs_y > 0:
150
  y = y / max_abs_y
151
 
 
152
  if stream is not None and len(stream) > 0:
153
  stream = np.concatenate([stream, y])
154
  else:
155
  stream = y
156
 
 
157
  result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
158
  full_text = result.get("text", "")
159
 
 
 
 
160
  return stream, full_text, full_text
161
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  # Function to generate a full-text search query for Neo4j
163
  def generate_full_text_query(input: str) -> str:
 
164
  words = [el for el in input.split() if el]
 
 
165
  if not words:
166
  return "" # Return an empty string or a default query if desired
 
 
167
  full_text_query = ""
168
  for word in words[:-1]:
169
  full_text_query += f" {word}~2 AND"
170
  full_text_query += f" {words[-1]}~2"
171
  return full_text_query.strip()
172
 
 
 
173
  # Function to generate audio with Eleven Labs TTS
174
  def generate_audio_elevenlabs(text):
175
  XI_API_KEY = os.environ['ELEVENLABS_API']
@@ -196,14 +176,14 @@ def generate_audio_elevenlabs(text):
196
  if chunk:
197
  f.write(chunk)
198
  audio_path = f.name
199
- return audio_path
200
  else:
201
  print(f"Error generating audio: {response.text}")
202
  return None
203
 
204
  # Define the template for generating responses based on context
205
  template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
206
- Ask your question directly, and I'll provide a precise and quick, short and crisp response in a conversational and straightforward way without any Greet.
207
  Context:
208
  {context}
209
 
@@ -219,12 +199,14 @@ def generate_response_with_prompt(context, question):
219
  context=context,
220
  question=question
221
  )
 
222
  llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
223
  response = llm(formatted_prompt)
224
  return response.content.strip()
225
 
226
  # Define the function to generate a hybrid response using Neo4j and other retrieval methods
227
  def retriever(question: str):
 
228
  structured_query = f"""
229
  CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
230
  YIELD node, score
@@ -235,28 +217,58 @@ def retriever(question: str):
235
  structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
236
  structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])
237
 
 
238
  unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
239
  unstructured_response = "\n".join(unstructured_data)
240
 
 
241
  combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
 
 
242
  final_response = generate_response_with_prompt(combined_context, question)
243
  return final_response
244
 
245
- # Create Gradio interface for audio input and output
246
- interface = gr.Interface(
247
- fn=lambda audio, state: process_audio(audio, state),
248
- inputs=[
249
- gr.Audio(sources="microphone", type="numpy", streaming=True),
250
- gr.State(AppState())
251
- ],
252
- outputs=[
253
- gr.Audio(type="filepath", autoplay=True, interactive=False),
254
- gr.State()
255
- ],
256
- live=True,
257
- description="Ask questions via audio and receive audio responses.",
258
- allow_flagging="never"
259
- )
260
 
261
- # Launch the Gradio app
262
- interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
9
  from langchain_community.vectorstores import Neo4jVector
10
  from langchain_community.graphs import Neo4jGraph
11
+ from langchain_experimental.graph_transformers import LLMGraphTransformer
12
  from langchain_core.prompts import ChatPromptTemplate
13
  import time
14
  import os
 
15
 
16
+
17
+
18
  @dataclass
19
  class AppState:
20
  stream: np.ndarray | None = None
21
  sampling_rate: int = 0
22
  pause_detected: bool = False
23
+ started_talking: bool = False
24
+ stopped: bool = False
25
+ conversation: list = field(default_factory=list)
26
+
27
+ def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
28
+ """Take in the stream, determine if a pause happened"""
29
+
30
+ temp_audio = audio
31
+
32
+ dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
33
+ duration = len(audio) / sampling_rate
34
+
35
+ if dur_vad > 0.5 and not state.started_talking:
36
+ print("started talking")
37
+ state.started_talking = True
38
+ return False
39
+
40
+ print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
41
+
42
+ return (duration - dur_vad) > 1
43
+
44
+ def start_recording_user(state: AppState):
45
+ if not state.stopped:
46
+ return gr.Audio(recording=True)
47
 
48
  # Neo4j setup
49
  graph = Neo4jGraph(
 
82
  return_timestamps=True
83
  )
84
 
85
+ # Function to reset the state after 10 seconds
86
+ def auto_reset_state():
87
+ time.sleep(2)
88
+ return None, "" # Reset the state and clear input text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
 
91
  # Function to process audio input and transcribe it
 
96
  print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
97
  return stream, "", None
98
 
99
+ # Ensure y is not empty and is at least 1-dimensional
100
  if y is None or len(y) == 0:
101
  return stream, "", None
102
 
 
105
  if max_abs_y > 0:
106
  y = y / max_abs_y
107
 
108
+ # Ensure stream is also at least 1-dimensional before concatenation
109
  if stream is not None and len(stream) > 0:
110
  stream = np.concatenate([stream, y])
111
  else:
112
  stream = y
113
 
114
+ # Process the audio data for transcription
115
  result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
116
  full_text = result.get("text", "")
117
 
118
+ # Start a thread to reset the state after 10 seconds
119
+ threading.Thread(target=auto_reset_state).start()
120
+
121
  return stream, full_text, full_text
122
 
123
+
124
+
125
+ # Function to generate a full-text search query for Neo4j
126
+ #def generate_full_text_query(input: str) -> str:
127
+ #full_text_query = ""
128
+ #words = [el for el in input.split() if el]
129
+ #for word in words[:-1]:
130
+ #full_text_query += f" {word}~2 AND"
131
+ #full_text_query += f" {words[-1]}~2"
132
+ #return full_text_query.strip()
133
+
134
+
135
  # Function to generate a full-text search query for Neo4j
136
  def generate_full_text_query(input: str) -> str:
137
+ # Split the input into words, ignoring any empty strings
138
  words = [el for el in input.split() if el]
139
+
140
+ # Check if there are no words
141
  if not words:
142
  return "" # Return an empty string or a default query if desired
143
+
144
+ # Create the full-text query with fuzziness (~2 for proximity search)
145
  full_text_query = ""
146
  for word in words[:-1]:
147
  full_text_query += f" {word}~2 AND"
148
  full_text_query += f" {words[-1]}~2"
149
  return full_text_query.strip()
150
 
151
+
152
+
153
  # Function to generate audio with Eleven Labs TTS
154
  def generate_audio_elevenlabs(text):
155
  XI_API_KEY = os.environ['ELEVENLABS_API']
 
176
  if chunk:
177
  f.write(chunk)
178
  audio_path = f.name
179
+ return audio_path # Return audio path for automatic playback
180
  else:
181
  print(f"Error generating audio: {response.text}")
182
  return None
183
 
184
  # Define the template for generating responses based on context
185
  template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
186
+ Ask your question directly, and I'll provide a precise and quick,short and crisp response in a conversational and straight-foreward way without any Greet.
187
  Context:
188
  {context}
189
 
 
199
  context=context,
200
  question=question
201
  )
202
+ # Use the ChatOpenAI instance to generate a response directly from the formatted prompt
203
  llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
204
  response = llm(formatted_prompt)
205
  return response.content.strip()
206
 
207
  # Define the function to generate a hybrid response using Neo4j and other retrieval methods
208
  def retriever(question: str):
209
+ # Structured data retrieval from Neo4j
210
  structured_query = f"""
211
  CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
212
  YIELD node, score
 
217
  structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
218
  structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])
219
 
220
+ # Unstructured data retrieval from vector store
221
  unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
222
  unstructured_response = "\n".join(unstructured_data)
223
 
224
+ # Combine structured and unstructured responses
225
  combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
226
+
227
+ # Generate the final response using the prompt template
228
  final_response = generate_response_with_prompt(combined_context, question)
229
  return final_response
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
+ # Function to handle the entire audio query and response process
233
+ def process_audio_query(audio_input):
234
+ stream = None
235
+ _, transcription, _ = transcribe_function(stream, audio_input)
236
+ print(f"Transcription: {transcription}")
237
+
238
+ # Retrieve hybrid response using Neo4j and other methods
239
+ response_text = retriever(transcription)
240
+ print(f"Response: {response_text}")
241
+
242
+ # Generate audio from the response text
243
+ audio_path = generate_audio_elevenlabs(response_text)
244
+ return audio_path
245
+
246
+
247
+
248
+ with gr.Blocks() as demo:
249
+ with gr.Row():
250
+ with gr.Column():
251
+ input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
252
+ with gr.Column():
253
+ chatbot = gr.Chatbot(label="Conversation", type="messages")
254
+ output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
255
+ state = gr.State(value=AppState())
256
+
257
+ stream = input_audio.stream(
258
+ process_audio_query,
259
+ [input_audio, state],
260
+ [output_audio, state],
261
+ stream_every=0.50,
262
+ time_limit=30,
263
+ )
264
+ restart = output_audio.stop(
265
+ start_recording_user,
266
+ [state],
267
+ [input_audio]
268
+ )
269
+ cancel = gr.Button("Stop Conversation", variant="stop")
270
+ cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
271
+ [state, input_audio], cancels=[stream, restart])
272
+
273
+ demo.launch()
274
+