Pijush2023 commited on
Commit
089a83f
·
verified ·
1 Parent(s): 45f988c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -86
app.py CHANGED
@@ -11,42 +11,15 @@ from langchain_community.graphs import Neo4jGraph
11
  from langchain_core.prompts import ChatPromptTemplate
12
  import time
13
  import os
14
- import io
15
- from pydub import AudioSegment
16
- from dataclasses import dataclass,field
17
- import numpy as np
18
-
19
 
20
- # Define AppState dataclass for managing the application's state
21
  @dataclass
22
  class AppState:
23
  stream: np.ndarray | None = None
24
  sampling_rate: int = 0
25
  pause_detected: bool = False
26
- stopped: bool = False
27
- conversation: list = field(default_factory=list)
28
- #conversation: list = []
29
-
30
-
31
-
32
-
33
- def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
34
- """Take in the stream, determine if a pause happened"""
35
-
36
- temp_audio = audio
37
-
38
- dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
39
- duration = len(audio) / sampling_rate
40
-
41
- if dur_vad > 0.5 and not state.started_talking:
42
- print("started talking")
43
- state.started_talking = True
44
- return False
45
-
46
- print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
47
-
48
- return (duration - dur_vad) > 1
49
-
50
 
51
  # Neo4j setup
52
  graph = Neo4jGraph(
@@ -85,12 +58,23 @@ pipe_asr = pipeline(
85
  return_timestamps=True
86
  )
87
 
88
- # Function to reset the state after 2 seconds
89
- def auto_reset_state():
90
- time.sleep(2)
91
- return AppState() # Reset the state
 
 
92
 
93
- # Function to process audio input and handle pause detection
 
 
 
 
 
 
 
 
 
94
  def process_audio(audio: tuple, state: AppState):
95
  if state.stream is None:
96
  state.stream = audio[1]
@@ -98,52 +82,68 @@ def process_audio(audio: tuple, state: AppState):
98
  else:
99
  state.stream = np.concatenate((state.stream, audio[1]))
100
 
101
- # Detect pauses in the audio stream
102
  pause_detected = determine_pause(state.stream, state.sampling_rate, state)
103
  state.pause_detected = pause_detected
104
 
105
- # If a pause is detected and the user has started talking, stop recording
106
  if state.pause_detected and state.started_talking:
107
- return gr.Audio(recording=False), state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  return None, state
109
 
 
 
 
 
 
 
 
 
 
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  # Function to generate a full-text search query for Neo4j
113
  def generate_full_text_query(input: str) -> str:
114
- # Split the input into words, ignoring any empty strings
115
  words = [el for el in input.split() if el]
116
-
117
- # Check if there are no words
118
  if not words:
119
  return "" # Return an empty string or a default query if desired
120
-
121
- # Create the full-text query with fuzziness (~2 for proximity search)
122
  full_text_query = ""
123
  for word in words[:-1]:
124
  full_text_query += f" {word}~2 AND"
125
  full_text_query += f" {words[-1]}~2"
126
  return full_text_query.strip()
127
 
128
-
129
- # Define the template for generating responses based on context
130
- template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
131
- Ask your question directly, and I'll provide a precise and quick,short and crisp response in a conversational and straight-foreward way without any Greet.
132
- Context:
133
- {context}
134
- Question: {question}
135
- Answer concisely:"""
136
-
137
- # Create a prompt object using the template
138
- prompt = ChatPromptTemplate.from_template(template)
139
-
140
- # Function to generate a response using the prompt and the context
141
- def generate_response_with_prompt(context, question):
142
- formatted_prompt = prompt.format(context=context, question=question)
143
- llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
144
- response = llm(formatted_prompt)
145
- return response.content.strip()
146
-
147
  # Function to generate audio with Eleven Labs TTS
148
  def generate_audio_elevenlabs(text):
149
  XI_API_KEY = os.environ['ELEVENLABS_API']
@@ -170,15 +170,37 @@ def generate_audio_elevenlabs(text):
170
  if chunk:
171
  f.write(chunk)
172
  audio_path = f.name
173
- return audio_path # Return audio path for automatic playback
174
  else:
175
  print(f"Error generating audio: {response.text}")
176
  return None
177
 
178
- # Define the function to retrieve information using Neo4j and the vector store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  def retriever(question: str):
180
- structured_query = """
181
- CALL db.index.fulltext.queryNodes('entity', $query, {limit: 2})
182
  YIELD node, score
183
  RETURN node.id AS entity, node.text AS context, score
184
  ORDER BY score DESC
@@ -191,27 +213,24 @@ def retriever(question: str):
191
  unstructured_response = "\n".join(unstructured_data)
192
 
193
  combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
194
- return generate_response_with_prompt(combined_context, question)
195
-
196
- # Function to handle the entire audio query and response process
197
- def process_audio_query(state: AppState, audio_input):
198
- state, _ = process_audio(audio_input, state)
199
- if state.pause_detected:
200
- # Perform transcription once pause is detected
201
- transcription = pipe_asr({"array": state.stream, "sampling_rate": state.sampling_rate}, return_timestamps=False)["text"]
202
- response_text = retriever(transcription)
203
- audio_path = generate_audio_elevenlabs(response_text)
204
- return audio_path, state
205
- return None, state
206
 
207
  # Create Gradio interface for audio input and output
208
- with gr.Blocks() as interface:
209
- audio_input = gr.Audio(sources="microphone", type="numpy", streaming=True, every=0.1)
210
- submit_button = gr.Button("Submit")
211
- audio_output = gr.Audio(type="filepath", autoplay=True)
212
- state = gr.State(AppState())
213
-
214
- submit_button.click(fn=process_audio_query, inputs=[state, audio_input], outputs=[audio_output, state])
 
 
 
 
 
 
 
215
 
216
  # Launch the Gradio app
217
- interface.launch(show_error=True)
 
11
  from langchain_core.prompts import ChatPromptTemplate
12
  import time
13
  import os
14
+ from dataclasses import dataclass
 
 
 
 
15
 
16
+ # Define AppState to store audio state information
17
  @dataclass
18
  class AppState:
19
  stream: np.ndarray | None = None
20
  sampling_rate: int = 0
21
  pause_detected: bool = False
22
+ started_talking: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Neo4j setup
25
  graph = Neo4jGraph(
 
58
  return_timestamps=True
59
  )
60
 
61
+ # Function to determine if a pause occurred
62
+ def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
63
+ """Take in the stream, determine if a pause happened"""
64
+ temp_audio = audio
65
+ dur_vad = len(temp_audio) / sampling_rate # Simulating VAD duration for this example
66
+ duration = len(audio) / sampling_rate
67
 
68
+ if dur_vad > 0.5 and not state.started_talking:
69
+ print("Started talking")
70
+ state.started_talking = True
71
+ return False
72
+
73
+ print(f"Duration after VAD: {dur_vad:.3f} s")
74
+
75
+ return (duration - dur_vad) > 1 # Adjust the threshold for pause duration as needed
76
+
77
+ # Function to process audio input, detect pauses, and handle state
78
  def process_audio(audio: tuple, state: AppState):
79
  if state.stream is None:
80
  state.stream = audio[1]
 
82
  else:
83
  state.stream = np.concatenate((state.stream, audio[1]))
84
 
85
+ # Check for a pause in speech
86
  pause_detected = determine_pause(state.stream, state.sampling_rate, state)
87
  state.pause_detected = pause_detected
88
 
 
89
  if state.pause_detected and state.started_talking:
90
+ # Transcribe the audio when a pause is detected
91
+ _, transcription, _ = transcribe_function(state.stream, (state.sampling_rate, state.stream))
92
+ print(f"Transcription: {transcription}")
93
+
94
+ # Retrieve hybrid response using Neo4j and other methods
95
+ response_text = retriever(transcription)
96
+ print(f"Response: {response_text}")
97
+
98
+ # Generate audio from the response text
99
+ audio_path = generate_audio_elevenlabs(response_text)
100
+
101
+ # Reset state for the next input
102
+ state.stream = None
103
+ state.started_talking = False
104
+ state.pause_detected = False
105
+
106
+ return audio_path, state
107
+
108
  return None, state
109
 
110
+ # Function to process audio input and transcribe it
111
+ def transcribe_function(stream, new_chunk):
112
+ try:
113
+ sr, y = new_chunk[0], new_chunk[1]
114
+ except TypeError:
115
+ print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
116
+ return stream, "", None
117
+
118
+ if y is None or len(y) == 0:
119
+ return stream, "", None
120
 
121
+ y = y.astype(np.float32)
122
+ max_abs_y = np.max(np.abs(y))
123
+ if max_abs_y > 0:
124
+ y = y / max_abs_y
125
+
126
+ if stream is not None and len(stream) > 0:
127
+ stream = np.concatenate([stream, y])
128
+ else:
129
+ stream = y
130
+
131
+ result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
132
+ full_text = result.get("text", "")
133
+
134
+ return stream, full_text, full_text
135
 
136
  # Function to generate a full-text search query for Neo4j
137
  def generate_full_text_query(input: str) -> str:
 
138
  words = [el for el in input.split() if el]
 
 
139
  if not words:
140
  return "" # Return an empty string or a default query if desired
 
 
141
  full_text_query = ""
142
  for word in words[:-1]:
143
  full_text_query += f" {word}~2 AND"
144
  full_text_query += f" {words[-1]}~2"
145
  return full_text_query.strip()
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # Function to generate audio with Eleven Labs TTS
148
  def generate_audio_elevenlabs(text):
149
  XI_API_KEY = os.environ['ELEVENLABS_API']
 
170
  if chunk:
171
  f.write(chunk)
172
  audio_path = f.name
173
+ return audio_path
174
  else:
175
  print(f"Error generating audio: {response.text}")
176
  return None
177
 
178
+ # Define the template for generating responses based on context
179
+ template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
180
+ Ask your question directly, and I'll provide a precise and quick, short and crisp response in a conversational and straightforward way without any Greet.
181
+ Context:
182
+ {context}
183
+
184
+ Question: {question}
185
+ Answer concisely:"""
186
+
187
+ # Create a prompt object using the template
188
+ prompt = ChatPromptTemplate.from_template(template)
189
+
190
+ # Function to generate a response using the prompt and the context
191
+ def generate_response_with_prompt(context, question):
192
+ formatted_prompt = prompt.format(
193
+ context=context,
194
+ question=question
195
+ )
196
+ llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
197
+ response = llm(formatted_prompt)
198
+ return response.content.strip()
199
+
200
+ # Define the function to generate a hybrid response using Neo4j and other retrieval methods
201
  def retriever(question: str):
202
+ structured_query = f"""
203
+ CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
204
  YIELD node, score
205
  RETURN node.id AS entity, node.text AS context, score
206
  ORDER BY score DESC
 
213
  unstructured_response = "\n".join(unstructured_data)
214
 
215
  combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
216
+ final_response = generate_response_with_prompt(combined_context, question)
217
+ return final_response
 
 
 
 
 
 
 
 
 
 
218
 
219
  # Create Gradio interface for audio input and output
220
+ interface = gr.Interface(
221
+ fn=lambda audio, state: process_audio(audio, state),
222
+ inputs=[
223
+ gr.Audio(sources="microphone", type="numpy", streaming=True),
224
+ gr.State(AppState())
225
+ ],
226
+ outputs=[
227
+ gr.Audio(type="filepath", autoplay=True, interactive=False),
228
+ gr.State()
229
+ ],
230
+ live=True,
231
+ description="Ask questions via audio and receive audio responses.",
232
+ allow_flagging="never"
233
+ )
234
 
235
  # Launch the Gradio app
236
+ interface.launch()