Pijush2023 commited on
Commit
8527f42
·
verified ·
1 Parent(s): f26ca23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -113
app.py CHANGED
@@ -1,20 +1,40 @@
1
  import gradio as gr
2
  import os
3
  import logging
4
- import requests
5
- import tempfile
6
- import torch
7
- import numpy as np
8
- from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
9
- from langchain_community.graphs import Neo4jGraph
10
  from langchain_core.prompts import ChatPromptTemplate
 
11
  from langchain_openai import ChatOpenAI
 
 
12
  from pydantic import BaseModel, Field
13
- from typing import List
 
 
 
 
 
 
 
 
 
 
14
  import time
 
 
 
15
  import torchaudio
 
 
 
16
 
17
- # Neo4j Setup
 
 
 
 
 
 
 
18
  graph = Neo4jGraph(
19
  url="neo4j+s://6457770f.databases.neo4j.io",
20
  username="neo4j",
@@ -51,6 +71,9 @@ def generate_full_text_query(input: str) -> str:
51
  full_text_query += f" {words[-1]}~2"
52
  return full_text_query.strip()
53
 
 
 
 
54
  def structured_retriever(question: str) -> str:
55
  result = ""
56
  entities = entity_chain.invoke({"question": question})
@@ -74,9 +97,38 @@ def structured_retriever(question: str) -> str:
74
  result += "\n".join([el['output'] for el in response])
75
  return result
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Function to generate audio with Eleven Labs TTS
78
  def generate_audio_elevenlabs(text):
79
- XI_API_KEY = os.environ.get('ELEVENLABS_API')
80
  VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
81
  tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
82
  headers = {
@@ -93,118 +145,82 @@ def generate_audio_elevenlabs(text):
93
  "use_speaker_boost": False
94
  }
95
  }
96
-
97
- try:
98
- logging.debug(f"Sending request to Eleven Labs with text: {text[:100]}...")
99
- response = requests.post(tts_url, headers=headers, json=data, stream=True)
100
-
101
- if response.ok:
102
- logging.debug("Received successful response from Eleven Labs API.")
103
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
104
- for chunk in response.iter_content(chunk_size=1024):
105
- if chunk:
106
- f.write(chunk)
107
- audio_path = f.name
108
- logging.debug(f"Audio successfully saved to {audio_path}")
109
- return audio_path
110
- else:
111
- logging.error(f"Error generating audio: {response.status_code} - {response.text}")
112
- return None
113
- except Exception as e:
114
- logging.error(f"Exception during audio generation: {str(e)}")
115
  return None
116
 
117
- # Define the ASR model with Whisper
118
- model_id = 'openai/whisper-large-v3'
119
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
120
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
121
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
122
- processor = AutoProcessor.from_pretrained(model_id)
123
-
124
- pipe_asr = pipeline(
125
- "automatic-speech-recognition",
126
- model=model,
127
- tokenizer=processor.tokenizer,
128
- feature_extractor=processor.feature_extractor,
129
- max_new_tokens=128,
130
- chunk_length_s=15,
131
- batch_size=16,
132
- torch_dtype=torch_dtype,
133
- device=device,
134
- return_timestamps=True
135
- )
136
 
137
- # Function to handle audio input, transcribe, fetch from Neo4j, and generate audio response
138
- def transcribe_and_respond(audio):
139
- if audio is None:
140
- logging.error("No audio provided.")
141
- return None, "No audio provided."
142
-
143
- sr, y = audio
144
- y = np.array(y).astype(np.float32)
145
-
146
- # Resample to 16kHz if needed
147
- target_sr = 16000
148
- if sr != target_sr:
149
- logging.debug(f"Resampling audio from {sr} Hz to {target_sr} Hz.")
150
- y = torchaudio.functional.resample(torch.tensor(y), orig_freq=sr, new_freq=target_sr).numpy()
151
- sr = target_sr
152
-
153
- # Transcribe the audio using Whisper with English language setting
154
- result = pipe_asr({"raw": y, "sampling_rate": sr}, return_timestamps=False)
155
- question = result.get("text", "")
156
-
157
- # Log the transcribed text for debugging
158
- logging.debug(f"Transcribed text: {question}")
159
-
160
- # Retrieve information from Neo4j
161
- response_text = structured_retriever(question) if question else "I didn't understand the question."
162
-
163
- # Convert the response to audio using Eleven Labs TTS
164
- audio_path = generate_audio_elevenlabs(response_text) if response_text else None
165
-
166
- # Ensure a valid audio path is returned
167
- if audio_path and os.path.exists(audio_path):
168
- logging.debug(f"Generated audio file path: {audio_path}")
169
  else:
170
- logging.error("Failed to generate audio or save audio to file.")
171
- audio_path = None
172
-
173
- return audio_path, response_text
174
 
175
- # Function to clear the transcription state
176
- def clear_transcription_state():
177
- return None, None
178
 
179
- # Define the Gradio interface with only audio input and output
 
 
180
  with gr.Blocks(theme="rawrsor1/Everforest") as demo:
181
- with gr.Row():
182
- audio_input = gr.Audio(
183
- sources=["microphone"],
184
- type='numpy',
185
- label="Speak to Ask"
186
- )
187
- audio_output = gr.Audio(
188
- label="Audio Response",
189
- type="filepath",
190
- autoplay=True,
191
- interactive=False
192
- )
193
-
194
- # Submit button to process the audio input
195
- submit_btn = gr.Button("Submit")
196
- submit_btn.click(
197
- fn=transcribe_and_respond,
198
- inputs=audio_input,
199
- outputs=[audio_output, gr.Textbox(label="Transcription")]
200
  )
201
 
202
- # Clear state interaction
203
- gr.Button("Clear State").click(
204
- fn=clear_transcription_state,
205
- outputs=[audio_output, gr.Textbox(label="Transcription")],
206
- api_name="api_clean_state"
 
 
207
  )
208
 
209
- # Launch the Gradio interface
210
- demo.launch(show_error=True, share=True)
 
1
  import gradio as gr
2
  import os
3
  import logging
 
 
 
 
 
 
4
  from langchain_core.prompts import ChatPromptTemplate
5
+ from langchain_core.output_parsers import StrOutputParser
6
  from langchain_openai import ChatOpenAI
7
+ from langchain_community.graphs import Neo4jGraph
8
+ from typing import List, Tuple
9
  from pydantic import BaseModel, Field
10
+ from langchain_core.messages import AIMessage, HumanMessage
11
+ from langchain_core.runnables import (
12
+ RunnableBranch,
13
+ RunnableLambda,
14
+ RunnablePassthrough,
15
+ RunnableParallel,
16
+ )
17
+ from langchain_core.prompts.prompt import PromptTemplate
18
+ import requests
19
+ import tempfile
20
+ from langchain.memory import ConversationBufferWindowMemory
21
  import time
22
+ import logging
23
+ from langchain.chains import ConversationChain
24
+ import torch
25
  import torchaudio
26
+ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
27
+ import numpy as np
28
+ import threading
29
 
30
+ # Setup conversational memory
31
+ conversational_memory = ConversationBufferWindowMemory(
32
+ memory_key='chat_history',
33
+ k=10,
34
+ return_messages=True
35
+ )
36
+
37
+ # Setup Neo4j connection
38
  graph = Neo4jGraph(
39
  url="neo4j+s://6457770f.databases.neo4j.io",
40
  username="neo4j",
 
71
  full_text_query += f" {words[-1]}~2"
72
  return full_text_query.strip()
73
 
74
+ # Setup logging to a file to capture debug information
75
+ logging.basicConfig(filename='neo4j_retrieval.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
76
+
77
  def structured_retriever(question: str) -> str:
78
  result = ""
79
  entities = entity_chain.invoke({"question": question})
 
97
  result += "\n".join([el['output'] for el in response])
98
  return result
99
 
100
+ def retriever_neo4j(question: str):
101
+ structured_data = structured_retriever(question)
102
+ logging.debug(f"Structured data: {structured_data}")
103
+ return structured_data
104
+
105
+ # Define the chain for Neo4j-based retrieval and response generation
106
+ chain_neo4j = (
107
+ RunnableParallel(
108
+ {
109
+ "context": RunnableLambda(lambda x: retriever_neo4j(x["question"])),
110
+ "question": RunnablePassthrough(),
111
+ }
112
+ )
113
+ | ChatPromptTemplate.from_template("Answer: {context} Question: {question}")
114
+ | chat_model
115
+ | StrOutputParser()
116
+ )
117
+
118
+ # Define the function to get the response
119
+ def get_response(question):
120
+ try:
121
+ return chain_neo4j.invoke({"question": question})
122
+ except Exception as e:
123
+ return f"Error: {str(e)}"
124
+
125
+ # Define the function to clear input and output
126
+ def clear_fields():
127
+ return [], "", None
128
+
129
  # Function to generate audio with Eleven Labs TTS
130
  def generate_audio_elevenlabs(text):
131
+ XI_API_KEY = os.environ['ELEVENLABS_API']
132
  VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
133
  tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
134
  headers = {
 
145
  "use_speaker_boost": False
146
  }
147
  }
148
+ response = requests.post(tts_url, headers=headers, json=data, stream=True)
149
+ if response.ok:
150
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
151
+ for chunk in response.iter_content(chunk_size=1024):
152
+ if chunk:
153
+ f.write(chunk)
154
+ audio_path = f.name
155
+ logging.debug(f"Audio saved to {audio_path}")
156
+ return audio_path # Return audio path for automatic playback
157
+ else:
158
+ logging.error(f"Error generating audio: {response.text}")
 
 
 
 
 
 
 
 
159
  return None
160
 
161
+ # Function to handle voice to voice conversation
162
+ def handle_voice_to_voice(chat_history, question):
163
+ response = get_response(question)
164
+ audio_path = generate_audio_elevenlabs(response)
165
+ chat_history.append(("[Voice Input]", "[Voice Response]"))
166
+ return chat_history, "", audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ # Function to transcribe audio input
169
+ def transcribe_function(stream, new_chunk):
170
+ try:
171
+ sr, y = new_chunk[0], new_chunk[1]
172
+ except TypeError:
173
+ print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
174
+ return stream, "", None
175
+
176
+ if y is None or len(y) == 0:
177
+ return stream, "", None
178
+
179
+ y = y.astype(np.float32)
180
+ max_abs_y = np.max(np.abs(y))
181
+ if max_abs_y > 0:
182
+ y = y / max_abs_y
183
+
184
+ if stream is not None and len(stream) > 0:
185
+ stream = np.concatenate([stream, y])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  else:
187
+ stream = y
188
+
189
+ result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
190
+ full_text = result.get("text", "")
191
 
192
+ threading.Thread(target=auto_reset_state).start()
 
 
193
 
194
+ return stream, full_text, full_text
195
+
196
+ # Define the Gradio interface
197
  with gr.Blocks(theme="rawrsor1/Everforest") as demo:
198
+ chatbot = gr.Chatbot([], elem_id="RADAR", bubble_full_width=False)
199
+ mode_selection = gr.Radio(
200
+ choices=["Normal Chatbot", "Voice to Voice Conversation"],
201
+ label="Mode Selection",
202
+ value="Normal Chatbot"
203
+ )
204
+ question_input = gr.Textbox(label="Ask a Question", placeholder="Type your question here...")
205
+ audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', every=0.1, label="Speak to Ask")
206
+ submit_voice_btn = gr.Button("Submit Voice")
207
+ audio_output = gr.Audio(label="Audio", type="filepath", autoplay=True, interactive=False)
208
+
209
+ # Interactions for Submit Voice Button
210
+ submit_voice_btn.click(
211
+ fn=handle_voice_to_voice,
212
+ inputs=[chatbot, question_input],
213
+ outputs=[chatbot, question_input, audio_output],
214
+ api_name="api_voice_to_voice_translation"
 
 
215
  )
216
 
217
+ # Speech-to-Text functionality
218
+ state = gr.State()
219
+ audio_input.stream(
220
+ transcribe_function,
221
+ inputs=[state, audio_input],
222
+ outputs=[state, question_input],
223
+ api_name="api_voice_to_text"
224
  )
225
 
226
+ demo.launch(show_error=True, share=True)