Pijush2023 commited on
Commit
c1009f8
·
verified ·
1 Parent(s): 12a16b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -86
app.py CHANGED
@@ -1,75 +1,98 @@
1
  import gradio as gr
2
- import os
3
  import requests
4
  import tempfile
5
- import torch
6
  import numpy as np
7
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
8
- from langchain_core.prompts import ChatPromptTemplate
9
  from langchain_openai import ChatOpenAI
 
10
  from langchain_community.graphs import Neo4jGraph
 
 
11
 
12
- # Setup Neo4j
13
  graph = Neo4jGraph(
14
- url="neo4j+s://6457770f.databases.neo4j.io",
15
  username="neo4j",
16
- password="Z10duoPkKCtENuOukw3eIlvl0xJWKtrVSr-_hGX1LQ4"
17
  )
18
 
19
- # Define a concise prompt template for generating responses
20
- template = """I am a guide for Birmingham, Alabama. I will provide a precise and short response based solely on the provided data.
21
- Do not include any additional commentary or context.
 
 
 
22
 
23
- Data:
24
- {context}
 
 
 
 
 
 
 
 
 
 
25
 
26
- User's question: {question}
27
- Answer:"""
28
- qa_prompt = ChatPromptTemplate.from_template(template)
 
29
 
30
- # Chat model configuration
31
- chat_model = ChatOpenAI(temperature=0, model_name="gpt-4o", api_key=os.environ['OPENAI_API_KEY'])
 
 
 
 
 
32
 
33
- # Function to generate a query for Neo4j and retrieve information
34
- def generate_full_text_query(input: str) -> str:
35
- return " ".join([f"{word}~2" for word in input.split()])
36
 
37
- def retrieve_from_neo4j(question: str) -> str:
38
- query = generate_full_text_query(question)
39
- response = graph.query(
40
- """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
41
- YIELD node, score
42
- RETURN node.name AS name, node.description AS description LIMIT 5""",
43
- {"query": query}
44
- )
45
- context = "\n".join([f"{el['name']}: {el['description']}" for el in response])
46
- return context
47
 
48
- # Function to generate the response using the prompt template and Neo4j data
49
- def get_response(question):
50
- try:
51
- context = retrieve_from_neo4j(question)
52
- prompt = qa_prompt.format_prompt(context=context, question=question)
53
- response = chat_model(prompt.to_string())
54
-
55
- # Filter extraneous content, keeping only the answer part
56
- if "Answer:" in response:
57
- response = response.split("Answer:")[-1].strip() # Extract the part after "Answer:" and strip extra spaces
58
-
59
- return response
60
- except Exception as e:
61
- return f"Error: {str(e)}"
62
 
63
  # Function to generate audio with Eleven Labs TTS
64
  def generate_audio_elevenlabs(text):
65
  XI_API_KEY = os.environ['ELEVENLABS_API']
66
  VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
67
  tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
68
- headers = {"Accept": "application/json", "xi-api-key": XI_API_KEY}
 
 
 
69
  data = {
70
  "text": str(text),
71
  "model_id": "eleven_multilingual_v2",
72
- "voice_settings": {"stability": 1.0, "similarity_boost": 0.0}
 
 
 
 
 
73
  }
74
  response = requests.post(tts_url, headers=headers, json=data, stream=True)
75
  if response.ok:
@@ -78,52 +101,76 @@ def generate_audio_elevenlabs(text):
78
  if chunk:
79
  f.write(chunk)
80
  audio_path = f.name
81
- return audio_path
82
  else:
 
83
  return None
84
 
85
- # Define the ASR model with Whisper
86
- model_id = 'openai/whisper-large-v3'
87
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
88
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
89
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
90
- processor = AutoProcessor.from_pretrained(model_id)
91
 
92
- pipe_asr = pipeline(
93
- "automatic-speech-recognition",
94
- model=model,
95
- tokenizer=processor.tokenizer,
96
- feature_extractor=processor.feature_extractor,
97
- max_new_tokens=128,
98
- chunk_length_s=15,
99
- batch_size=16,
100
- torch_dtype=torch_dtype,
101
- device=device,
102
- return_timestamps=True
103
- )
104
 
105
- # Define the function to transcribe audio and generate a response
106
- def transcribe_and_respond(audio):
107
- sr, y = audio[0], audio[1]
108
- y = y.astype(np.float32)
109
- max_abs_y = np.max(np.abs(y))
110
- if max_abs_y > 0:
111
- y = y / max_abs_y
112
- result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False)
113
- text = result.get("text", "")
114
- response = get_response(text)
115
- audio_path = generate_audio_elevenlabs(response)
116
- return audio_path
117
 
118
- with gr.Blocks() as demo:
119
- audio_input = gr.Audio(sources=["microphone"], streaming=False, type='numpy', label="Speak to Ask")
120
- audio_output = gr.Audio(label="Audio", type="filepath", autoplay=True, interactive=False)
121
-
122
- audio_input.change(
123
- fn=transcribe_and_respond,
124
- inputs=audio_input,
125
- outputs=audio_output,
126
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # Launch the Gradio interface
129
- demo.launch(show_error=True, share=True)
 
1
  import gradio as gr
2
+ import torch
3
  import requests
4
  import tempfile
5
+ import threading
6
  import numpy as np
7
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
 
8
  from langchain_openai import ChatOpenAI
9
+ from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
10
  from langchain_community.graphs import Neo4jGraph
11
+ from langchain_experimental.graph_transformers import LLMGraphTransformer
12
+ from langchain_core.prompts import ChatPromptTemplate
13
 
14
+ # Neo4j setup
15
  graph = Neo4jGraph(
16
+ url="neo4j+s://c62d0d35.databases.neo4j.io",
17
  username="neo4j",
18
+ password="_x8f-_aAQvs2NB0x6s0ZHSh3W_y-HrENDbgStvsUCM0"
19
  )
20
 
21
+ # Define the ASR model with Whisper
22
+ model_id = 'openai/whisper-large-v3'
23
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
24
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
25
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
26
+ processor = AutoProcessor.from_pretrained(model_id)
27
 
28
+ pipe_asr = pipeline(
29
+ "automatic-speech-recognition",
30
+ model=model,
31
+ tokenizer=processor.tokenizer,
32
+ feature_extractor=processor.feature_extractor,
33
+ max_new_tokens=128,
34
+ chunk_length_s=15,
35
+ batch_size=16,
36
+ torch_dtype=torch_dtype,
37
+ device=device,
38
+ return_timestamps=True
39
+ )
40
 
41
+ # Function to reset the state after 10 seconds
42
+ def auto_reset_state():
43
+ time.sleep(5)
44
+ return None, "" # Reset the state and clear input text
45
 
46
+ # Function to process audio input and transcribe it
47
+ def transcribe_function(stream, new_chunk):
48
+ try:
49
+ sr, y = new_chunk[0], new_chunk[1]
50
+ except TypeError:
51
+ print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
52
+ return stream, "", None
53
 
54
+ # Ensure y is not empty and is at least 1-dimensional
55
+ if y is None or len(y) == 0:
56
+ return stream, "", None
57
 
58
+ y = y.astype(np.float32)
59
+ max_abs_y = np.max(np.abs(y))
60
+ if max_abs_y > 0:
61
+ y = y / max_abs_y
 
 
 
 
 
 
62
 
63
+ # Ensure stream is also at least 1-dimensional before concatenation
64
+ if stream is not None and len(stream) > 0:
65
+ stream = np.concatenate([stream, y])
66
+ else:
67
+ stream = y
68
+
69
+ # Process the audio data for transcription
70
+ result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
71
+ full_text = result.get("text", "")
72
+
73
+ # Start a thread to reset the state after 10 seconds
74
+ threading.Thread(target=auto_reset_state).start()
75
+
76
+ return stream, full_text, full_text
77
 
78
  # Function to generate audio with Eleven Labs TTS
79
  def generate_audio_elevenlabs(text):
80
  XI_API_KEY = os.environ['ELEVENLABS_API']
81
  VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
82
  tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
83
+ headers = {
84
+ "Accept": "application/json",
85
+ "xi-api-key": XI_API_KEY
86
+ }
87
  data = {
88
  "text": str(text),
89
  "model_id": "eleven_multilingual_v2",
90
+ "voice_settings": {
91
+ "stability": 1.0,
92
+ "similarity_boost": 0.0,
93
+ "style": 0.60,
94
+ "use_speaker_boost": False
95
+ }
96
  }
97
  response = requests.post(tts_url, headers=headers, json=data, stream=True)
98
  if response.ok:
 
101
  if chunk:
102
  f.write(chunk)
103
  audio_path = f.name
104
+ return audio_path # Return audio path for automatic playback
105
  else:
106
+ print(f"Error generating audio: {response.text}")
107
  return None
108
 
109
+ # Define the template for generating responses based on context
110
+ template = """Use the following context to answer the question:
111
+ Context:
112
+ {context}
 
 
113
 
114
+ Question: {question}
115
+ Answer concisely:"""
 
 
 
 
 
 
 
 
 
 
116
 
117
+ # Create a prompt object using the template
118
+ prompt = ChatPromptTemplate.from_template(template)
 
 
 
 
 
 
 
 
 
 
119
 
120
+ # Function to generate a response using the prompt and the context
121
+ def generate_response_with_prompt(context, question):
122
+ response = prompt.format(
123
+ context=context,
124
+ question=question
 
 
 
125
  )
126
+ return response
127
+
128
+ # Define the function to generate a hybrid response using Neo4j and other retrieval methods
129
+ def retriever(question: str):
130
+ # Structured data retrieval from Neo4j
131
+ structured_query = f"""
132
+ CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
133
+ YIELD node, score
134
+ RETURN node.id AS entity, node.text AS context, score
135
+ ORDER BY score DESC
136
+ LIMIT 2
137
+ """
138
+ structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
139
+ structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])
140
+
141
+ # Unstructured data retrieval from vector store
142
+ unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
143
+ unstructured_response = "\n".join(unstructured_data)
144
+
145
+ # Combine structured and unstructured responses
146
+ combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
147
+
148
+ # Generate the final response using the prompt template
149
+ final_response = generate_response_with_prompt(combined_context, question)
150
+ return final_response
151
+
152
+ # Function to handle the entire audio query and response process
153
+ def process_audio_query(audio_input):
154
+ stream = None
155
+ _, transcription, _ = transcribe_function(stream, audio_input)
156
+ print(f"Transcription: {transcription}")
157
+
158
+ # Retrieve hybrid response using Neo4j and other methods
159
+ response_text = retriever(transcription)
160
+ print(f"Response: {response_text}")
161
+
162
+ # Generate audio from the response text
163
+ audio_path = generate_audio_elevenlabs(response_text)
164
+ return audio_path
165
+
166
+ # Create Gradio interface for audio input and output
167
+ interface = gr.Interface(
168
+ fn=process_audio_query,
169
+ inputs=gr.Audio(source="microphone", type="numpy"),
170
+ outputs="audio",
171
+ live=True,
172
+ description="Ask questions via audio and receive audio responses."
173
+ )
174
 
175
+ # Launch the Gradio app
176
+ interface.launch()