Spaces:
Build error
Build error
File size: 9,209 Bytes
7f3430b c1009f8 9916325 c1009f8 9916325 7a077d7 b88951f 7a077d7 b8d3256 c1009f8 f684fa1 a53d4df 2f82cd7 f4466b2 b8d3256 d5f43a3 b8d3256 9916325 c1009f8 c71d159 c1009f8 c71d159 c1009f8 92b0167 b88951f c8dd9c0 b88951f f684fa1 c1009f8 b370650 c1009f8 8ff2c37 c1009f8 b370650 b8d3256 5f00699 3595ee8 089a83f b8d3256 089a83f c1009f8 089a83f b8d3256 089a83f b8d3256 089a83f b8d3256 089a83f 0e49572 b8d3256 0e49572 b8d3256 0e49572 b8d3256 0e49572 b8d3256 0e49572 b8d3256 7702656 8527f42 7702656 7da87e1 8527f42 7da87e1 b8d3256 8527f42 c1009f8 7702656 089a83f b8d3256 089a83f b8d3256 089a83f c1009f8 b8d3256 089a83f c1009f8 42aa752 c1009f8 b8d3256 c1009f8 b8d3256 c1009f8 b8d3256 089a83f 1e2e0a1 934e44a b8d3256 93f1ece b8d3256 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
import gradio as gr
import torch
import requests
import tempfile
import threading
import numpy as np
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Neo4jVector
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.prompts import ChatPromptTemplate
import time
import os
from dataclasses import dataclass, field
@dataclass
class AppState:
stream: np.ndarray | None = None
sampling_rate: int = 0
pause_detected: bool = False
started_talking: bool = False
stopped: bool = False
conversation: list = field(default_factory=list)
def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
"""Take in the stream, determine if a pause happened"""
temp_audio = audio
dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
duration = len(audio) / sampling_rate
if dur_vad > 0.5 and not state.started_talking:
print("started talking")
state.started_talking = True
return False
print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
return (duration - dur_vad) > 1
def start_recording_user(state: AppState):
if not state.stopped:
return gr.Audio(recording=True)
# Neo4j setup
graph = Neo4jGraph(
url="neo4j+s://c62d0d35.databases.neo4j.io",
username="neo4j",
password="_x8f-_aAQvs2NB0x6s0ZHSh3W_y-HrENDbgStvsUCM0"
)
# Initialize the vector index with Neo4j
vector_index = Neo4jVector.from_existing_graph(
OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY']),
graph=graph,
search_type="hybrid",
node_label="Document",
text_node_properties=["text"],
embedding_node_property="embedding",
)
# Define the ASR model with Whisper
model_id = 'openai/whisper-large-v3'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe_asr = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=15,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
return_timestamps=True
)
# Function to reset the state after 10 seconds
def auto_reset_state():
time.sleep(2)
return None, "" # Reset the state and clear input text
# Function to process audio input and transcribe it
def transcribe_function(stream, new_chunk):
try:
sr, y = new_chunk[0], new_chunk[1]
except TypeError:
print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
return stream, "", None
# Ensure y is not empty and is at least 1-dimensional
if y is None or len(y) == 0:
return stream, "", None
y = y.astype(np.float32)
max_abs_y = np.max(np.abs(y))
if max_abs_y > 0:
y = y / max_abs_y
# Ensure stream is also at least 1-dimensional before concatenation
if stream is not None and len(stream) > 0:
stream = np.concatenate([stream, y])
else:
stream = y
# Process the audio data for transcription
result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
full_text = result.get("text", "")
# Start a thread to reset the state after 10 seconds
threading.Thread(target=auto_reset_state).start()
return stream, full_text, full_text
# Function to generate a full-text search query for Neo4j
#def generate_full_text_query(input: str) -> str:
#full_text_query = ""
#words = [el for el in input.split() if el]
#for word in words[:-1]:
#full_text_query += f" {word}~2 AND"
#full_text_query += f" {words[-1]}~2"
#return full_text_query.strip()
# Function to generate a full-text search query for Neo4j
def generate_full_text_query(input: str) -> str:
# Split the input into words, ignoring any empty strings
words = [el for el in input.split() if el]
# Check if there are no words
if not words:
return "" # Return an empty string or a default query if desired
# Create the full-text query with fuzziness (~2 for proximity search)
full_text_query = ""
for word in words[:-1]:
full_text_query += f" {word}~2 AND"
full_text_query += f" {words[-1]}~2"
return full_text_query.strip()
# Function to generate audio with Eleven Labs TTS
def generate_audio_elevenlabs(text):
XI_API_KEY = os.environ['ELEVENLABS_API']
VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
headers = {
"Accept": "application/json",
"xi-api-key": XI_API_KEY
}
data = {
"text": str(text),
"model_id": "eleven_multilingual_v2",
"voice_settings": {
"stability": 1.0,
"similarity_boost": 0.0,
"style": 0.60,
"use_speaker_boost": False
}
}
response = requests.post(tts_url, headers=headers, json=data, stream=True)
if response.ok:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
audio_path = f.name
return audio_path # Return audio path for automatic playback
else:
print(f"Error generating audio: {response.text}")
return None
# Define the template for generating responses based on context
template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
Ask your question directly, and I'll provide a precise and quick,short and crisp response in a conversational and straight-foreward way without any Greet.
Context:
{context}
Question: {question}
Answer concisely:"""
# Create a prompt object using the template
prompt = ChatPromptTemplate.from_template(template)
# Function to generate a response using the prompt and the context
def generate_response_with_prompt(context, question):
formatted_prompt = prompt.format(
context=context,
question=question
)
# Use the ChatOpenAI instance to generate a response directly from the formatted prompt
llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
response = llm(formatted_prompt)
return response.content.strip()
# Define the function to generate a hybrid response using Neo4j and other retrieval methods
def retriever(question: str):
# Structured data retrieval from Neo4j
structured_query = f"""
CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
YIELD node, score
RETURN node.id AS entity, node.text AS context, score
ORDER BY score DESC
LIMIT 2
"""
structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])
# Unstructured data retrieval from vector store
unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
unstructured_response = "\n".join(unstructured_data)
# Combine structured and unstructured responses
combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
# Generate the final response using the prompt template
final_response = generate_response_with_prompt(combined_context, question)
return final_response
# Function to handle the entire audio query and response process
def process_audio_query(audio_input):
stream = None
_, transcription, _ = transcribe_function(stream, audio_input)
print(f"Transcription: {transcription}")
# Retrieve hybrid response using Neo4j and other methods
response_text = retriever(transcription)
print(f"Response: {response_text}")
# Generate audio from the response text
audio_path = generate_audio_elevenlabs(response_text)
return audio_path
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
with gr.Column():
output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
state = gr.State(value=AppState())
stream = input_audio.stream(
process_audio_query,
[input_audio, state],
[output_audio, state],
every=0.50
)
restart = output_audio.stop(
start_recording_user,
[state],
[input_audio]
)
cancel = gr.Button("Stop Conversation", variant="stop")
cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
[state, input_audio], cancels=[stream, restart])
demo.launch()
|