Spaces:
Runtime error
Runtime error
File size: 6,075 Bytes
7f3430b c1009f8 9916325 c1009f8 9916325 7a077d7 c1009f8 7a077d7 c1009f8 9916325 c1009f8 c71d159 c1009f8 c71d159 c1009f8 92b0167 c1009f8 b370650 c1009f8 b370650 c1009f8 b370650 c1009f8 f0bef0b c1009f8 f0bef0b c1009f8 b370650 c1009f8 b370650 7702656 8527f42 7702656 c1009f8 165cb65 c1009f8 165cb65 8527f42 c1009f8 8527f42 c1009f8 7702656 c1009f8 7e66356 c1009f8 8527f42 c1009f8 f0bef0b c1009f8 28374c4 c1009f8 934e44a c1009f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import gradio as gr
import torch
import requests
import tempfile
import threading
import numpy as np
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.prompts import ChatPromptTemplate
# Neo4j setup
graph = Neo4jGraph(
url="neo4j+s://c62d0d35.databases.neo4j.io",
username="neo4j",
password="_x8f-_aAQvs2NB0x6s0ZHSh3W_y-HrENDbgStvsUCM0"
)
# Define the ASR model with Whisper
model_id = 'openai/whisper-large-v3'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe_asr = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=15,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
return_timestamps=True
)
# Function to reset the state after 10 seconds
def auto_reset_state():
time.sleep(5)
return None, "" # Reset the state and clear input text
# Function to process audio input and transcribe it
def transcribe_function(stream, new_chunk):
try:
sr, y = new_chunk[0], new_chunk[1]
except TypeError:
print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
return stream, "", None
# Ensure y is not empty and is at least 1-dimensional
if y is None or len(y) == 0:
return stream, "", None
y = y.astype(np.float32)
max_abs_y = np.max(np.abs(y))
if max_abs_y > 0:
y = y / max_abs_y
# Ensure stream is also at least 1-dimensional before concatenation
if stream is not None and len(stream) > 0:
stream = np.concatenate([stream, y])
else:
stream = y
# Process the audio data for transcription
result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
full_text = result.get("text", "")
# Start a thread to reset the state after 10 seconds
threading.Thread(target=auto_reset_state).start()
return stream, full_text, full_text
# Function to generate audio with Eleven Labs TTS
def generate_audio_elevenlabs(text):
XI_API_KEY = os.environ['ELEVENLABS_API']
VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
headers = {
"Accept": "application/json",
"xi-api-key": XI_API_KEY
}
data = {
"text": str(text),
"model_id": "eleven_multilingual_v2",
"voice_settings": {
"stability": 1.0,
"similarity_boost": 0.0,
"style": 0.60,
"use_speaker_boost": False
}
}
response = requests.post(tts_url, headers=headers, json=data, stream=True)
if response.ok:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
audio_path = f.name
return audio_path # Return audio path for automatic playback
else:
print(f"Error generating audio: {response.text}")
return None
# Define the template for generating responses based on context
template = """Use the following context to answer the question:
Context:
{context}
Question: {question}
Answer concisely:"""
# Create a prompt object using the template
prompt = ChatPromptTemplate.from_template(template)
# Function to generate a response using the prompt and the context
def generate_response_with_prompt(context, question):
response = prompt.format(
context=context,
question=question
)
return response
# Define the function to generate a hybrid response using Neo4j and other retrieval methods
def retriever(question: str):
# Structured data retrieval from Neo4j
structured_query = f"""
CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
YIELD node, score
RETURN node.id AS entity, node.text AS context, score
ORDER BY score DESC
LIMIT 2
"""
structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])
# Unstructured data retrieval from vector store
unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
unstructured_response = "\n".join(unstructured_data)
# Combine structured and unstructured responses
combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
# Generate the final response using the prompt template
final_response = generate_response_with_prompt(combined_context, question)
return final_response
# Function to handle the entire audio query and response process
def process_audio_query(audio_input):
stream = None
_, transcription, _ = transcribe_function(stream, audio_input)
print(f"Transcription: {transcription}")
# Retrieve hybrid response using Neo4j and other methods
response_text = retriever(transcription)
print(f"Response: {response_text}")
# Generate audio from the response text
audio_path = generate_audio_elevenlabs(response_text)
return audio_path
# Create Gradio interface for audio input and output
interface = gr.Interface(
fn=process_audio_query,
inputs=gr.Audio(source="microphone", type="numpy"),
outputs="audio",
live=True,
description="Ask questions via audio and receive audio responses."
)
# Launch the Gradio app
interface.launch()
|