radarbackend11262024v11

Runtime error

radarbackend11262024v11

File size: 9,209 Bytes

import gradio as gr
import torch
import requests
import tempfile
import threading
import numpy as np
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Neo4jVector
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.prompts import ChatPromptTemplate
import time
import os
from dataclasses import dataclass, field



@dataclass
class AppState:
    stream: np.ndarray | None = None
    sampling_rate: int = 0
    pause_detected: bool = False
    started_talking: bool =  False
    stopped: bool = False
    conversation: list = field(default_factory=list)

def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
    """Take in the stream, determine if a pause happened"""

    temp_audio = audio
    
    dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
    duration = len(audio) / sampling_rate

    if dur_vad > 0.5 and not state.started_talking:
        print("started talking")
        state.started_talking = True
        return False

    print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")

    return (duration - dur_vad) > 1 

def start_recording_user(state: AppState):
    if not state.stopped:
        return gr.Audio(recording=True)

# Neo4j setup
graph = Neo4jGraph(
    url="neo4j+s://c62d0d35.databases.neo4j.io",
    username="neo4j",
    password="_x8f-_aAQvs2NB0x6s0ZHSh3W_y-HrENDbgStvsUCM0"
)

# Initialize the vector index with Neo4j
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY']),
    graph=graph,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
)

# Define the ASR model with Whisper
model_id = 'openai/whisper-large-v3'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
processor = AutoProcessor.from_pretrained(model_id)

pipe_asr = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True
)

# Function to reset the state after 10 seconds
def auto_reset_state():
    time.sleep(2)
    return None, ""  # Reset the state and clear input text


# Function to process audio input and transcribe it
def transcribe_function(stream, new_chunk):
    try:
        sr, y = new_chunk[0], new_chunk[1]
    except TypeError:
        print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
        return stream, "", None

    # Ensure y is not empty and is at least 1-dimensional
    if y is None or len(y) == 0:
        return stream, "", None

    y = y.astype(np.float32)
    max_abs_y = np.max(np.abs(y))
    if max_abs_y > 0:
        y = y / max_abs_y

    # Ensure stream is also at least 1-dimensional before concatenation
    if stream is not None and len(stream) > 0:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    # Process the audio data for transcription
    result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
    full_text = result.get("text", "")

    # Start a thread to reset the state after 10 seconds
    threading.Thread(target=auto_reset_state).start()

    return stream, full_text, full_text



# Function to generate a full-text search query for Neo4j
#def generate_full_text_query(input: str) -> str:
    #full_text_query = ""
    #words = [el for el in input.split() if el]
    #for word in words[:-1]:
        #full_text_query += f" {word}~2 AND"
    #full_text_query += f" {words[-1]}~2"
    #return full_text_query.strip()


# Function to generate a full-text search query for Neo4j
def generate_full_text_query(input: str) -> str:
    # Split the input into words, ignoring any empty strings
    words = [el for el in input.split() if el]

    # Check if there are no words
    if not words:
        return ""  # Return an empty string or a default query if desired

    # Create the full-text query with fuzziness (~2 for proximity search)
    full_text_query = ""
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()



# Function to generate audio with Eleven Labs TTS
def generate_audio_elevenlabs(text):
    XI_API_KEY = os.environ['ELEVENLABS_API']
    VOICE_ID = 'ehbJzYLQFpwbJmGkqbnW'
    tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
    headers = {
        "Accept": "application/json",
        "xi-api-key": XI_API_KEY
    }
    data = {
        "text": str(text),
        "model_id": "eleven_multilingual_v2",
        "voice_settings": {
            "stability": 1.0,
            "similarity_boost": 0.0,
            "style": 0.60,
            "use_speaker_boost": False
        }
    }
    response = requests.post(tts_url, headers=headers, json=data, stream=True)
    if response.ok:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
            audio_path = f.name
        return audio_path  # Return audio path for automatic playback
    else:
        print(f"Error generating audio: {response.text}")
        return None

# Define the template for generating responses based on context
template = """I am a guide for Birmingham, Alabama. I can provide recommendations and insights about the city, including events and activities.
Ask your question directly, and I'll provide a precise and quick,short and crisp response in a conversational and straight-foreward way without any Greet.
Context:
{context}

Question: {question}
Answer concisely:"""

# Create a prompt object using the template
prompt = ChatPromptTemplate.from_template(template)

# Function to generate a response using the prompt and the context
def generate_response_with_prompt(context, question):
    formatted_prompt = prompt.format(
        context=context,
        question=question
    )
    # Use the ChatOpenAI instance to generate a response directly from the formatted prompt
    llm = ChatOpenAI(temperature=0, api_key=os.environ['OPENAI_API_KEY'])
    response = llm(formatted_prompt)
    return response.content.strip()

# Define the function to generate a hybrid response using Neo4j and other retrieval methods
def retriever(question: str):
    # Structured data retrieval from Neo4j
    structured_query = f"""
    CALL db.index.fulltext.queryNodes('entity', $query, {{limit: 2}})
    YIELD node, score
    RETURN node.id AS entity, node.text AS context, score
    ORDER BY score DESC
    LIMIT 2
    """
    structured_data = graph.query(structured_query, {"query": generate_full_text_query(question)})
    structured_response = "\n".join([f"{record['entity']}: {record['context']}" for record in structured_data])

    # Unstructured data retrieval from vector store
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    unstructured_response = "\n".join(unstructured_data)

    # Combine structured and unstructured responses
    combined_context = f"Structured data:\n{structured_response}\n\nUnstructured data:\n{unstructured_response}"
    
    # Generate the final response using the prompt template
    final_response = generate_response_with_prompt(combined_context, question)
    return final_response


# Function to handle the entire audio query and response process
def process_audio_query(audio_input):
    stream = None
    _, transcription, _ = transcribe_function(stream, audio_input)
    print(f"Transcription: {transcription}")
    
    # Retrieve hybrid response using Neo4j and other methods
    response_text = retriever(transcription)
    print(f"Response: {response_text}")
    
    # Generate audio from the response text
    audio_path = generate_audio_elevenlabs(response_text)
    return audio_path



with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
        with gr.Column():
            output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
    state = gr.State(value=AppState())

    stream = input_audio.stream(
        process_audio_query,
        [input_audio, state],
        [output_audio, state],
        every=0.50
    )
    restart = output_audio.stop(
        start_recording_user,
        [state],
        [input_audio]
    )
    cancel = gr.Button("Stop Conversation", variant="stop")
    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
                 [state, input_audio], cancels=[stream, restart])

    demo.launch()