import gradio as gr from pydub import AudioSegment import numpy as np import pandas as pd from sentence_transformers import SentenceTransformer import whisper # Load the model (choose "tiny", "base", "small", "medium", or "large") audio_model = whisper.load_model("large", device='cpu') # Load the model sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Simulated DataFrame containing chunk paths and transcriptions for demo # Replace with your actual data df_mapping = pd.read_csv('audio_chunk_mapping_with_transcription_embeddings.csv') # Function to process the input audio and retrieve the most similar audio chunk def process_and_find_audio(audio_file): # Load audio from the file path audio_path = "./temp_audio.wav" # Path to temporarily save audio if needed sample_rate,audio_np = audio_file # Save the numpy array as an audio file if you need to pass it to the Whisper model audio_segment = AudioSegment( audio_np.tobytes(), frame_rate=sample_rate, # Set the frame rate as appropriate sample_width=2, # Assuming 16-bit samples (adjust if necessary) channels=1 # Assuming mono channel (adjust if necessary) ) # audio_file = audio_path # audio_segment.export(audio_file, format="wav") # Save the audio to a temporary file audio_segment.export(audio_path, format="wav") # audio_path = audio_file.name transcription = audio_model.transcribe(audio_path, task="translate")['text'] # Compute embeddings for database transcriptions and user transcription # sentences = df_mapping['transcription'].tolist() # embeddings = model.encode(sentences) embeddings = df_mapping.iloc[:, 4:].to_numpy().astype('float32') embedding_query = sentence_model.encode(transcription) # Find the most similar transcription similarities = sentence_model.similarity(embeddings, embedding_query) index_of_most_similar_item = int(similarities.argmax()) # Retrieve the matching audio chunk path and transcription matched_chunk_path = df_mapping.loc[index_of_most_similar_item, "chunk_path"] matched_chunk_text = df_mapping.loc[index_of_most_similar_item, "transcription"] print(matched_chunk_path, matched_chunk_text) # Return the text and audio data return matched_chunk_text, matched_chunk_path # return matched_chunk_text, (matched_audio_segment.frame_rate, matched_audio_np) # Set up the Gradio interface with gr.Blocks() as demo: gr.Markdown("### Upload an audio file and retrieve the most similar database audio.") # Use gr.File for file upload and define text + audio outputs # audio_input = gr.File(label="Upload Your Audio") mic = gr.Audio(type="numpy", label="Record Your Audio") output_text = gr.Textbox(label="Matched Transcription") output_audio = gr.Audio(label="Matched Audio Playback") # Link the function to Gradio inputs and outputs # audio_input.change(process_and_find_audio, inputs=audio_input, outputs=[output_text, output_audio]) mic.change(process_and_find_audio, inputs=mic, outputs=[output_text, output_audio]) # Launch the app demo.launch(share=True) # import gradio as gr # from pydub import AudioSegment # import numpy as np # from io import BytesIO # # Simulated function to fetch audio from a "database" # def get_audio_from_database(): # # Replace with actual database retrieval logic # return "/home/shashank/Desktop/ml_fiest/Dataset/SandalWoodNewsStories_2.mp3" # Example path to an audio file # # Define the function that takes the user-recorded audio and returns database audio # def process_audio(user_audio): # # Process the user audio if needed # # Here we’re just passing it through without saving for demonstration purposes # # Get the database audio # db_audio_path = get_audio_from_database() # db_audio_segment = AudioSegment.from_file(db_audio_path) # # Convert db audio to numpy array and sample rate # buffer = BytesIO() # db_audio_segment.export(buffer, format="wav") # buffer.seek(0) # db_audio = np.frombuffer(buffer.read(), dtype=np.int16) # # Return the database audio as a response # return (db_audio_segment.frame_rate, db_audio) # # Set up the Gradio interface # with gr.Blocks() as demo: # gr.Markdown("### Record your audio and play a sample from the database.") # # Define microphone input and audio output without `source` argument # mic = gr.Audio(type="numpy", label="Record Your Audio") # output_audio = gr.Audio(label="Database Audio Response") # # Connect the function with Gradio components # mic.change(process_audio, inputs=mic, outputs=output_audio) # # Launch the app # demo.launch(share=True)