Spaces:
Sleeping
Sleeping
import gradio as gr | |
from pydub import AudioSegment | |
import numpy as np | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
import whisper | |
# Load the model (choose "tiny", "base", "small", "medium", or "large") | |
audio_model = whisper.load_model("large", device='cpu') | |
# Load the model | |
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
# Simulated DataFrame containing chunk paths and transcriptions for demo | |
# Replace with your actual data | |
df_mapping = pd.read_csv('audio_chunk_mapping_with_transcription_embeddings.csv') | |
# Function to process the input audio and retrieve the most similar audio chunk | |
def process_and_find_audio(audio_file): | |
# Load audio from the file path | |
audio_path = "./temp_audio.wav" # Path to temporarily save audio if needed | |
sample_rate,audio_np = audio_file | |
# Save the numpy array as an audio file if you need to pass it to the Whisper model | |
audio_segment = AudioSegment( | |
audio_np.tobytes(), | |
frame_rate=sample_rate, # Set the frame rate as appropriate | |
sample_width=2, # Assuming 16-bit samples (adjust if necessary) | |
channels=1 # Assuming mono channel (adjust if necessary) | |
) | |
# audio_file = audio_path | |
# audio_segment.export(audio_file, format="wav") | |
# Save the audio to a temporary file | |
audio_segment.export(audio_path, format="wav") | |
# audio_path = audio_file.name | |
transcription = audio_model.transcribe(audio_path, task="translate")['text'] | |
# Compute embeddings for database transcriptions and user transcription | |
# sentences = df_mapping['transcription'].tolist() | |
# embeddings = model.encode(sentences) | |
embeddings = df_mapping.iloc[:, 4:].to_numpy().astype('float32') | |
embedding_query = sentence_model.encode(transcription) | |
# Find the most similar transcription | |
similarities = sentence_model.similarity(embeddings, embedding_query) | |
index_of_most_similar_item = int(similarities.argmax()) | |
# Retrieve the matching audio chunk path and transcription | |
matched_chunk_path = df_mapping.loc[index_of_most_similar_item, "chunk_path"] | |
matched_chunk_text = df_mapping.loc[index_of_most_similar_item, "transcription"] | |
print(matched_chunk_path, matched_chunk_text) | |
# Return the text and audio data | |
return matched_chunk_text, matched_chunk_path | |
# return matched_chunk_text, (matched_audio_segment.frame_rate, matched_audio_np) | |
# Set up the Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("### Upload an audio file and retrieve the most similar database audio.") | |
# Use gr.File for file upload and define text + audio outputs | |
# audio_input = gr.File(label="Upload Your Audio") | |
mic = gr.Audio(type="numpy", label="Record Your Audio") | |
output_text = gr.Textbox(label="Matched Transcription") | |
output_audio = gr.Audio(label="Matched Audio Playback") | |
# Link the function to Gradio inputs and outputs | |
# audio_input.change(process_and_find_audio, inputs=audio_input, outputs=[output_text, output_audio]) | |
mic.change(process_and_find_audio, inputs=mic, outputs=[output_text, output_audio]) | |
# Launch the app | |
demo.launch(share=True) | |
# import gradio as gr | |
# from pydub import AudioSegment | |
# import numpy as np | |
# from io import BytesIO | |
# # Simulated function to fetch audio from a "database" | |
# def get_audio_from_database(): | |
# # Replace with actual database retrieval logic | |
# return "/home/shashank/Desktop/ml_fiest/Dataset/SandalWoodNewsStories_2.mp3" # Example path to an audio file | |
# # Define the function that takes the user-recorded audio and returns database audio | |
# def process_audio(user_audio): | |
# # Process the user audio if needed | |
# # Here we’re just passing it through without saving for demonstration purposes | |
# # Get the database audio | |
# db_audio_path = get_audio_from_database() | |
# db_audio_segment = AudioSegment.from_file(db_audio_path) | |
# # Convert db audio to numpy array and sample rate | |
# buffer = BytesIO() | |
# db_audio_segment.export(buffer, format="wav") | |
# buffer.seek(0) | |
# db_audio = np.frombuffer(buffer.read(), dtype=np.int16) | |
# # Return the database audio as a response | |
# return (db_audio_segment.frame_rate, db_audio) | |
# # Set up the Gradio interface | |
# with gr.Blocks() as demo: | |
# gr.Markdown("### Record your audio and play a sample from the database.") | |
# # Define microphone input and audio output without `source` argument | |
# mic = gr.Audio(type="numpy", label="Record Your Audio") | |
# output_audio = gr.Audio(label="Database Audio Response") | |
# # Connect the function with Gradio components | |
# mic.change(process_audio, inputs=mic, outputs=output_audio) | |
# # Launch the app | |
# demo.launch(share=True) | |