File size: 4,789 Bytes
076147f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d675cd3
076147f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
from pydub import AudioSegment
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import whisper

# Load the model (choose "tiny", "base", "small", "medium", or "large")
audio_model = whisper.load_model("large", device='cpu')

# Load the model
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Simulated DataFrame containing chunk paths and transcriptions for demo
# Replace with your actual data
df_mapping = pd.read_csv('audio_chunk_mapping_with_transcription_embeddings.csv')

# Function to process the input audio and retrieve the most similar audio chunk
def process_and_find_audio(audio_file):
    # Load audio from the file path
    audio_path = "./temp_audio.wav"  # Path to temporarily save audio if needed
    sample_rate,audio_np  = audio_file
    # Save the numpy array as an audio file if you need to pass it to the Whisper model
    audio_segment = AudioSegment(
        audio_np.tobytes(), 
        frame_rate=sample_rate,  # Set the frame rate as appropriate
        sample_width=2,  # Assuming 16-bit samples (adjust if necessary)
        channels=1  # Assuming mono channel (adjust if necessary)
    )
    # audio_file = audio_path
    # audio_segment.export(audio_file, format="wav")
    # Save the audio to a temporary file
    audio_segment.export(audio_path, format="wav")        

    # audio_path = audio_file.name
    transcription = audio_model.transcribe(audio_path, task="translate")['text']
    
    # Compute embeddings for database transcriptions and user transcription
    # sentences = df_mapping['transcription'].tolist()
    # embeddings = model.encode(sentences)
    embeddings = df_mapping.iloc[:, 4:].to_numpy().astype('float32')
    embedding_query = sentence_model.encode(transcription)

    # Find the most similar transcription
    similarities = sentence_model.similarity(embeddings, embedding_query)
    index_of_most_similar_item = int(similarities.argmax())

    # Retrieve the matching audio chunk path and transcription
    matched_chunk_path = df_mapping.loc[index_of_most_similar_item, "chunk_path"]
    matched_chunk_text = df_mapping.loc[index_of_most_similar_item, "transcription"]
    print(matched_chunk_path, matched_chunk_text)
    # Return the text and audio data
    return matched_chunk_text, matched_chunk_path
    # return matched_chunk_text, (matched_audio_segment.frame_rate, matched_audio_np)

# Set up the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("### Upload an audio file and retrieve the most similar database audio.")

    # Use gr.File for file upload and define text + audio outputs
    # audio_input = gr.File(label="Upload Your Audio")
    mic = gr.Audio(type="numpy", label="Record Your Audio")
    output_text = gr.Textbox(label="Matched Transcription")
    output_audio = gr.Audio(label="Matched Audio Playback")

    # Link the function to Gradio inputs and outputs
    # audio_input.change(process_and_find_audio, inputs=audio_input, outputs=[output_text, output_audio])
    mic.change(process_and_find_audio, inputs=mic, outputs=[output_text, output_audio])

# Launch the app
demo.launch(share=True)













# import gradio as gr
# from pydub import AudioSegment
# import numpy as np
# from io import BytesIO

# # Simulated function to fetch audio from a "database"
# def get_audio_from_database():
#     # Replace with actual database retrieval logic
#     return "/home/shashank/Desktop/ml_fiest/Dataset/SandalWoodNewsStories_2.mp3"  # Example path to an audio file

# # Define the function that takes the user-recorded audio and returns database audio
# def process_audio(user_audio):
#     # Process the user audio if needed
#     # Here we’re just passing it through without saving for demonstration purposes

#     # Get the database audio
#     db_audio_path = get_audio_from_database()
#     db_audio_segment = AudioSegment.from_file(db_audio_path)

#     # Convert db audio to numpy array and sample rate
#     buffer = BytesIO()
#     db_audio_segment.export(buffer, format="wav")
#     buffer.seek(0)
#     db_audio = np.frombuffer(buffer.read(), dtype=np.int16)

#     # Return the database audio as a response
#     return (db_audio_segment.frame_rate, db_audio)

# # Set up the Gradio interface
# with gr.Blocks() as demo:
#     gr.Markdown("### Record your audio and play a sample from the database.")
    
#     # Define microphone input and audio output without `source` argument
#     mic = gr.Audio(type="numpy", label="Record Your Audio")
#     output_audio = gr.Audio(label="Database Audio Response")
    
#     # Connect the function with Gradio components
#     mic.change(process_audio, inputs=mic, outputs=output_audio)

# # Launch the app
# demo.launch(share=True)