ShashankSS1205 commited on
Commit
076147f
·
1 Parent(s): da72d94

app.py added

Browse files
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pydub import AudioSegment
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sentence_transformers import SentenceTransformer
6
+ import whisper
7
+
8
+ # Load the model (choose "tiny", "base", "small", "medium", or "large")
9
+ audio_model = whisper.load_model("large", device='cpu')
10
+
11
+ # Load the model
12
+ sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
13
+
14
+ # Simulated DataFrame containing chunk paths and transcriptions for demo
15
+ # Replace with your actual data
16
+ df_mapping = pd.read_csv('audio_chunk_mapping_with_transcription_embeddings.csv')
17
+
18
+ # Function to process the input audio and retrieve the most similar audio chunk
19
+ def process_and_find_audio(audio_file):
20
+ # Load audio from the file path
21
+ audio_path = "/teamspace/studios/this_studio/temp_audio.wav" # Path to temporarily save audio if needed
22
+ sample_rate,audio_np = audio_file
23
+ # Save the numpy array as an audio file if you need to pass it to the Whisper model
24
+ audio_segment = AudioSegment(
25
+ audio_np.tobytes(),
26
+ frame_rate=sample_rate, # Set the frame rate as appropriate
27
+ sample_width=2, # Assuming 16-bit samples (adjust if necessary)
28
+ channels=1 # Assuming mono channel (adjust if necessary)
29
+ )
30
+ # audio_file = audio_path
31
+ # audio_segment.export(audio_file, format="wav")
32
+ # Save the audio to a temporary file
33
+ audio_segment.export(audio_path, format="wav")
34
+
35
+ # audio_path = audio_file.name
36
+ transcription = audio_model.transcribe(audio_path, task="translate")['text']
37
+
38
+ # Compute embeddings for database transcriptions and user transcription
39
+ # sentences = df_mapping['transcription'].tolist()
40
+ # embeddings = model.encode(sentences)
41
+ embeddings = df_mapping.iloc[:, 4:].to_numpy().astype('float32')
42
+ embedding_query = sentence_model.encode(transcription)
43
+
44
+ # Find the most similar transcription
45
+ similarities = sentence_model.similarity(embeddings, embedding_query)
46
+ index_of_most_similar_item = int(similarities.argmax())
47
+
48
+ # Retrieve the matching audio chunk path and transcription
49
+ matched_chunk_path = df_mapping.loc[index_of_most_similar_item, "chunk_path"]
50
+ matched_chunk_text = df_mapping.loc[index_of_most_similar_item, "transcription"]
51
+ print(matched_chunk_path, matched_chunk_text)
52
+ # Return the text and audio data
53
+ return matched_chunk_text, matched_chunk_path
54
+ # return matched_chunk_text, (matched_audio_segment.frame_rate, matched_audio_np)
55
+
56
+ # Set up the Gradio interface
57
+ with gr.Blocks() as demo:
58
+ gr.Markdown("### Upload an audio file and retrieve the most similar database audio.")
59
+
60
+ # Use gr.File for file upload and define text + audio outputs
61
+ # audio_input = gr.File(label="Upload Your Audio")
62
+ mic = gr.Audio(type="numpy", label="Record Your Audio")
63
+ output_text = gr.Textbox(label="Matched Transcription")
64
+ output_audio = gr.Audio(label="Matched Audio Playback")
65
+
66
+ # Link the function to Gradio inputs and outputs
67
+ # audio_input.change(process_and_find_audio, inputs=audio_input, outputs=[output_text, output_audio])
68
+ mic.change(process_and_find_audio, inputs=mic, outputs=[output_text, output_audio])
69
+
70
+ # Launch the app
71
+ demo.launch(share=True)
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+ # import gradio as gr
86
+ # from pydub import AudioSegment
87
+ # import numpy as np
88
+ # from io import BytesIO
89
+
90
+ # # Simulated function to fetch audio from a "database"
91
+ # def get_audio_from_database():
92
+ # # Replace with actual database retrieval logic
93
+ # return "/home/shashank/Desktop/ml_fiest/Dataset/SandalWoodNewsStories_2.mp3" # Example path to an audio file
94
+
95
+ # # Define the function that takes the user-recorded audio and returns database audio
96
+ # def process_audio(user_audio):
97
+ # # Process the user audio if needed
98
+ # # Here we’re just passing it through without saving for demonstration purposes
99
+
100
+ # # Get the database audio
101
+ # db_audio_path = get_audio_from_database()
102
+ # db_audio_segment = AudioSegment.from_file(db_audio_path)
103
+
104
+ # # Convert db audio to numpy array and sample rate
105
+ # buffer = BytesIO()
106
+ # db_audio_segment.export(buffer, format="wav")
107
+ # buffer.seek(0)
108
+ # db_audio = np.frombuffer(buffer.read(), dtype=np.int16)
109
+
110
+ # # Return the database audio as a response
111
+ # return (db_audio_segment.frame_rate, db_audio)
112
+
113
+ # # Set up the Gradio interface
114
+ # with gr.Blocks() as demo:
115
+ # gr.Markdown("### Record your audio and play a sample from the database.")
116
+
117
+ # # Define microphone input and audio output without `source` argument
118
+ # mic = gr.Audio(type="numpy", label="Record Your Audio")
119
+ # output_audio = gr.Audio(label="Database Audio Response")
120
+
121
+ # # Connect the function with Gradio components
122
+ # mic.change(process_audio, inputs=mic, outputs=output_audio)
123
+
124
+ # # Launch the app
125
+ # demo.launch(share=True)
audio_chunk_mapping_with_transcription_embeddings.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pydub
2
+ gradio
3
+ numpy
4
+ pandas
5
+ sentence_transformers
6
+ whisper