AyeshaAmeen commited on
Commit
e839bd5
1 Parent(s): d4e84f9

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +234 -0
  2. download_spacy_model.py +11 -0
  3. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1a3BQS9Nu4qUbxFVu7gP9XtVhZA0c-ldN
8
+ """
9
+
10
+ import assemblyai as aai
11
+ from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
12
+ from deep_translator import GoogleTranslator
13
+ import spacy
14
+ import gradio as gr
15
+ from pydub import AudioSegment
16
+ import os
17
+ from resemblyzer import VoiceEncoder, preprocess_wav
18
+ from pathlib import Path
19
+ import torch
20
+ import numpy as np
21
+ import requests
22
+ from tempfile import NamedTemporaryFile
23
+ from yt_dlp import YoutubeDL
24
+ from urllib.parse import urlparse
25
+ from sklearn.cluster import AgglomerativeClustering
26
+
27
+ # Step 1: Set AssemblyAI API Key
28
+ aai.settings.api_key = "your_assemblyai_api_key"
29
+ transcriber = aai.Transcriber()
30
+
31
+ def transcribe_audio(audio_file_path):
32
+ transcript = transcriber.transcribe(audio_file_path)
33
+ transcription_text = transcript.text if hasattr(transcript, 'text') else ""
34
+ transcription_words = transcript.words if hasattr(transcript, 'words') else []
35
+ return transcription_text, transcription_words
36
+
37
+ # Step 2: Language Translation (English and Urdu) with chunking
38
+ def translate_text(text, target_language):
39
+ translator = GoogleTranslator(source='auto', target=target_language)
40
+ chunk_size = 4999 # Ensure we do not exceed the limit
41
+ translated_chunks = []
42
+ for i in range(0, len(text), chunk_size):
43
+ chunk = text[i:i + chunk_size]
44
+ translated_chunk = translator.translate(chunk)
45
+ translated_chunks.append(translated_chunk)
46
+ translated_text = " ".join(translated_chunks)
47
+ return translated_text
48
+
49
+ # Step 3: Summarization with T5 Model
50
+ tokenizer = T5Tokenizer.from_pretrained('t5-base')
51
+ model_t5 = T5ForConditionalGeneration.from_pretrained('t5-base')
52
+
53
+ def summarize_text(text, source_language, target_language):
54
+ if source_language == 'urdu':
55
+ text = translate_text(text, 'en') # Translate to English for summarization
56
+ inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
57
+ summary_ids = model_t5.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
58
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
59
+ if source_language == 'urdu':
60
+ summary = translate_text(summary, target_language) # Translate back to Urdu
61
+ return summary
62
+
63
+ # Step 4: Key Points Extraction with spaCy
64
+ nlp = spacy.load("en_core_web_sm")
65
+
66
+ def extract_key_points(text):
67
+ doc = nlp(text)
68
+ tasks = []
69
+ for ent in doc.ents:
70
+ if ent.label_ in ["TASK", "DATE", "PERSON", "ORG"]:
71
+ tasks.append(ent.text)
72
+ return tasks
73
+
74
+ # Step 5: Speaker Identification using silero and resemblyzer
75
+ def identify_speakers(audio_file_path):
76
+ wav_fpath = Path(audio_file_path)
77
+ wav = preprocess_wav(wav_fpath)
78
+
79
+ # Load the silero VAD model and utilities
80
+ vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True)
81
+ (get_speech_timestamps, _, _, _, _) = utils
82
+ sampling_rate = 16000 # Set the sampling rate
83
+
84
+ # Get speech timestamps using silero VAD
85
+ speech_timestamps = get_speech_timestamps(wav, vad_model, sampling_rate=sampling_rate)
86
+
87
+ encoder = VoiceEncoder()
88
+ speaker_segments = []
89
+
90
+ for ts in speech_timestamps:
91
+ start, end = ts['start'], ts['end']
92
+ segment = wav[start:end]
93
+ speaker_embeds = encoder.embed_utterance(segment)
94
+ speaker_segments.append((start / sampling_rate, end / sampling_rate, speaker_embeds))
95
+
96
+ # Use AgglomerativeClustering to cluster the speakers
97
+ embeddings = np.vstack([seg[2] for seg in speaker_segments])
98
+ clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.75).fit(embeddings)
99
+ speaker_labels = clustering.labels_
100
+
101
+ # Merge adjacent segments identified as the same speaker
102
+ merged_segments = []
103
+ for i, (start_time, end_time, _) in enumerate(speaker_segments):
104
+ label = speaker_labels[i]
105
+ if merged_segments and merged_segments[-1][0] == label:
106
+ merged_segments[-1] = (label, merged_segments[-1][1], end_time)
107
+ else:
108
+ merged_segments.append((label, start_time, end_time))
109
+
110
+ return merged_segments, len(np.unique(speaker_labels))
111
+
112
+ # Step 6: Sentiment Analysis using transformers
113
+ model_sentiment = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
114
+ tokenizer_sentiment = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
115
+
116
+ def analyze_sentiment(text):
117
+ max_length = 512 # Set the maximum length for the tokenizer
118
+ inputs = tokenizer_sentiment(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
119
+ outputs = model_sentiment(**inputs)
120
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
121
+ sentiment = torch.argmax(probs, dim=1).item()
122
+ sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
123
+ return sentiment_map[sentiment]
124
+
125
+ # Ensure the directory exists
126
+ output_dir = "/content"
127
+ os.makedirs(output_dir, exist_ok=True)
128
+
129
+ # Step 7: Download audio from YouTube using yt-dlp
130
+ def download_audio_from_youtube(url):
131
+ ydl_opts = {
132
+ 'format': 'bestaudio/best',
133
+ 'postprocessors': [{
134
+ 'key': 'FFmpegExtractAudio',
135
+ 'preferredcodec': 'wav',
136
+ 'preferredquality': '192',
137
+ }],
138
+ 'outtmpl': '/tmp/%(id)s.%(ext)s',
139
+ 'quiet': True
140
+ }
141
+ with YoutubeDL(ydl_opts) as ydl:
142
+ info_dict = ydl.extract_info(url, download=True)
143
+ audio_file = ydl.prepare_filename(info_dict)
144
+ base, ext = os.path.splitext(audio_file)
145
+ audio_file = base + '.wav'
146
+ return audio_file
147
+
148
+ # Step 8: Gradio Interface Setup
149
+ def process_meeting(file, url, language):
150
+ audio_path = None
151
+ if file is not None:
152
+ file_path = file.name
153
+ audio_path = os.path.join(output_dir, "uploaded_audio.wav")
154
+
155
+ # Convert video to audio if necessary
156
+ if file_path.endswith(('.mp4', '.avi', '.mov', '.mkv')):
157
+ video = AudioSegment.from_file(file_path)
158
+ video.export(audio_path, format="wav")
159
+ else:
160
+ audio_path = file_path
161
+ elif url is not None:
162
+ parsed_url = urlparse(url)
163
+ if "youtube.com" in parsed_url.netloc or "youtu.be" in parsed_url.netloc:
164
+ audio_path = download_audio_from_youtube(url)
165
+ else:
166
+ response = requests.get(url)
167
+ with NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
168
+ temp_file.write(response.content)
169
+ audio_path = temp_file.name
170
+
171
+ if audio_path is None:
172
+ return "Please provide either a file or a URL."
173
+
174
+ transcription, words = transcribe_audio(audio_path)
175
+
176
+ # Step 2: Translation based on user-selected language
177
+ if language == "urdu":
178
+ translated_text = translate_text(transcription, 'ur')
179
+ else: # default to English
180
+ translated_text = transcription
181
+
182
+ # Step 3: Summarization and Key Points Extraction
183
+ summary = summarize_text(translated_text, language, 'ur')
184
+ key_points = extract_key_points(translated_text)
185
+
186
+ # Step 4: Speaker Identification
187
+ speakers, num_speakers = identify_speakers(audio_path)
188
+
189
+ # Map speakers to their spoken text
190
+ speaker_transcripts = {i: [] for i in range(num_speakers)}
191
+
192
+ for label, start_time, end_time in speakers:
193
+ segment = [word.text for word in words if start_time <= word.start / 1000 <= end_time]
194
+ text_segment = " ".join(segment)
195
+ speaker_transcripts[label].append(text_segment)
196
+
197
+ speaker_details = ""
198
+ for label, segments in speaker_transcripts.items():
199
+ speaker_name = f"Speaker {label + 1}"
200
+ speaker_details += f"{speaker_name}:\n"
201
+ speaker_details += "\n".join(segments) + "\n\n"
202
+
203
+ # Step 5: Sentiment Analysis
204
+ sentiment = analyze_sentiment(transcription)
205
+
206
+ speaker_details = f"Total number of speakers: {num_speakers}\n" + speaker_details
207
+
208
+ return transcription, translated_text, key_points, summary, speaker_details, sentiment
209
+
210
+ # Step 9: Launch Gradio Interface with Scrollbars
211
+ iface = gr.Interface(
212
+ fn=process_meeting,
213
+ inputs=[
214
+ gr.File(label="Upload Meeting Recording"),
215
+ gr.Textbox(label="Enter Meeting URL"),
216
+ gr.Radio(["english", "urdu"], label="Select Summary Language")
217
+ ],
218
+ outputs=[
219
+ gr.Textbox(label="Transcription", lines=20),
220
+ gr.Textbox(label="Translated Text", lines=20),
221
+ gr.Textbox(label="Key Points", lines=20),
222
+ gr.Textbox(label="Summary", lines=20),
223
+ gr.Textbox(label="Speakers", lines=20),
224
+ gr.Textbox(label="Sentiment", lines=1)
225
+ ],
226
+ title="Smart AI Meeting Assistant",
227
+ description="""
228
+ <div style='text-align: center;'>by Ayesha Ameen & Sana Sadiq</div>
229
+ <br>Upload your meeting recording or enter a publicly accessible URL and choose the summary language (English or Urdu).
230
+ """,
231
+ )
232
+
233
+ if __name__ == "__main__":
234
+ iface.launch(share=True, debug=True)
download_spacy_model.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """download_spacy_model.py.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1a3BQS9Nu4qUbxFVu7gP9XtVhZA0c-ldN
8
+ """
9
+
10
+ import spacy
11
+ spacy.cli.download("en_core_web_sm")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ assemblyai
4
+ deep-translator
5
+ spacy
6
+ pydub
7
+ torch
8
+ resemblyzer
9
+ yt-dlp
10
+ scikit-learn