Update app.py
Browse files
app.py
CHANGED
@@ -2,10 +2,8 @@ import gradio as gr
|
|
2 |
import whisper
|
3 |
import os
|
4 |
import re
|
5 |
-
|
6 |
-
import
|
7 |
-
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
8 |
-
from transformers import pipeline
|
9 |
|
10 |
# Load models
|
11 |
whisper_model = whisper.load_model("base")
|
@@ -15,37 +13,38 @@ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(multilingual_model)
|
|
15 |
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
|
16 |
|
17 |
SUPPORTED_LANGUAGES = {
|
18 |
-
"bn": "Bengali",
|
19 |
-
"
|
20 |
-
"
|
21 |
-
"hi": "Hindi",
|
22 |
-
"kn": "Kannada",
|
23 |
-
"ml": "Malayalam",
|
24 |
-
"mr": "Marathi",
|
25 |
-
"ta": "Tamil",
|
26 |
-
"te": "Telugu",
|
27 |
-
"ur": "Urdu"
|
28 |
}
|
29 |
|
30 |
def download_audio(youtube_url):
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
return filename
|
36 |
|
37 |
def extract_thumbnail(youtube_url):
|
38 |
-
|
39 |
-
match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", youtube_url)
|
40 |
if match:
|
41 |
video_id = match.group(1)
|
42 |
return f"https://img.youtube.com/vi/{video_id}/0.jpg"
|
43 |
return ""
|
44 |
|
45 |
-
def summarize_text(text
|
46 |
input_text = f"summarize: {text}"
|
47 |
inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
|
48 |
-
summary_ids = summarizer_model.generate(inputs, max_length=150, min_length=30, num_beams=4
|
49 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
50 |
|
51 |
def transcribe_and_summarize(youtube_url, translate_to_english):
|
@@ -60,7 +59,7 @@ def transcribe_and_summarize(youtube_url, translate_to_english):
|
|
60 |
if lang_code not in SUPPORTED_LANGUAGES:
|
61 |
return None, f"β Language '{lang_code}' not supported.", "", "", None
|
62 |
|
63 |
-
summary = summarize_text(transcript
|
64 |
|
65 |
if translate_to_english and lang_code != "en":
|
66 |
translated_summary = translator(summary)[0]["translation_text"]
|
@@ -79,10 +78,9 @@ def transcribe_and_summarize(youtube_url, translate_to_english):
|
|
79 |
except Exception as e:
|
80 |
return None, f"β Error: {str(e)}", "", "", None
|
81 |
|
82 |
-
# Gradio UI
|
83 |
with gr.Blocks(css="style.css") as demo:
|
84 |
gr.Markdown("<h1 style='text-align: center;'>π¬ Multilingual YouTube Summarizer</h1>")
|
85 |
-
gr.Markdown("
|
86 |
|
87 |
with gr.Row():
|
88 |
youtube_url = gr.Textbox(label="YouTube Video URL")
|
|
|
2 |
import whisper
|
3 |
import os
|
4 |
import re
|
5 |
+
import yt_dlp
|
6 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
|
|
|
|
7 |
|
8 |
# Load models
|
9 |
whisper_model = whisper.load_model("base")
|
|
|
13 |
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
|
14 |
|
15 |
SUPPORTED_LANGUAGES = {
|
16 |
+
"bn": "Bengali", "en": "English", "gu": "Gujarati", "hi": "Hindi",
|
17 |
+
"kn": "Kannada", "ml": "Malayalam", "mr": "Marathi", "ta": "Tamil",
|
18 |
+
"te": "Telugu", "ur": "Urdu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
}
|
20 |
|
21 |
def download_audio(youtube_url):
|
22 |
+
ydl_opts = {
|
23 |
+
'format': 'bestaudio/best',
|
24 |
+
'outtmpl': 'audio.%(ext)s',
|
25 |
+
'postprocessors': [{
|
26 |
+
'key': 'FFmpegExtractAudio',
|
27 |
+
'preferredcodec': 'mp3',
|
28 |
+
}],
|
29 |
+
'cookiefile': 'cookies.txt' # <β Uses your uploaded cookie file
|
30 |
+
}
|
31 |
+
|
32 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
33 |
+
info_dict = ydl.extract_info(youtube_url, download=True)
|
34 |
+
filename = ydl.prepare_filename(info_dict).replace(".webm", ".mp3").replace(".m4a", ".mp3")
|
35 |
return filename
|
36 |
|
37 |
def extract_thumbnail(youtube_url):
|
38 |
+
match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", youtube_url)
|
|
|
39 |
if match:
|
40 |
video_id = match.group(1)
|
41 |
return f"https://img.youtube.com/vi/{video_id}/0.jpg"
|
42 |
return ""
|
43 |
|
44 |
+
def summarize_text(text):
|
45 |
input_text = f"summarize: {text}"
|
46 |
inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
|
47 |
+
summary_ids = summarizer_model.generate(inputs, max_length=150, min_length=30, num_beams=4)
|
48 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
49 |
|
50 |
def transcribe_and_summarize(youtube_url, translate_to_english):
|
|
|
59 |
if lang_code not in SUPPORTED_LANGUAGES:
|
60 |
return None, f"β Language '{lang_code}' not supported.", "", "", None
|
61 |
|
62 |
+
summary = summarize_text(transcript)
|
63 |
|
64 |
if translate_to_english and lang_code != "en":
|
65 |
translated_summary = translator(summary)[0]["translation_text"]
|
|
|
78 |
except Exception as e:
|
79 |
return None, f"β Error: {str(e)}", "", "", None
|
80 |
|
|
|
81 |
with gr.Blocks(css="style.css") as demo:
|
82 |
gr.Markdown("<h1 style='text-align: center;'>π¬ Multilingual YouTube Summarizer</h1>")
|
83 |
+
gr.Markdown("Paste any YouTube video link, and get transcript + summary. Works for Hindi, Bengali, Tamil, Urdu, and more!")
|
84 |
|
85 |
with gr.Row():
|
86 |
youtube_url = gr.Textbox(label="YouTube Video URL")
|