GPTTUBE / app.py
Dhruv Pai Dukle
Add application file
1947bbe
import streamlit as st
import os
from pydub import AudioSegment
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import whisper
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
models = models.to(device)
model = whisper.load_model("base")
st.title("Audio Analysis")
# Arguments input
st.subheader("Enter YouTube link and file name:")
url = st.text_input("YouTube link")
name = st.text_input("File name")
# Process audio and generate headings
if st.button("Process"):
if os.path.exists("audio.mp3"):
os.remove("audio.mp3")
os.system("youtube-dl "+"--write-thumbnail "+"--skip-download "+url + " -o logo.png")
os.system("yt-dlp -f 140 -o audio.mp3 " + url)
while not os.path.exists("audio.mp3"):
continue
if os.path.exists("segments"):
os.system("rm -rf segments")
audio = AudioSegment.from_file("audio.mp3")
segment_length = 30 * 1000
if not os.path.exists("segments"):
os.makedirs("segments")
for i, segment in enumerate(audio[::segment_length]):
segment.export(f"segments/{i}.mp3", format="mp3")
original_text = ""
audio_list = os.listdir("segments")
headings = []
original_texts = []
dataForWeb = {}
for i in range(len(audio_list)):
st.write(f"Processing segment {i+1}/{len(audio_list)}")
audio = whisper.load_audio(f"segments/{i}.mp3")
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions(fp16=False)
result = whisper.decode(model, mel, options)
text = "headline: " + result.text
max_len = 256
encoding = tokenizer.encode_plus(text, return_tensors="pt")
input_ids = encoding["input_ids"].to(device)
attention_masks = encoding["attention_mask"].to(device)
beam_outputs = models.generate(
input_ids=input_ids,
attention_mask=attention_masks,
max_length=64,
num_beams=3,
early_stopping=True,
)
generated_heading = tokenizer.decode(beam_outputs[0])
headings.append(generated_heading)
original_texts.append(result.text)
dataForWeb[i] = {
"heading": generated_heading,
"text": result.text
}
original_text += "\n"
original_text += "<h3>" + generated_heading + "</h3>"
original_text += "\n"
original_text += "<p>" + result.text + "</p>"
with open(name, "w") as f:
f.write(original_text)
st.success("Audio processing completed!")
# Display results
st.subheader("Generated Headings and Text:")
for i, heading in enumerate(headings):
st.write(f"Segment {i+1}:")
st.write("Heading:", heading)
st.write("Text:", original_texts[i])
st.write("-----------")