import streamlit as st import os from pydub import AudioSegment from transformers import T5ForConditionalGeneration, T5Tokenizer import torch import whisper device = torch.device("cuda" if torch.cuda.is_available() else "cpu") models = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline") tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline") models = models.to(device) model = whisper.load_model("base") st.title("Audio Analysis") # Arguments input st.subheader("Enter YouTube link and file name:") url = st.text_input("YouTube link") name = st.text_input("File name") # Process audio and generate headings if st.button("Process"): if os.path.exists("audio.mp3"): os.remove("audio.mp3") os.system("youtube-dl "+"--write-thumbnail "+"--skip-download "+url + " -o logo.png") os.system("yt-dlp -f 140 -o audio.mp3 " + url) while not os.path.exists("audio.mp3"): continue if os.path.exists("segments"): os.system("rm -rf segments") audio = AudioSegment.from_file("audio.mp3") segment_length = 30 * 1000 if not os.path.exists("segments"): os.makedirs("segments") for i, segment in enumerate(audio[::segment_length]): segment.export(f"segments/{i}.mp3", format="mp3") original_text = "" audio_list = os.listdir("segments") headings = [] original_texts = [] dataForWeb = {} for i in range(len(audio_list)): st.write(f"Processing segment {i+1}/{len(audio_list)}") audio = whisper.load_audio(f"segments/{i}.mp3") audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) options = whisper.DecodingOptions(fp16=False) result = whisper.decode(model, mel, options) text = "headline: " + result.text max_len = 256 encoding = tokenizer.encode_plus(text, return_tensors="pt") input_ids = encoding["input_ids"].to(device) attention_masks = encoding["attention_mask"].to(device) beam_outputs = models.generate( input_ids=input_ids, attention_mask=attention_masks, max_length=64, num_beams=3, early_stopping=True, ) generated_heading = tokenizer.decode(beam_outputs[0]) headings.append(generated_heading) original_texts.append(result.text) dataForWeb[i] = { "heading": generated_heading, "text": result.text } original_text += "\n" original_text += "

" + generated_heading + "

" original_text += "\n" original_text += "

" + result.text + "

" with open(name, "w") as f: f.write(original_text) st.success("Audio processing completed!") # Display results st.subheader("Generated Headings and Text:") for i, heading in enumerate(headings): st.write(f"Segment {i+1}:") st.write("Heading:", heading) st.write("Text:", original_texts[i]) st.write("-----------")