summary_tube / app.py
cfc-tech's picture
w
dec5a5d verified
raw
history blame
1.87 kB
import gradio as gr
from pytube import YouTube
import subprocess
import torch
from huggingsound import SpeechRecognitionModel
import librosa
import soundfile as sf
from transformers import pipeline
def summarize_video(youtube_link):
# Download YouTube video's audio
yt = YouTube(youtube_link)
yt.streams.filter(only_audio=True, file_extension='mp4').first().download(filename='ytaudio.mp4')
# Convert to WAV format
subprocess.run(['ffmpeg', '-i', 'ytaudio.mp4', '-acodec', 'pcm_s16le', '-ar', '16000', 'ytaudio.wav'], check=True)
# Initialize speech recognition model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device=device)
# Process audio file and transcribe
input_file = 'ytaudio.wav'
stream = librosa.stream(input_file, block_length=30, frame_length=16000, hop_length=16000)
full_transcript = ''
for i, speech in enumerate(stream):
sf.write(f'{i}.wav', speech, 16000)
transcription = model.transcribe([f'{i}.wav'])
full_transcript += ' '.join([item['transcription'] for item in transcription])
# Summarize the transcript
summarizer = pipeline('summarization')
summarized_text = summarizer(full_transcript, max_length=130, min_length=30, do_sample=False)
return summarized_text[0]['summary_text']
# Set up the Gradio interface
iface = gr.Interface(fn=summarize_video,
inputs=gr.inputs.Textbox(lines=2, placeholder="Enter YouTube Video Link Here..."),
outputs="text",
title="YouTube Video Text Summarizer",
description="This tool summarizes the text extracted from a given YouTube video. Please enter the video link below.")
if __name__ == "__main__":
iface.launch()