cfc-tech's picture
app
ba86223 verified
raw
history blame
1.32 kB
import gradio as gr
from pytube import YouTube
import subprocess
from huggingsound import SpeechRecognitionModel
import torch
import librosa
import soundfile as sf
from transformers import pipeline
def process_video(video_url):
yt = YouTube(video_url)
audio_file = yt.streams.filter(only_audio=True, file_extension='mp4').first().download(filename='ytaudio.mp4')
subprocess.run(['ffmpeg', '-i', 'ytaudio.mp4', '-acodec', 'pcm_s16le', '-ar', '16000', 'ytaudio.wav'])
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device=device)
input_file = 'ytaudio.wav'
stream = librosa.stream(input_file, block_length=30, frame_length=16000, hop_length=16000)
full_transcript = ''
for i, speech in enumerate(stream):
sf.write(f'{i}.wav', speech, 16000)
transcription = model.transcribe([f'{i}.wav'])[0]['transcription']
full_transcript += transcription + ' '
summarization = pipeline('summarization')
summarized_text = summarization(full_transcript, max_length=130, min_length=30, do_sample=False)
return summarized_text[0]['summary_text']
iface = gr.Interface(fn=process_video, inputs="text", outputs="text", title="YouTube Video Summarizer")
iface.launch()