manmeetkaurbaxi's picture
Update app.py
cc84962
# Imports
from pytube import YouTube
from huggingsound import SpeechRecognitionModel
import torch
from transformers import pipeline
from IPython.display import Audio
from pprint import pprint
import os
import gradio as gr
import subprocess # process in the os
from subprocess import STDOUT, check_call #os process manipulation
# Constants
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = DEVICE)
summarizationPipeline = pipeline('summarization') # Hugging Face's default summarization pipeline
SAMPLING_RATE = 16000
USE_ONNX = False
torch.set_num_threads(1)
def transcribeVideo(VIDEO_URL):
# Download the '.mp4' & save it as an audio file ('.wav') for the video
ytVideo = YouTube(VIDEO_URL)
ytVideo.streams \
.filter(only_audio = True, file_extension = 'mp4') \
.first() \
.download(filename = 'ytaudio.mp4') \
os.system("ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav")
# Audio Chunking with Silero VAD
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=True,
onnx=USE_ONNX)
(get_speech_timestamps,
save_audio,
read_audio,
VADIterator,
collect_chunks) = utils
# Read '.wav' audio file
audioFile = read_audio('ytaudio.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speechTimestamps = get_speech_timestamps(audioFile, model, sampling_rate=SAMPLING_RATE)
# Save the audio chunks as separate audio files
index = 0
for timestamp in speechTimestamps:
startTime = timestamp['start']
endTime = timestamp['end']
save_audio(f'speech-{index}.wav', audioFile[startTime:endTime], sampling_rate=SAMPLING_RATE)
index += 1
# Concatenate the path of these separated audio chunks
audioChunksPath = []
for i in range(len(speechTimestamps)):
audioChunksPath.append(f'speech-{i}.wav')
# Generate individual transcriptions & concatenate them
transcriptions = MODEL.transcribe(audioChunksPath)
fullTranscript = ''
for transcript in transcriptions:
fullTranscript += ''.join(transcript['transcription']) + ' '
return fullTranscript
def summarizeTranscription(VIDEO_URL):
fullTranscript = transcribeVideo(VIDEO_URL)
# Generate summary from the full transcript
summarizedText = summarizationPipeline(fullTranscript, max_length=300, min_length=75, do_sample=False)
return summarizedText[0]['summary_text']
iface = gr.Interface(fn=summarizeTranscription, inputs=["text"], outputs=["textbox"], title='YouTube Video Summarizer').launch(inline=False)