File size: 4,985 Bytes
ee54bd3 c94ca86 ee54bd3 f5fe020 ee54bd3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
from pytubefix import YouTube
from moviepy.editor import VideoFileClip, AudioFileClip
from pydub import AudioSegment
import whisper
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import gradio as gr
import ast
from IPython.display import Audio, display
model = whisper.load_model("base")
def extract_yt_audio(video_url):
"""
Takes youtube url (youtobe_url) and path where audio clip will be stored (audio_path)
in string format as input arguments.
Returns the extracted video clip (video) and the path to audio clip (audio_path).
"""
if "youtube.com" in video_url or "youtu.be" in video_url:
yt = YouTube(video_url, use_oauth=True)
a = yt.streams.filter(only_audio=True).first()
audio_file = a.download()
sample = AudioSegment.from_file(audio_file)
else:
sample = AudioSegment.from_file(video_url)
audio_path = 'audio.wav'
display(Audio(audio_path))
sample.export(audio_path, format="wav")
result = model.transcribe(audio_path)
print("Transcription started \nTranscript:\n")
print(result['text'], '\n')
return gr.update(visible=True, value=result['text']), gr.update(visible=True), result['segments'], gr.update(visible=True, value=audio_path)
def semantic_chunks(segs, max_chunk_length=15.0):
print(type(segs))
print(segs)
"""
Takes segments of transcribed audio and 15secs as maximum check duration and returns chunks of the audio as a list.
"""
segs = ast.literal_eval(segs)
print(type(segs))
chunks = []
current_chunk = []
chunk_start_time = None
chunk_end_time = None
chunk_duration = 0
# iterate over segments and create chunks out of each segment
for segment in segs:
start = segment['start']
end = segment['end']
text = segment['text']
# sentence tokenize each segment to capture more semantic context
sentences = sent_tokenize(text)
# iterate over the sentences and group them into chunks subject to the max_chunk_length is 15 secs
for sentence in sentences:
sentence_duration = (end - start) / len(sentences)
# Check if adding the sentence exceeds the max_chunk_length of 15 secs
if chunk_duration + sentence_duration <= max_chunk_length:
if not current_chunk:
chunk_start_time = start
current_chunk.append(sentence)
chunk_duration += sentence_duration
chunk_end_time = end
else:
# If the chunk would be too long, finalize the current chunk with required parameters
chunks.append({
'chunk_id': len(chunks) + 1,
'chunk_length (secs)': chunk_duration,
'semantic_chunk': ' '.join(current_chunk),
'start_time (secs)': chunk_start_time,
'end_time (secs)': chunk_end_time
})
# Start a new chunk with the current sentence
current_chunk = [sentence]
chunk_start_time = start
chunk_end_time = end
chunk_duration = sentence_duration
# Finalize the last chunk if it exists
if current_chunk:
chunks.append({
'chunk_id': len(chunks) + 1,
'chunk_length (secs)': chunk_duration,
'semantic_chunk': ' '.join(current_chunk),
'start_time (secs)': chunk_start_time,
'end_time (secs)': chunk_end_time
})
return gr.update(visible=True, value=pd.DataFrame(chunks))
def clear_all():
return (gr.update(visible=True, value=""), gr.update(visible=True, value=""), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))
with gr.Blocks() as demo:
gr.Markdown(
"""
# Extract audio from video, get the transcript and then get the semantic chunk information.
""")
# input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='https://www.youtube.com/watch?v=ug5e4JfC3oo')
input_url = gr.Textbox(label="Type-in the URL or File Location of the Video", value='sample.wav')
segments = gr.Textbox(visible=False)
submit_btn_1 = gr.Button("Get the Transcript", visible=True)
audio = gr.Audio(visible=True, type="filepath", label='Play Audio')
transcript = gr.Textbox(visible=True, label='Transcript')
submit_btn_2 = gr.Button("Get the semantically Chuncked Segments", visible=False)
chunks = gr.Dataframe(visible=False, label = 'semantic Chunks')
clear_btn = gr.Button("Clear")
submit_btn_1.click(fn=extract_yt_audio, inputs=[input_url], outputs=[transcript, submit_btn_2, segments, audio])
submit_btn_2.click(fn=semantic_chunks, inputs=[segments], outputs=[chunks])
clear_btn.click(fn=clear_all, outputs=[input_url, transcript, submit_btn_2, chunks, audio])
demo.launch(debug=True) |