File size: 7,338 Bytes
3c0cd7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6d2852
3c0cd7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a1265a
3c0cd7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f1b98b
3c0cd7e
 
 
 
 
 
 
 
f0c3818
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# -*- coding: utf-8 -*-
"""pod_to_sum_v3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1rbZ98r1Z_IM0Z3VDuNQObxpuZf5KUgmL

### Initialization
"""

import os
save_dir= os.path.join('./','docs')
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

transcription_model = "openai/whisper-base"
llm_model = "gmurro/bart-large-finetuned-filtered-spotify-podcast-summ"

import pandas as pd
import numpy as np
import pytube
from pytube import YouTube
import transformers
from transformers import pipeline
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

"""### Define how to get transcript of the YT video"""

def get_transcript(url):
  yt_video = YouTube(str(url))
  yt_audio = yt_video.streams.filter(only_audio=True, file_extension='mp4').first() # get 1st available audio stream
  out_file = yt_audio.download(filename="audio.mp4", output_path = save_dir)

  asr = pipeline("automatic-speech-recognition", model=transcription_model, device=device)

  import librosa
  speech_array, sampling_rate = librosa.load(out_file, sr=16000) # getting audio file array

  audio_text = asr(
    speech_array,
    max_new_tokens=256,
    generate_kwargs={"task": "transcribe"},
    chunk_length_s=30,
    batch_size=8) # calling whisper model

  del(asr)
  torch.cuda.empty_cache() #deleting cache

  return audio_text['text']

"""### Define functions to generate summary"""

def clean_sent(sent_list):
    new_sent_list = [sent_list[0]]
    for i in range(len(sent_list)):
      if sent_list[i] != new_sent_list[-1]: new_sent_list.append(sent_list[i])
    return new_sent_list

import nltk
nltk.download('punkt')

def get_chunks (audio_text, sent_overlap, max_token, tokenizer):
  # pre-processing text
  sentences = nltk.tokenize.sent_tokenize(audio_text)
  sentences = clean_sent(sentences)

  first_sentence = 0
  last_sentence = 0
  chunks=[]
  while last_sentence <= len(sentences) - 1:
        last_sentence = first_sentence
        chunk_parts = []
        chunk_size = 0
        for sentence in sentences[first_sentence:]:
            sentence_sz = len(tokenizer.tokenize(sentence))
            if chunk_size + sentence_sz > max_token:
                break

            chunk_parts.append(sentence)
            chunk_size += sentence_sz
            last_sentence += 1

        chunks.append(" ".join(chunk_parts))
        first_sentence = last_sentence - sent_overlap
  return chunks

"""### Define how to get summary of the transcript"""

def get_summary(audio_text):
  import re
  audio_text = re.sub(r'\b(\w+) \1\b', r'\1', audio_text, flags=re.IGNORECASE) # cleaning text

  from transformers import AutoTokenizer
  tokenizer = AutoTokenizer.from_pretrained(llm_model) # set tockenizer

  from transformers import pipeline
  summarizer = pipeline("summarization", model=llm_model) # set summarizer

  model_max_tokens = tokenizer.model_max_length # get max tockens model can process
  text_tokens = len(tokenizer.tokenize(audio_text)) # get number of tockens in audio text

  def get_map_summary(chunk_text, summarizer):
      max_token = model_max_tokens - 2 #protect for "" before and after the text
      sent_overlap = 3 #overlapping sentences between 2 chunks
      sent_chunks = get_chunks(audio_text = chunk_text,sent_overlap = sent_overlap,max_token = max_token, tokenizer = tokenizer) # get chunks
      chunk_summary_list = summarizer(sent_chunks,min_length=50, max_length=200, batch_size=8) # get summary per chunk

      grouped_summary = ""
      for c in chunk_summary_list: grouped_summary += c['summary_text'] + " "


      return grouped_summary

  # check text requires map-reduce stategy
  map_text = audio_text
  long_summary = ""

  while text_tokens > model_max_tokens:
    map_summary = get_map_summary(chunk_text=map_text, summarizer=summarizer)
    text_tokens = len(tokenizer.tokenize(map_summary))
    long_summary = map_summary
    map_text = map_summary

  # else deploy reduce method
  else:
    max_token = round(text_tokens*0.3) # 1/3rd reduction
    final_summary = summarizer(map_text,min_length=35, max_length=max_token)
    final_summary = final_summary[0]["summary_text"]

  if long_summary == "": long_summary = "The video is too short to produce a descriptive summary"

  del(tokenizer, summarizer)
  torch.cuda.empty_cache() #deleting cache

  return final_summary, long_summary


"""### Defining Gradio App"""

import gradio as gr

import pytube
from pytube import YouTube

def get_youtube_title(url):
    yt = YouTube(str(url))
    return yt.title

def get_video(url):
    vid_id = pytube.extract.video_id(url)
    embed_html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/{}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'.format(vid_id)
    return embed_html

def summarize_youtube_video(url):
    print("URL:",url)
    text = get_transcript(url)
    print("Transcript:",text[:500])
    short_summary, long_summary = get_summary(text)
    print("Short Summary:",short_summary)
    print("Long Summary:",long_summary)
    return text, short_summary, long_summary

html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'

# Defining the structure of the UI
with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("# Summarize a Long YouTube Video")

    with gr.Row():
        with gr.Column(scale=4):
            url = gr.Textbox(label="Enter YouTube video link here:",placeholder="Place for youtube link..")
        with gr.Column(scale=1):
            sum_btn = gr.Button("Summarize!")

    gr.Markdown("# Results")

    title = gr.Textbox(label="Video Title",placeholder="title...")

    with gr.Row():
        with gr.Column(scale=4):
          video = gr.HTML(html,scale=1)
        with gr.Column():
          with gr.Row():
            short_summary = gr.Textbox(label="Gist",placeholder="short summary...",scale=1)
          with gr.Row():
            long_summary = gr.Textbox(label="Summary",placeholder="long summary...",scale=2)


    with gr.Row():
        with gr.Group():
            text = gr.Textbox(label="Full Transcript",placeholder="transcript...",show_label=True)

    with gr.Accordion("Credits and Notes",open=False):
        gr.Markdown("""
                    1. Transcipt is generated by openai/whisper-base model by downloading YouTube video.\n
                    2. Summary is generated by gmurro/bart-large-finetuned-filtered-spotify-podcast-summ.\n
                    3. The app is possible because of Hugging Face Transformers.\n
                    """)

    # Defining the functions to call on clicking the button
    sum_btn.click(fn=get_youtube_title, inputs=url, outputs=title, api_name="get_youtube_title", queue=False)
    sum_btn.click(fn=summarize_youtube_video, inputs=url, outputs=[text, short_summary, long_summary], api_name="summarize_youtube_video", queue=True)
    sum_btn.click(fn=get_video, inputs=url, outputs=video, api_name="get_youtube_video", queue=False)

demo.queue()
demo.launch(share=False)