Spaces:
Runtime error
Runtime error
File size: 4,432 Bytes
1ebc0dd 37466ca 5de9db4 37466ca 1ebc0dd 37466ca 5de9db4 37466ca 5de9db4 37466ca 1ebc0dd 37466ca 1ebc0dd 5de9db4 1ebc0dd 5de9db4 1ebc0dd 7d14eed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import gradio as gr
import requests
import re
from transformers import AutoTokenizer, pipeline
from youtube_transcript_api._transcripts import TranscriptListFetcher
tagger = pipeline(
"token-classification",
"./checkpoint-6000",
aggregation_strategy="first",
)
tokenizer = AutoTokenizer.from_pretrained("./checkpoint-6000")
max_size = 512
classes = [False, True]
pattern = re.compile(
r"(?:https?:\/\/)?(?:[0-9A-Z-]+\.)?(?:youtube|youtu|youtube-nocookie)\.(?:com|be)\/(?:watch\?v=|watch\?.+&v=|embed\/|v\/|.+\?v=)?([^&=\n%\?]{11})"
)
def video_id(url):
p = pattern.match(url)
return p.group(1) if p else None
def process(obj):
o = obj["events"]
new_l = []
start_dur = None
for line in o:
if "segs" in line:
if len(line["segs"]) == 1 and line["segs"][0]["utf8"] == "\n":
if start_dur is not None:
new_l.append(
{
"w": prev["utf8"],
"s": start_dur + prev["tOffsetMs"],
"e": line["tStartMs"],
}
)
continue
start_dur = line["tStartMs"]
prev = line["segs"][0]
prev["tOffsetMs"] = 0
for word in line["segs"][1:]:
try:
new_l.append(
{
"w": prev["utf8"],
"s": start_dur + prev["tOffsetMs"],
"e": start_dur + word["tOffsetMs"],
}
)
prev = word
except KeyError:
pass
return new_l
def get_transcript(video_id, session):
fetcher = TranscriptListFetcher(session)
_json = fetcher._extract_captions_json(
fetcher._fetch_video_html(video_id), video_id
)
captionTracks = _json["captionTracks"]
transcript_track_url = ""
for track in captionTracks:
if track["languageCode"] == "en":
transcript_track_url = track["baseUrl"] + "&fmt=json3"
if not transcript_track_url:
return None
obj = session.get(transcript_track_url)
p = process(obj.json())
return p
def transcript(url):
i = video_id(url)
if i:
return " ".join(l["w"].strip() for l in get_transcript(i, requests.Session()))
else:
return "ERROR: Failed to load transcript (it the link a valid youtube url?)..."
def inference(transcript):
tokens = tokenizer(transcript.split(" "))["input_ids"]
current_length = 0
current_word_length = 0
batches = []
for i, w in enumerate(tokens):
word = w[:-1] if i == 0 else w[1:] if i == (len(tokens) - 1) else w[1:-1]
if (current_length + len(word)) > max_size:
batch = " ".join(
tokenizer.batch_decode(
[
tok[1:-1]
for tok in tokens[max(0, i - current_word_length - 1) : i]
]
)
)
batches.append(batch)
current_word_length = 0
current_length = 0
continue
current_length += len(word)
current_word_length += 1
if current_length > 0:
batches.append(
" ".join(
tokenizer.batch_decode(
[tok[1:-1] for tok in tokens[i - current_word_length :]]
)
)
)
results = []
for split in batches:
values = tagger(split)
results.extend(
{
"sponsor": v["entity_group"] == "LABEL_1",
"phrase": v["word"],
}
for v in values
)
return results
def predict(transcript):
return [(span["phrase"], "Sponsor" if span["sponsor"] else None) for span in inference(transcript)]
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
inp = gr.Textbox(label="Video URL", placeholder="Video URL", lines=1, max_lines=1)
btn = gr.Button("Fetch Transcript")
gr.Examples(["youtu.be/xsLJZyih3Ac"], [inp])
text = gr.Textbox(label="Transcript", placeholder="<generated transcript>")
btn.click(fn=transcript, inputs=inp, outputs=text)
with gr.Column():
p = gr.Button("Predict Sponsors")
highlight = gr.HighlightedText()
p.click(fn=predict, inputs=text, outputs=highlight)
demo.launch()
|