Spaces:
Running
Running
File size: 6,020 Bytes
d1a5021 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 52085a2 3456a58 e5ec50a 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 9150552 3456a58 b158abb 3456a58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# (C) Glue Labs Private Limited 2023 (gluelabs.com)
# All rights reserved
import argparse
import datetime
import os
import gradio as gr
from signal import SIGINT, signal
from utils.log import debug, info, logger, breakPoint as bc
import requests
from constants import *
CHUNK_SIZE = 512
VIDEO_ID = ""
OUT_PPT_NAME= PPTX_DEST
NO_IMAGES = False
QUESTIONS = 5
def gradio_run(
video_id, chunk_size: int,
no_images: bool, no_chapters: bool, out_type="pdf"):
VIDEO_ID = video_id
CHUNK_SIZE = chunk_size
NO_IMAGES = no_images
NO_CHAPTERS = no_chapters
OUT_PPT_NAME = f"{OUTDIR}/gradio-out{VIDEO_ID}.{out_type}"
info("Loading modules..")
from langchain.chains.summarize import load_summarize_chain
# from langchain.vectorstores import Chroma
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from langchain.chains import RetrievalQA
# from langchain.llms import HuggingFacePipeline
from langchain.docstore.document import Document
from rich.progress import track
import utils.markdown as md
from models.lamini import lamini as model
from utils.marp_wrapper import marp
from utils.ppt import generate_ppt
from utils.subtitles import subs
from utils.video import video
from utils.chunk import ChunkByChapters
# intialize marp
out = marp(MD_DEST)
out.add_header(config=MARP_GAIA)
# out.add_body("<style> section { font-size: 1.5rem; } </style>")
# initialize video
vid = video(VIDEO_ID, f"{OUTDIR}/vid-{VIDEO_ID}")
vid.download()
# initialize model
llm_model = model
llm = llm_model.load_model(
max_length=400,
temperature=0,
top_p=0.95,
repetition_penalty=1.15
)
# slice subtitle and chunk them
# to CHUNK_SIZE based on chapters
info(f"Getting subtitles {VIDEO_ID}..")
raw_subs = vid.getSubtitles()
if raw_subs is None:
logger.critical("No subtitles found, exiting..")
exit()
info(f"got {len(raw_subs)} length subtitles")
if NO_CHAPTERS:
chunker = subs(VIDEO_ID)
chunks = chunker.getSubsList(size=CHUNK_SIZE)
model_tmplts = llm_model.templates()
summarizer = model_tmplts.summarize
title_gen = model_tmplts.generate_title
# title Photo
first_pic = str(datetime.timedelta(seconds=chunks[0][1]))
img_name = f"vid-{VIDEO_ID}_{first_pic}.png"
img_path = f"{PNG_DEST}/{img_name}"
vid.getframe(first_pic, img_path)
out.add_page(md.h1(VIDEO_ID), md.image(url=img_name))
out.marp_end()
for chunk in track(chunks, description="(processing chunks) Summarizing.."):
summary = summarizer(chunk[0])[0]["generated_text"].replace("-", "\n-")
title = title_gen(chunk[0])[0]["generated_text"]
heading = md.h2 if len(title) < 40 else md.h3
out.add_page(heading(title), summary)
if not NO_IMAGES and len(summary+title) < 270:
timestamp = str(datetime.timedelta(seconds=chunk[1]))
imgName = f"vid-{VIDEO_ID}_{timestamp}.png"
imgPath = f"{PNG_DEST}/{imgName}"
vid.getframe(timestamp, imgPath)
out.add_body(md.image(imgName, align="left", setAsBackground=True))
out.marp_end()
else:
raw_chapters = vid.getChapters(f"{YT_CHAPTER_ENDPOINT}{VIDEO_ID}")
chunk_dict = ChunkByChapters(raw_chapters, raw_subs, CHUNK_SIZE)
chain = load_summarize_chain(llm, chain_type="stuff")
# TODO: ( use refine chain type to summarize all chapters )
img_hook = False
for title, subchunks in track(chunk_dict.items(), description="(processing chunks) Summarizing.."):
# Typecase subchunks to Document for every topic
# get summary for every topic with stuff/refine chain
# add to final summary
debug(subchunks)
docs = [ Document(page_content=t[0]) for t in subchunks[0] ]
summary = chain.run(docs)
if img_hook == False:
ts = str(datetime.timedelta(seconds=subchunks[0][1][0]))
img_path = f"{PNG_DEST}/vid-{VIDEO_ID}_{ts}.png"
vid.getframe(ts, img_path)
if os.path.exists(img_path):
# if summary is long ignore images for better page and no clipping
if len(summary+title) < 270:
out.add_body(md.image(
img_path.replace(f"{OUTEXTRA}/", ""),
align="left",
setAsBackground=True
))
out.add_page(md.h2(title), summary)
out.marp_end()
info(f"Generating {OUT_PPT_NAME}..")
out.close_file()
generate_ppt(MD_DEST, OUT_PPT_NAME)
print(f"Done! {OUT_PPT_NAME}")
return os.path.abspath(OUT_PPT_NAME)
def gradio_Interface():
app = gr.Interface(
fn=gradio_run,
inputs=[
"text",
gr.Slider(1, 2000, 1, label="Chunk Size", info="More chunk size = longer text & shorter numbber of slides"),
gr.Checkbox(label="No Images", info="Don't keep images in output ( gives more spaces for larger text)"),
gr.Checkbox(label="No Chapters", info="Don't use chapter based chunking"),
gr.Dropdown(["pptx", "pdf", "html"], label="file format", info="which file format to generte.")
],
outputs="file"
)
app.launch()
if __name__ == "__main__":
logger.info("Starting gradio interface..")
if not os.path.exists(OUTDIR):
os.mkdir(OUTDIR)
os.mkdir(OUTEXTRA)
if not os.path.exists(OUTEXTRA):
os.mkdir(OUTEXTRA)
gradio_Interface()
|