dhairyashah's picture
update
2c8b539
raw
history blame
3.22 kB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tqdm
from PIL import Image
import torch
import fitz
import gradio as gr
import spaces
import os
from transformers import AutoModel
from transformers import AutoTokenizer
import numpy as np
cache_dir = 'pdf_cache'
os.makedirs(cache_dir, exist_ok=True)
device = 'cuda'
print("Embedding model loading...")
model_path = 'RhapsodyAI/minicpm-visual-embedding-v0'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
model.eval()
model.to(device)
print("Embedding model loaded successfully!")
print("Generation model loading...")
gen_model_path = 'openbmb/MiniCPM-V-2_6'
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_path, trust_remote_code=True)
gen_model = AutoModel.from_pretrained(gen_model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
gen_model.eval()
gen_model.to(device)
print("Generation model loaded successfully!")
@spaces.GPU(duration=100)
def process_pdf(pdf_file, max_pages, progress=gr.Progress()):
doc = fitz.open("pdf", pdf_file)
num_pages = min(max_pages, len(doc))
images = []
for page_num in progress.tqdm(range(num_pages)):
page = doc[page_num]
pix = page.get_pixmap(dpi=200)
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(image)
return images
@spaces.GPU(duration=50)
def answer_question(images, question):
global gen_model, gen_tokenizer
images_ = [img.convert('RGB') for img in images]
msgs = [{'role': 'user', 'content': [question, *images_]}]
answer = gen_model.chat(
image=None,
msgs=msgs,
tokenizer=gen_tokenizer
)
print(answer)
return answer
with gr.Blocks() as app:
gr.Markdown("# PDF Question Answering with Vision Language Model")
gr.Markdown("""
This application uses a Vision Language Model to answer questions about PDF documents.
1. Upload a PDF file
2. Set the maximum number of pages to process
3. Click "Process PDF" to extract the pages
4. Enter your question about the PDF content
5. Click "Answer Question" to get the model's response
""")
with gr.Row():
file_input = gr.File(type="binary", label="Upload PDF")
max_pages = gr.Number(value=10, minimum=1, maximum=50, step=1, label="Maximum number of pages to process")
process_button = gr.Button("Process PDF")
with gr.Row():
query_input = gr.Text(label="Your Question")
answer_button = gr.Button("Answer Question")
images_output = gr.Gallery(label="Processed PDF Pages", visible=False)
gen_model_response = gr.Textbox(label="Model's Answer")
def process_and_show(pdf_file, max_pages):
images = process_pdf(pdf_file, max_pages)
return gr.Gallery.update(value=images, visible=True)
process_button.click(
process_and_show,
inputs=[file_input, max_pages],
outputs=images_output
)
answer_button.click(
answer_question,
inputs=[images_output, query_input],
outputs=gen_model_response
)
app.launch()