Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import tqdm | |
from PIL import Image | |
import torch | |
import fitz | |
import gradio as gr | |
import spaces | |
import os | |
from transformers import AutoModel | |
from transformers import AutoTokenizer | |
import numpy as np | |
cache_dir = 'pdf_cache' | |
os.makedirs(cache_dir, exist_ok=True) | |
device = 'cuda' | |
print("Embedding model loading...") | |
model_path = 'RhapsodyAI/minicpm-visual-embedding-v0' | |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | |
model = AutoModel.from_pretrained(model_path, trust_remote_code=True) | |
model.eval() | |
model.to(device) | |
print("Embedding model loaded successfully!") | |
print("Generation model loading...") | |
gen_model_path = 'openbmb/MiniCPM-V-2_6' | |
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_path, trust_remote_code=True) | |
gen_model = AutoModel.from_pretrained(gen_model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16) | |
gen_model.eval() | |
gen_model.to(device) | |
print("Generation model loaded successfully!") | |
def process_pdf(pdf_file, max_pages, progress=gr.Progress()): | |
doc = fitz.open("pdf", pdf_file) | |
num_pages = min(max_pages, len(doc)) | |
images = [] | |
for page_num in progress.tqdm(range(num_pages)): | |
page = doc[page_num] | |
pix = page.get_pixmap(dpi=200) | |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
images.append(image) | |
return images | |
def answer_question(images, question): | |
global gen_model, gen_tokenizer | |
images_ = [img.convert('RGB') for img in images] | |
msgs = [{'role': 'user', 'content': [question, *images_]}] | |
answer = gen_model.chat( | |
image=None, | |
msgs=msgs, | |
tokenizer=gen_tokenizer | |
) | |
print(answer) | |
return answer | |
with gr.Blocks() as app: | |
gr.Markdown("# PDF Question Answering with Vision Language Model") | |
gr.Markdown(""" | |
This application uses a Vision Language Model to answer questions about PDF documents. | |
1. Upload a PDF file | |
2. Set the maximum number of pages to process | |
3. Click "Process PDF" to extract the pages | |
4. Enter your question about the PDF content | |
5. Click "Answer Question" to get the model's response | |
""") | |
with gr.Row(): | |
file_input = gr.File(type="binary", label="Upload PDF") | |
max_pages = gr.Number(value=10, minimum=1, maximum=50, step=1, label="Maximum number of pages to process") | |
process_button = gr.Button("Process PDF") | |
with gr.Row(): | |
query_input = gr.Text(label="Your Question") | |
answer_button = gr.Button("Answer Question") | |
images_output = gr.Gallery(label="Processed PDF Pages", visible=False) | |
gen_model_response = gr.Textbox(label="Model's Answer") | |
def process_and_show(pdf_file, max_pages): | |
images = process_pdf(pdf_file, max_pages) | |
return gr.Gallery.update(value=images, visible=True) | |
process_button.click( | |
process_and_show, | |
inputs=[file_input, max_pages], | |
outputs=images_output | |
) | |
answer_button.click( | |
answer_question, | |
inputs=[images_output, query_input], | |
outputs=gen_model_response | |
) | |
app.launch() |