Spaces:
Runtime error
Runtime error
File size: 3,223 Bytes
2c8b539 a0e2927 2c8b539 a0e2927 2c8b539 a0e2927 2c8b539 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tqdm
from PIL import Image
import torch
import fitz
import gradio as gr
import spaces
import os
from transformers import AutoModel
from transformers import AutoTokenizer
import numpy as np
cache_dir = 'pdf_cache'
os.makedirs(cache_dir, exist_ok=True)
device = 'cuda'
print("Embedding model loading...")
model_path = 'RhapsodyAI/minicpm-visual-embedding-v0'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
model.eval()
model.to(device)
print("Embedding model loaded successfully!")
print("Generation model loading...")
gen_model_path = 'openbmb/MiniCPM-V-2_6'
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_path, trust_remote_code=True)
gen_model = AutoModel.from_pretrained(gen_model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
gen_model.eval()
gen_model.to(device)
print("Generation model loaded successfully!")
@spaces.GPU(duration=100)
def process_pdf(pdf_file, max_pages, progress=gr.Progress()):
doc = fitz.open("pdf", pdf_file)
num_pages = min(max_pages, len(doc))
images = []
for page_num in progress.tqdm(range(num_pages)):
page = doc[page_num]
pix = page.get_pixmap(dpi=200)
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(image)
return images
@spaces.GPU(duration=50)
def answer_question(images, question):
global gen_model, gen_tokenizer
images_ = [img.convert('RGB') for img in images]
msgs = [{'role': 'user', 'content': [question, *images_]}]
answer = gen_model.chat(
image=None,
msgs=msgs,
tokenizer=gen_tokenizer
)
print(answer)
return answer
with gr.Blocks() as app:
gr.Markdown("# PDF Question Answering with Vision Language Model")
gr.Markdown("""
This application uses a Vision Language Model to answer questions about PDF documents.
1. Upload a PDF file
2. Set the maximum number of pages to process
3. Click "Process PDF" to extract the pages
4. Enter your question about the PDF content
5. Click "Answer Question" to get the model's response
""")
with gr.Row():
file_input = gr.File(type="binary", label="Upload PDF")
max_pages = gr.Number(value=10, minimum=1, maximum=50, step=1, label="Maximum number of pages to process")
process_button = gr.Button("Process PDF")
with gr.Row():
query_input = gr.Text(label="Your Question")
answer_button = gr.Button("Answer Question")
images_output = gr.Gallery(label="Processed PDF Pages", visible=False)
gen_model_response = gr.Textbox(label="Model's Answer")
def process_and_show(pdf_file, max_pages):
images = process_pdf(pdf_file, max_pages)
return gr.Gallery.update(value=images, visible=True)
process_button.click(
process_and_show,
inputs=[file_input, max_pages],
outputs=images_output
)
answer_button.click(
answer_question,
inputs=[images_output, query_input],
outputs=gen_model_response
)
app.launch() |