Spaces:

dhairyashah
/

langchain-que-gen

Runtime error

App Files Files Community

langchain-que-gen / app.py

dhairyashah

update

2c8b539 11 months ago

raw

history blame

3.22 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	import tqdm
	from PIL import Image
	import torch
	import fitz
	import gradio as gr
	import spaces
	import os
	from transformers import AutoModel
	from transformers import AutoTokenizer
	import numpy as np

	cache_dir = 'pdf_cache'
	os.makedirs(cache_dir, exist_ok=True)

	device = 'cuda'

	print("Embedding model loading...")
	model_path = 'RhapsodyAI/minicpm-visual-embedding-v0'
	tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
	model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
	model.eval()
	model.to(device)
	print("Embedding model loaded successfully!")

	print("Generation model loading...")
	gen_model_path = 'openbmb/MiniCPM-V-2_6'
	gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_path, trust_remote_code=True)
	gen_model = AutoModel.from_pretrained(gen_model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
	gen_model.eval()
	gen_model.to(device)
	print("Generation model loaded successfully!")

	@spaces.GPU(duration=100)
	def process_pdf(pdf_file, max_pages, progress=gr.Progress()):
	doc = fitz.open("pdf", pdf_file)
	num_pages = min(max_pages, len(doc))

	images = []
	for page_num in progress.tqdm(range(num_pages)):
	page = doc[page_num]
	pix = page.get_pixmap(dpi=200)
	image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(image)

	return images

	@spaces.GPU(duration=50)
	def answer_question(images, question):
	global gen_model, gen_tokenizer
	images_ = [img.convert('RGB') for img in images]
	msgs = [{'role': 'user', 'content': [question, *images_]}]
	answer = gen_model.chat(
	image=None,
	msgs=msgs,
	tokenizer=gen_tokenizer
	)
	print(answer)
	return answer

	with gr.Blocks() as app:
	gr.Markdown("# PDF Question Answering with Vision Language Model")

	gr.Markdown("""
	This application uses a Vision Language Model to answer questions about PDF documents.

	1. Upload a PDF file
	2. Set the maximum number of pages to process
	3. Click "Process PDF" to extract the pages
	4. Enter your question about the PDF content
	5. Click "Answer Question" to get the model's response
	""")

	with gr.Row():
	file_input = gr.File(type="binary", label="Upload PDF")
	max_pages = gr.Number(value=10, minimum=1, maximum=50, step=1, label="Maximum number of pages to process")
	process_button = gr.Button("Process PDF")

	with gr.Row():
	query_input = gr.Text(label="Your Question")
	answer_button = gr.Button("Answer Question")

	images_output = gr.Gallery(label="Processed PDF Pages", visible=False)
	gen_model_response = gr.Textbox(label="Model's Answer")

	def process_and_show(pdf_file, max_pages):
	images = process_pdf(pdf_file, max_pages)
	return gr.Gallery.update(value=images, visible=True)

	process_button.click(
	process_and_show,
	inputs=[file_input, max_pages],
	outputs=images_output
	)

	answer_button.click(
	answer_question,
	inputs=[images_output, query_input],
	outputs=gen_model_response
	)

	app.launch()