Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

Imgscope-OCR-Mini / app.py

prithivMLmods

Update app.py

c7906eb verified 3 months ago

raw

history blame

8.15 kB

	"""
	app.py

	This demo builds a Multimodal OCR Granite Vision interface using:
	- @rag: retrieval‐augmented generation for PDF and image documents (via LightRAG)
	- @granite: image understanding with Granite Vision
	- @video-infer: video understanding by downsampling frames and processing each with Granite Vision

	Make sure the required Granite models and dependencies (Gradio, Transformers, etc.) are installed.
	"""

	import os
	import random
	import uuid
	import time
	import cv2
	import numpy as np
	import torch
	from PIL import Image
	import gradio as gr

	from transformers import AutoProcessor, AutoModelForVision2Seq, AutoTokenizer, AutoModelForCausalLM
	from transformers.image_utils import load_image

	# Import the LightRAG class (which internally uses Granite embedding and generation models)
	from sandbox.light_rag.light_rag import LightRAG

	# ------------------------------
	# Utility and device setup
	# ------------------------------
	def get_device():
	if torch.backends.mps.is_available():
	return "mps" # macOS GPU
	elif torch.cuda.is_available():
	return "cuda"
	else:
	return "cpu"

	device = get_device()

	# ------------------------------
	# Generation parameter constants
	# ------------------------------
	MAX_NEW_TOKENS = 1024
	TEMPERATURE = 0.7
	TOP_P = 0.85
	TOP_K = 50
	REPETITION_PENALTY = 1.05

	# ------------------------------
	# Load Granite Vision model for image processing (@granite and video)
	# ------------------------------
	VISION_MODEL_ID = "ibm-granite/granite-vision-3.2-2b"
	vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID)
	vision_model = AutoModelForVision2Seq.from_pretrained(VISION_MODEL_ID, device_map="auto").to(device)

	# ------------------------------
	# Initialize the LightRAG pipeline for text-only or document (PDF/image) RAG (@rag)
	# ------------------------------
	rag_config = {
	"embedding_model_id": "ibm-granite/granite-embedding-125m-english",
	"generation_model_id": "ibm-granite/granite-3.1-8b-instruct",
	"milvus_collection_name": "granite_vision_text_milvus",
	"milvus_db_path": "milvus.db", # adjust this path as needed
	}
	light_rag = LightRAG(rag_config)

	# ------------------------------
	# Video downsampling helper
	# ------------------------------
	def downsample_video(video_path):
	"""
	Downsamples the video to 10 evenly spaced frames.
	Returns a list of tuples: (PIL image, timestamp in seconds)
	"""
	vidcap = cv2.VideoCapture(video_path)
	total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
	fps = vidcap.get(cv2.CAP_PROP_FPS)
	frames = []
	# Sample 10 evenly spaced frame indices
	frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
	for i in frame_indices:
	vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
	success, frame = vidcap.read()
	if success:
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(frame)
	timestamp = round(i / fps, 2)
	frames.append((pil_image, timestamp))
	vidcap.release()
	return frames

	# ------------------------------
	# Command processing functions
	# ------------------------------
	def process_rag(query, file_path=None):
	"""
	Process @rag command using the LightRAG pipeline.
	Optionally, if a file is provided (e.g. PDF or image), one might extract text from it.
	Here we simply use the query for retrieval-augmented generation.
	"""
	context = light_rag.search(query, top_n=5)
	answer, prompt = light_rag.generate(query, context)
	return answer

	def process_granite(query, image: Image.Image):
	"""
	Process @granite command:
	Build a simple prompt from the image and the query then run the Granite Vision model.
	"""
	# Here we build a conversation with a single user turn.
	conversation = [{"role": "user", "content": query}]
	inputs = vision_processor.apply_chat_template(
	conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
	).to(device)
	generate_kwargs = {
	"max_new_tokens": MAX_NEW_TOKENS,
	"do_sample": True,
	"top_p": TOP_P,
	"top_k": TOP_K,
	"temperature": TEMPERATURE,
	"repetition_penalty": REPETITION_PENALTY,
	}
	output = vision_model.generate(inputs, generate_kwargs)
	result = vision_processor.decode(output[0], skip_special_tokens=True)
	return result.strip()

	def process_video(query, video_path):
	"""
	Process @video-infer command:
	Downsample the video, process each frame with the Granite Vision model, and combine the results.
	"""
	frames = downsample_video(video_path)
	descriptions = []
	for image, timestamp in frames:
	desc = process_granite(query, image)
	descriptions.append(f"At {timestamp}s: {desc}")
	return "\n".join(descriptions)

	# ------------------------------
	# Main function to handle input and dispatch based on command
	# ------------------------------
	def generate_response(input_dict, chat_history, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
	"""
	Based on the query prefix, this function calls:
	- process_rag for @rag
	- process_granite for @granite
	- process_video for @video-infer
	If no special command is provided, it defaults to text-only generation via LightRAG.
	"""
	text = input_dict["text"]
	files = input_dict.get("files", [])
	lower_text = text.strip().lower()

	if lower_text.startswith("@rag"):
	query = text[len("@rag"):].strip()
	file_path = files[0] if files else None # Optionally process the provided file
	answer = process_rag(query, file_path)
	return answer

	elif lower_text.startswith("@granite"):
	query = text[len("@granite"):].strip()
	if files:
	# Assume first file is an image
	image = load_image(files[0])
	result = process_granite(query, image)
	return result
	else:
	return "No image file provided for @granite command."

	elif lower_text.startswith("@video-infer"):
	query = text[len("@video-infer"):].strip()
	if files:
	video_path = files[0] # Assume first file is a video
	result = process_video(query, video_path)
	return result
	else:
	return "No video file provided for @video-infer command."

	else:
	# Default: text-only generation using LightRAG
	answer, prompt = light_rag.generate(text, context=[])
	return answer

	# ------------------------------
	# Build the Gradio interface using a multimodal textbox
	# ------------------------------
	demo = gr.ChatInterface(
	fn=generate_response,
	additional_inputs=[
	gr.Slider(label="Max new tokens", minimum=1, maximum=2048, step=1, value=MAX_NEW_TOKENS),
	gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=TEMPERATURE),
	gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=TOP_P),
	gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=TOP_K),
	gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=REPETITION_PENALTY),
	],
	textbox=gr.MultimodalTextbox(
	label="Query Input",
	file_types=["image", "pdf", "video"],
	file_count="multiple",
	placeholder="Enter your query starting with @rag, @granite, or @video-infer",
	),
	examples=[
	[{"text": "@rag What was the revenue growth in 2020?"}],
	[{"text": "@granite Describe the content of this image", "files": ["example_image.png"]}],
	[{"text": "@video-infer Summarize the event shown in the video", "files": ["example_video.mp4"]}],
	],
	cache_examples=False,
	type="messages",
	description=(
	"### Multimodal OCR Granite Vision\n"
	"Use @rag for PDF/image RAG, @granite for image questions, and @video-infer for video understanding."
	),
	fill_height=True,
	stop_btn="Stop Generation",
	theme="default",
	)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()