Spaces:

aabdullah27
/

SmolDocling-OCR-App

Running

Muhammad Abdullah

Upload 2 files

80e8620 verified 4 months ago

22.6 kB

	import streamlit as st
	import os
	import time
	import torch
	import tempfile
	from PIL import Image
	from dotenv import load_dotenv
	import logging
	from datetime import datetime

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Load environment variables
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN")
	CACHE_DIR = os.getenv("CACHE_DIR", os.path.join(tempfile.gettempdir(), "smoldocling_cache"))

	# Ensure cache directory exists
	os.makedirs(CACHE_DIR, exist_ok=True)

	# Import for Transformers approach
	try:
	from transformers import AutoProcessor, AutoModelForVision2Seq
	from huggingface_hub import login
	transformers_available = True
	except ImportError:
	transformers_available = False

	try:
	from docling_core.types.doc import DoclingDocument
	from docling_core.types.doc.document import DocTagsDocument
	docling_available = True
	except ImportError:
	docling_available = False

	# Global variables for model caching
	processor = None
	model = None

	def check_dependencies():
	"""Check if all required dependencies are installed"""
	missing = []
	if not transformers_available:
	missing.append("transformers huggingface_hub")
	if not docling_available:
	missing.append("docling-core")

	return missing

	def get_available_devices():
	"""Get available processing devices"""
	devices = ["cpu"]
	if torch.cuda.is_available():
	cuda_count = torch.cuda.device_count()
	for i in range(cuda_count):
	devices.append(f"cuda:{i} ({torch.cuda.get_device_name(i)})")
	return devices

	def get_device_from_selection(selection):
	"""Convert user-friendly device selection to torch device"""
	if selection.startswith("cuda:"):
	return selection.split(" ")[0] # Extract just the "cuda:X" part
	return "cpu"

	@st.cache_resource
	def load_model(_device):
	"""Load and cache the model to avoid reloading"""
	global processor, model

	# Authenticate with Hugging Face
	if HF_TOKEN:
	login(token=HF_TOKEN)

	try:
	logger.info(f"Loading SmolDocling model on {_device}...")
	processor = AutoProcessor.from_pretrained(
	"ds4sd/SmolDocling-256M-preview",
	cache_dir=CACHE_DIR
	)
	model = AutoModelForVision2Seq.from_pretrained(
	"ds4sd/SmolDocling-256M-preview",
	torch_dtype=torch.float16 if _device.startswith("cuda") else torch.float32,
	cache_dir=CACHE_DIR
	).to(_device)
	logger.info("Model loaded successfully")
	return processor, model
	except Exception as e:
	logger.error(f"Error loading model: {str(e)}")
	raise

	def optimize_image(image, max_size=1600):
	"""Optimize image size while maintaining aspect ratio"""
	width, height = image.size
	if max(width, height) > max_size:
	if width > height:
	new_width = max_size
	new_height = int(height * (max_size / width))
	else:
	new_height = max_size
	new_width = int(width * (max_size / height))
	image = image.resize((new_width, new_height), Image.LANCZOS)
	return image

	def process_single_image(image, prompt_text="Convert this page to docling.", device="cpu", show_progress=None):
	"""Process a single image"""
	global processor, model

	# Optimize image
	image = optimize_image(image)

	start_time = time.time()

	# Load the model if not already loaded
	processor, model = load_model(device)

	# Create input messages
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": prompt_text}
	]
	},
	]

	# Prepare inputs
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt")
	inputs = inputs.to(device)

	# Generate outputs
	with torch.no_grad(): # Add this to save memory
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=1500, # Increased for better results
	do_sample=False, # Deterministic generation
	num_beams=1, # Simple beam search
	temperature=1.0, # No temperature scaling
	)

	prompt_length = inputs.input_ids.shape[1]
	trimmed_generated_ids = generated_ids[:, prompt_length:]
	doctags = processor.batch_decode(
	trimmed_generated_ids,
	skip_special_tokens=False,
	)[0].lstrip()

	# Clean the output
	doctags = doctags.replace("<end_of_utterance>", "").strip()

	# Populate document
	doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])

	# Create a docling document
	doc = DoclingDocument(name="Document")
	doc.load_from_doctags(doctags_doc)

	# Export as markdown
	md_content = doc.export_to_markdown()

	# Export as HTML
	html_content = doc.export_to_html()

	# Get plain text
	plain_text = doc.export_to_text()

	processing_time = time.time() - start_time

	return {
	"doctags": doctags,
	"markdown": md_content,
	"html": html_content,
	"text": plain_text,
	"processing_time": processing_time
	}

	def process_batch(images, prompt_text, device, progress_bar=None):
	"""Process a batch of images with progress tracking"""
	results = []
	total = len(images)

	for idx, image in enumerate(images):
	if progress_bar:
	progress_bar.progress((idx) / total, text=f"Processing image {idx+1}/{total}")

	result = process_single_image(image, prompt_text, device)
	results.append(result)

	if progress_bar:
	progress_bar.progress((idx + 1) / total, text=f"Processed {idx+1}/{total} images")

	return results

	def save_session_history(results):
	"""Save processing results to session history"""
	if 'history' not in st.session_state:
	st.session_state.history = []

	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	for idx, result in enumerate(results):
	st.session_state.history.append({
	"id": len(st.session_state.history) + 1,
	"timestamp": timestamp,
	"type": "Image " + str(idx + 1),
	"processing_time": result["processing_time"],
	"result": result
	})

	def display_history():
	"""Display session history"""
	if 'history' not in st.session_state or not st.session_state.history:
	st.info("No processing history available")
	return

	st.subheader("Processing History")

	for item in reversed(st.session_state.history):
	with st.expander(f"#{item['id']} - {item['type']} ({item['timestamp']})"):
	st.write(f"Processing time: {item['processing_time']:.2f} seconds")
	tabs = st.tabs(["Markdown", "Text", "DocTags", "HTML"])

	with tabs[0]:
	st.markdown(item['result']['markdown'])
	st.download_button(
	"Download Markdown",
	item['result']['markdown'],
	file_name=f"output_{item['id']}.md"
	)

	with tabs[1]:
	st.text_area("Plain Text", item['result']['text'], height=200)
	st.download_button(
	"Download Text",
	item['result']['text'],
	file_name=f"output_{item['id']}.txt"
	)

	with tabs[2]:
	st.text_area("DocTags", item['result']['doctags'], height=200)
	st.download_button(
	"Download DocTags",
	item['result']['doctags'],
	file_name=f"output_{item['id']}.dt"
	)

	with tabs[3]:
	st.code(item['result']['html'], language="html")
	st.download_button(
	"Download HTML",
	item['result']['html'],
	file_name=f"output_{item['id']}.html"
	)

	def main():
	# App configuration
	st.set_page_config(
	page_title="SmolDocling OCR App",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom theme
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	margin-bottom: 0.5rem;
	}
	.sub-header {
	font-size: 1.2rem;
	color: #666;
	margin-bottom: 2rem;
	}
	.stTabs [data-baseweb="tab-list"] {
	gap: 2px;
	}
	.stTabs [data-baseweb="tab"] {
	padding: 10px 16px;
	background-color: #f0f2f6;
	}
	.stTabs [aria-selected="true"] {
	background-color: #e6f0ff;
	}
	</style>
	""", unsafe_allow_html=True)

	# App header
	st.markdown('<p class="main-header">SmolDocling OCR App</p>', unsafe_allow_html=True)
	st.markdown('<p class="sub-header">Extract text from images using SmolDocling AI</p>', unsafe_allow_html=True)

	# Check dependencies
	missing_deps = check_dependencies()
	if missing_deps:
	st.error(f"Missing dependencies: {', '.join(missing_deps)}. Please install them to use this app.")
	st.info("Install with: pip install " + " ".join(missing_deps))
	st.stop()

	# Initialize session state
	if 'results' not in st.session_state:
	st.session_state.results = []

	# Create sidebar
	with st.sidebar:
	st.header("Configuration")

	# Device selection
	st.subheader("Processing Device")
	available_devices = get_available_devices()
	selected_device = st.selectbox(
	"Select processing device",
	available_devices,
	index=0 if len(available_devices) == 1 else 1, # Default to CUDA if available
	help="Choose the device for model inference. GPU (CUDA) is recommended for faster processing."
	)
	device = get_device_from_selection(selected_device)

	# Model info
	st.info(f"Selected device: {selected_device}")

	if device == "cpu":
	st.warning("⚠️ CPU processing may be slow. Select a GPU device if available for faster performance.")

	# Memory management
	if device.startswith("cuda"):
	with st.expander("GPU Memory Management"):
	st.write("Current GPU Memory Usage:")
	if torch.cuda.is_available():
	gpu_idx = int(device.split(":")[1]) if ":" in device else 0
	allocated = torch.cuda.memory_allocated(gpu_idx) / (1024 ** 3)
	reserved = torch.cuda.memory_reserved(gpu_idx) / (1024 ** 3)
	st.progress(allocated / (torch.cuda.get_device_properties(gpu_idx).total_memory / (1024 ** 3)))
	st.write(f"Allocated: {allocated:.2f} GB")
	st.write(f"Reserved: {reserved:.2f} GB")

	if st.button("Clear GPU Cache"):
	torch.cuda.empty_cache()
	st.success("GPU cache cleared")

	# Upload options
	st.subheader("Upload Options")
	upload_option = st.radio("Choose upload option:", ["Single Image", "Multiple Images"])

	# Advanced options
	with st.expander("Advanced Options"):
	task_type = st.selectbox(
	"Select task type",
	[
	"Convert this page to docling.",
	"Convert this table to OTSL.",
	"Convert code to text.",
	"Convert formula to latex.",
	"Convert chart to OTSL.",
	"Extract all section header elements on the page."
	]
	)

	custom_prompt = st.text_area(
	"Custom prompt (optional)",
	value="",
	help="Provide a custom prompt if needed. Leave empty to use the selected task type."
	)

	max_image_size = st.slider(
	"Max image dimension (pixels)",
	min_value=800,
	max_value=3200,
	value=1600,
	step=100,
	help="Larger values may improve OCR quality but use more memory"
	)

	final_prompt = custom_prompt if custom_prompt else task_type

	# Upload controls
	st.subheader("Upload Image(s)")
	if upload_option == "Single Image":
	uploaded_file = st.file_uploader("Upload image", type=["jpg", "jpeg", "png", "pdf"])

	if uploaded_file is not None:
	try:
	image = Image.open(uploaded_file).convert("RGB")
	st.image(image, caption="Uploaded Image", width=250)
	except Exception as e:
	st.error(f"Error loading image: {str(e)}")
	else:
	uploaded_files = st.file_uploader(
	"Upload multiple images",
	type=["jpg", "jpeg", "png"],
	accept_multiple_files=True
	)

	if uploaded_files:
	st.success(f"{len(uploaded_files)} images uploaded")

	# Process button
	if (upload_option == "Single Image" and 'uploaded_file' in locals() and uploaded_file is not None) or \
	(upload_option == "Multiple Images" and 'uploaded_files' in locals() and uploaded_files):
	process_button = st.button("Process Image(s)", type="primary")

	# History button
	st.subheader("History")
	if st.button("Show Processing History"):
	st.session_state.show_history = True

	# About section
	with st.expander("About SmolDocling OCR"):
	st.write("""
	This app uses SmolDocling, a powerful OCR model for document understanding from Hugging Face Hub.

	The app extracts DocTags format and converts it to Markdown, HTML, and plain text for easy reading.

	Available tasks:
	- Convert pages to DocTags (general OCR)
	- Convert tables to OTSL
	- Convert code snippets to text
	- Convert formulas to LaTeX
	- Convert charts to OTSL
	- Extract section headers
	""")

	# Main content area
	if 'show_history' in st.session_state and st.session_state.show_history:
	display_history()
	st.session_state.show_history = False
	elif upload_option == "Single Image" and 'uploaded_file' in locals() and uploaded_file is not None and process_button:
	with st.spinner("Processing image..."):
	try:
	progress_bar = st.progress(0, text="Preparing to process...")

	# Update global optimization settings
	optimize_image.func_defaults = (max_image_size,)

	result = process_single_image(image, final_prompt, device)
	st.session_state.results = [result]

	# Save to history
	save_session_history(st.session_state.results)

	progress_bar.progress(1.0, text="Processing complete!")

	# Display results
	tabs = st.tabs(["Markdown", "Text", "DocTags", "HTML"])

	with tabs[0]:
	st.subheader("Markdown Output")
	st.markdown(result["markdown"])
	st.download_button(
	"Download Markdown",
	result["markdown"],
	file_name="output.md"
	)

	with tabs[1]:
	st.subheader("Plain Text Output")
	st.text_area("Extracted Text", result["text"], height=300)
	st.download_button(
	"Download Text",
	result["text"],
	file_name="output.txt"
	)

	with tabs[2]:
	st.subheader("DocTags Output")
	st.text_area("DocTags", result["doctags"], height=300)
	st.download_button(
	"Download DocTags",
	result["doctags"],
	file_name="output.dt"
	)

	with tabs[3]:
	st.subheader("HTML Output")
	st.code(result["html"], language="html")
	st.download_button(
	"Download HTML",
	result["html"],
	file_name="output.html"
	)

	st.success(f"Processing completed in {result['processing_time']:.2f} seconds on {selected_device}")
	except Exception as e:
	st.error(f"Error processing image: {str(e)}")
	logger.error(f"Error processing image: {str(e)}", exc_info=True)

	elif upload_option == "Multiple Images" and 'uploaded_files' in locals() and uploaded_files and process_button:
	try:
	images = [Image.open(file).convert("RGB") for file in uploaded_files]

	if len(images) > 0:
	with st.spinner(f"Processing {len(images)} images..."):
	progress_bar = st.progress(0, text="Preparing to process...")

	# Update global optimization settings
	optimize_image.func_defaults = (max_image_size,)

	results = process_batch(images, final_prompt, device, progress_bar)
	st.session_state.results = results

	# Save to history
	save_session_history(results)

	progress_bar.progress(1.0, text="Processing complete!")

	# Display results
	st.subheader("Processing Results")

	total_time = sum(result["processing_time"] for result in results)
	avg_time = total_time / len(results)

	st.write(f"Total processing time: {total_time:.2f} seconds on {selected_device}")
	st.write(f"Average processing time: {avg_time:.2f} seconds per image")

	# Create tabs for each image
	for idx, (result, image) in enumerate(zip(results, images)):
	with st.expander(f"Image {idx+1} Results"):
	col1, col2 = st.columns([1, 2])

	with col1:
	st.image(image, caption=f"Image {idx+1}", width=250)
	st.write(f"Processing time: {result['processing_time']:.2f} seconds")

	with col2:
	inner_tabs = st.tabs(["Markdown", "Text", "DocTags", "HTML"])

	with inner_tabs[0]:
	st.markdown(result["markdown"])
	st.download_button(
	f"Download Markdown",
	result["markdown"],
	file_name=f"output_{idx+1}.md"
	)

	with inner_tabs[1]:
	st.text_area("Plain Text", result["text"], height=200)
	st.download_button(
	f"Download Text",
	result["text"],
	file_name=f"output_{idx+1}.txt"
	)

	with inner_tabs[2]:
	st.text_area("DocTags", result["doctags"], height=200)
	st.download_button(
	f"Download DocTags",
	result["doctags"],
	file_name=f"output_{idx+1}.dt"
	)

	with inner_tabs[3]:
	st.code(result["html"], language="html")
	st.download_button(
	f"Download HTML",
	result["html"],
	file_name=f"output_{idx+1}.html"
	)

	st.success(f"All images processed successfully")
	except Exception as e:
	st.error(f"Error processing images: {str(e)}")
	logger.error(f"Error processing images: {str(e)}", exc_info=True)

	# Display a welcome message if no image has been uploaded
	if ('uploaded_file' not in locals() or uploaded_file is None) and \
	('uploaded_files' not in locals() or not uploaded_files):
	st.info("👈 Upload an image using the sidebar to get started")


	if __name__ == "__main__":
	main()