Spaces:

rawwerks
/

handwriting-ocr

Runtime error

handwriting-ocr / app.py

Raymond Weitekamp

private dataset structure

e3d7a16 5 months ago

14.7 kB

	import gradio as gr
	import random
	import os
	from datetime import datetime
	from huggingface_hub import HfApi
	from typing import Optional
	from PIL import Image # Needed for working with PIL images
	import datasets

	# The list of sentences from our previous conversation.
	sentences = [
	"Optical character recognition (OCR) is the process of converting images of text into machine-readable data.",
	"When applied to handwriting, OCR faces additional challenges because of the natural variability in individual penmanship.",
	"Over the last century, advances in computer vision and machine learning have transformed handwriting OCR from bulky, specialized hardware into highly accurate, software-driven systems.",
	"The origins of OCR date back to the early 20th century.",
	"Early pioneers explored how machines might read text.",
	"In the 1920s, inventors such as Emanuel Goldberg developed early devices that could capture printed characters by converting them into telegraph codes.",
	"Around the same time, Gustav Tauschek created the Reading Machine using template-matching methods to detect letters in images.",
	"These devices were designed for printed text and depended on fixed, machine-friendly fonts rather than natural handwriting.",
	"In the 1950s, systems like David Shepard's GISMO emerged to begin automating the conversion of paper records into digital form.",
	"Although these early OCR systems were limited in scope and accuracy, they laid the groundwork for later innovations.",
	"The 1960s saw OCR technology being applied to real-world tasks.",
	"In 1965, American inventor Jacob Rabinow developed an OCR machine specifically aimed at sorting mail by reading addresses.",
	"This was a critical step for the U.S. Postal Service.",
	"Soon after, research groups, including those at IBM, began developing machines such as the IBM 1287, which was capable of reading handprinted numbers on envelopes to facilitate automated mail processing.",
	"These systems marked the first attempts to apply computer vision to handwritten data on a large scale.",
	"By the late 1980s and early 1990s, researchers such as Yann LeCun and his colleagues developed neural network architectures to recognize handwritten digits.",
	"Their work, initially applied to reading ZIP codes on mail, demonstrated that carefully designed, constrained neural networks could achieve error rates as low as about 1% on USPS data.",
	"Sargur Srihari and his team at the Center of Excellence for Document Analysis and Recognition extended these ideas to develop complete handwritten address interpretation systems.",
	"These systems, deployed by the USPS and postal agencies worldwide, helped automate the routing of mail and revolutionized the sorting process.",
	"The development and evaluation of handwriting OCR have been driven in part by standard benchmark datasets.",
	"The MNIST dataset, introduced in the 1990s, consists of 70,000 images of handwritten digits and became the de facto benchmark for handwritten digit recognition.",
	"Complementing MNIST is the USPS dataset, which provides images of hand‐written digits derived from actual envelopes and captures real-world variability.",
	"Handwriting OCR entered a new era with the introduction of neural network models.",
	"In 1989, LeCun et al. applied backpropagation to a convolutional neural network tailored for handwritten digit recognition, an innovation that evolved into the LeNet series.",
	"By automatically learning features rather than relying on hand-designed templates, these networks drastically improved recognition performance.",
	"As computational power increased and large labeled datasets became available, deep learning models, particularly convolutional neural networks and recurrent neural networks, pushed the accuracy of handwriting OCR to near-human levels.",
	"Modern systems can handle both printed and cursive text, automatically segmenting and recognizing characters in complex handwritten documents.",
	"Cursive handwriting presents a classic challenge known as Sayre's paradox, where word recognition requires letter segmentation and letter segmentation requires word recognition.",
	"Contemporary approaches use implicit segmentation methods, often combined with hidden Markov models or end-to-end neural networks, to circumvent this paradox.",
	"Today's handwriting OCR systems are highly accurate and widely deployed.",
	"Modern systems combine OCR with artificial intelligence to not only recognize text but also extract meaning, verify data, and integrate into larger enterprise workflows.",
	"Projects such as In Codice Ratio use deep convolutional networks to transcribe historical handwritten documents, further expanding OCR applications.",
	"Despite impressive advances, handwriting OCR continues to face challenges with highly variable or degraded handwriting.",
	"Ongoing research aims to improve recognition accuracy, particularly for cursive and unconstrained handwriting, and to extend support across languages and historical scripts.",
	"With improvements in deep learning architectures, increased computing power, and large annotated datasets, future OCR systems are expected to become even more robust, handling real-world handwriting in diverse applications from postal services to archival digitization.",
	"Today's research in handwriting OCR benefits from a wide array of well-established datasets and ongoing evaluation challenges.",
	"These resources help drive the development of increasingly robust systems for both digit and full-text recognition.",
	"For handwritten digit recognition, the MNIST dataset remains the most widely used benchmark thanks to its simplicity and broad adoption.",
	"Complementing MNIST is the USPS dataset, which is derived from actual mail envelopes and provides additional challenges with real-world variability.",
	"The IAM Handwriting Database is one of the most popular datasets for unconstrained offline handwriting recognition and includes scanned pages of handwritten English text with corresponding transcriptions.",
	"It is frequently used to train and evaluate models that work on full-line or full-page recognition tasks.",
	"For systems designed to capture the dynamic aspects of handwriting, such as pen stroke trajectories, the IAM On-Line Handwriting Database offers valuable data.",
	"The CVL dataset provides multi-writer handwritten texts with a range of writing styles, making it useful for assessing the generalization capabilities of OCR systems across diverse handwriting samples.",
	"The RIMES dataset, developed for French handwriting recognition, contains scanned documents and is a key resource for evaluating systems in multilingual settings.",
	"Various ICDAR competitions, such as ICDAR 2013 and ICDAR 2017, have released datasets that reflect the complexities of real-world handwriting, including historical documents and unconstrained writing.",
	"For Arabic handwriting recognition, the KHATT dataset offers a collection of handwritten texts that capture the unique challenges of cursive and context-dependent scripts.",
	"These datasets, along with continual evaluation efforts through competitions hosted at ICDAR and ICFHR, ensure that the field keeps pushing toward higher accuracy, better robustness, and broader language coverage.",
	"Emerging benchmarks, often tailored to specific scripts, historical documents, or noisy real-world data, will further refine the state-of-the-art in handwriting OCR.",
	"This array of resources continues to shape the development of handwriting OCR systems today.",
	"This additional section outlines today's most influential datasets and benchmarks, highlighting how they continue to shape the development of handwriting OCR systems."
	]

	class OCRDataCollector:
	def __init__(self):
	self.collected_pairs = []
	self.current_text_block = self.get_random_text_block()
	self.hf_api = HfApi()

	def get_random_text_block(self):
	block_length = random.randint(1, 5)
	start_index = random.randint(0, len(sentences) - block_length)
	block = " ".join(sentences[start_index:start_index + block_length])
	return block

	def submit_image(self, image, text_block, username: Optional[str] = None):
	if image is not None and username:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	self.collected_pairs.append({
	"text": text_block,
	"image": image,
	"timestamp": timestamp,
	"username": username
	})
	return self.get_random_text_block()

	def skip_text(self, text_block, username: Optional[str] = None):
	return self.get_random_text_block()


	def strip_metadata(image: Image.Image) -> Image.Image:
	"""
	Helper function to strip all metadata from the provided PIL Image.
	This creates a new image with the same pixel data but no additional info.
	"""
	data = list(image.getdata())
	stripped_image = Image.new(image.mode, image.size)
	stripped_image.putdata(data)
	return stripped_image


	def create_gradio_interface():
	collector = OCRDataCollector()

	with gr.Blocks() as demo:
	gr.Markdown("## Crowdsourcing Handwriting OCR Dataset")
	gr.LoginButton()
	user_info = gr.Markdown()
	# Hidden state to hold the user's OAuth profile as JSON.
	profile_state = gr.JSON(visible=False)

	gr.Markdown(
	"You will be shown between 1 and 5 consecutive sentences. Please handwrite them on paper and upload an image of your handwriting. "
	"If you wish to skip the current text, click 'Skip'."
	)

	text_box = gr.Textbox(value=collector.current_text_block, label="Text to Handwrite", interactive=False, visible=False)
	image_input = gr.Image(type="pil", label="Upload Handwritten Image", sources=["upload"], visible=False)
	# Toggle (using a radio button) for dataset choice.
	dataset_radio = gr.Radio(choices=["Private", "Public"], label="Select Dataset", value="Private", visible=False)

	with gr.Row(visible=False) as button_row:
	submit_btn = gr.Button("Submit")
	skip_btn = gr.Button("Skip")

	def update_user_info(profile: gr.OAuthProfile \| None):
	if profile is None:
	return "Please log in with your Hugging Face account to contribute to the dataset.", {}
	# Use the username provided by the profile (from the "profile" scope)
	return f"Logged in as: {profile.username}", {"username": profile.username}

	def handle_submit(profile, dataset_choice, image, text):
	if not profile or "username" not in profile:
	raise gr.Error("Please log in to use this application")
	username = profile["username"]
	repo_id = f"{username}/handwriting-ocr-private"

	if dataset_choice == "Private":
	# Remove all metadata for privacy
	stripped_image = strip_metadata(image)

	# Check if the dataset exists; if not, create it as private
	try:
	collector.hf_api.dataset_info(repo_id)
	except Exception as e:
	collector.hf_api.create_repo(repo_id, repo_type="dataset", private=True)

	# Save the stripped image to a temporary file
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"{timestamp}.png"
	temp_dir = "temp"
	os.makedirs(temp_dir, exist_ok=True)
	temp_path = os.path.join(temp_dir, filename)
	stripped_image.save(temp_path)

	# Create a dataset dictionary with the image-text pair
	features = datasets.Features({
	'text': datasets.Value('string'),
	'image': datasets.Image(),
	'timestamp': datasets.Value('string')
	})

	dataset_dict = {
	'text': [text],
	'image': [temp_path],
	'timestamp': [timestamp]
	}

	# Create the dataset and push to hub
	dataset = datasets.Dataset.from_dict(dataset_dict, features=features)
	dataset.push_to_hub(repo_id)

	# Remove the temporary file
	os.remove(temp_path)

	# Log the submission locally
	collector.collected_pairs.append({
	"text": text,
	"image": image,
	"timestamp": timestamp,
	"username": username,
	"dataset": "private"
	})

	new_text = collector.get_random_text_block()
	return None, new_text
	else:
	# Fallback to public submission
	new_text = collector.submit_image(image, text, username)
	return None, new_text

	def handle_skip(profile, text):
	if not profile or "username" not in profile:
	raise gr.Error("Please log in to use this application")
	return collector.skip_text(text, profile["username"])

	def update_visibility(profile: gr.OAuthProfile \| None):
	is_visible = profile is not None
	# Update the visibility of text_box, image_input, button_row, and dataset_radio.
	return [
	gr.update(visible=is_visible),
	gr.update(visible=is_visible),
	gr.update(visible=is_visible),
	gr.update(visible=is_visible)
	]

	# On load, update both the display message and the hidden profile state.
	demo.load(update_user_info, inputs=None, outputs=[user_info, profile_state])
	demo.load(update_visibility, inputs=None, outputs=[text_box, image_input, button_row, dataset_radio])

	# Bind the submit and skip actions with updated inputs.
	submit_btn.click(handle_submit, inputs=[profile_state, dataset_radio, image_input, text_box], outputs=[image_input, text_box])
	skip_btn.click(handle_skip, inputs=[profile_state, text_box], outputs=text_box)

	return demo

	if __name__ == "__main__":
	demo = create_gradio_interface()
	demo.launch()