Spaces:
Runtime error
Runtime error
import gradio as gr | |
import random | |
import os | |
from datetime import datetime | |
from huggingface_hub import HfApi | |
from typing import Optional | |
from PIL import Image # Needed for working with PIL images | |
import datasets | |
# The list of sentences from our previous conversation. | |
sentences = [ | |
"Optical character recognition (OCR) is the process of converting images of text into machine-readable data.", | |
"When applied to handwriting, OCR faces additional challenges because of the natural variability in individual penmanship.", | |
"Over the last century, advances in computer vision and machine learning have transformed handwriting OCR from bulky, specialized hardware into highly accurate, software-driven systems.", | |
"The origins of OCR date back to the early 20th century.", | |
"Early pioneers explored how machines might read text.", | |
"In the 1920s, inventors such as Emanuel Goldberg developed early devices that could capture printed characters by converting them into telegraph codes.", | |
"Around the same time, Gustav Tauschek created the Reading Machine using template-matching methods to detect letters in images.", | |
"These devices were designed for printed text and depended on fixed, machine-friendly fonts rather than natural handwriting.", | |
"In the 1950s, systems like David Shepard's GISMO emerged to begin automating the conversion of paper records into digital form.", | |
"Although these early OCR systems were limited in scope and accuracy, they laid the groundwork for later innovations.", | |
"The 1960s saw OCR technology being applied to real-world tasks.", | |
"In 1965, American inventor Jacob Rabinow developed an OCR machine specifically aimed at sorting mail by reading addresses.", | |
"This was a critical step for the U.S. Postal Service.", | |
"Soon after, research groups, including those at IBM, began developing machines such as the IBM 1287, which was capable of reading handprinted numbers on envelopes to facilitate automated mail processing.", | |
"These systems marked the first attempts to apply computer vision to handwritten data on a large scale.", | |
"By the late 1980s and early 1990s, researchers such as Yann LeCun and his colleagues developed neural network architectures to recognize handwritten digits.", | |
"Their work, initially applied to reading ZIP codes on mail, demonstrated that carefully designed, constrained neural networks could achieve error rates as low as about 1% on USPS data.", | |
"Sargur Srihari and his team at the Center of Excellence for Document Analysis and Recognition extended these ideas to develop complete handwritten address interpretation systems.", | |
"These systems, deployed by the USPS and postal agencies worldwide, helped automate the routing of mail and revolutionized the sorting process.", | |
"The development and evaluation of handwriting OCR have been driven in part by standard benchmark datasets.", | |
"The MNIST dataset, introduced in the 1990s, consists of 70,000 images of handwritten digits and became the de facto benchmark for handwritten digit recognition.", | |
"Complementing MNIST is the USPS dataset, which provides images of hand‐written digits derived from actual envelopes and captures real-world variability.", | |
"Handwriting OCR entered a new era with the introduction of neural network models.", | |
"In 1989, LeCun et al. applied backpropagation to a convolutional neural network tailored for handwritten digit recognition, an innovation that evolved into the LeNet series.", | |
"By automatically learning features rather than relying on hand-designed templates, these networks drastically improved recognition performance.", | |
"As computational power increased and large labeled datasets became available, deep learning models, particularly convolutional neural networks and recurrent neural networks, pushed the accuracy of handwriting OCR to near-human levels.", | |
"Modern systems can handle both printed and cursive text, automatically segmenting and recognizing characters in complex handwritten documents.", | |
"Cursive handwriting presents a classic challenge known as Sayre's paradox, where word recognition requires letter segmentation and letter segmentation requires word recognition.", | |
"Contemporary approaches use implicit segmentation methods, often combined with hidden Markov models or end-to-end neural networks, to circumvent this paradox.", | |
"Today's handwriting OCR systems are highly accurate and widely deployed.", | |
"Modern systems combine OCR with artificial intelligence to not only recognize text but also extract meaning, verify data, and integrate into larger enterprise workflows.", | |
"Projects such as In Codice Ratio use deep convolutional networks to transcribe historical handwritten documents, further expanding OCR applications.", | |
"Despite impressive advances, handwriting OCR continues to face challenges with highly variable or degraded handwriting.", | |
"Ongoing research aims to improve recognition accuracy, particularly for cursive and unconstrained handwriting, and to extend support across languages and historical scripts.", | |
"With improvements in deep learning architectures, increased computing power, and large annotated datasets, future OCR systems are expected to become even more robust, handling real-world handwriting in diverse applications from postal services to archival digitization.", | |
"Today's research in handwriting OCR benefits from a wide array of well-established datasets and ongoing evaluation challenges.", | |
"These resources help drive the development of increasingly robust systems for both digit and full-text recognition.", | |
"For handwritten digit recognition, the MNIST dataset remains the most widely used benchmark thanks to its simplicity and broad adoption.", | |
"Complementing MNIST is the USPS dataset, which is derived from actual mail envelopes and provides additional challenges with real-world variability.", | |
"The IAM Handwriting Database is one of the most popular datasets for unconstrained offline handwriting recognition and includes scanned pages of handwritten English text with corresponding transcriptions.", | |
"It is frequently used to train and evaluate models that work on full-line or full-page recognition tasks.", | |
"For systems designed to capture the dynamic aspects of handwriting, such as pen stroke trajectories, the IAM On-Line Handwriting Database offers valuable data.", | |
"The CVL dataset provides multi-writer handwritten texts with a range of writing styles, making it useful for assessing the generalization capabilities of OCR systems across diverse handwriting samples.", | |
"The RIMES dataset, developed for French handwriting recognition, contains scanned documents and is a key resource for evaluating systems in multilingual settings.", | |
"Various ICDAR competitions, such as ICDAR 2013 and ICDAR 2017, have released datasets that reflect the complexities of real-world handwriting, including historical documents and unconstrained writing.", | |
"For Arabic handwriting recognition, the KHATT dataset offers a collection of handwritten texts that capture the unique challenges of cursive and context-dependent scripts.", | |
"These datasets, along with continual evaluation efforts through competitions hosted at ICDAR and ICFHR, ensure that the field keeps pushing toward higher accuracy, better robustness, and broader language coverage.", | |
"Emerging benchmarks, often tailored to specific scripts, historical documents, or noisy real-world data, will further refine the state-of-the-art in handwriting OCR.", | |
"This array of resources continues to shape the development of handwriting OCR systems today.", | |
"This additional section outlines today's most influential datasets and benchmarks, highlighting how they continue to shape the development of handwriting OCR systems." | |
] | |
class OCRDataCollector: | |
def __init__(self): | |
self.collected_pairs = [] | |
self.current_text_block = self.get_random_text_block() | |
self.hf_api = HfApi() | |
def get_random_text_block(self): | |
block_length = random.randint(1, 5) | |
start_index = random.randint(0, len(sentences) - block_length) | |
block = " ".join(sentences[start_index:start_index + block_length]) | |
return block | |
def submit_image(self, image, text_block, username: Optional[str] = None): | |
if image is not None and username: | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
self.collected_pairs.append({ | |
"text": text_block, | |
"image": image, | |
"timestamp": timestamp, | |
"username": username | |
}) | |
return self.get_random_text_block() | |
def skip_text(self, text_block, username: Optional[str] = None): | |
return self.get_random_text_block() | |
def strip_metadata(image: Image.Image) -> Image.Image: | |
""" | |
Helper function to strip all metadata from the provided PIL Image. | |
This creates a new image with the same pixel data but no additional info. | |
""" | |
data = list(image.getdata()) | |
stripped_image = Image.new(image.mode, image.size) | |
stripped_image.putdata(data) | |
return stripped_image | |
def create_gradio_interface(): | |
collector = OCRDataCollector() | |
with gr.Blocks() as demo: | |
gr.Markdown("## Crowdsourcing Handwriting OCR Dataset") | |
gr.LoginButton() | |
user_info = gr.Markdown() | |
# Hidden state to hold the user's OAuth profile as JSON. | |
profile_state = gr.JSON(visible=False) | |
gr.Markdown( | |
"You will be shown between 1 and 5 consecutive sentences. Please handwrite them on paper and upload an image of your handwriting. " | |
"If you wish to skip the current text, click 'Skip'." | |
) | |
text_box = gr.Textbox(value=collector.current_text_block, label="Text to Handwrite", interactive=False, visible=False) | |
image_input = gr.Image(type="pil", label="Upload Handwritten Image", sources=["upload"], visible=False) | |
# Toggle (using a radio button) for dataset choice. | |
dataset_radio = gr.Radio(choices=["Private", "Public"], label="Select Dataset", value="Private", visible=False) | |
with gr.Row(visible=False) as button_row: | |
submit_btn = gr.Button("Submit") | |
skip_btn = gr.Button("Skip") | |
def update_user_info(profile: gr.OAuthProfile | None): | |
if profile is None: | |
return "Please log in with your Hugging Face account to contribute to the dataset.", {} | |
# Use the username provided by the profile (from the "profile" scope) | |
return f"Logged in as: {profile.username}", {"username": profile.username} | |
def handle_submit(profile, dataset_choice, image, text): | |
if not profile or "username" not in profile: | |
raise gr.Error("Please log in to use this application") | |
username = profile["username"] | |
repo_id = f"{username}/handwriting-ocr-private" | |
if dataset_choice == "Private": | |
# Remove all metadata for privacy | |
stripped_image = strip_metadata(image) | |
# Check if the dataset exists; if not, create it as private | |
try: | |
collector.hf_api.dataset_info(repo_id) | |
except Exception as e: | |
collector.hf_api.create_repo(repo_id, repo_type="dataset", private=True) | |
# Save the stripped image to a temporary file | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
filename = f"{timestamp}.png" | |
temp_dir = "temp" | |
os.makedirs(temp_dir, exist_ok=True) | |
temp_path = os.path.join(temp_dir, filename) | |
stripped_image.save(temp_path) | |
# Create a dataset dictionary with the image-text pair | |
features = datasets.Features({ | |
'text': datasets.Value('string'), | |
'image': datasets.Image(), | |
'timestamp': datasets.Value('string') | |
}) | |
dataset_dict = { | |
'text': [text], | |
'image': [temp_path], | |
'timestamp': [timestamp] | |
} | |
# Create the dataset and push to hub | |
dataset = datasets.Dataset.from_dict(dataset_dict, features=features) | |
dataset.push_to_hub(repo_id) | |
# Remove the temporary file | |
os.remove(temp_path) | |
# Log the submission locally | |
collector.collected_pairs.append({ | |
"text": text, | |
"image": image, | |
"timestamp": timestamp, | |
"username": username, | |
"dataset": "private" | |
}) | |
new_text = collector.get_random_text_block() | |
return None, new_text | |
else: | |
# Fallback to public submission | |
new_text = collector.submit_image(image, text, username) | |
return None, new_text | |
def handle_skip(profile, text): | |
if not profile or "username" not in profile: | |
raise gr.Error("Please log in to use this application") | |
return collector.skip_text(text, profile["username"]) | |
def update_visibility(profile: gr.OAuthProfile | None): | |
is_visible = profile is not None | |
# Update the visibility of text_box, image_input, button_row, and dataset_radio. | |
return [ | |
gr.update(visible=is_visible), | |
gr.update(visible=is_visible), | |
gr.update(visible=is_visible), | |
gr.update(visible=is_visible) | |
] | |
# On load, update both the display message and the hidden profile state. | |
demo.load(update_user_info, inputs=None, outputs=[user_info, profile_state]) | |
demo.load(update_visibility, inputs=None, outputs=[text_box, image_input, button_row, dataset_radio]) | |
# Bind the submit and skip actions with updated inputs. | |
submit_btn.click(handle_submit, inputs=[profile_state, dataset_radio, image_input, text_box], outputs=[image_input, text_box]) | |
skip_btn.click(handle_skip, inputs=[profile_state, text_box], outputs=text_box) | |
return demo | |
if __name__ == "__main__": | |
demo = create_gradio_interface() | |
demo.launch() |