Spaces:
Sleeping
Sleeping
import torch | |
import cv2 | |
import json | |
import xml.etree.ElementTree as ET | |
import gradio as gr | |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, GPT2LMHeadModel, GPT2Tokenizer | |
# Load OCR model (TrOCR) | |
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") | |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") | |
# Load GPT-2 model | |
GPT2_model = GPT2LMHeadModel.from_pretrained("gpt2") | |
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
# Image preprocessing | |
def preprocess_image(image_path): | |
image = cv2.imread(image_path) | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
return gray | |
# Extract text using TrOCR (instead of Tesseract) | |
def extract_text(image_path): | |
image = preprocess_image(image_path) | |
pixel_values = processor(image, return_tensors="pt").pixel_values | |
generated_ids = model.generate(pixel_values) | |
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return text | |
# Generate structured format (JSON/XML) | |
def generate_machine_readable_format(text, format_type='json'): | |
if format_type == 'json': | |
return json.dumps({"content": text}) | |
elif format_type == 'xml': | |
root = ET.Element("Document") | |
content = ET.SubElement(root, "Content") | |
content.text = text | |
return ET.tostring(root, encoding='unicode') | |
return text | |
# GPT-2 for structured output | |
def generate_structured_output(text): | |
inputs = GPT2_tokenizer.encode(text, return_tensors="pt") | |
outputs = GPT2_model.generate(inputs, max_length=500) | |
return GPT2_tokenizer.decode(outputs[0]) | |
# Convert document | |
def convert_document(image, output_format='json'): | |
text = extract_text(image) | |
structured_output = generate_structured_output(text) | |
machine_readable_output = generate_machine_readable_format(structured_output, format_type=output_format) | |
return machine_readable_output | |
# Gradio UI | |
iface = gr.Interface( | |
fn=convert_document, | |
inputs=[gr.Image(type="filepath"), gr.Radio(["json", "xml"], label="Output Format")], | |
outputs="text", | |
title="Document OCR and Conversion", | |
description="Extracts text from images and converts it into structured JSON/XML format." | |
) | |
iface.launch() | |