Spaces:

DeepDiveDev
/

TransformoDocs-Demo

Sleeping

File size: 2,257 Bytes

300310b
 
 
 
ddf7acc
 
300310b
ddf7acc
300310b
 
 
ddf7acc
 
 
 
 
300310b
 
 
 
 
ddf7acc
300310b
 
ddf7acc
 
 
300310b
 
ddf7acc
300310b
 
 
 
 
 
 
 
 
 
ddf7acc
300310b
 
 
 
 
ddf7acc
 
 
300310b
 
 
 
ddf7acc

import torch
import cv2
import json
import xml.etree.ElementTree as ET
import gradio as gr
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, GPT2LMHeadModel, GPT2Tokenizer

# Load OCR model (TrOCR)
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Load GPT-2 model
GPT2_model = GPT2LMHeadModel.from_pretrained("gpt2")
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Image preprocessing
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return gray

# Extract text using TrOCR (instead of Tesseract)
def extract_text(image_path):
    image = preprocess_image(image_path)
    pixel_values = processor(image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text

# Generate structured format (JSON/XML)
def generate_machine_readable_format(text, format_type='json'):
    if format_type == 'json':
        return json.dumps({"content": text})
    elif format_type == 'xml':
        root = ET.Element("Document")
        content = ET.SubElement(root, "Content")
        content.text = text
        return ET.tostring(root, encoding='unicode')
    return text

# GPT-2 for structured output
def generate_structured_output(text):
    inputs = GPT2_tokenizer.encode(text, return_tensors="pt")
    outputs = GPT2_model.generate(inputs, max_length=500)
    return GPT2_tokenizer.decode(outputs[0])

# Convert document
def convert_document(image, output_format='json'):
    text = extract_text(image)
    structured_output = generate_structured_output(text)
    machine_readable_output = generate_machine_readable_format(structured_output, format_type=output_format)
    return machine_readable_output

# Gradio UI
iface = gr.Interface(
    fn=convert_document,
    inputs=[gr.Image(type="filepath"), gr.Radio(["json", "xml"], label="Output Format")],
    outputs="text",
    title="Document OCR and Conversion",
    description="Extracts text from images and converts it into structured JSON/XML format."
)

iface.launch()