DeepDiveDev's picture
Update app.py
ddf7acc verified
raw
history blame
2.26 kB
import torch
import cv2
import json
import xml.etree.ElementTree as ET
import gradio as gr
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, GPT2LMHeadModel, GPT2Tokenizer
# Load OCR model (TrOCR)
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
# Load GPT-2 model
GPT2_model = GPT2LMHeadModel.from_pretrained("gpt2")
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Image preprocessing
def preprocess_image(image_path):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
return gray
# Extract text using TrOCR (instead of Tesseract)
def extract_text(image_path):
image = preprocess_image(image_path)
pixel_values = processor(image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return text
# Generate structured format (JSON/XML)
def generate_machine_readable_format(text, format_type='json'):
if format_type == 'json':
return json.dumps({"content": text})
elif format_type == 'xml':
root = ET.Element("Document")
content = ET.SubElement(root, "Content")
content.text = text
return ET.tostring(root, encoding='unicode')
return text
# GPT-2 for structured output
def generate_structured_output(text):
inputs = GPT2_tokenizer.encode(text, return_tensors="pt")
outputs = GPT2_model.generate(inputs, max_length=500)
return GPT2_tokenizer.decode(outputs[0])
# Convert document
def convert_document(image, output_format='json'):
text = extract_text(image)
structured_output = generate_structured_output(text)
machine_readable_output = generate_machine_readable_format(structured_output, format_type=output_format)
return machine_readable_output
# Gradio UI
iface = gr.Interface(
fn=convert_document,
inputs=[gr.Image(type="filepath"), gr.Radio(["json", "xml"], label="Output Format")],
outputs="text",
title="Document OCR and Conversion",
description="Extracts text from images and converts it into structured JSON/XML format."
)
iface.launch()