Spaces:
Sleeping
Sleeping
File size: 2,257 Bytes
300310b ddf7acc 300310b ddf7acc 300310b ddf7acc 300310b ddf7acc 300310b ddf7acc 300310b ddf7acc 300310b ddf7acc 300310b ddf7acc 300310b ddf7acc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import torch
import cv2
import json
import xml.etree.ElementTree as ET
import gradio as gr
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, GPT2LMHeadModel, GPT2Tokenizer
# Load OCR model (TrOCR)
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
# Load GPT-2 model
GPT2_model = GPT2LMHeadModel.from_pretrained("gpt2")
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Image preprocessing
def preprocess_image(image_path):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
return gray
# Extract text using TrOCR (instead of Tesseract)
def extract_text(image_path):
image = preprocess_image(image_path)
pixel_values = processor(image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return text
# Generate structured format (JSON/XML)
def generate_machine_readable_format(text, format_type='json'):
if format_type == 'json':
return json.dumps({"content": text})
elif format_type == 'xml':
root = ET.Element("Document")
content = ET.SubElement(root, "Content")
content.text = text
return ET.tostring(root, encoding='unicode')
return text
# GPT-2 for structured output
def generate_structured_output(text):
inputs = GPT2_tokenizer.encode(text, return_tensors="pt")
outputs = GPT2_model.generate(inputs, max_length=500)
return GPT2_tokenizer.decode(outputs[0])
# Convert document
def convert_document(image, output_format='json'):
text = extract_text(image)
structured_output = generate_structured_output(text)
machine_readable_output = generate_machine_readable_format(structured_output, format_type=output_format)
return machine_readable_output
# Gradio UI
iface = gr.Interface(
fn=convert_document,
inputs=[gr.Image(type="filepath"), gr.Radio(["json", "xml"], label="Output Format")],
outputs="text",
title="Document OCR and Conversion",
description="Extracts text from images and converts it into structured JSON/XML format."
)
iface.launch()
|