import torch import cv2 import json import xml.etree.ElementTree as ET import gradio as gr from transformers import TrOCRProcessor, VisionEncoderDecoderModel, GPT2LMHeadModel, GPT2Tokenizer # Load OCR model (TrOCR) processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") # Load GPT-2 model GPT2_model = GPT2LMHeadModel.from_pretrained("gpt2") GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Image preprocessing def preprocess_image(image_path): image = cv2.imread(image_path) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) return gray # Extract text using TrOCR (instead of Tesseract) def extract_text(image_path): image = preprocess_image(image_path) pixel_values = processor(image, return_tensors="pt").pixel_values generated_ids = model.generate(pixel_values) text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return text # Generate structured format (JSON/XML) def generate_machine_readable_format(text, format_type='json'): if format_type == 'json': return json.dumps({"content": text}) elif format_type == 'xml': root = ET.Element("Document") content = ET.SubElement(root, "Content") content.text = text return ET.tostring(root, encoding='unicode') return text # GPT-2 for structured output def generate_structured_output(text): inputs = GPT2_tokenizer.encode(text, return_tensors="pt") outputs = GPT2_model.generate(inputs, max_length=500) return GPT2_tokenizer.decode(outputs[0]) # Convert document def convert_document(image, output_format='json'): text = extract_text(image) structured_output = generate_structured_output(text) machine_readable_output = generate_machine_readable_format(structured_output, format_type=output_format) return machine_readable_output # Gradio UI iface = gr.Interface( fn=convert_document, inputs=[gr.Image(type="filepath"), gr.Radio(["json", "xml"], label="Output Format")], outputs="text", title="Document OCR and Conversion", description="Extracts text from images and converts it into structured JSON/XML format." ) iface.launch()