File size: 2,257 Bytes
300310b
 
 
 
ddf7acc
 
300310b
ddf7acc
300310b
 
 
ddf7acc
 
 
 
 
300310b
 
 
 
 
ddf7acc
300310b
 
ddf7acc
 
 
300310b
 
ddf7acc
300310b
 
 
 
 
 
 
 
 
 
ddf7acc
300310b
 
 
 
 
ddf7acc
 
 
300310b
 
 
 
ddf7acc
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import torch
import cv2
import json
import xml.etree.ElementTree as ET
import gradio as gr
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, GPT2LMHeadModel, GPT2Tokenizer

# Load OCR model (TrOCR)
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Load GPT-2 model
GPT2_model = GPT2LMHeadModel.from_pretrained("gpt2")
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Image preprocessing
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return gray

# Extract text using TrOCR (instead of Tesseract)
def extract_text(image_path):
    image = preprocess_image(image_path)
    pixel_values = processor(image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text

# Generate structured format (JSON/XML)
def generate_machine_readable_format(text, format_type='json'):
    if format_type == 'json':
        return json.dumps({"content": text})
    elif format_type == 'xml':
        root = ET.Element("Document")
        content = ET.SubElement(root, "Content")
        content.text = text
        return ET.tostring(root, encoding='unicode')
    return text

# GPT-2 for structured output
def generate_structured_output(text):
    inputs = GPT2_tokenizer.encode(text, return_tensors="pt")
    outputs = GPT2_model.generate(inputs, max_length=500)
    return GPT2_tokenizer.decode(outputs[0])

# Convert document
def convert_document(image, output_format='json'):
    text = extract_text(image)
    structured_output = generate_structured_output(text)
    machine_readable_output = generate_machine_readable_format(structured_output, format_type=output_format)
    return machine_readable_output

# Gradio UI
iface = gr.Interface(
    fn=convert_document,
    inputs=[gr.Image(type="filepath"), gr.Radio(["json", "xml"], label="Output Format")],
    outputs="text",
    title="Document OCR and Conversion",
    description="Extracts text from images and converts it into structured JSON/XML format."
)

iface.launch()