Spaces:
Sleeping
Sleeping
File size: 2,649 Bytes
300310b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import torch
import pytesseract
import cv2
import json
import xml.etree.ElementTree as ET
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from layoutparser import Detectron2LayoutModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from stable_baselines3 import PPO
# Load OCR model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
def preprocess_image(image_path):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
return gray
def extract_text(image_path):
image = preprocess_image(image_path)
text = pytesseract.image_to_string(image)
return text
def analyze_layout(image_path):
model = Detectron2LayoutModel("lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config")
image = cv2.imread(image_path)
layout = model.detect(image)
return layout
def generate_machine_readable_format(text, format_type='json'):
if format_type == 'json':
return json.dumps({"content": text})
elif format_type == 'xml':
root = ET.Element("Document")
content = ET.SubElement(root, "Content")
content.text = text
return ET.tostring(root, encoding='unicode')
return text
# Generative AI Model
GPT2_model = GPT2LMHeadModel.from_pretrained("gpt2")
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
def generate_structured_output(text):
inputs = GPT2_tokenizer.encode(text, return_tensors="pt")
outputs = GPT2_model.generate(inputs, max_length=500)
return GPT2_tokenizer.decode(outputs[0])
# Reinforcement Learning for Optimization
class DocumentConversionEnv:
def __init__(self):
self.state = None
def reset(self):
self.state = "start"
return self.state
def step(self, action):
reward = 1 if action == "optimize" else -1
self.state = "optimized" if action == "optimize" else "start"
return self.state, reward, False, {}
env = DocumentConversionEnv()
rl_model = PPO("MlpPolicy", env, verbose=1)
rl_model.learn(total_timesteps=1000)
def convert_document(image_path, output_format='json'):
text = extract_text(image_path)
layout = analyze_layout(image_path)
structured_output = generate_structured_output(text)
machine_readable_output = generate_machine_readable_format(structured_output, format_type=output_format)
return machine_readable_output
# Example usage
document_path = "sample_document.png"
converted_document = convert_document(document_path, output_format='json')
print(converted_document)
|