|
import gradio as gr |
|
import torch |
|
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification |
|
import pytesseract |
|
import os |
|
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" |
|
|
|
|
|
try: |
|
tesseract_version = pytesseract.get_tesseract_version() |
|
print("Tesseract Version:", tesseract_version) |
|
print("Tesseract Path:", pytesseract.pytesseract.tesseract_cmd) |
|
print("Environment PATH:", os.environ["PATH"]) |
|
except Exception as e: |
|
print("Tesseract Debugging Error:", e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
processor = LayoutLMv3Processor.from_pretrained("quadranttechnologies/Table_OCR") |
|
model = LayoutLMv3ForTokenClassification.from_pretrained("quadranttechnologies/Table_OCR") |
|
model.eval() |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
def process_image(image): |
|
try: |
|
|
|
encoding = processor(image, return_tensors="pt", truncation=True, padding="max_length", max_length=512) |
|
|
|
|
|
encoding = {key: val.to(device) for key, val in encoding.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**encoding) |
|
predictions = torch.argmax(outputs.logits, dim=-1) |
|
|
|
|
|
words = encoding["input_ids"] |
|
bboxes = encoding["bbox"] |
|
labels = predictions.squeeze().tolist() |
|
|
|
|
|
structured_output = [] |
|
for word_id, bbox, label in zip(words.squeeze().tolist(), bboxes.squeeze().tolist(), labels): |
|
|
|
word = processor.tokenizer.decode([word_id]).strip() |
|
if word: |
|
structured_output.append({ |
|
"word": word, |
|
"bounding_box": bbox, |
|
"label": model.config.id2label[label] |
|
}) |
|
|
|
return structured_output |
|
|
|
except Exception as e: |
|
|
|
print("Error during processing:", str(e)) |
|
return {"error": str(e)} |
|
|
|
|
|
interface = gr.Interface( |
|
fn=process_image, |
|
inputs=gr.Image(type="pil"), |
|
outputs="json", |
|
title="Table OCR", |
|
description="Upload an image (e.g., receipt or document) to extract structured information in JSON format." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
print("Starting Table OCR App...") |
|
interface.launch(share=True) |
|
|
|
|
|
|
|
|
|
|
|
|