import gradio as gr
import torch
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification

# Load the model and processor
processor = LayoutLMv3Processor.from_pretrained("quadranttechnologies/Table_OCR")
model = LayoutLMv3ForTokenClassification.from_pretrained("quadranttechnologies/Table_OCR")
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def process_image(image):
    # Preprocess the image using the processor
    encoding = processor(image, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

    # Move inputs to the same device as the model
    encoding = {key: val.to(device) for key, val in encoding.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**encoding)
        predictions = torch.argmax(outputs.logits, dim=-1)

    # Extract input IDs, bounding boxes, and predicted labels
    words = encoding["input_ids"]
    bboxes = encoding["bbox"]
    labels = predictions.squeeze().tolist()

    # Format output as JSON
    structured_output = []
    for word_id, bbox, label in zip(words.squeeze().tolist(), bboxes.squeeze().tolist(), labels):
        # Decode the word ID to text
        word = processor.tokenizer.decode([word_id]).strip()
        if word:  # Avoid adding empty words
            structured_output.append({
                "word": word,
                "bounding_box": bbox,
                "label": model.config.id2label[label]  # Convert label ID to label name
            })

    return structured_output

# Define the Gradio interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),  # Accepts image input
    outputs="json"  # Outputs JSON structure
)

# Launch the app
if __name__ == "__main__":
    interface.launch(share=False)