Spaces:
Running
Running
from gradio import Interface, File, Dropdown, Textbox, Slider | |
import json | |
from gliner import GLiNER | |
from doctr.io import DocumentFile | |
from doctr.models import ocr_predictor | |
class DoctrHandler: | |
def __init__(self): | |
self.model = ocr_predictor(det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True) | |
def extract_text(self, file_path): | |
try: | |
# Handle both PDF and image files | |
doc = DocumentFile.from_pdf(file_path) if file_path.endswith('.pdf') else DocumentFile.from_images(file_path) | |
# Perform OCR | |
result = self.model(doc) | |
# Extract text from result | |
text = "" | |
for page in result.pages: | |
for block in page.blocks: | |
for line in block.lines: | |
for word in line.words: | |
text += word.value + " " | |
return text.strip() | |
except Exception as e: | |
raise Exception(f"Error during OCR processing: {str(e)}") | |
class GlinerHandler: | |
def __init__(self): | |
self.max_length = 384 | |
self.model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", max_length=self.max_length) | |
def predict_entities(self, text, labels, threshold): | |
entities = self.model.predict_entities(text, labels, threshold=threshold) | |
return entities | |
# Initialize handlers | |
ocr_handler = DoctrHandler() | |
ner_handler = GlinerHandler() | |
# Default entities | |
DEFAULT_ENTITIES = ["name", "person", "bank account number", "email", "address", "phone number", "date", "currency", "amount", "document number", "iban", "country"] | |
def process_file(uploaded_file, selected_entities, custom_entities, threshold=0.5): | |
# Input validation | |
if not selected_entities and not custom_entities: | |
return json.dumps({ | |
"message": "Please select or provide at least one entity to search for", | |
"hits": 0, | |
"searched_for": [], | |
"entities": [] | |
}, indent=4) | |
# Handle no file uploaded | |
if not uploaded_file: | |
return json.dumps({ | |
"message": "No file uploaded", | |
"hits": 0, | |
"searched_for": [], | |
"entities": [] | |
}, indent=4) | |
# Convert custom entities string to list and clean whitespace | |
custom_entity_list = [e.strip() for e in custom_entities.split(",") if e.strip()] if custom_entities else [] | |
# Combine default and custom entities | |
all_entities = selected_entities + custom_entity_list | |
# Perform OCR on the uploaded file | |
extracted_text = ocr_handler.extract_text(uploaded_file.name) | |
# Perform NER on the extracted text with threshold | |
entities = ner_handler.predict_entities(extracted_text, all_entities, threshold) | |
if not entities: | |
return json.dumps({ | |
"message": "No entities were found in the document", | |
"hits": 0, | |
"searched_for": all_entities, | |
"entities": [] | |
}, indent=4) | |
# Clean and sort entities | |
cleaned_entities = [] | |
for entity in entities: | |
cleaned_entity = { | |
"text": entity["text"], | |
"label": entity["label"], | |
"confidence": entity["score"] | |
} | |
cleaned_entities.append(cleaned_entity) | |
# Sort by confidence score in descending order | |
cleaned_entities.sort(key=lambda x: x["confidence"], reverse=True) | |
# Return structured response | |
response = { | |
"message": "Document destroyed successfully!", | |
"hits": len(cleaned_entities), | |
"searched_for": all_entities, | |
"entities": cleaned_entities | |
} | |
return json.dumps(response, indent=4) | |
# Create Gradio interface | |
iface = Interface( | |
fn=process_file, | |
inputs=[ | |
File(label="Upload Document (PDF or Image)"), | |
Dropdown(choices=DEFAULT_ENTITIES, label="Select Entities", multiselect=True), | |
Textbox(label="Custom Entities (comma-separated)", placeholder="entity1, entity2, ..."), | |
Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold") | |
], | |
outputs=Textbox(label="Extracted Entities (JSON)"), | |
title="DocDestroyer11000", | |
allow_flagging="never", | |
description="Extract valuable information from your documents in a snap! Upload your PDFs or images, select the entities you care about et started now and watch your documents be **destroyed** (or in other words - turned into JSON)! π<br>Tech: Copilot/Claude Sonnet + https://mindee.github.io/doctr/ + https://huggingface.co/urchade/gliner_multi-v2.1" | |
) | |
iface.launch() |