KennethTM's picture
Update app.py
ae62ed9 verified
raw
history blame
4.66 kB
from gradio import Interface, File, Dropdown, Textbox, Slider
import json
from gliner import GLiNER
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
class DoctrHandler:
def __init__(self):
self.model = ocr_predictor(det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True)
def extract_text(self, file_path):
try:
# Handle both PDF and image files
doc = DocumentFile.from_pdf(file_path) if file_path.endswith('.pdf') else DocumentFile.from_images(file_path)
# Perform OCR
result = self.model(doc)
# Extract text from result
text = ""
for page in result.pages:
for block in page.blocks:
for line in block.lines:
for word in line.words:
text += word.value + " "
return text.strip()
except Exception as e:
raise Exception(f"Error during OCR processing: {str(e)}")
class GlinerHandler:
def __init__(self):
self.max_length = 384
self.model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", max_length=self.max_length)
def predict_entities(self, text, labels, threshold):
entities = self.model.predict_entities(text, labels, threshold=threshold)
return entities
# Initialize handlers
ocr_handler = DoctrHandler()
ner_handler = GlinerHandler()
# Default entities
DEFAULT_ENTITIES = ["name", "person", "bank account number", "email", "address", "phone number", "date", "currency", "amount", "document number", "iban", "country"]
def process_file(uploaded_file, selected_entities, custom_entities, threshold=0.5):
# Input validation
if not selected_entities and not custom_entities:
return json.dumps({
"message": "Please select or provide at least one entity to search for",
"hits": 0,
"searched_for": [],
"entities": []
}, indent=4)
# Handle no file uploaded
if not uploaded_file:
return json.dumps({
"message": "No file uploaded",
"hits": 0,
"searched_for": [],
"entities": []
}, indent=4)
# Convert custom entities string to list and clean whitespace
custom_entity_list = [e.strip() for e in custom_entities.split(",") if e.strip()] if custom_entities else []
# Combine default and custom entities
all_entities = selected_entities + custom_entity_list
# Perform OCR on the uploaded file
extracted_text = ocr_handler.extract_text(uploaded_file.name)
# Perform NER on the extracted text with threshold
entities = ner_handler.predict_entities(extracted_text, all_entities, threshold)
if not entities:
return json.dumps({
"message": "No entities were found in the document",
"hits": 0,
"searched_for": all_entities,
"entities": []
}, indent=4)
# Clean and sort entities
cleaned_entities = []
for entity in entities:
cleaned_entity = {
"text": entity["text"],
"label": entity["label"],
"confidence": entity["score"]
}
cleaned_entities.append(cleaned_entity)
# Sort by confidence score in descending order
cleaned_entities.sort(key=lambda x: x["confidence"], reverse=True)
# Return structured response
response = {
"message": "Document destroyed successfully!",
"hits": len(cleaned_entities),
"searched_for": all_entities,
"entities": cleaned_entities
}
return json.dumps(response, indent=4)
# Create Gradio interface
iface = Interface(
fn=process_file,
inputs=[
File(label="Upload Document (PDF or Image)"),
Dropdown(choices=DEFAULT_ENTITIES, label="Select Entities", multiselect=True),
Textbox(label="Custom Entities (comma-separated)", placeholder="entity1, entity2, ..."),
Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold")
],
outputs=Textbox(label="Extracted Entities (JSON)"),
title="DocDestroyer11000",
allow_flagging=False,
description="Extract valuable information from your documents in a snap! Upload your PDFs or images, select the entities you care about et started now and watch your documents be **destroyed** (or in other words - turned into JSON)! πŸš€<br>Tech: Copilot/Claude Sonnet + https://mindee.github.io/doctr/ + https://huggingface.co/urchade/gliner_multi-v2.1"
)
iface.launch()