Spaces:

KennethTM
/

DocDestroyer11000

Running

File size: 4,666 Bytes

from gradio import Interface, File, Dropdown, Textbox, Slider
import json
from gliner import GLiNER
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

class DoctrHandler:
    def __init__(self):
        self.model = ocr_predictor(det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True)
    
    def extract_text(self, file_path):
        try:
            # Handle both PDF and image files
            doc = DocumentFile.from_pdf(file_path) if file_path.endswith('.pdf') else DocumentFile.from_images(file_path)
            # Perform OCR
            result = self.model(doc)
            # Extract text from result
            text = ""
            for page in result.pages:
                for block in page.blocks:
                    for line in block.lines:
                        for word in line.words:
                            text += word.value + " "
            return text.strip()
        except Exception as e:
            raise Exception(f"Error during OCR processing: {str(e)}")

class GlinerHandler:
    def __init__(self):
        self.max_length = 384
        self.model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", max_length=self.max_length)
        
    def predict_entities(self, text, labels, threshold):

        entities = self.model.predict_entities(text, labels, threshold=threshold)
                
        return entities

# Initialize handlers
ocr_handler = DoctrHandler()
ner_handler = GlinerHandler()

# Default entities
DEFAULT_ENTITIES = ["name", "person", "bank account number", "email", "address", "phone number", "date", "currency", "amount", "document number", "iban", "country"]

def process_file(uploaded_file, selected_entities, custom_entities, threshold=0.5):
    
    # Input validation
    if not selected_entities and not custom_entities:
        return json.dumps({
            "message": "Please select or provide at least one entity to search for",
            "hits": 0,
            "searched_for": [],
            "entities": []
        }, indent=4)
        
    # Handle no file uploaded
    if not uploaded_file:
        return json.dumps({
            "message": "No file uploaded",
            "hits": 0,
            "searched_for": [],
            "entities": []
        }, indent=4)
        
    # Convert custom entities string to list and clean whitespace
    custom_entity_list = [e.strip() for e in custom_entities.split(",") if e.strip()] if custom_entities else []
    
    # Combine default and custom entities
    all_entities = selected_entities + custom_entity_list
    
    # Perform OCR on the uploaded file
    extracted_text = ocr_handler.extract_text(uploaded_file.name)
    
    # Perform NER on the extracted text with threshold
    entities = ner_handler.predict_entities(extracted_text, all_entities, threshold)
    
    if not entities:
        return json.dumps({
            "message": "No entities were found in the document",
            "hits": 0,
            "searched_for": all_entities,
            "entities": []
        }, indent=4)
    
    # Clean and sort entities
    cleaned_entities = []
    for entity in entities:
        cleaned_entity = {
            "text": entity["text"],
            "label": entity["label"],
            "confidence": entity["score"]
        }
        cleaned_entities.append(cleaned_entity)
    
    # Sort by confidence score in descending order
    cleaned_entities.sort(key=lambda x: x["confidence"], reverse=True)
    
    # Return structured response
    response = {
        "message": "Document destroyed successfully!",
        "hits": len(cleaned_entities),
        "searched_for": all_entities,
        "entities": cleaned_entities
    }
    
    return json.dumps(response, indent=4)


# Create Gradio interface
iface = Interface(
    fn=process_file,
    inputs=[
        File(label="Upload Document (PDF or Image)"),
        Dropdown(choices=DEFAULT_ENTITIES, label="Select Entities", multiselect=True),
        Textbox(label="Custom Entities (comma-separated)", placeholder="entity1, entity2, ..."),
        Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold")
    ],
    outputs=Textbox(label="Extracted Entities (JSON)"),
    title="DocDestroyer11000",
    allow_flagging="never",
    description="Extract valuable information from your documents in a snap! Upload your PDFs or images, select the entities you care about et started now and watch your documents be **destroyed** (or in other words - turned into JSON)! 🚀<br>Tech: Copilot/Claude Sonnet + https://mindee.github.io/doctr/ + https://huggingface.co/urchade/gliner_multi-v2.1"
)

iface.launch()