File size: 4,666 Bytes
1594055
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edd7682
1594055
 
 
ae62ed9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from gradio import Interface, File, Dropdown, Textbox, Slider
import json
from gliner import GLiNER
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

class DoctrHandler:
    def __init__(self):
        self.model = ocr_predictor(det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True)
    
    def extract_text(self, file_path):
        try:
            # Handle both PDF and image files
            doc = DocumentFile.from_pdf(file_path) if file_path.endswith('.pdf') else DocumentFile.from_images(file_path)
            # Perform OCR
            result = self.model(doc)
            # Extract text from result
            text = ""
            for page in result.pages:
                for block in page.blocks:
                    for line in block.lines:
                        for word in line.words:
                            text += word.value + " "
            return text.strip()
        except Exception as e:
            raise Exception(f"Error during OCR processing: {str(e)}")

class GlinerHandler:
    def __init__(self):
        self.max_length = 384
        self.model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", max_length=self.max_length)
        
    def predict_entities(self, text, labels, threshold):

        entities = self.model.predict_entities(text, labels, threshold=threshold)
                
        return entities

# Initialize handlers
ocr_handler = DoctrHandler()
ner_handler = GlinerHandler()

# Default entities
DEFAULT_ENTITIES = ["name", "person", "bank account number", "email", "address", "phone number", "date", "currency", "amount", "document number", "iban", "country"]

def process_file(uploaded_file, selected_entities, custom_entities, threshold=0.5):
    
    # Input validation
    if not selected_entities and not custom_entities:
        return json.dumps({
            "message": "Please select or provide at least one entity to search for",
            "hits": 0,
            "searched_for": [],
            "entities": []
        }, indent=4)
        
    # Handle no file uploaded
    if not uploaded_file:
        return json.dumps({
            "message": "No file uploaded",
            "hits": 0,
            "searched_for": [],
            "entities": []
        }, indent=4)
        
    # Convert custom entities string to list and clean whitespace
    custom_entity_list = [e.strip() for e in custom_entities.split(",") if e.strip()] if custom_entities else []
    
    # Combine default and custom entities
    all_entities = selected_entities + custom_entity_list
    
    # Perform OCR on the uploaded file
    extracted_text = ocr_handler.extract_text(uploaded_file.name)
    
    # Perform NER on the extracted text with threshold
    entities = ner_handler.predict_entities(extracted_text, all_entities, threshold)
    
    if not entities:
        return json.dumps({
            "message": "No entities were found in the document",
            "hits": 0,
            "searched_for": all_entities,
            "entities": []
        }, indent=4)
    
    # Clean and sort entities
    cleaned_entities = []
    for entity in entities:
        cleaned_entity = {
            "text": entity["text"],
            "label": entity["label"],
            "confidence": entity["score"]
        }
        cleaned_entities.append(cleaned_entity)
    
    # Sort by confidence score in descending order
    cleaned_entities.sort(key=lambda x: x["confidence"], reverse=True)
    
    # Return structured response
    response = {
        "message": "Document destroyed successfully!",
        "hits": len(cleaned_entities),
        "searched_for": all_entities,
        "entities": cleaned_entities
    }
    
    return json.dumps(response, indent=4)


# Create Gradio interface
iface = Interface(
    fn=process_file,
    inputs=[
        File(label="Upload Document (PDF or Image)"),
        Dropdown(choices=DEFAULT_ENTITIES, label="Select Entities", multiselect=True),
        Textbox(label="Custom Entities (comma-separated)", placeholder="entity1, entity2, ..."),
        Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold")
    ],
    outputs=Textbox(label="Extracted Entities (JSON)"),
    title="DocDestroyer11000",
    allow_flagging="never",
    description="Extract valuable information from your documents in a snap! Upload your PDFs or images, select the entities you care about et started now and watch your documents be **destroyed** (or in other words - turned into JSON)! πŸš€<br>Tech: Copilot/Claude Sonnet + https://mindee.github.io/doctr/ + https://huggingface.co/urchade/gliner_multi-v2.1"
)

iface.launch()