from gradio import Interface, File, Dropdown, Textbox, Slider import json from gliner import GLiNER from doctr.io import DocumentFile from doctr.models import ocr_predictor class DoctrHandler: def __init__(self): self.model = ocr_predictor(det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True) def extract_text(self, file_path): try: # Handle both PDF and image files doc = DocumentFile.from_pdf(file_path) if file_path.endswith('.pdf') else DocumentFile.from_images(file_path) # Perform OCR result = self.model(doc) # Extract text from result text = "" for page in result.pages: for block in page.blocks: for line in block.lines: for word in line.words: text += word.value + " " return text.strip() except Exception as e: raise Exception(f"Error during OCR processing: {str(e)}") class GlinerHandler: def __init__(self): self.max_length = 384 self.model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", max_length=self.max_length) def predict_entities(self, text, labels, threshold): entities = self.model.predict_entities(text, labels, threshold=threshold) return entities # Initialize handlers ocr_handler = DoctrHandler() ner_handler = GlinerHandler() # Default entities DEFAULT_ENTITIES = ["name", "person", "bank account number", "email", "address", "phone number", "date", "currency", "amount", "document number", "iban", "country"] def process_file(uploaded_file, selected_entities, custom_entities, threshold=0.5): # Input validation if not selected_entities and not custom_entities: return json.dumps({ "message": "Please select or provide at least one entity to search for", "hits": 0, "searched_for": [], "entities": [] }, indent=4) # Handle no file uploaded if not uploaded_file: return json.dumps({ "message": "No file uploaded", "hits": 0, "searched_for": [], "entities": [] }, indent=4) # Convert custom entities string to list and clean whitespace custom_entity_list = [e.strip() for e in custom_entities.split(",") if e.strip()] if custom_entities else [] # Combine default and custom entities all_entities = selected_entities + custom_entity_list # Perform OCR on the uploaded file extracted_text = ocr_handler.extract_text(uploaded_file.name) # Perform NER on the extracted text with threshold entities = ner_handler.predict_entities(extracted_text, all_entities, threshold) if not entities: return json.dumps({ "message": "No entities were found in the document", "hits": 0, "searched_for": all_entities, "entities": [] }, indent=4) # Clean and sort entities cleaned_entities = [] for entity in entities: cleaned_entity = { "text": entity["text"], "label": entity["label"], "confidence": entity["score"] } cleaned_entities.append(cleaned_entity) # Sort by confidence score in descending order cleaned_entities.sort(key=lambda x: x["confidence"], reverse=True) # Return structured response response = { "message": "Document destroyed successfully!", "hits": len(cleaned_entities), "searched_for": all_entities, "entities": cleaned_entities } return json.dumps(response, indent=4) # Create Gradio interface iface = Interface( fn=process_file, inputs=[ File(label="Upload Document (PDF or Image)"), Dropdown(choices=DEFAULT_ENTITIES, label="Select Entities", multiselect=True), Textbox(label="Custom Entities (comma-separated)", placeholder="entity1, entity2, ..."), Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold") ], outputs=Textbox(label="Extracted Entities (JSON)"), title="DocDestroyer11000", allow_flagging=False, description="Extract valuable information from your documents in a snap! Upload your PDFs or images, select the entities you care about et started now and watch your documents be **destroyed** (or in other words - turned into JSON)! 🚀
Tech: Copilot/Claude Sonnet + https://mindee.github.io/doctr/ + https://huggingface.co/urchade/gliner_multi-v2.1" ) iface.launch()