Spaces:

KennethTM
/

DocDestroyer11000

Running

App Files Files Community

DocDestroyer11000 / app.py

KennethTM

Update app.py

edd7682 verified about 1 month ago

raw

history blame contribute delete

4.67 kB

	from gradio import Interface, File, Dropdown, Textbox, Slider
	import json
	from gliner import GLiNER
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor

	class DoctrHandler:
	def __init__(self):
	self.model = ocr_predictor(det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True)

	def extract_text(self, file_path):
	try:
	# Handle both PDF and image files
	doc = DocumentFile.from_pdf(file_path) if file_path.endswith('.pdf') else DocumentFile.from_images(file_path)
	# Perform OCR
	result = self.model(doc)
	# Extract text from result
	text = ""
	for page in result.pages:
	for block in page.blocks:
	for line in block.lines:
	for word in line.words:
	text += word.value + " "
	return text.strip()
	except Exception as e:
	raise Exception(f"Error during OCR processing: {str(e)}")

	class GlinerHandler:
	def __init__(self):
	self.max_length = 384
	self.model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", max_length=self.max_length)

	def predict_entities(self, text, labels, threshold):

	entities = self.model.predict_entities(text, labels, threshold=threshold)

	return entities

	# Initialize handlers
	ocr_handler = DoctrHandler()
	ner_handler = GlinerHandler()

	# Default entities
	DEFAULT_ENTITIES = ["name", "person", "bank account number", "email", "address", "phone number", "date", "currency", "amount", "document number", "iban", "country"]

	def process_file(uploaded_file, selected_entities, custom_entities, threshold=0.5):

	# Input validation
	if not selected_entities and not custom_entities:
	return json.dumps({
	"message": "Please select or provide at least one entity to search for",
	"hits": 0,
	"searched_for": [],
	"entities": []
	}, indent=4)

	# Handle no file uploaded
	if not uploaded_file:
	return json.dumps({
	"message": "No file uploaded",
	"hits": 0,
	"searched_for": [],
	"entities": []
	}, indent=4)

	# Convert custom entities string to list and clean whitespace
	custom_entity_list = [e.strip() for e in custom_entities.split(",") if e.strip()] if custom_entities else []

	# Combine default and custom entities
	all_entities = selected_entities + custom_entity_list

	# Perform OCR on the uploaded file
	extracted_text = ocr_handler.extract_text(uploaded_file.name)

	# Perform NER on the extracted text with threshold
	entities = ner_handler.predict_entities(extracted_text, all_entities, threshold)

	if not entities:
	return json.dumps({
	"message": "No entities were found in the document",
	"hits": 0,
	"searched_for": all_entities,
	"entities": []
	}, indent=4)

	# Clean and sort entities
	cleaned_entities = []
	for entity in entities:
	cleaned_entity = {
	"text": entity["text"],
	"label": entity["label"],
	"confidence": entity["score"]
	}
	cleaned_entities.append(cleaned_entity)

	# Sort by confidence score in descending order
	cleaned_entities.sort(key=lambda x: x["confidence"], reverse=True)

	# Return structured response
	response = {
	"message": "Document destroyed successfully!",
	"hits": len(cleaned_entities),
	"searched_for": all_entities,
	"entities": cleaned_entities
	}

	return json.dumps(response, indent=4)


	# Create Gradio interface
	iface = Interface(
	fn=process_file,
	inputs=[
	File(label="Upload Document (PDF or Image)"),
	Dropdown(choices=DEFAULT_ENTITIES, label="Select Entities", multiselect=True),
	Textbox(label="Custom Entities (comma-separated)", placeholder="entity1, entity2, ..."),
	Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold")
	],
	outputs=Textbox(label="Extracted Entities (JSON)"),
	title="DocDestroyer11000",
	allow_flagging="never",
	description="Extract valuable information from your documents in a snap! Upload your PDFs or images, select the entities you care about et started now and watch your documents be destroyed (or in other words - turned into JSON)! 🚀<br>Tech: Copilot/Claude Sonnet + https://mindee.github.io/doctr/ + https://huggingface.co/urchade/gliner_multi-v2.1"
	)

	iface.launch()