deepseek-r1dotcom

Runtime error

App Files Files Community

deepseek-r1dotcom / src /main /file_extractors.py

hadadrjt

ai: Restructured repo for production.

f99ad65 about 1 month ago

raw

history blame

8.48 kB

	#
	# SPDX-FileCopyrightText: Hadad <[email protected]>
	# SPDX-License-Identifier: Apache-2.0
	#

	import pdfplumber # PDF
	import pytesseract # OCR
	import docx # Microsoft Word
	import zipfile # Microsoft Word
	import io
	import pandas as pd # Microsoft Excel
	import warnings
	import re

	from openpyxl import load_workbook # Microsoft Excel
	from pptx import Presentation # Microsoft PowerPoint
	from PIL import Image, ImageEnhance, ImageFilter # OCR
	from pathlib import Path

	def clean_text(text):
	"""Clean and normalize extracted outputs."""
	# Remove non-printable and special characters except common punctuation
	text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text)
	# Remove isolated single letters (likely OCR noise)
	text = re.sub(r'\b[a-zA-Z]\b', '', text)
	# Normalize whitespace and remove empty lines
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	return "\n".join(lines)

	def format_table(df, max_rows=10):
	"""Format pandas DataFrame as a readable table string, limited to max rows."""
	if df.empty:
	return ""
	# Drop fully empty rows and columns to reduce NaN clutter
	df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
	# Replace NaN with empty string to avoid 'NaN' in output
	df_clean = df_clean.fillna('')
	if df_clean.empty:
	return ""
	display_df = df_clean.head(max_rows)
	table_str = display_df.to_string(index=False)
	if len(df_clean) > max_rows:
	table_str += f"\n... ({len(df_clean) - max_rows} more rows)"
	return table_str

	def preprocess_image(img):
	"""Preprocess image for better OCR accuracy."""
	try:
	img = img.convert("L") # Grayscale
	enhancer = ImageEnhance.Contrast(img)
	img = enhancer.enhance(2) # Increase contrast
	img = img.filter(ImageFilter.MedianFilter()) # Reduce noise
	# Binarize image (threshold)
	img = img.point(lambda x: 0 if x < 140 else 255, '1')
	return img
	except Exception:
	return img

	def ocr_image(img):
	"""Perform OCR on PIL Image with preprocessing and clean result."""
	try:
	img = preprocess_image(img)
	text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
	text = clean_text(text)
	return text
	except Exception:
	return ""

	def extract_pdf_content(fp):
	"""
	Extract text content from PDF file.
	Includes OCR on embedded images to capture text within images.
	Also extracts tables as tab-separated text.
	"""
	content = ""
	try:
	with pdfplumber.open(fp) as pdf:
	for i, page in enumerate(pdf.pages, 1):
	text = page.extract_text() or ""
	content += f"Page {i} Text:\n{clean_text(text)}\n\n"
	# OCR on images if any
	if page.images:
	img_obj = page.to_image(resolution=300)
	for img in page.images:
	bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
	cropped = img_obj.original.crop(bbox)
	ocr_text = ocr_image(cropped)
	if ocr_text:
	content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n"
	# Extract tables as TSV
	tables = page.extract_tables()
	for idx, table in enumerate(tables, 1):
	if table:
	df = pd.DataFrame(table[1:], columns=table[0])
	content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n"
	except Exception as e:
	content += f"\n[Error reading PDF {fp}: {e}]"
	return content.strip()

	def extract_docx_content(fp):
	"""
	Extract text from Microsoft Word files.
	Also performs OCR on embedded images inside the Microsoft Word archive.
	"""
	content = ""
	try:
	doc = docx.Document(fp)
	paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
	if paragraphs:
	content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n"
	# Extract tables
	tables = []
	for table in doc.tables:
	rows = []
	for row in table.rows:
	cells = [cell.text.strip() for cell in row.cells]
	rows.append(cells)
	if rows:
	df = pd.DataFrame(rows[1:], columns=rows[0])
	tables.append(df)
	for i, df in enumerate(tables, 1):
	content += f"Table {i}:\n{format_table(df)}\n\n"
	# OCR on embedded images inside Microsoft Word
	with zipfile.ZipFile(fp) as z:
	for file in z.namelist():
	if file.startswith("word/media/"):
	data = z.read(file)
	try:
	img = Image.open(io.BytesIO(data))
	ocr_text = ocr_image(img)
	if ocr_text:
	content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n"
	except Exception:
	pass
	except Exception as e:
	content += f"\n[Error reading Microsoft Word {fp}: {e}]"
	return content.strip()

	def extract_excel_content(fp):
	"""
	Extract content from Microsoft Excel files.
	Converts sheets to readable tables and replaces NaN values.
	Does NOT attempt to extract images to avoid errors.
	"""
	content = ""
	try:
	with warnings.catch_warnings():
	warnings.simplefilter("ignore") # Suppress openpyxl warnings
	# Explicitly specify the engine to avoid potential issues
	sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl')
	for sheet_name, df in sheets.items():
	content += f"Sheet: {sheet_name}\n"
	content += format_table(df) + "\n\n"
	except Exception as e:
	content += f"\n[Error reading Microsoft Excel {fp}: {e}]"
	return content.strip()

	def extract_pptx_content(fp):
	"""
	Extract text content from Microsoft PowerPoint presentation slides.
	Includes text from shapes and tables.
	Performs OCR on embedded images.
	"""
	content = ""
	try:
	prs = Presentation(fp)
	for i, slide in enumerate(prs.slides, 1):
	slide_texts = []
	for shape in slide.shapes:
	if hasattr(shape, "text") and shape.text.strip():
	slide_texts.append(shape.text.strip())
	if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
	try:
	img = Image.open(io.BytesIO(shape.image.blob))
	ocr_text = ocr_image(img)
	if ocr_text:
	slide_texts.append(f"[OCR Text from image]:\n{ocr_text}")
	except Exception:
	pass
	if slide_texts:
	content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n"
	else:
	content += f"Slide {i} Text:\nNo text found on this slide.\n\n"
	# Extract tables
	for shape in slide.shapes:
	if shape.has_table:
	rows = []
	table = shape.table
	for row in table.rows:
	cells = [cell.text.strip() for cell in row.cells]
	rows.append(cells)
	if rows:
	df = pd.DataFrame(rows[1:], columns=rows[0])
	content += f"Table on slide {i}:\n{format_table(df)}\n\n"
	except Exception as e:
	content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]"
	return content.strip()

	def extract_file_content(fp):
	"""
	Determine file type by extension and extract text content accordingly.
	For unknown types, attempts to read as plain text.
	"""
	ext = Path(fp).suffix.lower()
	if ext == ".pdf":
	return extract_pdf_content(fp)
	elif ext in [".doc", ".docx"]:
	return extract_docx_content(fp)
	elif ext in [".xlsx", ".xls"]:
	return extract_excel_content(fp)
	elif ext in [".ppt", ".pptx"]:
	return extract_pptx_content(fp)
	else:
	try:
	text = Path(fp).read_text(encoding="utf-8")
	return clean_text(text)
	except Exception as e:
	return f"\n[Error reading file {fp}: {e}]"