deepseek-r1dotcom / src /main /file_extractors.py
hadadrjt's picture
ai: Restructured repo for production.
f99ad65
raw
history blame
8.48 kB
#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#
import pdfplumber # PDF
import pytesseract # OCR
import docx # Microsoft Word
import zipfile # Microsoft Word
import io
import pandas as pd # Microsoft Excel
import warnings
import re
from openpyxl import load_workbook # Microsoft Excel
from pptx import Presentation # Microsoft PowerPoint
from PIL import Image, ImageEnhance, ImageFilter # OCR
from pathlib import Path
def clean_text(text):
"""Clean and normalize extracted outputs."""
# Remove non-printable and special characters except common punctuation
text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text)
# Remove isolated single letters (likely OCR noise)
text = re.sub(r'\b[a-zA-Z]\b', '', text)
# Normalize whitespace and remove empty lines
lines = [line.strip() for line in text.splitlines() if line.strip()]
return "\n".join(lines)
def format_table(df, max_rows=10):
"""Format pandas DataFrame as a readable table string, limited to max rows."""
if df.empty:
return ""
# Drop fully empty rows and columns to reduce NaN clutter
df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
# Replace NaN with empty string to avoid 'NaN' in output
df_clean = df_clean.fillna('')
if df_clean.empty:
return ""
display_df = df_clean.head(max_rows)
table_str = display_df.to_string(index=False)
if len(df_clean) > max_rows:
table_str += f"\n... ({len(df_clean) - max_rows} more rows)"
return table_str
def preprocess_image(img):
"""Preprocess image for better OCR accuracy."""
try:
img = img.convert("L") # Grayscale
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(2) # Increase contrast
img = img.filter(ImageFilter.MedianFilter()) # Reduce noise
# Binarize image (threshold)
img = img.point(lambda x: 0 if x < 140 else 255, '1')
return img
except Exception:
return img
def ocr_image(img):
"""Perform OCR on PIL Image with preprocessing and clean result."""
try:
img = preprocess_image(img)
text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
text = clean_text(text)
return text
except Exception:
return ""
def extract_pdf_content(fp):
"""
Extract text content from PDF file.
Includes OCR on embedded images to capture text within images.
Also extracts tables as tab-separated text.
"""
content = ""
try:
with pdfplumber.open(fp) as pdf:
for i, page in enumerate(pdf.pages, 1):
text = page.extract_text() or ""
content += f"Page {i} Text:\n{clean_text(text)}\n\n"
# OCR on images if any
if page.images:
img_obj = page.to_image(resolution=300)
for img in page.images:
bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
cropped = img_obj.original.crop(bbox)
ocr_text = ocr_image(cropped)
if ocr_text:
content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n"
# Extract tables as TSV
tables = page.extract_tables()
for idx, table in enumerate(tables, 1):
if table:
df = pd.DataFrame(table[1:], columns=table[0])
content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n"
except Exception as e:
content += f"\n[Error reading PDF {fp}: {e}]"
return content.strip()
def extract_docx_content(fp):
"""
Extract text from Microsoft Word files.
Also performs OCR on embedded images inside the Microsoft Word archive.
"""
content = ""
try:
doc = docx.Document(fp)
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
if paragraphs:
content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n"
# Extract tables
tables = []
for table in doc.tables:
rows = []
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells]
rows.append(cells)
if rows:
df = pd.DataFrame(rows[1:], columns=rows[0])
tables.append(df)
for i, df in enumerate(tables, 1):
content += f"Table {i}:\n{format_table(df)}\n\n"
# OCR on embedded images inside Microsoft Word
with zipfile.ZipFile(fp) as z:
for file in z.namelist():
if file.startswith("word/media/"):
data = z.read(file)
try:
img = Image.open(io.BytesIO(data))
ocr_text = ocr_image(img)
if ocr_text:
content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n"
except Exception:
pass
except Exception as e:
content += f"\n[Error reading Microsoft Word {fp}: {e}]"
return content.strip()
def extract_excel_content(fp):
"""
Extract content from Microsoft Excel files.
Converts sheets to readable tables and replaces NaN values.
Does NOT attempt to extract images to avoid errors.
"""
content = ""
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore") # Suppress openpyxl warnings
# Explicitly specify the engine to avoid potential issues
sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl')
for sheet_name, df in sheets.items():
content += f"Sheet: {sheet_name}\n"
content += format_table(df) + "\n\n"
except Exception as e:
content += f"\n[Error reading Microsoft Excel {fp}: {e}]"
return content.strip()
def extract_pptx_content(fp):
"""
Extract text content from Microsoft PowerPoint presentation slides.
Includes text from shapes and tables.
Performs OCR on embedded images.
"""
content = ""
try:
prs = Presentation(fp)
for i, slide in enumerate(prs.slides, 1):
slide_texts = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_texts.append(shape.text.strip())
if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
try:
img = Image.open(io.BytesIO(shape.image.blob))
ocr_text = ocr_image(img)
if ocr_text:
slide_texts.append(f"[OCR Text from image]:\n{ocr_text}")
except Exception:
pass
if slide_texts:
content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n"
else:
content += f"Slide {i} Text:\nNo text found on this slide.\n\n"
# Extract tables
for shape in slide.shapes:
if shape.has_table:
rows = []
table = shape.table
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells]
rows.append(cells)
if rows:
df = pd.DataFrame(rows[1:], columns=rows[0])
content += f"Table on slide {i}:\n{format_table(df)}\n\n"
except Exception as e:
content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]"
return content.strip()
def extract_file_content(fp):
"""
Determine file type by extension and extract text content accordingly.
For unknown types, attempts to read as plain text.
"""
ext = Path(fp).suffix.lower()
if ext == ".pdf":
return extract_pdf_content(fp)
elif ext in [".doc", ".docx"]:
return extract_docx_content(fp)
elif ext in [".xlsx", ".xls"]:
return extract_excel_content(fp)
elif ext in [".ppt", ".pptx"]:
return extract_pptx_content(fp)
else:
try:
text = Path(fp).read_text(encoding="utf-8")
return clean_text(text)
except Exception as e:
return f"\n[Error reading file {fp}: {e}]"