deepseek-r1dotcom

Runtime error

File size: 8,478 Bytes

f99ad65

#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#

import pdfplumber # PDF
import pytesseract # OCR
import docx # Microsoft Word
import zipfile # Microsoft Word
import io
import pandas as pd # Microsoft Excel
import warnings
import re

from openpyxl import load_workbook # Microsoft Excel
from pptx import Presentation # Microsoft PowerPoint
from PIL import Image, ImageEnhance, ImageFilter # OCR
from pathlib import Path

def clean_text(text):
    """Clean and normalize extracted outputs."""
    # Remove non-printable and special characters except common punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text)
    # Remove isolated single letters (likely OCR noise)
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    # Normalize whitespace and remove empty lines
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return "\n".join(lines)

def format_table(df, max_rows=10):
    """Format pandas DataFrame as a readable table string, limited to max rows."""
    if df.empty:
        return ""
    # Drop fully empty rows and columns to reduce NaN clutter
    df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
    # Replace NaN with empty string to avoid 'NaN' in output
    df_clean = df_clean.fillna('')
    if df_clean.empty:
        return ""
    display_df = df_clean.head(max_rows)
    table_str = display_df.to_string(index=False)
    if len(df_clean) > max_rows:
        table_str += f"\n... ({len(df_clean) - max_rows} more rows)"
    return table_str

def preprocess_image(img):
    """Preprocess image for better OCR accuracy."""
    try:
        img = img.convert("L")  # Grayscale
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(2)  # Increase contrast
        img = img.filter(ImageFilter.MedianFilter())  # Reduce noise
        # Binarize image (threshold)
        img = img.point(lambda x: 0 if x < 140 else 255, '1')
        return img
    except Exception:
        return img

def ocr_image(img):
    """Perform OCR on PIL Image with preprocessing and clean result."""
    try:
        img = preprocess_image(img)
        text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
        text = clean_text(text)
        return text
    except Exception:
        return ""

def extract_pdf_content(fp):
    """
    Extract text content from PDF file.
    Includes OCR on embedded images to capture text within images.
    Also extracts tables as tab-separated text.
    """
    content = ""
    try:
        with pdfplumber.open(fp) as pdf:
            for i, page in enumerate(pdf.pages, 1):
                text = page.extract_text() or ""
                content += f"Page {i} Text:\n{clean_text(text)}\n\n"
                # OCR on images if any
                if page.images:
                    img_obj = page.to_image(resolution=300)
                    for img in page.images:
                        bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
                        cropped = img_obj.original.crop(bbox)
                        ocr_text = ocr_image(cropped)
                        if ocr_text:
                            content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n"
                # Extract tables as TSV
                tables = page.extract_tables()
                for idx, table in enumerate(tables, 1):
                    if table:
                        df = pd.DataFrame(table[1:], columns=table[0])
                        content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n"
    except Exception as e:
        content += f"\n[Error reading PDF {fp}: {e}]"
    return content.strip()

def extract_docx_content(fp):
    """
    Extract text from Microsoft Word files.
    Also performs OCR on embedded images inside the Microsoft Word archive.
    """
    content = ""
    try:
        doc = docx.Document(fp)
        paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
        if paragraphs:
            content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n"
        # Extract tables
        tables = []
        for table in doc.tables:
            rows = []
            for row in table.rows:
                cells = [cell.text.strip() for cell in row.cells]
                rows.append(cells)
            if rows:
                df = pd.DataFrame(rows[1:], columns=rows[0])
                tables.append(df)
        for i, df in enumerate(tables, 1):
            content += f"Table {i}:\n{format_table(df)}\n\n"
        # OCR on embedded images inside Microsoft Word
        with zipfile.ZipFile(fp) as z:
            for file in z.namelist():
                if file.startswith("word/media/"):
                    data = z.read(file)
                    try:
                        img = Image.open(io.BytesIO(data))
                        ocr_text = ocr_image(img)
                        if ocr_text:
                            content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n"
                    except Exception:
                        pass
    except Exception as e:
        content += f"\n[Error reading Microsoft Word {fp}: {e}]"
    return content.strip()

def extract_excel_content(fp):
    """
    Extract content from Microsoft Excel files.
    Converts sheets to readable tables and replaces NaN values.
    Does NOT attempt to extract images to avoid errors.
    """
    content = ""
    try:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore") # Suppress openpyxl warnings
            # Explicitly specify the engine to avoid potential issues
            sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl')
        for sheet_name, df in sheets.items():
            content += f"Sheet: {sheet_name}\n"
            content += format_table(df) + "\n\n"
    except Exception as e:
        content += f"\n[Error reading Microsoft Excel {fp}: {e}]"
    return content.strip()

def extract_pptx_content(fp):
    """
    Extract text content from Microsoft PowerPoint presentation slides.
    Includes text from shapes and tables.
    Performs OCR on embedded images.
    """
    content = ""
    try:
        prs = Presentation(fp)
        for i, slide in enumerate(prs.slides, 1):
            slide_texts = []
            for shape in slide.shapes:
                if hasattr(shape, "text") and shape.text.strip():
                    slide_texts.append(shape.text.strip())
                if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
                    try:
                        img = Image.open(io.BytesIO(shape.image.blob))
                        ocr_text = ocr_image(img)
                        if ocr_text:
                            slide_texts.append(f"[OCR Text from image]:\n{ocr_text}")
                    except Exception:
                        pass
            if slide_texts:
                content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n"
            else:
                content += f"Slide {i} Text:\nNo text found on this slide.\n\n"
            # Extract tables
            for shape in slide.shapes:
                if shape.has_table:
                    rows = []
                    table = shape.table
                    for row in table.rows:
                        cells = [cell.text.strip() for cell in row.cells]
                        rows.append(cells)
                    if rows:
                        df = pd.DataFrame(rows[1:], columns=rows[0])
                        content += f"Table on slide {i}:\n{format_table(df)}\n\n"
    except Exception as e:
        content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]"
    return content.strip()

def extract_file_content(fp):
    """
    Determine file type by extension and extract text content accordingly.
    For unknown types, attempts to read as plain text.
    """
    ext = Path(fp).suffix.lower()
    if ext == ".pdf":
        return extract_pdf_content(fp)
    elif ext in [".doc", ".docx"]:
        return extract_docx_content(fp)
    elif ext in [".xlsx", ".xls"]:
        return extract_excel_content(fp)
    elif ext in [".ppt", ".pptx"]:
        return extract_pptx_content(fp)
    else:
        try:
            text = Path(fp).read_text(encoding="utf-8")
            return clean_text(text)
        except Exception as e:
            return f"\n[Error reading file {fp}: {e}]"