# # SPDX-FileCopyrightText: Hadad # SPDX-License-Identifier: Apache-2.0 # import pdfplumber # PDF import pytesseract # OCR import docx # Microsoft Word import zipfile # Microsoft Word import io import pandas as pd # Microsoft Excel import warnings import re from openpyxl import load_workbook # Microsoft Excel from pptx import Presentation # Microsoft PowerPoint from PIL import Image, ImageEnhance, ImageFilter # OCR from pathlib import Path def clean_text(text): """Clean and normalize extracted outputs.""" # Remove non-printable and special characters except common punctuation text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text) # Remove isolated single letters (likely OCR noise) text = re.sub(r'\b[a-zA-Z]\b', '', text) # Normalize whitespace and remove empty lines lines = [line.strip() for line in text.splitlines() if line.strip()] return "\n".join(lines) def format_table(df, max_rows=10): """Format pandas DataFrame as a readable table string, limited to max rows.""" if df.empty: return "" # Drop fully empty rows and columns to reduce NaN clutter df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all') # Replace NaN with empty string to avoid 'NaN' in output df_clean = df_clean.fillna('') if df_clean.empty: return "" display_df = df_clean.head(max_rows) table_str = display_df.to_string(index=False) if len(df_clean) > max_rows: table_str += f"\n... ({len(df_clean) - max_rows} more rows)" return table_str def preprocess_image(img): """Preprocess image for better OCR accuracy.""" try: img = img.convert("L") # Grayscale enhancer = ImageEnhance.Contrast(img) img = enhancer.enhance(2) # Increase contrast img = img.filter(ImageFilter.MedianFilter()) # Reduce noise # Binarize image (threshold) img = img.point(lambda x: 0 if x < 140 else 255, '1') return img except Exception: return img def ocr_image(img): """Perform OCR on PIL Image with preprocessing and clean result.""" try: img = preprocess_image(img) text = pytesseract.image_to_string(img, lang='eng', config='--psm 6') text = clean_text(text) return text except Exception: return "" def extract_pdf_content(fp): """ Extract text content from PDF file. Includes OCR on embedded images to capture text within images. Also extracts tables as tab-separated text. """ content = "" try: with pdfplumber.open(fp) as pdf: for i, page in enumerate(pdf.pages, 1): text = page.extract_text() or "" content += f"Page {i} Text:\n{clean_text(text)}\n\n" # OCR on images if any if page.images: img_obj = page.to_image(resolution=300) for img in page.images: bbox = (img["x0"], img["top"], img["x1"], img["bottom"]) cropped = img_obj.original.crop(bbox) ocr_text = ocr_image(cropped) if ocr_text: content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n" # Extract tables as TSV tables = page.extract_tables() for idx, table in enumerate(tables, 1): if table: df = pd.DataFrame(table[1:], columns=table[0]) content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n" except Exception as e: content += f"\n[Error reading PDF {fp}: {e}]" return content.strip() def extract_docx_content(fp): """ Extract text from Microsoft Word files. Also performs OCR on embedded images inside the Microsoft Word archive. """ content = "" try: doc = docx.Document(fp) paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] if paragraphs: content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n" # Extract tables tables = [] for table in doc.tables: rows = [] for row in table.rows: cells = [cell.text.strip() for cell in row.cells] rows.append(cells) if rows: df = pd.DataFrame(rows[1:], columns=rows[0]) tables.append(df) for i, df in enumerate(tables, 1): content += f"Table {i}:\n{format_table(df)}\n\n" # OCR on embedded images inside Microsoft Word with zipfile.ZipFile(fp) as z: for file in z.namelist(): if file.startswith("word/media/"): data = z.read(file) try: img = Image.open(io.BytesIO(data)) ocr_text = ocr_image(img) if ocr_text: content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n" except Exception: pass except Exception as e: content += f"\n[Error reading Microsoft Word {fp}: {e}]" return content.strip() def extract_excel_content(fp): """ Extract content from Microsoft Excel files. Converts sheets to readable tables and replaces NaN values. Does NOT attempt to extract images to avoid errors. """ content = "" try: with warnings.catch_warnings(): warnings.simplefilter("ignore") # Suppress openpyxl warnings # Explicitly specify the engine to avoid potential issues sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl') for sheet_name, df in sheets.items(): content += f"Sheet: {sheet_name}\n" content += format_table(df) + "\n\n" except Exception as e: content += f"\n[Error reading Microsoft Excel {fp}: {e}]" return content.strip() def extract_pptx_content(fp): """ Extract text content from Microsoft PowerPoint presentation slides. Includes text from shapes and tables. Performs OCR on embedded images. """ content = "" try: prs = Presentation(fp) for i, slide in enumerate(prs.slides, 1): slide_texts = [] for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): slide_texts.append(shape.text.strip()) if shape.shape_type == 13 and hasattr(shape, "image") and shape.image: try: img = Image.open(io.BytesIO(shape.image.blob)) ocr_text = ocr_image(img) if ocr_text: slide_texts.append(f"[OCR Text from image]:\n{ocr_text}") except Exception: pass if slide_texts: content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n" else: content += f"Slide {i} Text:\nNo text found on this slide.\n\n" # Extract tables for shape in slide.shapes: if shape.has_table: rows = [] table = shape.table for row in table.rows: cells = [cell.text.strip() for cell in row.cells] rows.append(cells) if rows: df = pd.DataFrame(rows[1:], columns=rows[0]) content += f"Table on slide {i}:\n{format_table(df)}\n\n" except Exception as e: content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]" return content.strip() def extract_file_content(fp): """ Determine file type by extension and extract text content accordingly. For unknown types, attempts to read as plain text. """ ext = Path(fp).suffix.lower() if ext == ".pdf": return extract_pdf_content(fp) elif ext in [".doc", ".docx"]: return extract_docx_content(fp) elif ext in [".xlsx", ".xls"]: return extract_excel_content(fp) elif ext in [".ppt", ".pptx"]: return extract_pptx_content(fp) else: try: text = Path(fp).read_text(encoding="utf-8") return clean_text(text) except Exception as e: return f"\n[Error reading file {fp}: {e}]"