Spaces:
Runtime error
Runtime error
# | |
# SPDX-FileCopyrightText: Hadad <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
import pdfplumber # PDF | |
import pytesseract # OCR | |
import docx # Microsoft Word | |
import zipfile # Microsoft Word | |
import io | |
import pandas as pd # Microsoft Excel | |
import warnings | |
import re | |
from openpyxl import load_workbook # Microsoft Excel | |
from pptx import Presentation # Microsoft PowerPoint | |
from PIL import Image, ImageEnhance, ImageFilter # OCR | |
from pathlib import Path | |
def clean_text(text): | |
"""Clean and normalize extracted outputs.""" | |
# Remove non-printable and special characters except common punctuation | |
text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text) | |
# Remove isolated single letters (likely OCR noise) | |
text = re.sub(r'\b[a-zA-Z]\b', '', text) | |
# Normalize whitespace and remove empty lines | |
lines = [line.strip() for line in text.splitlines() if line.strip()] | |
return "\n".join(lines) | |
def format_table(df, max_rows=10): | |
"""Format pandas DataFrame as a readable table string, limited to max rows.""" | |
if df.empty: | |
return "" | |
# Drop fully empty rows and columns to reduce NaN clutter | |
df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all') | |
# Replace NaN with empty string to avoid 'NaN' in output | |
df_clean = df_clean.fillna('') | |
if df_clean.empty: | |
return "" | |
display_df = df_clean.head(max_rows) | |
table_str = display_df.to_string(index=False) | |
if len(df_clean) > max_rows: | |
table_str += f"\n... ({len(df_clean) - max_rows} more rows)" | |
return table_str | |
def preprocess_image(img): | |
"""Preprocess image for better OCR accuracy.""" | |
try: | |
img = img.convert("L") # Grayscale | |
enhancer = ImageEnhance.Contrast(img) | |
img = enhancer.enhance(2) # Increase contrast | |
img = img.filter(ImageFilter.MedianFilter()) # Reduce noise | |
# Binarize image (threshold) | |
img = img.point(lambda x: 0 if x < 140 else 255, '1') | |
return img | |
except Exception: | |
return img | |
def ocr_image(img): | |
"""Perform OCR on PIL Image with preprocessing and clean result.""" | |
try: | |
img = preprocess_image(img) | |
text = pytesseract.image_to_string(img, lang='eng', config='--psm 6') | |
text = clean_text(text) | |
return text | |
except Exception: | |
return "" | |
def extract_pdf_content(fp): | |
""" | |
Extract text content from PDF file. | |
Includes OCR on embedded images to capture text within images. | |
Also extracts tables as tab-separated text. | |
""" | |
content = "" | |
try: | |
with pdfplumber.open(fp) as pdf: | |
for i, page in enumerate(pdf.pages, 1): | |
text = page.extract_text() or "" | |
content += f"Page {i} Text:\n{clean_text(text)}\n\n" | |
# OCR on images if any | |
if page.images: | |
img_obj = page.to_image(resolution=300) | |
for img in page.images: | |
bbox = (img["x0"], img["top"], img["x1"], img["bottom"]) | |
cropped = img_obj.original.crop(bbox) | |
ocr_text = ocr_image(cropped) | |
if ocr_text: | |
content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n" | |
# Extract tables as TSV | |
tables = page.extract_tables() | |
for idx, table in enumerate(tables, 1): | |
if table: | |
df = pd.DataFrame(table[1:], columns=table[0]) | |
content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n" | |
except Exception as e: | |
content += f"\n[Error reading PDF {fp}: {e}]" | |
return content.strip() | |
def extract_docx_content(fp): | |
""" | |
Extract text from Microsoft Word files. | |
Also performs OCR on embedded images inside the Microsoft Word archive. | |
""" | |
content = "" | |
try: | |
doc = docx.Document(fp) | |
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] | |
if paragraphs: | |
content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n" | |
# Extract tables | |
tables = [] | |
for table in doc.tables: | |
rows = [] | |
for row in table.rows: | |
cells = [cell.text.strip() for cell in row.cells] | |
rows.append(cells) | |
if rows: | |
df = pd.DataFrame(rows[1:], columns=rows[0]) | |
tables.append(df) | |
for i, df in enumerate(tables, 1): | |
content += f"Table {i}:\n{format_table(df)}\n\n" | |
# OCR on embedded images inside Microsoft Word | |
with zipfile.ZipFile(fp) as z: | |
for file in z.namelist(): | |
if file.startswith("word/media/"): | |
data = z.read(file) | |
try: | |
img = Image.open(io.BytesIO(data)) | |
ocr_text = ocr_image(img) | |
if ocr_text: | |
content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n" | |
except Exception: | |
pass | |
except Exception as e: | |
content += f"\n[Error reading Microsoft Word {fp}: {e}]" | |
return content.strip() | |
def extract_excel_content(fp): | |
""" | |
Extract content from Microsoft Excel files. | |
Converts sheets to readable tables and replaces NaN values. | |
Does NOT attempt to extract images to avoid errors. | |
""" | |
content = "" | |
try: | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") # Suppress openpyxl warnings | |
# Explicitly specify the engine to avoid potential issues | |
sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl') | |
for sheet_name, df in sheets.items(): | |
content += f"Sheet: {sheet_name}\n" | |
content += format_table(df) + "\n\n" | |
except Exception as e: | |
content += f"\n[Error reading Microsoft Excel {fp}: {e}]" | |
return content.strip() | |
def extract_pptx_content(fp): | |
""" | |
Extract text content from Microsoft PowerPoint presentation slides. | |
Includes text from shapes and tables. | |
Performs OCR on embedded images. | |
""" | |
content = "" | |
try: | |
prs = Presentation(fp) | |
for i, slide in enumerate(prs.slides, 1): | |
slide_texts = [] | |
for shape in slide.shapes: | |
if hasattr(shape, "text") and shape.text.strip(): | |
slide_texts.append(shape.text.strip()) | |
if shape.shape_type == 13 and hasattr(shape, "image") and shape.image: | |
try: | |
img = Image.open(io.BytesIO(shape.image.blob)) | |
ocr_text = ocr_image(img) | |
if ocr_text: | |
slide_texts.append(f"[OCR Text from image]:\n{ocr_text}") | |
except Exception: | |
pass | |
if slide_texts: | |
content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n" | |
else: | |
content += f"Slide {i} Text:\nNo text found on this slide.\n\n" | |
# Extract tables | |
for shape in slide.shapes: | |
if shape.has_table: | |
rows = [] | |
table = shape.table | |
for row in table.rows: | |
cells = [cell.text.strip() for cell in row.cells] | |
rows.append(cells) | |
if rows: | |
df = pd.DataFrame(rows[1:], columns=rows[0]) | |
content += f"Table on slide {i}:\n{format_table(df)}\n\n" | |
except Exception as e: | |
content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]" | |
return content.strip() | |
def extract_file_content(fp): | |
""" | |
Determine file type by extension and extract text content accordingly. | |
For unknown types, attempts to read as plain text. | |
""" | |
ext = Path(fp).suffix.lower() | |
if ext == ".pdf": | |
return extract_pdf_content(fp) | |
elif ext in [".doc", ".docx"]: | |
return extract_docx_content(fp) | |
elif ext in [".xlsx", ".xls"]: | |
return extract_excel_content(fp) | |
elif ext in [".ppt", ".pptx"]: | |
return extract_pptx_content(fp) | |
else: | |
try: | |
text = Path(fp).read_text(encoding="utf-8") | |
return clean_text(text) | |
except Exception as e: | |
return f"\n[Error reading file {fp}: {e}]" | |