Spaces:

Soltane777
/

textgeneration

Sleeping

App Files Files Community

Soltane777 commited on Apr 22

Commit

70cb71f

verified ·

1 Parent(s): 36c62ac

Update backend/utils.py

Browse files

Files changed (1) hide show

backend/utils.py +71 -15

backend/utils.py CHANGED Viewed

@@ -1,15 +1,71 @@
-import fitz  # pymupdf لاستخراج النصوص من PDF
-from tika import parser
-# دالة لاستخراج النص من ملف PDF
-def extract_text_from_pdf(file_path):
-    doc = fitz.open(file_path)
-    text = ""
-    for page in doc:
-        text += page.get_text()
-    return text.strip()
-# دالة لاستخراج النصوص من أي مستند (PDF, DOCX, PPTX)
-def extract_text_from_document(file_path):
-    parsed = parser.from_file(file_path)
-    return parsed["content"]

+import fitz  # pymupdf
+from docx import Document
+import pptx
+import os
+from typing import Optional
+def extract_text_from_pdf(file_path: str) -> Optional[str]:
+    """
+    استخراج النص من ملف PDF باستخدام pymupdf (أسرع من tika).
+    """
+    try:
+        doc = fitz.open(file_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text.strip() if text else None
+    except Exception as e:
+        print(f"Error reading PDF: {e}")
+        return None
+def extract_text_from_docx(file_path: str) -> Optional[str]:
+    """
+    استخراج النص من ملف Word (DOCX).
+    """
+    try:
+        doc = Document(file_path)
+        return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
+    except Exception as e:
+        print(f"Error reading DOCX: {e}")
+        return None
+def extract_text_from_pptx(file_path: str) -> Optional[str]:
+    """
+    استخراج النص من ملف PowerPoint (PPTX).
+    """
+    try:
+        presentation = pptx.Presentation(file_path)
+        text = []
+        for slide in presentation.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    text.append(shape.text)
+        return "\n".join(text) if text else None
+    except Exception as e:
+        print(f"Error reading PPTX: {e}")
+        return None
+def extract_text_from_document(file_path: str) -> Optional[str]:
+    """
+    دالة موحدة لاستخراج النص من أي مستند (PDF/DOCX/PPTX/TXT).
+    """
+    if not os.path.exists(file_path):
+        print(f"File not found: {file_path}")
+        return None
+    if file_path.lower().endswith('.pdf'):
+        return extract_text_from_pdf(file_path)
+    elif file_path.lower().endswith('.docx'):
+        return extract_text_from_docx(file_path)
+    elif file_path.lower().endswith('.pptx'):
+        return extract_text_from_pptx(file_path)
+    elif file_path.lower().endswith('.txt'):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            print(f"Error reading TXT: {e}")
+            return None
+    else:
+        print(f"Unsupported file format: {file_path}")
+        return None