Spaces:
Sleeping
Sleeping
Update backend/utils.py
Browse files- backend/utils.py +71 -15
backend/utils.py
CHANGED
@@ -1,15 +1,71 @@
|
|
1 |
-
import fitz # pymupdf
|
2 |
-
from
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz # pymupdf
|
2 |
+
from docx import Document
|
3 |
+
import pptx
|
4 |
+
import os
|
5 |
+
from typing import Optional
|
6 |
+
|
7 |
+
def extract_text_from_pdf(file_path: str) -> Optional[str]:
|
8 |
+
"""
|
9 |
+
استخراج النص من ملف PDF باستخدام pymupdf (أسرع من tika).
|
10 |
+
"""
|
11 |
+
try:
|
12 |
+
doc = fitz.open(file_path)
|
13 |
+
text = ""
|
14 |
+
for page in doc:
|
15 |
+
text += page.get_text()
|
16 |
+
return text.strip() if text else None
|
17 |
+
except Exception as e:
|
18 |
+
print(f"Error reading PDF: {e}")
|
19 |
+
return None
|
20 |
+
|
21 |
+
def extract_text_from_docx(file_path: str) -> Optional[str]:
|
22 |
+
"""
|
23 |
+
استخراج النص من ملف Word (DOCX).
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
doc = Document(file_path)
|
27 |
+
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
|
28 |
+
except Exception as e:
|
29 |
+
print(f"Error reading DOCX: {e}")
|
30 |
+
return None
|
31 |
+
|
32 |
+
def extract_text_from_pptx(file_path: str) -> Optional[str]:
|
33 |
+
"""
|
34 |
+
استخراج النص من ملف PowerPoint (PPTX).
|
35 |
+
"""
|
36 |
+
try:
|
37 |
+
presentation = pptx.Presentation(file_path)
|
38 |
+
text = []
|
39 |
+
for slide in presentation.slides:
|
40 |
+
for shape in slide.shapes:
|
41 |
+
if hasattr(shape, "text"):
|
42 |
+
text.append(shape.text)
|
43 |
+
return "\n".join(text) if text else None
|
44 |
+
except Exception as e:
|
45 |
+
print(f"Error reading PPTX: {e}")
|
46 |
+
return None
|
47 |
+
|
48 |
+
def extract_text_from_document(file_path: str) -> Optional[str]:
|
49 |
+
"""
|
50 |
+
دالة موحدة لاستخراج النص من أي مستند (PDF/DOCX/PPTX/TXT).
|
51 |
+
"""
|
52 |
+
if not os.path.exists(file_path):
|
53 |
+
print(f"File not found: {file_path}")
|
54 |
+
return None
|
55 |
+
|
56 |
+
if file_path.lower().endswith('.pdf'):
|
57 |
+
return extract_text_from_pdf(file_path)
|
58 |
+
elif file_path.lower().endswith('.docx'):
|
59 |
+
return extract_text_from_docx(file_path)
|
60 |
+
elif file_path.lower().endswith('.pptx'):
|
61 |
+
return extract_text_from_pptx(file_path)
|
62 |
+
elif file_path.lower().endswith('.txt'):
|
63 |
+
try:
|
64 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
65 |
+
return f.read()
|
66 |
+
except Exception as e:
|
67 |
+
print(f"Error reading TXT: {e}")
|
68 |
+
return None
|
69 |
+
else:
|
70 |
+
print(f"Unsupported file format: {file_path}")
|
71 |
+
return None
|