Soltane777 commited on
Commit
70cb71f
·
verified ·
1 Parent(s): 36c62ac

Update backend/utils.py

Browse files
Files changed (1) hide show
  1. backend/utils.py +71 -15
backend/utils.py CHANGED
@@ -1,15 +1,71 @@
1
- import fitz # pymupdf لاستخراج النصوص من PDF
2
- from tika import parser
3
-
4
- # دالة لاستخراج النص من ملف PDF
5
- def extract_text_from_pdf(file_path):
6
- doc = fitz.open(file_path)
7
- text = ""
8
- for page in doc:
9
- text += page.get_text()
10
- return text.strip()
11
-
12
- # دالة لاستخراج النصوص من أي مستند (PDF, DOCX, PPTX)
13
- def extract_text_from_document(file_path):
14
- parsed = parser.from_file(file_path)
15
- return parsed["content"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # pymupdf
2
+ from docx import Document
3
+ import pptx
4
+ import os
5
+ from typing import Optional
6
+
7
+ def extract_text_from_pdf(file_path: str) -> Optional[str]:
8
+ """
9
+ استخراج النص من ملف PDF باستخدام pymupdf (أسرع من tika).
10
+ """
11
+ try:
12
+ doc = fitz.open(file_path)
13
+ text = ""
14
+ for page in doc:
15
+ text += page.get_text()
16
+ return text.strip() if text else None
17
+ except Exception as e:
18
+ print(f"Error reading PDF: {e}")
19
+ return None
20
+
21
+ def extract_text_from_docx(file_path: str) -> Optional[str]:
22
+ """
23
+ استخراج النص من ملف Word (DOCX).
24
+ """
25
+ try:
26
+ doc = Document(file_path)
27
+ return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
28
+ except Exception as e:
29
+ print(f"Error reading DOCX: {e}")
30
+ return None
31
+
32
+ def extract_text_from_pptx(file_path: str) -> Optional[str]:
33
+ """
34
+ استخراج النص من ملف PowerPoint (PPTX).
35
+ """
36
+ try:
37
+ presentation = pptx.Presentation(file_path)
38
+ text = []
39
+ for slide in presentation.slides:
40
+ for shape in slide.shapes:
41
+ if hasattr(shape, "text"):
42
+ text.append(shape.text)
43
+ return "\n".join(text) if text else None
44
+ except Exception as e:
45
+ print(f"Error reading PPTX: {e}")
46
+ return None
47
+
48
+ def extract_text_from_document(file_path: str) -> Optional[str]:
49
+ """
50
+ دالة موحدة لاستخراج النص من أي مستند (PDF/DOCX/PPTX/TXT).
51
+ """
52
+ if not os.path.exists(file_path):
53
+ print(f"File not found: {file_path}")
54
+ return None
55
+
56
+ if file_path.lower().endswith('.pdf'):
57
+ return extract_text_from_pdf(file_path)
58
+ elif file_path.lower().endswith('.docx'):
59
+ return extract_text_from_docx(file_path)
60
+ elif file_path.lower().endswith('.pptx'):
61
+ return extract_text_from_pptx(file_path)
62
+ elif file_path.lower().endswith('.txt'):
63
+ try:
64
+ with open(file_path, 'r', encoding='utf-8') as f:
65
+ return f.read()
66
+ except Exception as e:
67
+ print(f"Error reading TXT: {e}")
68
+ return None
69
+ else:
70
+ print(f"Unsupported file format: {file_path}")
71
+ return None