File size: 1,279 Bytes
b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a b6f8046 4932f0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import os
import re
from docx import Document
from PyPDF2 import PdfReader
def read_txt_file(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def read_docx_file(file_path):
doc = Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
def read_pdf_file(file_path):
reader = PdfReader(file_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
def split_arabic_text(text, chunk_size=500):
sentences = re.split(r'(?<=[.؟!])\s+', text)
chunks = []
current = ""
for sentence in sentences:
if len(current) + len(sentence) <= chunk_size:
current += sentence + " "
else:
chunks.append(current.strip())
current = sentence + " "
if current:
chunks.append(current.strip())
return chunks
def process_documents(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == ".txt":
text = read_txt_file(file_path)
elif ext == ".docx":
text = read_docx_file(file_path)
elif ext == ".pdf":
text = read_pdf_file(file_path)
else:
return []
clean_text = text.replace('\n', ' ').replace('\r', ' ').strip()
return split_arabic_text(clean_text)
|