|
import os |
|
import re |
|
from docx import Document |
|
from PyPDF2 import PdfReader |
|
|
|
def read_txt_file(file_path): |
|
with open(file_path, "r", encoding="utf-8") as f: |
|
return f.read() |
|
|
|
def read_docx_file(file_path): |
|
doc = Document(file_path) |
|
return "\n".join([para.text for para in doc.paragraphs]) |
|
|
|
def read_pdf_file(file_path): |
|
reader = PdfReader(file_path) |
|
return "\n".join(page.extract_text() or "" for page in reader.pages) |
|
|
|
def split_arabic_text(text, chunk_size=500): |
|
sentences = re.split(r'(?<=[.ุ!])\s+', text) |
|
chunks = [] |
|
current = "" |
|
for sentence in sentences: |
|
if len(current) + len(sentence) <= chunk_size: |
|
current += sentence + " " |
|
else: |
|
chunks.append(current.strip()) |
|
current = sentence + " " |
|
if current: |
|
chunks.append(current.strip()) |
|
return chunks |
|
|
|
def process_documents(file_path): |
|
ext = os.path.splitext(file_path)[1].lower() |
|
if ext == ".txt": |
|
text = read_txt_file(file_path) |
|
elif ext == ".docx": |
|
text = read_docx_file(file_path) |
|
elif ext == ".pdf": |
|
text = read_pdf_file(file_path) |
|
else: |
|
return [] |
|
|
|
clean_text = text.replace('\n', ' ').replace('\r', ' ').strip() |
|
return split_arabic_text(clean_text) |
|
|