Spaces:

ramy2018
/

pope30

Sleeping

File size: 1,279 Bytes

b6f8046
 
 
 
 
 
4932f0a
 
b6f8046
 
 
4932f0a
b6f8046
 
 
4932f0a
b6f8046
4932f0a
b6f8046
 
4932f0a
b6f8046
4932f0a
 
b6f8046
4932f0a
 
 
 
 
b6f8046
 
 
 
4932f0a
b6f8046
4932f0a
b6f8046
4932f0a
b6f8046
 
 
4932f0a

import os
import re
from docx import Document
from PyPDF2 import PdfReader

def read_txt_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def read_docx_file(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def read_pdf_file(file_path):
    reader = PdfReader(file_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

def split_arabic_text(text, chunk_size=500):
    sentences = re.split(r'(?<=[.؟!])\s+', text)
    chunks = []
    current = ""
    for sentence in sentences:
        if len(current) + len(sentence) <= chunk_size:
            current += sentence + " "
        else:
            chunks.append(current.strip())
            current = sentence + " "
    if current:
        chunks.append(current.strip())
    return chunks

def process_documents(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".txt":
        text = read_txt_file(file_path)
    elif ext == ".docx":
        text = read_docx_file(file_path)
    elif ext == ".pdf":
        text = read_pdf_file(file_path)
    else:
        return []

    clean_text = text.replace('\n', ' ').replace('\r', ' ').strip()
    return split_arabic_text(clean_text)