Spaces:

ramy2018
/

pope30

Sleeping

App Files Files Community

ramy2018 commited on May 30

Commit

7153947

verified ·

1 Parent(s): cb11377

Delete utils.py

Browse files

Files changed (1) hide show

utils.py +0 -62

utils.py DELETED Viewed

@@ -1,62 +0,0 @@
-import os
-import re
-from docx import Document
-from PyPDF2 import PdfReader
-def read_txt_file(file_path):
-    try:
-        with open(file_path, "r", encoding="utf-8") as f:
-            return f.read()
-    except UnicodeDecodeError:
-        with open(file_path, "r", encoding="cp1256") as f:
-            return f.read()
-def read_docx_file(file_path):
-    doc = Document(file_path)
-    full_text = [para.text for para in doc.paragraphs]
-    return '\n'.join(full_text)
-def read_pdf_file(file_path):
-    reader = PdfReader(file_path)
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text() or ""
-    return text
-def split_arabic_text(text, chunk_size=500, overlap=50):
-    sentences = re.split(r'(?<=[.؟!])\s+', text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) <= chunk_size:
-            current_chunk += sentence + " "
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    overlapped_chunks = []
-    for i in range(0, len(chunks)):
-        start = max(0, i - 1)
-        overlapped_chunks.append(" ".join(chunks[start:i+1]))
-    return overlapped_chunks
-def process_documents(file_path):
-    ext = os.path.splitext(file_path)[1].lower()
-    if ext == ".txt":
-        raw_text = read_txt_file(file_path)
-    elif ext == ".docx":
-        raw_text = read_docx_file(file_path)
-    elif ext == ".pdf":
-        raw_text = read_pdf_file(file_path)
-    else:
-        return []
-    clean_text = raw_text.replace('\n', ' ').replace('\r', ' ').strip()
-    chunks = split_arabic_text(clean_text, chunk_size=500)
-    return chunks