ramy2018 commited on
Commit
7153947
·
verified ·
1 Parent(s): cb11377

Delete utils.py

Browse files
Files changed (1) hide show
  1. utils.py +0 -62
utils.py DELETED
@@ -1,62 +0,0 @@
1
-
2
- import os
3
- import re
4
- from docx import Document
5
- from PyPDF2 import PdfReader
6
-
7
- def read_txt_file(file_path):
8
- try:
9
- with open(file_path, "r", encoding="utf-8") as f:
10
- return f.read()
11
- except UnicodeDecodeError:
12
- with open(file_path, "r", encoding="cp1256") as f:
13
- return f.read()
14
-
15
- def read_docx_file(file_path):
16
- doc = Document(file_path)
17
- full_text = [para.text for para in doc.paragraphs]
18
- return '\n'.join(full_text)
19
-
20
- def read_pdf_file(file_path):
21
- reader = PdfReader(file_path)
22
- text = ""
23
- for page in reader.pages:
24
- text += page.extract_text() or ""
25
- return text
26
-
27
- def split_arabic_text(text, chunk_size=500, overlap=50):
28
- sentences = re.split(r'(?<=[.؟!])\s+', text)
29
- chunks = []
30
- current_chunk = ""
31
-
32
- for sentence in sentences:
33
- if len(current_chunk) + len(sentence) <= chunk_size:
34
- current_chunk += sentence + " "
35
- else:
36
- chunks.append(current_chunk.strip())
37
- current_chunk = sentence + " "
38
-
39
- if current_chunk:
40
- chunks.append(current_chunk.strip())
41
-
42
- overlapped_chunks = []
43
- for i in range(0, len(chunks)):
44
- start = max(0, i - 1)
45
- overlapped_chunks.append(" ".join(chunks[start:i+1]))
46
-
47
- return overlapped_chunks
48
-
49
- def process_documents(file_path):
50
- ext = os.path.splitext(file_path)[1].lower()
51
- if ext == ".txt":
52
- raw_text = read_txt_file(file_path)
53
- elif ext == ".docx":
54
- raw_text = read_docx_file(file_path)
55
- elif ext == ".pdf":
56
- raw_text = read_pdf_file(file_path)
57
- else:
58
- return []
59
-
60
- clean_text = raw_text.replace('\n', ' ').replace('\r', ' ').strip()
61
- chunks = split_arabic_text(clean_text, chunk_size=500)
62
- return chunks