Spaces:
Sleeping
Sleeping
neuralworm
commited on
Commit
•
382e2e7
1
Parent(s):
8e59f09
add util
Browse files
util.py
CHANGED
@@ -1,26 +1,16 @@
|
|
1 |
import json
|
2 |
import re
|
3 |
|
4 |
-
def process_json_files(start, end
|
5 |
base_path = "texts"
|
6 |
results = []
|
7 |
|
8 |
-
for i in range(start, end + 1
|
9 |
file_name = f"{base_path}/{i:02}.json"
|
10 |
try:
|
11 |
with open(file_name, 'r', encoding='utf-8') as file:
|
12 |
data = json.load(file)
|
13 |
-
|
14 |
-
|
15 |
-
full_text = " ".join([' '.join(block) for block in text_blocks])
|
16 |
-
|
17 |
-
if strip_in_braces:
|
18 |
-
full_text = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL)
|
19 |
-
if strip_diacritics:
|
20 |
-
full_text = re.sub(r"[^\u05D0-\u05EA ]+", "", full_text)
|
21 |
-
|
22 |
-
# Check if the full text is not empty after processing
|
23 |
-
if full_text.strip():
|
24 |
results.append({
|
25 |
"book": i,
|
26 |
"title": data.get("title", "No title"),
|
|
|
1 |
import json
|
2 |
import re
|
3 |
|
4 |
+
def process_json_files(start, end):
|
5 |
base_path = "texts"
|
6 |
results = []
|
7 |
|
8 |
+
for i in range(start, end + 1):
|
9 |
file_name = f"{base_path}/{i:02}.json"
|
10 |
try:
|
11 |
with open(file_name, 'r', encoding='utf-8') as file:
|
12 |
data = json.load(file)
|
13 |
+
if data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
results.append({
|
15 |
"book": i,
|
16 |
"title": data.get("title", "No title"),
|