neuralworm commited on
Commit
382e2e7
1 Parent(s): 8e59f09
Files changed (1) hide show
  1. util.py +3 -13
util.py CHANGED
@@ -1,26 +1,16 @@
1
  import json
2
  import re
3
 
4
- def process_json_files(start, end, step, strip_in_braces=False, strip_diacritics=False):
5
  base_path = "texts"
6
  results = []
7
 
8
- for i in range(start, end + 1, step):
9
  file_name = f"{base_path}/{i:02}.json"
10
  try:
11
  with open(file_name, 'r', encoding='utf-8') as file:
12
  data = json.load(file)
13
- text_blocks = data.get("text", [])
14
-
15
- full_text = " ".join([' '.join(block) for block in text_blocks])
16
-
17
- if strip_in_braces:
18
- full_text = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL)
19
- if strip_diacritics:
20
- full_text = re.sub(r"[^\u05D0-\u05EA ]+", "", full_text)
21
-
22
- # Check if the full text is not empty after processing
23
- if full_text.strip():
24
  results.append({
25
  "book": i,
26
  "title": data.get("title", "No title"),
 
1
  import json
2
  import re
3
 
4
+ def process_json_files(start, end):
5
  base_path = "texts"
6
  results = []
7
 
8
+ for i in range(start, end + 1):
9
  file_name = f"{base_path}/{i:02}.json"
10
  try:
11
  with open(file_name, 'r', encoding='utf-8') as file:
12
  data = json.load(file)
13
+ if data:
 
 
 
 
 
 
 
 
 
 
14
  results.append({
15
  "book": i,
16
  "title": data.get("title", "No title"),