File size: 1,416 Bytes
8e59f09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import json
import re

def process_json_files(start, end, step, strip_in_braces=False, strip_diacritics=False):
    base_path = "texts"
    results = []

    for i in range(start, end + 1, step):
        file_name = f"{base_path}/{i:02}.json"
        try:
            with open(file_name, 'r', encoding='utf-8') as file:
                data = json.load(file)
                text_blocks = data.get("text", [])

                full_text = " ".join([' '.join(block) for block in text_blocks])

                if strip_in_braces:
                    full_text = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL)
                if strip_diacritics:
                    full_text = re.sub(r"[^\u05D0-\u05EA ]+", "", full_text)

                # Check if the full text is not empty after processing
                if full_text.strip():
                    results.append({
                        "book": i,
                        "title": data.get("title", "No title"),
			"text": data.get("text", "No text"),
                    })

        except FileNotFoundError:
            results.append({"error": f"File {file_name} not found."})
        except json.JSONDecodeError as e:
            results.append({"error": f"File {file_name} could not be read as JSON: {e}"})
        except KeyError as e:
            results.append({"error": f"Expected key 'text' is missing in {file_name}: {e}"})

    return results