Spaces:
Sleeping
Sleeping
import json | |
import re | |
def process_json_files(start, end, step, strip_in_braces=False, strip_diacritics=False): | |
base_path = "texts" | |
results = [] | |
for i in range(start, end + 1, step): | |
file_name = f"{base_path}/{i:02}.json" | |
try: | |
with open(file_name, 'r', encoding='utf-8') as file: | |
data = json.load(file) | |
text_blocks = data.get("text", []) | |
full_text = " ".join([' '.join(block) for block in text_blocks]) | |
if strip_in_braces: | |
full_text = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL) | |
if strip_diacritics: | |
full_text = re.sub(r"[^\u05D0-\u05EA ]+", "", full_text) | |
# Check if the full text is not empty after processing | |
if full_text.strip(): | |
results.append({ | |
"book": i, | |
"title": data.get("title", "No title"), | |
"text": data.get("text", "No text"), | |
}) | |
except FileNotFoundError: | |
results.append({"error": f"File {file_name} not found."}) | |
except json.JSONDecodeError as e: | |
results.append({"error": f"File {file_name} could not be read as JSON: {e}"}) | |
except KeyError as e: | |
results.append({"error": f"Expected key 'text' is missing in {file_name}: {e}"}) | |
return results | |