gematria_date_sums

Sleeping

gematria_date_sums / util.py

add util

8e59f09 6 months ago

1.42 kB

	import json
	import re

	def process_json_files(start, end, step, strip_in_braces=False, strip_diacritics=False):
	base_path = "texts"
	results = []

	for i in range(start, end + 1, step):
	file_name = f"{base_path}/{i:02}.json"
	try:
	with open(file_name, 'r', encoding='utf-8') as file:
	data = json.load(file)
	text_blocks = data.get("text", [])

	full_text = " ".join([' '.join(block) for block in text_blocks])

	if strip_in_braces:
	full_text = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL)
	if strip_diacritics:
	full_text = re.sub(r"[^\u05D0-\u05EA ]+", "", full_text)

	# Check if the full text is not empty after processing
	if full_text.strip():
	results.append({
	"book": i,
	"title": data.get("title", "No title"),
	"text": data.get("text", "No text"),
	})

	except FileNotFoundError:
	results.append({"error": f"File {file_name} not found."})
	except json.JSONDecodeError as e:
	results.append({"error": f"File {file_name} could not be read as JSON: {e}"})
	except KeyError as e:
	results.append({"error": f"Expected key 'text' is missing in {file_name}: {e}"})

	return results