File size: 8,977 Bytes
3b58069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import logging
logger = logging.getLogger(__name__)

import json
import os
import re
from deep_translator import GoogleTranslator
from gematria import calculate_gematria
import math


def process_json_files(start, end, step, rounds="1", length=0, tlang="en", strip_spaces=True, strip_in_braces=True,
                       strip_diacritics=True, translate=False):
    """
    Processes Tripitaka JSON files and performs various text manipulations.

    Parameters:
    - start (int): Start number of the Tripitaka book.
    - end (int): End number of the Tripitaka book.
    - step (int): Step size for character selection.
    - rounds (str): Comma-separated list of round numbers (can include negative values).
    - length (int): Maximum length of the result text.
    - tlang (str): Target language for translation.
    - strip_spaces (bool): Whether to remove spaces from the text.
    - strip_in_braces (bool): Whether to remove text within braces.
    - strip_diacritics (bool): Whether to remove diacritics from the text.
    - translate (bool): Whether to translate the result text.

    Returns:
    - list: A list of dictionaries containing processed data or error messages.
    """
    base_path = "texts/torah"
    translator = GoogleTranslator(source='auto', target=tlang)
    results = []

    for i in range(start, end + 1):
        file_name = f"{base_path}/{i:02}.json"
        try:
            with open(file_name, 'r', encoding='utf-8') as file:
                data = json.load(file)
                text_blocks = data["text"]

                full_text = ""
                for block in text_blocks:
                    full_text += ' '.join(block)

                clean_text = full_text
                if strip_in_braces:
                    clean_text = re.sub(r"\[.*?\]", "", clean_text, flags=re.DOTALL)
                if strip_diacritics:
                    clean_text = re.sub(r"[^\u05D0-\u05EA ]+", "", clean_text)
                if strip_spaces:
                    clean_text = clean_text.replace(" ", "")
                else:
                    # Replace multiple spaces with a single space
                    clean_text = re.sub(r'\s+', ' ', clean_text)

                text_length = len(clean_text)
                if text_length == 0:
                    # If after cleaning, there's no text, skip processing
                    continue

                rounds_list = list(map(float, rounds.split(',')))  # Allow floats
                result_text = ""

                for r in rounds_list:
                    abs_r = abs(r)

                    # Determine the number of full passes and the remainder.
                    full_passes = math.floor(abs_r)
                    remainder = abs_r - full_passes

                    # Base chars per pass
                    base_chars = text_length // step

                    if base_chars == 0:
                        if abs_r > 1:  # Changed from >=1 to >1
                            # When step > text_length and rounds >1, pick 1 character per full pass
                            chars_per_full_pass = 1
                        else:
                            # No characters to pick
                            chars_per_full_pass = 0
                        # For remainder, since base_chars=0, no remainder characters
                        chars_for_remainder = 0
                    else:
                        # Normal case
                        chars_per_full_pass = base_chars
                        chars_for_remainder = math.floor(base_chars * remainder)  # Partial pass

                    if r > 0:
                        current_index = (step - 1) % text_length
                        direction = 1
                    else:
                        current_index = (text_length - step) % text_length
                        direction = -1

                    pass_result = ""

                    # Full passes, we only keep the last pass
                    for pass_num in range(1, full_passes + 1):
                        current_pass_chars = ""
                        for _ in range(chars_per_full_pass):
                            if chars_per_full_pass == 0:
                                break
                            current_pass_chars += clean_text[current_index]
                            current_index = (current_index + direction * step) % text_length

                        # Keep only the last full pass
                        if pass_num == full_passes:
                            pass_result = current_pass_chars

                    # Remainder pass, for fractional rounds
                    if remainder > 0 and chars_for_remainder > 0:
                        current_pass_chars = ""
                        for _ in range(chars_for_remainder):
                            current_pass_chars += clean_text[current_index]
                            current_index = (current_index + direction * step) % text_length
                        pass_result += current_pass_chars

                    # Handle cases where step > text_length and chars_per_full_pass=1
                    if base_chars == 0 and chars_per_full_pass == 1 and full_passes > 0:
                        # Append the last character picked in the full passes
                        # Since only one character is picked per full pass, pass_result is already set
                        pass
                    elif base_chars ==0 and chars_per_full_pass ==0 and full_passes >0:
                        # When no characters are picked, and rounds >=1, we might want to skip
                        pass

                    result_text += pass_result

                # Only translate if needed
                translated_text = translator.translate(result_text) if (result_text and translate) else ""

                if length != 0:
                    result_text = result_text[:length]

                if result_text:
                    results.append({
                        "book": f"Torah {i}.",
                        "title": data.get("title", "Unknown Title"),
                        "result_text": result_text,
                        "result_sum": calculate_gematria(result_text),
                        "translated_text": translated_text,
                        "source_language": "iw",
                    })

        except FileNotFoundError:
            results.append({"error": f"File {file_name} not found."})
        except json.JSONDecodeError as e:
            results.append({"error": f"File {file_name} could not be read as JSON: {e}"})
        except KeyError as e:
            results.append({"error": f"Expected key 'text' is missing in {file_name}: {e}"})

    return results if results else None




# Tests
test_results = [
    (process_json_files(0, 0, 21, rounds="3", length=0), "ืง"),
    (process_json_files(0, 0, 22, rounds="1", length=0), "ืช"),
    (process_json_files(0, 0, 22, rounds="3", length=0), "ืช"),
    (process_json_files(0, 0, 23, rounds="3", length=0), "ื’"),
    (process_json_files(0, 0, 11, rounds="1", length=0), "ื›ืช"),
    (process_json_files(0, 0, 2, rounds="1", length=0), "ื‘ื“ื•ื—ื™ืœื ืขืฆืจืช"),
    (process_json_files(0, 0, 23, rounds="1", length=0), None),  # Expect None, when no results
    (process_json_files(0, 0, 23, rounds="-1", length=0), None),  # Expect None, when no results
    (process_json_files(0, 0, 22, rounds="-1", length=0), "ื"),
    (process_json_files(0, 0, 22, rounds="-2", length=0), "ื"),
    (process_json_files(0, 0, 1, rounds="1,-1", length=0), "ืื‘ื’ื“ื”ื•ื–ื—ื˜ื™ื›ืœืžื ืกืขืคืฆืงืจืฉืชืชืฉืจืงืฆืคืขืกื ืžืœื›ื™ื˜ื—ื–ื•ื”ื“ื’ื‘ื"), # Combined rounds
    (process_json_files(0, 0, 1, rounds="-1", length=0), "ืชืฉืจืงืฆืคืขืกื ืžืœื›ื™ื˜ื—ื–ื•ื”ื“ื’ื‘ื"), # Reversed Hebrew alphabet
    (process_json_files(0, 0, 1, rounds="-1.5", length=0), "ืชืฉืจืงืฆืคืขืกื ืžืœื›ื™ื˜ื—ื–ื•ื”ื“ื’ื‘ืืชืฉืจืงืฆืคืขืกื ืžืœ"), # Fractional rounds
]

all_tests_passed = True
for result, expected in test_results:
    result_text = result[0]['result_text'] if result else None
    if expected is None:  # Check if no result is expected
        if not result:
            logger.warning(f"Test passed: Expected no results, got no results.")
        else:
            logger.error(f"Test failed: Expected no results, but got: {result_text}")
            all_tests_passed = False
    else:
        # Check if result is not empty before accessing elements
        if result:
            if result_text == expected:
                logger.warning(f"Test passed: Expected '{expected}', got '{result_text}'")
            else:
                logger.error(f"Test failed: Expected '{expected}', but got '{result_text}'")
                all_tests_passed = False
        else:
            logger.error(f"Test failed: Expected '{expected}', but got no results")
            all_tests_passed = False

if all_tests_passed:
    logger.info("All round tests passed.")