Spaces:

neuralworm
/

els_journal

Sleeping

File size: 15,384 Bytes

3b58069

import logging
import json
import os
import re
from deep_translator import GoogleTranslator
from gematria import calculate_gematria
import math
import csv

# Configure the logger
# You can uncomment the next line to enable debugging logs
# logging.basicConfig(level=logging.DEBUG, format='%(levelname)s:%(message)s')
logger = logging.getLogger(__name__)

def process_json_files(start=40, end=66, step=1, rounds="1", length=0, tlang="en", strip_spaces=True,
                      strip_in_braces=True, strip_diacritics=True, translate=False):
    """
    Process a CSV file containing biblical texts and perform various text manipulations.

    Parameters:
    - start (int): Starting book number.
    - end (int): Ending book number.
    - step (int): Step value for character selection.
    - rounds (str): Comma-separated string of round values (can include floats).
    - length (int): Maximum length of the result text.
    - tlang (str): Target language for translation.
    - strip_spaces (bool): Whether to remove spaces from the text.
    - strip_in_braces (bool): Whether to remove text within braces.
    - strip_diacritics (bool): Whether to remove diacritics from the text.
    - translate (bool): Whether to translate the result text.

    Returns:
    - list: A list of dictionaries containing processed data or error messages.
    """
    file_name = "texts/bible/OpenGNT_version3_3.csv"
    translator = GoogleTranslator(source='auto', target=tlang) if translate else None
    results = []

    # Dictionary for the 27 books of the New Testament (English names)
    nt_books = {
        40: "Matthew",
        41: "Mark",
        42: "Luke",
        43: "John",
        44: "Acts",
        45: "Romans",
        46: "1 Corinthians",
        47: "2 Corinthians",
        48: "Galatians",
        49: "Ephesians",
        50: "Philippians",
        51: "Colossians",
        52: "1 Thessalonians",
        53: "2 Thessalonians",
        54: "1 Timothy",
        55: "2 Timothy",
        56: "Titus",
        57: "Philemon",
        58: "Hebrews",
        59: "James",
        60: "1 Peter",
        61: "2 Peter",
        62: "1 John",
        63: "2 John",
        64: "3 John",
        65: "Jude",
        66: "Revelation"
    }

    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file, delimiter='\t')

            book_texts = {}
            current_book = None
            for row_num, row in enumerate(reader, start=1):
                try:
                    # Parse the book number from '〔Book｜Chapter｜Verse〕' field
                    book_field = row['〔Book｜Chapter｜Verse〕']
                    book_str = book_field.split('｜')[0]  # e.g., '〔40'
                    book_num_str = book_str.lstrip('〔')  # Remove leading '〔'
                    book = int(book_num_str)

                    if book < start or book > end:
                        continue

                    if current_book != book:
                        current_book = book
                        book_texts[book] = ""

                    # Parse the Greek text from '〔OGNTk｜OGNTu｜OGNTa｜lexeme｜rmac｜sn〕' field
                    greek_field = row['〔OGNTk｜OGNTu｜OGNTa｜lexeme｜rmac｜sn〕']
                    # Extract the first part before '〔' and split by '｜'
                    if '〔' in greek_field:
                        greek_text = greek_field.split('〔')[1]
                        greek_text = greek_text.split('｜')[0]
                    else:
                        greek_text = greek_field.split('｜')[0]

                    book_texts[book] += greek_text + " "

                except (KeyError, IndexError, ValueError) as e:
                    logger.error(f"Error parsing row {row_num}: {e}")
                    continue  # Skip this row and continue

            for book, full_text in book_texts.items():
                logger.debug(f"Processing book {book}")
                clean_text = full_text
                if strip_in_braces:
                    clean_text = re.sub(r"\[.*?\]|\{.*?\}|\<.*?\>", "", clean_text, flags=re.DOTALL)

                if strip_diacritics:
                    # Adjusted regex for Greek diacritics
                    clean_text = re.sub(r"[^\u0370-\u03FF\u1F00-\u1FFF ]+", "", clean_text)
                    # Optionally, remove specific diacritics or punctuation if needed
                    # clean_text = re.sub(r'[additional patterns]', '', clean_text)

                # Normalize spaces
                clean_text = clean_text.replace("\n\n              ", " ")
                clean_text = clean_text.replace("\n", " ")
                clean_text = re.sub(r'\s+', ' ', clean_text).strip()

                if strip_spaces:
                    clean_text = clean_text.replace(" ", "")

                text_length = len(clean_text)
                logger.debug(f"Clean text for book {book}: Length = {text_length}")

                if text_length == 0:
                    logger.warning(f"No text available for book {book} after cleaning.")
                    continue  # Skip processing if there's no text

                try:
                    rounds_list = list(map(float, rounds.split(',')))  # Allow floats
                except ValueError as e:
                    logger.error(f"Invalid rounds parameter: {e}")
                    return [{"error": f"Invalid rounds parameter: {e}"}]

                result_text = ""

                for r in rounds_list:
                    abs_r = abs(r)

                    # Determine the number of full passes and the remainder.
                    full_passes = math.floor(abs_r)
                    remainder = abs_r - full_passes

                    # Base number of characters per pass
                    base_chars = text_length // step

                    if base_chars == 0:
                        if abs_r > 1:  # Changed from >=1 to >1
                            # When step > text_length and rounds >1, pick 1 character per full pass
                            chars_per_full_pass = 1
                            logger.debug(f"Book {book}: step > text_length ({step} > {text_length}), selecting 1 character per full pass.")
                        else:
                            # No characters to pick
                            chars_per_full_pass = 0
                            logger.debug(f"Book {book}: step > text_length ({step} > {text_length}) and rounds <=1, no characters selected.")
                        # For remainder, since base_chars=0, no remainder characters
                        chars_for_remainder = 0
                    else:
                        # Normal case
                        chars_per_full_pass = base_chars
                        chars_for_remainder = math.floor(base_chars * remainder)  # Partial pass
                        logger.debug(f"Book {book}: Normal case, chars_per_full_pass = {chars_per_full_pass}, chars_for_remainder = {chars_for_remainder}")

                    if r > 0:
                        current_index = (step - 1) % text_length
                        direction = 1
                    else:
                        current_index = (text_length - step) % text_length
                        direction = -1

                    pass_result = ""

                    # Full passes, keep only the last pass
                    for pass_num in range(1, full_passes + 1):
                        current_pass_chars = ""
                        for _ in range(chars_per_full_pass):
                            if chars_per_full_pass == 0:
                                break
                            current_pass_chars += clean_text[current_index]
                            current_index = (current_index + direction * step) % text_length

                        # Keep only the last full pass
                        if pass_num == full_passes:
                            pass_result = current_pass_chars
                            logger.debug(f"Book {book}: Pass {pass_num}, pass_result = {pass_result}")

                    # Remainder pass for fractional rounds
                    if remainder > 0 and chars_for_remainder > 0:
                        current_pass_chars = ""
                        for _ in range(chars_for_remainder):
                            current_pass_chars += clean_text[current_index]
                            current_index = (current_index + direction * step) % text_length
                        pass_result += current_pass_chars
                        logger.debug(f"Book {book}: Remainder pass_result = {pass_result}")

                    # Handle cases where step > text_length and chars_per_full_pass=1
                    if base_chars == 0 and chars_per_full_pass == 1 and full_passes > 0:
                        # pass_result already contains the last character picked
                        pass
                    elif base_chars == 0 and chars_per_full_pass == 0 and full_passes > 0:
                        # When no characters are picked, skip appending
                        pass

                    result_text += pass_result

                logger.debug(f"Result text for book {book}: {result_text}")

                if length != 0:
                    result_text = result_text[:length]
                    logger.debug(f"Book {book}: Result text truncated to length {length}.")

                # Translate the result text if required
                try:
                    translated_text = translator.translate(result_text) if translator and result_text else ""
                except Exception as e:
                    logger.error(f"Book {book}: Translation error: {e}")
                    translated_text = ""

                # Calculate the Gematria sum
                try:
                    result_sum = calculate_gematria(result_text)
                except Exception as e:
                    logger.error(f"Book {book}: Gematria calculation error: {e}")
                    result_sum = None

                if result_text:
                    result = {
                        'book': f"Bible {book}.",
                        'title': nt_books.get(book, "Unknown Book"),
                        'result_text': result_text,
                        'result_sum': result_sum,
                        'translated_text': translated_text,
                        'source_language': 'el'
                    }
                    results.append(result)

    except FileNotFoundError:
        logger.error(f"File {file_name} not found.")
        results.append({"error": f"File {file_name} not found."})
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
        results.append({"error": f"An unexpected error occurred: {e}"})

    return results if results else None


# Tests
test_results = [
    (process_json_files(40,40,386,rounds="1,0.5,-1,-0.5"),  "τωιεοννναυοοαμπυρααυοιξοηαϲοιιωομκνοοουομρυοιεχοοδεαλαννοτοκϲααυϲϲτεαδαϲαιεευαιηαεηαμαλκμαιακιγνμυνετνυυθθεγιεδρεαοαντηοκεοατϲνπναολαεοοφεηεϲωμκουμερρυοϲαοοαϲλτιηιωδυνϲυτυιχοονεαηωντολθβοτεαιαυοηετιτεαννυεινϲεενωκωξρυρηρνϲξεαγεαϲατωιεοννναυοοαμπυρααυοιξοηαϲοιιωομκνοοουομρυοιεχοοδεαλαννοτοκϲααυϲϲτεαδαϲαιεευαιηαεηαμαλκμαιακιγνμυνετνυυθθεγιεδρεαοϲτοαϲωθειιυνδνδξλονυταολαϲαττμτννοερτεοροανιεκτεεαιιϲωεαμϲωικμτποκϲϲιορϲπμοκαιουτωτδοωαξεαγωεφτωυπμαλυττταεομττλυαεπατονεαξτομυχαωηνυοβωπτυυκξαπιτααπυενεροοτϲαααυηοανηλταιθεαντινοιλκιβπγοδαιοηωαδαετακυϲοηυουαυνωαεαοττυαεεωτανεκβγεϲτοαϲωθειιυνδνδξλονυταολαϲαττμτννοερτεοροανιεκτεεαιιϲωεαμϲωικμτποκϲϲιορϲπμοκαιουτωτδοωαξεαγωεφτωυπμαλυττταεομττλυαε"),
    (process_json_files(40,40,386,rounds="1,-1"),  "τωιεοννναυοοαμπυρααυοιξοηαϲοιιωομκνοοουομρυοιεχοοδεαλαννοτοκϲααυϲϲτεαδαϲαιεευαιηαεηαμαλκμαιακιγνμυνετνυυθθεγιεδρεαοαντηοκεοατϲνπναολαεοοφεηεϲωμκουμερρυοϲαοοαϲλτιηιωδυνϲυτυιχοονεαηωντολθβοτεαιαυοηετιτεαννυεινϲεενωκωξρυρηρνϲξεαγεαϲαϲτοαϲωθειιυνδνδξλονυταολαϲαττμτννοερτεοροανιεκτεεαιιϲωεαμϲωικμτποκϲϲιορϲπμοκαιουτωτδοωαξεαγωεφτωυπμαλυττταεομττλυαεπατονεαξτομυχαωηνυοβωπτυυκξαπιτααπυενεροοτϲαααυηοανηλταιθεαντινοιλκιβπγοδαιοηωαδαετακυϲοηυουαυνωαεαοττυαεεωτανεκβγε"),
    #(process_json_files(1, 1, 21, rounds="3", length=0), ""),
    #(process_json_files(1, 1, 22, rounds="1", length=0), ""),
    #(process_json_files(1, 1, 22, rounds="3", length=0), ""),
    #(process_json_files(1, 1, 23, rounds="3", length=0), ""),
    #(process_json_files(1, 1, 11, rounds="1", length=0), ""),
    #(process_json_files(1, 1, 2, rounds="1", length=0), ""),
    #(process_json_files(1, 1, 23, rounds="1", length=0), None),  # Expect None, when no results
    #(process_json_files(1, 1, 23, rounds="-1", length=0), None),  # Expect None, when no results
    #(process_json_files(1, 1, 22, rounds="-1", length=0), ""),
    #(process_json_files(1, 1, 22, rounds="-2", length=0), ""),
    #(process_json_files(1, 1, 1, rounds="-1", length=0), ""), # Reversed Hebrew alphabet
    #(process_json_files(1, 1, 1, rounds="1,-1", length=0), ""), # Combined rounds
    #(process_json_files(1, 1, 22, rounds="1,-1", length=0, average_compile=True), ""),  # average compile test (400+1) / 2 = math.ceil(200.5)=201=200+1="רא"
]

all_tests_passed = True
for result, expected in test_results:
    if expected is None:  # Check if no result is expected
        if not result:
            logger.warning(f"Test passed: Expected no results, got no results.")
        else:
            logger.error(f"Test failed: Expected no results, but got: {result}")
            all_tests_passed = False
    else:
        # Check if result is not empty before accessing elements
        if result:
            result_text = result[0]['result_text']
            if result_text == expected:
                logger.warning(f"Test passed: Expected '{expected}', got '{result_text}'")
            else:
                logger.error(f"Test failed: Expected '{expected}', but got '{result_text}'")
                all_tests_passed = False
        else:
            logger.error(f"Test failed: Expected '{expected}', but got no results")
            all_tests_passed = False

if all_tests_passed:
    logger.info("All round tests passed.")