Spaces:

davidr70
/

embedder

Sleeping

File size: 1,664 Bytes

124e2e4

# Read the JSON file
import json
import re
import unicodedata

def unicode_to_ascii(text):
    # Normalize to decomposed form (separate characters and combining marks)
    normalized = unicodedata.normalize('NFKD', text)

    # Remove non-ASCII chars (keeps only ASCII)
    ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')

    return ascii_text

def clean_html_tags(html_string):
    """
    Remove all HTML tags from the input string.

    Args:
        html_string (str): String containing HTML tags

    Returns:
        str: String with all HTML tags removed
    """
    # This pattern matches HTML tags: < followed by anything except >, then >
    pattern = re.compile(r'<[^>]+>')

    # Replace all occurrences of HTML tags with empty string
    clean_text = re.sub(pattern, '', html_string)
    super_clean_text = unicode_to_ascii(clean_text)
    return super_clean_text

with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file:
    megillah_data = file.readlines()

with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file:
    # Loop through each line in the file
    for line in megillah_data:
        full_talmud = json.loads(line)

        for sugya, texts in full_talmud.items():
            metadata = {"sugya": sugya, "sections": []}
            content = ""
            for text in texts:
                cleaned_text = clean_html_tags(text['english'])
                content += f"{cleaned_text} "
                metadata["sections"].append(text['sefaria_id'])
            output = {"id": sugya, "metadata": metadata, "content": content}
            output_file.write(f"{json.dumps(output)}\n")