File size: 1,664 Bytes
124e2e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# Read the JSON file
import json
import re
import unicodedata
def unicode_to_ascii(text):
# Normalize to decomposed form (separate characters and combining marks)
normalized = unicodedata.normalize('NFKD', text)
# Remove non-ASCII chars (keeps only ASCII)
ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')
return ascii_text
def clean_html_tags(html_string):
"""
Remove all HTML tags from the input string.
Args:
html_string (str): String containing HTML tags
Returns:
str: String with all HTML tags removed
"""
# This pattern matches HTML tags: < followed by anything except >, then >
pattern = re.compile(r'<[^>]+>')
# Replace all occurrences of HTML tags with empty string
clean_text = re.sub(pattern, '', html_string)
super_clean_text = unicode_to_ascii(clean_text)
return super_clean_text
with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file:
megillah_data = file.readlines()
with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file:
# Loop through each line in the file
for line in megillah_data:
full_talmud = json.loads(line)
for sugya, texts in full_talmud.items():
metadata = {"sugya": sugya, "sections": []}
content = ""
for text in texts:
cleaned_text = clean_html_tags(text['english'])
content += f"{cleaned_text} "
metadata["sections"].append(text['sefaria_id'])
output = {"id": sugya, "metadata": metadata, "content": content}
output_file.write(f"{json.dumps(output)}\n")
|