# Read the JSON file import json import re import unicodedata def unicode_to_ascii(text): # Normalize to decomposed form (separate characters and combining marks) normalized = unicodedata.normalize('NFKD', text) # Remove non-ASCII chars (keeps only ASCII) ascii_text = normalized.encode('ascii', 'ignore').decode('ascii') return ascii_text def clean_html_tags(html_string): """ Remove all HTML tags from the input string. Args: html_string (str): String containing HTML tags Returns: str: String with all HTML tags removed """ # This pattern matches HTML tags: < followed by anything except >, then > pattern = re.compile(r'<[^>]+>') # Replace all occurrences of HTML tags with empty string clean_text = re.sub(pattern, '', html_string) super_clean_text = unicode_to_ascii(clean_text) return super_clean_text with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file: megillah_data = file.readlines() with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file: # Loop through each line in the file for line in megillah_data: full_talmud = json.loads(line) for sugya, texts in full_talmud.items(): metadata = {"sugya": sugya, "sections": []} content = "" for text in texts: cleaned_text = clean_html_tags(text['english']) content += f"{cleaned_text} " metadata["sections"].append(text['sefaria_id']) output = {"id": sugya, "metadata": metadata, "content": content} output_file.write(f"{json.dumps(output)}\n")