|
|
|
import json |
|
import re |
|
import unicodedata |
|
|
|
def unicode_to_ascii(text): |
|
|
|
normalized = unicodedata.normalize('NFKD', text) |
|
|
|
|
|
ascii_text = normalized.encode('ascii', 'ignore').decode('ascii') |
|
|
|
return ascii_text |
|
|
|
def clean_html_tags(html_string): |
|
""" |
|
Remove all HTML tags from the input string. |
|
|
|
Args: |
|
html_string (str): String containing HTML tags |
|
|
|
Returns: |
|
str: String with all HTML tags removed |
|
""" |
|
|
|
pattern = re.compile(r'<[^>]+>') |
|
|
|
|
|
clean_text = re.sub(pattern, '', html_string) |
|
super_clean_text = unicode_to_ascii(clean_text) |
|
return super_clean_text |
|
|
|
with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file: |
|
megillah_data = file.readlines() |
|
|
|
with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file: |
|
|
|
for line in megillah_data: |
|
full_talmud = json.loads(line) |
|
|
|
for sugya, texts in full_talmud.items(): |
|
metadata = {"sugya": sugya, "sections": []} |
|
content = "" |
|
for text in texts: |
|
cleaned_text = clean_html_tags(text['english']) |
|
content += f"{cleaned_text} " |
|
metadata["sections"].append(text['sefaria_id']) |
|
output = {"id": sugya, "metadata": metadata, "content": content} |
|
output_file.write(f"{json.dumps(output)}\n") |
|
|
|
|