Spaces:

davidr70
/

embedder

Sleeping

App Files Files Community

davidr70 commited on Mar 20

Commit

124e2e4

1 Parent(s): 6fb6f87

code for creating dataset

Browse files

Files changed (1) hide show

dataset_creator.py +50 -0

dataset_creator.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Read the JSON file
+import json
+import re
+import unicodedata
+def unicode_to_ascii(text):
+    # Normalize to decomposed form (separate characters and combining marks)
+    normalized = unicodedata.normalize('NFKD', text)
+    # Remove non-ASCII chars (keeps only ASCII)
+    ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')
+    return ascii_text
+def clean_html_tags(html_string):
+    """
+    Remove all HTML tags from the input string.
+    Args:
+        html_string (str): String containing HTML tags
+    Returns:
+        str: String with all HTML tags removed
+    """
+    # This pattern matches HTML tags: < followed by anything except >, then >
+    pattern = re.compile(r'<[^>]+>')
+    # Replace all occurrences of HTML tags with empty string
+    clean_text = re.sub(pattern, '', html_string)
+    super_clean_text = unicode_to_ascii(clean_text)
+    return super_clean_text
+with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file:
+    megillah_data = file.readlines()
+with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file:
+    # Loop through each line in the file
+    for line in megillah_data:
+        full_talmud = json.loads(line)
+        for sugya, texts in full_talmud.items():
+            metadata = {"sugya": sugya, "sections": []}
+            content = ""
+            for text in texts:
+                cleaned_text = clean_html_tags(text['english'])
+                content += f"{cleaned_text} "
+                metadata["sections"].append(text['sefaria_id'])
+            output = {"id": sugya, "metadata": metadata, "content": content}
+            output_file.write(f"{json.dumps(output)}\n")