code for creating dataset
Browse files- dataset_creator.py +50 -0
dataset_creator.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Read the JSON file
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
import unicodedata
|
5 |
+
|
6 |
+
def unicode_to_ascii(text):
|
7 |
+
# Normalize to decomposed form (separate characters and combining marks)
|
8 |
+
normalized = unicodedata.normalize('NFKD', text)
|
9 |
+
|
10 |
+
# Remove non-ASCII chars (keeps only ASCII)
|
11 |
+
ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')
|
12 |
+
|
13 |
+
return ascii_text
|
14 |
+
|
15 |
+
def clean_html_tags(html_string):
|
16 |
+
"""
|
17 |
+
Remove all HTML tags from the input string.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
html_string (str): String containing HTML tags
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
str: String with all HTML tags removed
|
24 |
+
"""
|
25 |
+
# This pattern matches HTML tags: < followed by anything except >, then >
|
26 |
+
pattern = re.compile(r'<[^>]+>')
|
27 |
+
|
28 |
+
# Replace all occurrences of HTML tags with empty string
|
29 |
+
clean_text = re.sub(pattern, '', html_string)
|
30 |
+
super_clean_text = unicode_to_ascii(clean_text)
|
31 |
+
return super_clean_text
|
32 |
+
|
33 |
+
with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file:
|
34 |
+
megillah_data = file.readlines()
|
35 |
+
|
36 |
+
with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file:
|
37 |
+
# Loop through each line in the file
|
38 |
+
for line in megillah_data:
|
39 |
+
full_talmud = json.loads(line)
|
40 |
+
|
41 |
+
for sugya, texts in full_talmud.items():
|
42 |
+
metadata = {"sugya": sugya, "sections": []}
|
43 |
+
content = ""
|
44 |
+
for text in texts:
|
45 |
+
cleaned_text = clean_html_tags(text['english'])
|
46 |
+
content += f"{cleaned_text} "
|
47 |
+
metadata["sections"].append(text['sefaria_id'])
|
48 |
+
output = {"id": sugya, "metadata": metadata, "content": content}
|
49 |
+
output_file.write(f"{json.dumps(output)}\n")
|
50 |
+
|