davidr70 commited on
Commit
124e2e4
·
1 Parent(s): 6fb6f87

code for creating dataset

Browse files
Files changed (1) hide show
  1. dataset_creator.py +50 -0
dataset_creator.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the JSON file
2
+ import json
3
+ import re
4
+ import unicodedata
5
+
6
+ def unicode_to_ascii(text):
7
+ # Normalize to decomposed form (separate characters and combining marks)
8
+ normalized = unicodedata.normalize('NFKD', text)
9
+
10
+ # Remove non-ASCII chars (keeps only ASCII)
11
+ ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')
12
+
13
+ return ascii_text
14
+
15
+ def clean_html_tags(html_string):
16
+ """
17
+ Remove all HTML tags from the input string.
18
+
19
+ Args:
20
+ html_string (str): String containing HTML tags
21
+
22
+ Returns:
23
+ str: String with all HTML tags removed
24
+ """
25
+ # This pattern matches HTML tags: < followed by anything except >, then >
26
+ pattern = re.compile(r'<[^>]+>')
27
+
28
+ # Replace all occurrences of HTML tags with empty string
29
+ clean_text = re.sub(pattern, '', html_string)
30
+ super_clean_text = unicode_to_ascii(clean_text)
31
+ return super_clean_text
32
+
33
+ with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file:
34
+ megillah_data = file.readlines()
35
+
36
+ with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file:
37
+ # Loop through each line in the file
38
+ for line in megillah_data:
39
+ full_talmud = json.loads(line)
40
+
41
+ for sugya, texts in full_talmud.items():
42
+ metadata = {"sugya": sugya, "sections": []}
43
+ content = ""
44
+ for text in texts:
45
+ cleaned_text = clean_html_tags(text['english'])
46
+ content += f"{cleaned_text} "
47
+ metadata["sections"].append(text['sefaria_id'])
48
+ output = {"id": sugya, "metadata": metadata, "content": content}
49
+ output_file.write(f"{json.dumps(output)}\n")
50
+