manaviel85370 commited on
Commit
58c260c
·
1 Parent(s): fc86982

create new date extractor, optimize testing and pipelines

Browse files
pages/9_Testing.py CHANGED
@@ -3,7 +3,7 @@ import streamlit as st
3
  from src.nlp.experimental.textclassification.classify_title import train_data
4
  from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
5
  from src.persistence.db import init_db
6
- from src.utils.Event import Event, DateTime, Address
7
  from src.utils.apis.googlemaps_api import GoogleMapsAPI
8
  from src.utils.helpers import normalize_data
9
 
@@ -90,7 +90,7 @@ if start_tests:
90
 
91
  dates = el.get("information", {}).get("actual", {}).get("dates", [])
92
  actual_event.schedule = [
93
- DateTime(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None),
94
  date.get("end_time", None), date.get("admittance_time", None))
95
  for date in dates]
96
 
 
3
  from src.nlp.experimental.textclassification.classify_title import train_data
4
  from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
5
  from src.persistence.db import init_db
6
+ from src.utils.Event import Event, Schedule
7
  from src.utils.apis.googlemaps_api import GoogleMapsAPI
8
  from src.utils.helpers import normalize_data
9
 
 
90
 
91
  dates = el.get("information", {}).get("actual", {}).get("dates", [])
92
  actual_event.schedule = [
93
+ Schedule(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None),
94
  date.get("end_time", None), date.get("admittance_time", None))
95
  for date in dates]
96
 
src/nlp/playground/pipelines/date_experimentals.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from datetime import datetime
4
+
5
+ import joblib
6
+ import spacy
7
+ from dotenv import load_dotenv
8
+ from huggingface_hub import hf_hub_download, login
9
+ from spacy import Language
10
+ from spacy.tokenizer import Tokenizer
11
+ from spacy.util import compile_suffix_regex, compile_infix_regex
12
+
13
+ from src.resources.TEXTS import TEXTS
14
+ from src.utils.Event import Schedule
15
+ from src.utils.helpers import normalize_data
16
+ from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
17
+
18
+ load_dotenv()
19
+ token = os.getenv("HUGGING_FACE_SPACES_TOKEN")
20
+ login(token=token)
21
+
22
+
23
+ placeholder = {
24
+ "DATE_RANGE_TIME_RANGE": "[DATE] [TIME] - [DATE] [TIME]",
25
+ "DATE_RANGE": "[DATE] - [DATE]",
26
+ "DATE_TIME_RANGE": "[DATE] [TIME] - [TIME]",
27
+ "TIME_RANGE": "[TIME] - [TIME]",
28
+ "DATE_TIME": "[DATE] [TIME]",
29
+ "DATE": "[DATE]",
30
+ "TIME": "[TIME]"
31
+ }
32
+
33
+ def convert_to_schedule(date_time, label):
34
+ print("Converting ", date_time, label)
35
+ try:
36
+ if label == "DATE_RANGE_TIME_RANGE":
37
+ return Schedule(
38
+ start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
39
+ end_date=datetime.strptime(date_time[2], "%d.%m.%Y").date(),
40
+ start_time=datetime.strptime(date_time[1], "%H:%M").time(),
41
+ end_time=datetime.strptime(date_time[3], "%H:%M").time(),
42
+ admittance_time=None
43
+ )
44
+
45
+ if label == "DATE_RANGE":
46
+ return Schedule(
47
+ start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
48
+ end_date=datetime.strptime(date_time[1], "%d.%m.%Y").date(),
49
+ start_time=None,
50
+ end_time=None,
51
+ admittance_time=None
52
+ )
53
+
54
+ if label == "DATE_TIME_RANGE":
55
+ return Schedule(
56
+ start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
57
+ end_date=None,
58
+ start_time=datetime.strptime(date_time[1], "%H:%M").time(),
59
+ end_time=datetime.strptime(date_time[2], "%H:%M").time(),
60
+ admittance_time=None
61
+ )
62
+
63
+ if label == "TIME_RANGE":
64
+ return Schedule(
65
+ start_date=None,
66
+ end_date=None,
67
+ start_time=datetime.strptime(date_time[0], "%H:%M").time(),
68
+ end_time=datetime.strptime(date_time[1], "%H:%M").time(),
69
+ admittance_time=None
70
+ )
71
+
72
+ if label == "DATE_TIME":
73
+ return Schedule(
74
+ start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
75
+ end_date=None,
76
+ start_time=datetime.strptime(date_time[1], "%H:%M").time(),
77
+ end_time=None,
78
+ admittance_time=None
79
+ )
80
+
81
+ if label == "DATE":
82
+ return Schedule(
83
+ start_date=datetime.strptime(date_time, "%d.%m.%Y").date(),
84
+ end_date=None,
85
+ start_time=None,
86
+ end_time=None,
87
+ admittance_time=None
88
+ )
89
+
90
+ if label == "TIME":
91
+ return Schedule(
92
+ start_date=None,
93
+ end_date=None,
94
+ start_time=datetime.strptime(date_time, "%H:%M").time(),
95
+ end_time=None,
96
+ admittance_time=None
97
+ )
98
+ except Exception as e:
99
+ print(e)
100
+ return None
101
+
102
+ def _load_classifier(repo_id, model_name):
103
+ return joblib.load(
104
+ hf_hub_download(repo_id=repo_id, filename=model_name + ".pkl")
105
+ )
106
+
107
+ def classify_date_time(date_times, label, text):
108
+ # Text anhand des Platzhalters [LABEL] in Segmente teilen
109
+ segments = text.split(f"[{label}]")
110
+ tokens = []
111
+ # print(date_times)
112
+ date_time_positions = []
113
+ for i, segment in enumerate(segments):
114
+ tokens.extend(segment.split()) # Segment als Token hinzufügen
115
+ if i < len(date_times): # Falls noch Date-Times übrig sind
116
+ tokens.append(placeholder.get(label, "ERROR")) # Date-Time als eigenes Token einfügen
117
+ date_time_positions.append(len(tokens)-1)
118
+
119
+ # print("TOKENS:", tokens)
120
+ # print(date_time_positions)
121
+ # print(len(date_time_positions)==len(date_times))
122
+
123
+
124
+ # sliding window classification
125
+ window_size = 5
126
+ event_date_total = 0
127
+ other_total = 0
128
+
129
+
130
+ schedules = []
131
+ for i, date_time in enumerate(date_times):
132
+ # Berechne den Start-Index für das Fenster
133
+ start = max(0, date_time_positions[i] - (window_size - 1))
134
+
135
+ # Führe Klassifikation für jedes Fenster durch
136
+ while start + window_size <= len(tokens): # Solange das Fenster in den Tokens bleibt
137
+ window = tokens[start:start + window_size]
138
+ # print(window)
139
+
140
+ # Klassifikation durchführen
141
+ if label == "TIME":
142
+ time_class = time_classifier(" ".join(window))
143
+ # print(time_class)
144
+ else:
145
+ date_class = date_classifier(" ".join(window))
146
+ # print(date_class)
147
+
148
+ # Aufaddieren der Werte
149
+ event_date_total += date_class.get('EVENT_DATE', 0)
150
+ other_total += date_class.get('OTHER', 0)
151
+
152
+ # Fenster verschieben
153
+ start += 1
154
+
155
+ # Rückgabe der Gesamtsummen
156
+ if label == "TIME":
157
+ pass
158
+ else:
159
+ # print("Gesamtsumme EVENT_DATE:", event_date_total)
160
+ # print("Gesamtsumme OTHER:", other_total)
161
+ if event_date_total > other_total:
162
+
163
+ schedule = convert_to_schedule(date_time, label)
164
+ schedules.append(schedule)
165
+ # print(date_time)
166
+ # print("EVENT DATE: ", schedule)
167
+ return schedules
168
+
169
+ try:
170
+ date_classifier = _load_classifier("adojode/date_classifier", "date_classifier")
171
+ time_classifier = _load_classifier("adojode/time_classifier", "time_classifier")
172
+ except Exception as e:
173
+ print("Error loading classifier models from hugging face: ", e)
174
+
175
+
176
+
177
+ def extract_schedules(text):
178
+ try:
179
+ normalized = normalize_data(text)
180
+ # print("*"*100)
181
+ # print(normalized)
182
+ # print("*"*100)
183
+ cleaned = re.sub(r"\*", " ", normalized)
184
+ cleaned = re.sub(r"=", " ", cleaned)
185
+ cleaned = re.sub(r"#", " ", cleaned)
186
+ cleaned = re.sub(r"(-|—|–|bis)", "-", cleaned)
187
+ cleaned = re.sub(r"(und|sowie)", "+", cleaned)
188
+ # cleaned = re.sub( r"\b(?:mo|di|mi|do|fr|sa|so|montag|dienstag|mittwoch|donnerstag|freitag|samstag|sonntag)(?:s?)\b",
189
+ # " ", cleaned, flags=re.IGNORECASE)
190
+
191
+ cleaned = re.sub(r"(von|vom|am|um|ab)", " ", cleaned, flags=re.IGNORECASE)
192
+ cleaned = re.sub(r",", " ", cleaned)
193
+ cleaned = re.sub(r"\|", " ", cleaned)
194
+ cleaned = re.sub(r"\s+", " ", cleaned)
195
+
196
+
197
+ matches = {}
198
+
199
+ # Match für das Datum und die Zeit mit einer Zeitspanne
200
+ date_range_time_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})\s*-\s*(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})"
201
+ match = re.findall(date_range_time_range_pattern, cleaned)
202
+ if match:
203
+ matches["DATE_RANGE_TIME_RANGE"] = match
204
+ # print("DATE_RANGE_TIME_RANGE matches:", matches["DATE_RANGE_TIME_RANGE"])
205
+ cleaned = re.sub(date_range_time_range_pattern, "[DATE_RANGE_TIME_RANGE]", cleaned)
206
+
207
+ # Match für das Datum mit einem Zeitraum ohne Zeitangabe
208
+ date_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*-\s*(\d{2}\.\d{2}\.\d{4})"
209
+ match = re.findall(date_range_pattern, cleaned)
210
+ if match:
211
+ matches["DATE_RANGE"] = match
212
+ # print("DATE_RANGE matches:", matches["DATE_RANGE"])
213
+ cleaned = re.sub(date_range_pattern, "[DATE_RANGE]", cleaned)
214
+
215
+ # Match für das Datum mit einer Zeitspanne ohne Start- und Enddatum
216
+ date_time_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})"
217
+ match = re.findall(date_time_range_pattern, cleaned)
218
+ if match:
219
+ matches["DATE_TIME_RANGE"] = match
220
+ # print("DATE_TIME_RANGE matches:", matches["DATE_TIME_RANGE"])
221
+ cleaned = re.sub(date_time_range_pattern, "[DATE_TIME_RANGE]", cleaned)
222
+
223
+ # Match für eine reine Zeitspanne ohne Datum
224
+ time_range_pattern = r"(\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})"
225
+ match = re.findall(time_range_pattern, cleaned)
226
+ if match:
227
+ matches["TIME_RANGE"] = match
228
+ # print("TIME_RANGE matches:", matches["TIME_RANGE"])
229
+ cleaned = re.sub(time_range_pattern, "[TIME_RANGE]", cleaned)
230
+
231
+ # Match für Datum mit Zeitangabe
232
+ date_time_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})"
233
+ match = re.findall(date_time_pattern, cleaned)
234
+ if match:
235
+ matches["DATE_TIME"] = match
236
+ # print("DATE_TIME matches:", matches["DATE_TIME"])
237
+ cleaned = re.sub(date_time_pattern, "[DATE_TIME]", cleaned)
238
+
239
+ date_pattern = r"(\d{2}\.\d{2}\.\d{4})"
240
+ match = re.findall(date_pattern, cleaned)
241
+ if match:
242
+ matches["DATE"] = match
243
+ # print("DATE matches:", matches["DATE"])
244
+ cleaned = re.sub(date_pattern, "[DATE]", cleaned)
245
+
246
+ time_pattern = r"(\d{2}:\d{2})"
247
+ match = re.findall(time_pattern, cleaned)
248
+ if match:
249
+ matches["TIME"] = match
250
+ # print("TIME matches:", matches["TIME"])
251
+ cleaned = re.sub(time_pattern, "[TIME]", cleaned)
252
+
253
+
254
+ event_schedules = []
255
+
256
+ # return date_time if only one found
257
+ if len(matches)==1:
258
+ key, value = next(iter(matches.items()))
259
+
260
+ event_schedules.append(convert_to_schedule(label=key,date_time=value[0]))
261
+ return event_schedules
262
+
263
+
264
+ for key, value in matches.items():
265
+ # print(f"{key}: {value}")
266
+ schedules = classify_date_time(date_times=value, label=key, text=cleaned)
267
+ if schedules:
268
+ event_schedules.extend(schedules)
269
+
270
+
271
+ if len(event_schedules)==1:
272
+ return event_schedules
273
+
274
+
275
+ print(event_schedules)
276
+ unique_schedules = []
277
+ for i, schedule in enumerate(event_schedules):
278
+ if any(schedule in other for j, other in enumerate(event_schedules) if
279
+ i != j):
280
+ continue
281
+ unique_schedules.append(schedule)
282
+ return unique_schedules
283
+
284
+ except Exception as ex:
285
+ print(ex)
286
+
287
+
288
+ # TEXTS = ["\n\nTermin für öffentliche Besichtigung\n=================================== \n\n07.01.2025\n\n * Am 07.01.2025\n* Von 18:00 bis 19:00 Uhr\n* Tasköprüstraße 10 (ehemalige Selgros-Markthalle)\n* Termin im Kalender speichern\n"]
289
+
290
+
291
+ for text in TEXTS:
292
+ print(text)
293
+ schedules = extract_schedules(text)
294
+ print("*" * 100)
295
+ print("EXTRACTED SCHEDULES: ")
296
+ print(schedules)
297
+ print("*" * 100)
src/nlp/playground/pipelines/date_extractor_v2.py CHANGED
@@ -6,7 +6,7 @@ from spacy.tokenizer import Tokenizer
6
  from spacy.util import compile_suffix_regex, compile_infix_regex
7
  import os
8
  from dotenv import load_dotenv
9
- from src.utils.Event import DateTime
10
  from huggingface_hub import hf_hub_download
11
  import joblib
12
  from huggingface_hub import login
@@ -202,10 +202,10 @@ class ScheduleExtractor(NLPProcessor):
202
  datetime.strptime(token.text, "%H:%M").time())
203
 
204
  if start_date and end_date and start_time and end_time and admittance_time:
205
- date_times.append(DateTime(start_date, end_date, start_time, end_time, admittance_time))
206
  start_date = end_date = start_time = end_time = admittance_time = None
207
 
208
- date_times.append(DateTime(start_date, end_date, start_time, end_time, admittance_time))
209
  date_times = self.__remove_subsets(date_times)
210
  return list(set(date_times))
211
 
 
6
  from spacy.util import compile_suffix_regex, compile_infix_regex
7
  import os
8
  from dotenv import load_dotenv
9
+ from src.utils.Event import Schedule
10
  from huggingface_hub import hf_hub_download
11
  import joblib
12
  from huggingface_hub import login
 
202
  datetime.strptime(token.text, "%H:%M").time())
203
 
204
  if start_date and end_date and start_time and end_time and admittance_time:
205
+ date_times.append(Schedule(start_date, end_date, start_time, end_time, admittance_time))
206
  start_date = end_date = start_time = end_time = admittance_time = None
207
 
208
+ date_times.append(Schedule(start_date, end_date, start_time, end_time, admittance_time))
209
  date_times = self.__remove_subsets(date_times)
210
  return list(set(date_times))
211
 
src/nlp/playground/pipelines/date_extractor_v3.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ from src.utils.Event import Schedule
6
+ from huggingface_hub import hf_hub_download
7
+ import joblib
8
+ from huggingface_hub import login
9
+ from datetime import datetime
10
+
11
+ from src.utils.helpers import normalize_data
12
+
13
+ load_dotenv()
14
+ token = os.getenv("HUGGING_FACE_SPACES_TOKEN")
15
+ login(token=token)
16
+
17
+
18
+
19
+
20
+
21
+ class ScheduleExtractorV3:
22
+ def __init__(self):
23
+ try:
24
+ self.date_classifier = self._load_classifier("adojode/date_classifier", "date_classifier")
25
+ self.time_classifier = self._load_classifier("adojode/time_classifier", "time_classifier")
26
+ except Exception as e:
27
+ print("Error loading classifier models from hugging face: ", e)
28
+
29
+ def _load_classifier(self,repo_id, model_name):
30
+ return joblib.load(
31
+ hf_hub_download(repo_id=repo_id, filename=model_name + ".pkl")
32
+ )
33
+
34
+ def extract(self, text):
35
+ try:
36
+ normalized = normalize_data(text)
37
+ # print("*"*100)
38
+ # print(normalized)
39
+ # print("*"*100)
40
+ cleaned = re.sub(r"\*", " ", normalized)
41
+ cleaned = re.sub(r"=", " ", cleaned)
42
+ cleaned = re.sub(r"#", " ", cleaned)
43
+ cleaned = re.sub(r"(-|—|–|bis)", "-", cleaned)
44
+ cleaned = re.sub(r"(und|sowie)", "+", cleaned)
45
+ # cleaned = re.sub( r"\b(?:mo|di|mi|do|fr|sa|so|montag|dienstag|mittwoch|donnerstag|freitag|samstag|sonntag)(?:s?)\b",
46
+ # " ", cleaned, flags=re.IGNORECASE)
47
+
48
+ cleaned = re.sub(r"(von|vom|am|um|ab)", " ", cleaned, flags=re.IGNORECASE)
49
+ cleaned = re.sub(r",", " ", cleaned)
50
+ cleaned = re.sub(r"\|", " ", cleaned)
51
+ cleaned = re.sub(r"\s+", " ", cleaned)
52
+
53
+ matches = {}
54
+
55
+ # Match für das Datum und die Zeit mit einer Zeitspanne
56
+ date_range_time_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})\s*-\s*(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})"
57
+ match = re.findall(date_range_time_range_pattern, cleaned)
58
+ if match:
59
+ matches["DATE_RANGE_TIME_RANGE"] = match
60
+ # print("DATE_RANGE_TIME_RANGE matches:", matches["DATE_RANGE_TIME_RANGE"])
61
+ cleaned = re.sub(date_range_time_range_pattern, "[DATE_RANGE_TIME_RANGE]", cleaned)
62
+
63
+ # Match für das Datum mit einem Zeitraum ohne Zeitangabe
64
+ date_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*-\s*(\d{2}\.\d{2}\.\d{4})"
65
+ match = re.findall(date_range_pattern, cleaned)
66
+ if match:
67
+ matches["DATE_RANGE"] = match
68
+ # print("DATE_RANGE matches:", matches["DATE_RANGE"])
69
+ cleaned = re.sub(date_range_pattern, "[DATE_RANGE]", cleaned)
70
+
71
+ # Match für das Datum mit einer Zeitspanne ohne Start- und Enddatum
72
+ date_time_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})"
73
+ match = re.findall(date_time_range_pattern, cleaned)
74
+ if match:
75
+ matches["DATE_TIME_RANGE"] = match
76
+ # print("DATE_TIME_RANGE matches:", matches["DATE_TIME_RANGE"])
77
+ cleaned = re.sub(date_time_range_pattern, "[DATE_TIME_RANGE]", cleaned)
78
+
79
+ # Match für eine reine Zeitspanne ohne Datum
80
+ time_range_pattern = r"(\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})"
81
+ match = re.findall(time_range_pattern, cleaned)
82
+ if match:
83
+ matches["TIME_RANGE"] = match
84
+ # print("TIME_RANGE matches:", matches["TIME_RANGE"])
85
+ cleaned = re.sub(time_range_pattern, "[TIME_RANGE]", cleaned)
86
+
87
+ # Match für Datum mit Zeitangabe
88
+ date_time_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})"
89
+ match = re.findall(date_time_pattern, cleaned)
90
+ if match:
91
+ matches["DATE_TIME"] = match
92
+ # print("DATE_TIME matches:", matches["DATE_TIME"])
93
+ cleaned = re.sub(date_time_pattern, "[DATE_TIME]", cleaned)
94
+
95
+ date_pattern = r"(\d{2}\.\d{2}\.\d{4})"
96
+ match = re.findall(date_pattern, cleaned)
97
+ if match:
98
+ matches["DATE"] = match
99
+ # print("DATE matches:", matches["DATE"])
100
+ cleaned = re.sub(date_pattern, "[DATE]", cleaned)
101
+
102
+ time_pattern = r"(\d{2}:\d{2})"
103
+ match = re.findall(time_pattern, cleaned)
104
+ if match:
105
+ matches["TIME"] = match
106
+ # print("TIME matches:", matches["TIME"])
107
+ cleaned = re.sub(time_pattern, "[TIME]", cleaned)
108
+
109
+ event_schedules = []
110
+
111
+ # return date_time if only one found
112
+ if len(matches) == 1:
113
+ key, value = next(iter(matches.items()))
114
+
115
+ event_schedules.append(self.convert_to_schedule(label=key, date_time=value[0]))
116
+ return event_schedules
117
+
118
+ for key, value in matches.items():
119
+ # print(f"{key}: {value}")
120
+ schedules = self.classify_date_time(date_times=value, label=key, text=cleaned)
121
+ if schedules:
122
+ event_schedules.extend(schedules)
123
+
124
+ if len(event_schedules) == 1:
125
+ return event_schedules
126
+
127
+ # print(event_schedules)
128
+ unique_schedules = []
129
+ for i, schedule in enumerate(event_schedules):
130
+ if any(schedule in other for j, other in enumerate(event_schedules) if
131
+ i != j):
132
+ continue
133
+ unique_schedules.append(schedule)
134
+
135
+ if len(unique_schedules) == 2:
136
+ first, second = unique_schedules
137
+ print("Versuche Schedules zu mergen....", first,second)
138
+ if any(not e for e in [first.start_date, second.start_date]) and any(not e for e in [first.end_date, second.end_date]) and any(not e for e in [first.start_time, second.start_time]) and any(not e for e in [first.end_time, second.end_time]) and any(not e for e in [first.admittance_time, second.admittance_time]):
139
+ merged = Schedule(
140
+ start_date=first.start_date or second.start_date,
141
+ end_date=first.end_date or second.end_date,
142
+ start_time=first.start_time or second.start_time,
143
+ end_time=first.end_time or second.end_time,
144
+ admittance_time=first.admittance_time or second.admittance_time
145
+ )
146
+ print("Merged:", merged)
147
+ return [merged]
148
+ return unique_schedules
149
+
150
+ except Exception as ex:
151
+ print(ex)
152
+
153
+ def classify_date_time(self, date_times, label, text):
154
+ # Text anhand des Platzhalters [LABEL] in Segmente teilen
155
+ segments = text.split(f"[{label}]")
156
+ tokens = []
157
+ # print(date_times)
158
+ date_time_positions = []
159
+ for i, segment in enumerate(segments):
160
+ tokens.extend(segment.split()) # Segment als Token hinzufügen
161
+ if i < len(date_times): # Falls noch Date-Times übrig sind
162
+ tokens.append(placeholder.get(label, "ERROR")) # Date-Time als eigenes Token einfügen
163
+ date_time_positions.append(len(tokens) - 1)
164
+
165
+
166
+ # sliding window classification
167
+ window_size = 5
168
+ event_date_total = 0
169
+ other_total = 0
170
+
171
+ schedules = []
172
+ for i, date_time in enumerate(date_times):
173
+ # Berechne den Start-Index für das Fenster
174
+ start = max(0, date_time_positions[i] - (window_size - 1))
175
+
176
+ # Führe Klassifikation für jedes Fenster durch
177
+ while start + window_size <= len(tokens): # Solange das Fenster in den Tokens bleibt
178
+ window = tokens[start:start + window_size]
179
+ # print(window)
180
+
181
+ # Klassifikation durchführen
182
+ if label == "TIME":
183
+ time_class = self.time_classifier(" ".join(window))
184
+ # print(time_class)
185
+ else:
186
+ date_class = self.date_classifier(" ".join(window))
187
+ # print(date_class)
188
+
189
+ # Aufaddieren der Werte
190
+ event_date_total += date_class.get('EVENT_DATE', 0)
191
+ other_total += date_class.get('OTHER', 0)
192
+
193
+ # Fenster verschieben
194
+ start += 1
195
+
196
+ # Rückgabe der Gesamtsummen
197
+ if label == "TIME":
198
+ pass
199
+ else:
200
+ # print("Gesamtsumme EVENT_DATE:", event_date_total)
201
+ # print("Gesamtsumme OTHER:", other_total)
202
+ if event_date_total > other_total:
203
+ schedule = self.convert_to_schedule(date_time, label)
204
+ schedules.append(schedule)
205
+ # print(date_time)
206
+ # print("EVENT DATE: ", schedule)
207
+ return schedules
208
+
209
+ def convert_to_schedule(self,date_time, label):
210
+ try:
211
+ if label == "DATE_RANGE_TIME_RANGE":
212
+ return Schedule(
213
+ start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
214
+ end_date=datetime.strptime(date_time[2], "%d.%m.%Y").date(),
215
+ start_time=datetime.strptime(date_time[1], "%H:%M").time(),
216
+ end_time=datetime.strptime(date_time[3], "%H:%M").time(),
217
+ admittance_time=None
218
+ )
219
+
220
+ if label == "DATE_RANGE":
221
+ return Schedule(
222
+ start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
223
+ end_date=datetime.strptime(date_time[1], "%d.%m.%Y").date(),
224
+ start_time=None,
225
+ end_time=None,
226
+ admittance_time=None
227
+ )
228
+
229
+ if label == "DATE_TIME_RANGE":
230
+ return Schedule(
231
+ start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
232
+ end_date=None,
233
+ start_time=datetime.strptime(date_time[1], "%H:%M").time(),
234
+ end_time=datetime.strptime(date_time[2], "%H:%M").time(),
235
+ admittance_time=None
236
+ )
237
+
238
+ if label == "TIME_RANGE":
239
+ return Schedule(
240
+ start_date=None,
241
+ end_date=None,
242
+ start_time=datetime.strptime(date_time[0], "%H:%M").time(),
243
+ end_time=datetime.strptime(date_time[1], "%H:%M").time(),
244
+ admittance_time=None
245
+ )
246
+
247
+ if label == "DATE_TIME":
248
+ return Schedule(
249
+ start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
250
+ end_date=None,
251
+ start_time=datetime.strptime(date_time[1], "%H:%M").time(),
252
+ end_time=None,
253
+ admittance_time=None
254
+ )
255
+
256
+ if label == "DATE":
257
+ return Schedule(
258
+ start_date=datetime.strptime(date_time, "%d.%m.%Y").date(),
259
+ end_date=None,
260
+ start_time=None,
261
+ end_time=None,
262
+ admittance_time=None
263
+ )
264
+
265
+ if label == "TIME":
266
+ return Schedule(
267
+ start_date=None,
268
+ end_date=None,
269
+ start_time=datetime.strptime(date_time, "%H:%M").time(),
270
+ end_time=None,
271
+ admittance_time=None
272
+ )
273
+ except Exception as e:
274
+ print(e)
275
+ return None
276
+
277
+ placeholder = {
278
+ "DATE_RANGE_TIME_RANGE": "[DATE] [TIME] - [DATE] [TIME]",
279
+ "DATE_RANGE": "[DATE] - [DATE]",
280
+ "DATE_TIME_RANGE": "[DATE] [TIME] - [TIME]",
281
+ "TIME_RANGE": "[TIME] - [TIME]",
282
+ "DATE_TIME": "[DATE] [TIME]",
283
+ "DATE": "[DATE]",
284
+ "TIME": "[TIME]"
285
+ }
286
+
287
+
288
+
src/nlp/playground/pipelines/event_data_extractor.py CHANGED
@@ -3,6 +3,7 @@ import re
3
  from src.nlp.playground.ner import GlinerHandler
4
  from src.nlp.playground.pipelines.address_extractor import AddressExtractor
5
  from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor
 
6
  from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor
7
  from src.nlp.playground.pipelines.title_extractor import TitleExtractor
8
  from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
@@ -14,7 +15,8 @@ class EventDataExtractor:
14
  self.title_extractor = TitleExtractor()
15
  self.zero_shot_classifier = ZeroShotClassifier()
16
  self.gliner_handler = GlinerHandler()
17
- self.schedule_extractor = ScheduleExtractor()
 
18
  self.address_extractor = AddressExtractor()
19
  self.description_extractor = DescriptionExtractor()
20
 
@@ -26,8 +28,9 @@ class EventDataExtractor:
26
  event.locations = self.extract_locations(data)
27
  event.organizers = self.extract_organizers(data)
28
  event.address = self.extract_address(data)
29
- event.schedule = self.extract_schedule(data)
30
  event.description = self.extract_description(data, event.title)
 
31
 
32
  print("Extraction process completed.")
33
  return event
@@ -96,9 +99,9 @@ class EventDataExtractor:
96
 
97
  def extract_prices(self, data):
98
  print("Extracting prices...")
99
- entities = self.gliner_handler.extract_entities(data, ["PRICE"])
100
-
101
- filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d', e["text"])]
102
 
103
  prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]
104
 
@@ -110,7 +113,7 @@ class EventDataExtractor:
110
  ))[0].label
111
 
112
  if entrance_free_category == "Eintritt frei" and not prices:
113
- return ["Eintritt frei"]
114
 
115
  return prices
116
 
 
3
  from src.nlp.playground.ner import GlinerHandler
4
  from src.nlp.playground.pipelines.address_extractor import AddressExtractor
5
  from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor
6
+ from src.nlp.playground.pipelines.date_extractor_v3 import ScheduleExtractorV3
7
  from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor
8
  from src.nlp.playground.pipelines.title_extractor import TitleExtractor
9
  from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
 
15
  self.title_extractor = TitleExtractor()
16
  self.zero_shot_classifier = ZeroShotClassifier()
17
  self.gliner_handler = GlinerHandler()
18
+ # self.schedule_extractor = ScheduleExtractor()
19
+ self.schedule_extractor = ScheduleExtractorV3()
20
  self.address_extractor = AddressExtractor()
21
  self.description_extractor = DescriptionExtractor()
22
 
 
28
  event.locations = self.extract_locations(data)
29
  event.organizers = self.extract_organizers(data)
30
  event.address = self.extract_address(data)
31
+ event.schedule = self.extract_schedule(data)
32
  event.description = self.extract_description(data, event.title)
33
+ event.prices = self.extract_prices(data)
34
 
35
  print("Extraction process completed.")
36
  return event
 
99
 
100
  def extract_prices(self, data):
101
  print("Extracting prices...")
102
+ entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis der Veranstaltung"])
103
+ print(entities)
104
+ filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d', e["text"]) and e["score"]>=0.4]
105
 
106
  prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]
107
 
 
113
  ))[0].label
114
 
115
  if entrance_free_category == "Eintritt frei" and not prices:
116
+ return ["kostenlos"]
117
 
118
  return prices
119
 
src/nlp/playground/pipelines/testing/date_extractor_testing.py CHANGED
The diff for this file is too large to render. See raw diff
 
src/nlp/playground/pipelines/testing/event_data_extractor_testing.py CHANGED
@@ -1,14 +1,13 @@
1
  import csv
2
  import gc
3
  import time
4
- from collections import defaultdict
5
 
6
  import pandas as pd
7
 
8
  from src.nlp.experimental.textclassification.classify_title import train_data
9
  from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
10
  from src.persistence.db import init_db
11
- from src.utils.Event import Event, DateTime
12
  from src.utils.apis.googlemaps_api import GoogleMapsAPI
13
  from src.utils.helpers import normalize_data
14
  import matplotlib.pyplot as plt
@@ -25,7 +24,7 @@ def init_db_entries():
25
  if all(f not in el.get("markdown", "") for f in filter_data):
26
  filtered_elements.append(el)
27
  print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
28
- return filtered_elements
29
 
30
  def event_similarity(actual, predicted):
31
  # Liste der Attribute, die verglichen werden
@@ -75,11 +74,11 @@ for el in elements:
75
  actual_event.url = el.get("url")
76
  print(actual_event.url)
77
  actual_event.title = el.get("information", {}).get("actual", {}).get("title", "")
78
- actual_event.organizers = [org for org in el.get("information", {}).get("actual", {}).get("organizers", []) if
79
  org.strip()]
80
  actual_event.categories = el.get("information", {}).get("actual", {}).get("categories", [])
81
  actual_event.locations = [
82
- loc for loc in el.get("information", {}).get("actual", {}).get("locations", []) if loc
83
  ]
84
  actual_event.prices = el.get("information", {}).get("actual", {}).get("prices", [])
85
  address = el.get("information", {}).get("actual", {}).get("address")
@@ -91,7 +90,7 @@ for el in elements:
91
 
92
  dates = el.get("information", {}).get("actual", {}).get("dates", [])
93
  actual_event.schedule = [
94
- DateTime(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None),
95
  date.get("end_time", None), date.get("admittance_time", None))
96
  for date in dates]
97
 
@@ -162,18 +161,29 @@ field_sums = {
162
  "address": df["address"].sum(),
163
  "organizers": df["organizers"].sum(),
164
  }
165
- print(df['extraction_time'])
166
- print(len(df))
 
167
 
168
  # 📊 Graphen erstellen
169
- plt.figure(figsize=(10, 5))
170
- plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
 
 
 
 
 
171
 
172
  # 🏷️ Achsenbeschriftungen & Titel
173
  plt.xlabel("Event Attribute")
174
  plt.ylabel("Anzahl der Übereinstimmungen")
175
- plt.title(f"Summierte Übereinstimmungen pro Event-Attribut. Durchschittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / len(df)}")
176
- plt.ylim(0, len(df)) # Maximale Höhe entspricht der Anzahl der Events
 
 
 
 
 
177
  plt.grid(axis="y", linestyle="--", alpha=0.7)
178
 
179
  # 📈 Zeige den Graphen
 
1
  import csv
2
  import gc
3
  import time
 
4
 
5
  import pandas as pd
6
 
7
  from src.nlp.experimental.textclassification.classify_title import train_data
8
  from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
9
  from src.persistence.db import init_db
10
+ from src.utils.Event import Event, Schedule
11
  from src.utils.apis.googlemaps_api import GoogleMapsAPI
12
  from src.utils.helpers import normalize_data
13
  import matplotlib.pyplot as plt
 
24
  if all(f not in el.get("markdown", "") for f in filter_data):
25
  filtered_elements.append(el)
26
  print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
27
+ return filtered_elements[20]
28
 
29
  def event_similarity(actual, predicted):
30
  # Liste der Attribute, die verglichen werden
 
74
  actual_event.url = el.get("url")
75
  print(actual_event.url)
76
  actual_event.title = el.get("information", {}).get("actual", {}).get("title", "")
77
+ actual_event.organizers = [org.strip() for org in el.get("information", {}).get("actual", {}).get("organizers", []) if
78
  org.strip()]
79
  actual_event.categories = el.get("information", {}).get("actual", {}).get("categories", [])
80
  actual_event.locations = [
81
+ loc.strip() for loc in el.get("information", {}).get("actual", {}).get("locations", []) if loc.strip()
82
  ]
83
  actual_event.prices = el.get("information", {}).get("actual", {}).get("prices", [])
84
  address = el.get("information", {}).get("actual", {}).get("address")
 
90
 
91
  dates = el.get("information", {}).get("actual", {}).get("dates", [])
92
  actual_event.schedule = [
93
+ Schedule(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None),
94
  date.get("end_time", None), date.get("admittance_time", None))
95
  for date in dates]
96
 
 
161
  "address": df["address"].sum(),
162
  "organizers": df["organizers"].sum(),
163
  }
164
+
165
+ total_events = len(df) # Gesamtanzahl der Events
166
+ percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()} # Berechne Prozentwerte
167
 
168
  # 📊 Graphen erstellen
169
+ plt.figure(figsize=(10, 6))
170
+ bars = plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
171
+
172
+ # Prozentwerte unter den Balken hinzufügen
173
+ for bar, (key, percent) in zip(bars, percentages.items()):
174
+ plt.text(bar.get_x() + bar.get_width() / 2, -0.05 * total_events, f"{percent:.1f}%",
175
+ ha="center", va="top", fontsize=10, color="black")
176
 
177
  # 🏷️ Achsenbeschriftungen & Titel
178
  plt.xlabel("Event Attribute")
179
  plt.ylabel("Anzahl der Übereinstimmungen")
180
+ plt.title(f"Summierte Übereinstimmungen pro Event-Attribut")
181
+
182
+ # 📝 Info-Box mit Anzahl der Events
183
+ info_text = f"Getestete Events: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
184
+ plt.text(0.5, total_events * 1.05, info_text, fontsize=12, ha="center", bbox=dict(facecolor="white", alpha=0.8))
185
+
186
+ plt.ylim(0, total_events * 1.2) # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
187
  plt.grid(axis="y", linestyle="--", alpha=0.7)
188
 
189
  # 📈 Zeige den Graphen
src/nlp/playground/pipelines/testing/price_extractor_testing.py CHANGED
@@ -6,9 +6,34 @@ from src.nlp.playground.textclassification import CustomMode, ZeroShotClassifier
6
 
7
  event_extractor = EventDataExtractor()
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
- texts = [
12
  "Das ist unser Museum! – Kinderstudio\nSo 15.12. 14:00\nZurück zur Übersicht\n\nTickets kaufen\nAngebot für: Kinder & Familien\nKosten: Kostenlos\nDauer: 1,5 Stunden\nKontakt: [email protected]\nWir treffen uns im Farblabor und entdecken zusammen spielerisch Farben und Formen. Mit unseren Lieblingsfarben ausgerüstet geht es dann ab in die Ausstellung – wer findet das erste Grashüpfergrün? Können wir zusammen ein Dreieck sein? Und hast du schon den Wal gefunden?\nDas Vermittlungsprogramm für Kinder und Familien setzt regelmäßig einen anderen Schwerpunkt. Das Thema ist momentan: Farbe ist alles!\nWeitere Veranstaltungen\n[\nSo 15.12. 14:00\nmre Blickwinkel – Rundgang Architektur](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-blickwinkel-rundgang-architektur-748/)\n[\nSo 15.12. 14:01\nmre Blickwinkel – Rundgang Architektur](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-blickwinkel-rundgang-architektur-6098/)\n[\nSo 15.12. 15:00\nRundgang Sonderausstellung – Fumihiko Maki und Maki & Associates](https://www.museum-re.de/de/besuch/veranstaltungskalender/rundgang-sonderausstellung-fumihiko-maki-und-maki-associates-3644/)\n[\nSo 15.12. 15:00\nFarbe ist alles! Rundgang Sammlung](https://www.museum-re.de/de/besuch/veranstaltungskalender/farbe-ist-alles-rundgang-sammlung-697/)\n[\nSo 15.12. 15:01\nFarbe ist alles! Rundgang Sammlung](https://www.museum-re.de/de/besuch/veranstaltungskalender/farbe-ist-alles-rundgang-sammlung-5110/)\n[\nMi 18.12. 18:00\nmre Tiefenrausch](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-tiefenrausch-6116/)",
13
  "Rundgang Sonderausstellung – Fumihiko Maki und Maki & Associates\nSo 15.12. 15:00\nTickets kaufen\nAngebot für: Erwachsene\nKosten: 5 €\nDauer: 1 Stunde\nKontakt: [email protected]\nLassen sie uns den „Zuckerwürfel“ einmal genauer unter die Lupe nehmen. Wir gehen auf gemeinsame Entdeckungsreise durch das Museumsgebäude und erfahren nebenbei Spannendes über die Entstehung des mre.\nEntdecken Sie Konzepte und Gestaltungselemente der Architektur von Fumihiko Maki in unserer Sonderausstellung Fumihiko Maki und Maki & Associates – Für eine menschliche Architektur und finden Sie sie am Gebäude des mre wieder.",
14
  "Geeignet für Kinder ab etwa 5 Jahren.\n(F) = Familienveranstaltung\nEintrittspreise:\nErwachsene: 6,50 Euro\nKinder/Ermäßigt: 3,50 Euro\nSchulvorführungen:\n2,50 Euro pro Person\nKartenreservierung:\nTelefon: 0541 323-7000\[email protected]\nBitte reservieren Sie Ihre Tickets vor Ihrem Besuch per Telefon, E-Mail oder direkt hier online. Viele Vorstellungen sind frühzeitig ausgebucht. Reservierte Karten müssen bis spätestens 15 Minuten vor Veranstaltungsbeginn an der Kasse abgeholt werden.\nTermine & Tickets:\n01.04.2025 - 14:30 Tickets",
@@ -35,5 +60,8 @@ texts = [
35
 
36
  ]
37
 
38
- for text in texts:
39
- print(event_extractor.extract_prices(text))
 
 
 
 
6
 
7
  event_extractor = EventDataExtractor()
8
 
9
+ texts = [
10
+ """Kunst- und Handwerksmesse 2025
11
+
12
+ 📅 Datum: 12. – 14. September 2025
13
+ 📍 Ort: Messehalle Frankfurt, Deutschland
14
+
15
+ Die Kunst- und Handwerksmesse 2025 bringt talentierte Kunsthandwerker, Designer und kreative Köpfe aus ganz Europa zusammen. Besucher können sich auf eine Vielzahl handgefertigter Produkte freuen – von Keramik und Schmuck bis hin zu maßgeschneiderten Möbeln und Mode.
16
+
17
+ 🔹 Highlights der Messe:
18
+
19
+ Live-Werkstätten: Erleben Sie, wie Meister ihres Fachs Kunstwerke aus Glas, Holz und Metall fertigen.
20
+
21
+ Gourmetbereich: Probieren Sie handgemachte Schokoladenkreationen (ab 4,50 € pro Stück) und exklusive Bio-Kaffeesorten (250g-Packung für 12,99 €).
22
+
23
+ Workshops: Nehmen Sie an einem Kalligraphie-Kurs teil (Materialkostenpauschale 15 €) oder gestalten Sie Ihre eigene Tonvase (30 € inkl. Brennkosten).
24
+
25
+ Antiquitätenmarkt: Entdecken Sie einzigartige Sammlerstücke wie historische Postkarten (ab 3 € pro Stück) oder antike Silberlöffel (Preis je nach Gewicht und Reinheitsgrad).
26
+
27
+ Rahmenservice: Lassen Sie Ihr vor Ort erworbenes Kunstwerk direkt rahmen (ab 25 € je nach Größe und Material).
28
+
29
+ Ein Muss für alle, die handgefertigte Unikate und künstlerische Inspiration lieben!
30
+
31
+ Die Teilnahme an der Messe kostet 20 € für Erwachsene, 10 € für Kinder.
32
+
33
+ """,
34
+
35
 
36
 
 
37
  "Das ist unser Museum! – Kinderstudio\nSo 15.12. 14:00\nZurück zur Übersicht\n\nTickets kaufen\nAngebot für: Kinder & Familien\nKosten: Kostenlos\nDauer: 1,5 Stunden\nKontakt: [email protected]\nWir treffen uns im Farblabor und entdecken zusammen spielerisch Farben und Formen. Mit unseren Lieblingsfarben ausgerüstet geht es dann ab in die Ausstellung – wer findet das erste Grashüpfergrün? Können wir zusammen ein Dreieck sein? Und hast du schon den Wal gefunden?\nDas Vermittlungsprogramm für Kinder und Familien setzt regelmäßig einen anderen Schwerpunkt. Das Thema ist momentan: Farbe ist alles!\nWeitere Veranstaltungen\n[\nSo 15.12. 14:00\nmre Blickwinkel – Rundgang Architektur](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-blickwinkel-rundgang-architektur-748/)\n[\nSo 15.12. 14:01\nmre Blickwinkel – Rundgang Architektur](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-blickwinkel-rundgang-architektur-6098/)\n[\nSo 15.12. 15:00\nRundgang Sonderausstellung – Fumihiko Maki und Maki & Associates](https://www.museum-re.de/de/besuch/veranstaltungskalender/rundgang-sonderausstellung-fumihiko-maki-und-maki-associates-3644/)\n[\nSo 15.12. 15:00\nFarbe ist alles! Rundgang Sammlung](https://www.museum-re.de/de/besuch/veranstaltungskalender/farbe-ist-alles-rundgang-sammlung-697/)\n[\nSo 15.12. 15:01\nFarbe ist alles! Rundgang Sammlung](https://www.museum-re.de/de/besuch/veranstaltungskalender/farbe-ist-alles-rundgang-sammlung-5110/)\n[\nMi 18.12. 18:00\nmre Tiefenrausch](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-tiefenrausch-6116/)",
38
  "Rundgang Sonderausstellung – Fumihiko Maki und Maki & Associates\nSo 15.12. 15:00\nTickets kaufen\nAngebot für: Erwachsene\nKosten: 5 €\nDauer: 1 Stunde\nKontakt: [email protected]\nLassen sie uns den „Zuckerwürfel“ einmal genauer unter die Lupe nehmen. Wir gehen auf gemeinsame Entdeckungsreise durch das Museumsgebäude und erfahren nebenbei Spannendes über die Entstehung des mre.\nEntdecken Sie Konzepte und Gestaltungselemente der Architektur von Fumihiko Maki in unserer Sonderausstellung Fumihiko Maki und Maki & Associates – Für eine menschliche Architektur und finden Sie sie am Gebäude des mre wieder.",
39
  "Geeignet für Kinder ab etwa 5 Jahren.\n(F) = Familienveranstaltung\nEintrittspreise:\nErwachsene: 6,50 Euro\nKinder/Ermäßigt: 3,50 Euro\nSchulvorführungen:\n2,50 Euro pro Person\nKartenreservierung:\nTelefon: 0541 323-7000\[email protected]\nBitte reservieren Sie Ihre Tickets vor Ihrem Besuch per Telefon, E-Mail oder direkt hier online. Viele Vorstellungen sind frühzeitig ausgebucht. Reservierte Karten müssen bis spätestens 15 Minuten vor Veranstaltungsbeginn an der Kasse abgeholt werden.\nTermine & Tickets:\n01.04.2025 - 14:30 Tickets",
 
60
 
61
  ]
62
 
63
+ for text in texts:
64
+ print(text)
65
+ print("*" * 100)
66
+ print("Preise: ", event_extractor.extract_prices(text))
67
+ print("*" * 100)
src/resources/TEXTS.py ADDED
The diff for this file is too large to render. See raw diff
 
src/utils/Event.py CHANGED
@@ -3,7 +3,7 @@ from textwrap import indent
3
  from sympy import false
4
 
5
 
6
- class DateTime:
7
  def __init__(self, start_date, end_date, start_time, end_time, admittance_time):
8
  self.start_date = start_date
9
  self.end_date = end_date
@@ -33,24 +33,41 @@ class DateTime:
33
 
34
  def __eq__(self, other):
35
 
36
- if isinstance(other, DateTime):
37
- return str(self) == str(other)
 
38
  else:
39
  return False
40
 
41
  def __hash__(self):
42
  return hash((self.start_date, self.end_date, self.start_time, self.end_time, self.admittance_time))
43
 
 
 
44
 
45
- class Address:
46
- def __init__(self, street, house_number, postal_code, city):
47
- self.street = street
48
- self.house_number = house_number
49
- self.postal_code = postal_code
50
- self.city = city
51
-
52
- def __str__(self):
53
- return f"🏠 {self.street if self.street else ''} {self.house_number if self.house_number else ''}, {self.postal_code if self.postal_code else ''} {self.city if self.city else ''}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
 
56
  class Event:
@@ -95,13 +112,13 @@ class Event:
95
 
96
  @schedule.setter
97
  def schedule(self, value):
98
- if isinstance(value, list) and all(isinstance(entry, DateTime) for entry in value):
99
  self._schedule = value
100
  else:
101
  raise ValueError("Schedule must be a list of Schedule objects")
102
 
103
  def add_schedule_entry(self, start_date, end_date, start_time, end_time, admittance_time):
104
- self._schedule.append(DateTime(start_date, end_date, start_time, end_time, admittance_time))
105
 
106
  @property
107
  def address(self):
 
3
  from sympy import false
4
 
5
 
6
+ class Schedule:
7
  def __init__(self, start_date, end_date, start_time, end_time, admittance_time):
8
  self.start_date = start_date
9
  self.end_date = end_date
 
33
 
34
  def __eq__(self, other):
35
 
36
+ if isinstance(other, Schedule):
37
+ print("Equals?", str(self),str(other), str(self) == str(other))
38
+ return str(self).strip() == str(other).strip()
39
  else:
40
  return False
41
 
42
  def __hash__(self):
43
  return hash((self.start_date, self.end_date, self.start_time, self.end_time, self.admittance_time))
44
 
45
+ def __len__(self):
46
+ return len([element for element in [self.start_date, self.end_date, self.start_time, self.end_time, self.admittance_time] if element])
47
 
48
+ def __contains__(self,item):
49
+ if not isinstance(item, Schedule): # Falls other kein Schedule-Objekt ist, direkt False
50
+ return False
51
+ print("SELF: ", self)
52
+ print("ITEM: ", item)
53
+ return (
54
+ (self.start_date == item.start_date or self.start_date == item.end_date or item.start_date is None) and
55
+ (self.end_date == item.end_date or self.end_date == item.start_date or item.end_date is None) and
56
+ (self.start_time == item.start_time or self.start_time == self.end_time or item.start_time is None) and
57
+ (self.end_time == item.end_time or self.end_time == self.start_time or item.end_time is None) and
58
+ (self.admittance_time == item.admittance_time or item.admittance_time is None)
59
+ )
60
+
61
+
62
+ # class Address:
63
+ # def __init__(self, street, house_number, postal_code, city):
64
+ # self.street = street
65
+ # self.house_number = house_number
66
+ # self.postal_code = postal_code
67
+ # self.city = city
68
+ #
69
+ # def __str__(self):
70
+ # return f"🏠 {self.street if self.street else ''} {self.house_number if self.house_number else ''}, {self.postal_code if self.postal_code else ''} {self.city if self.city else ''}"
71
 
72
 
73
  class Event:
 
112
 
113
  @schedule.setter
114
  def schedule(self, value):
115
+ if isinstance(value, list) and all(isinstance(entry, Schedule) for entry in value):
116
  self._schedule = value
117
  else:
118
  raise ValueError("Schedule must be a list of Schedule objects")
119
 
120
  def add_schedule_entry(self, start_date, end_date, start_time, end_time, admittance_time):
121
+ self._schedule.append(Schedule(start_date, end_date, start_time, end_time, admittance_time))
122
 
123
  @property
124
  def address(self):
src/utils/helpers.py CHANGED
@@ -2,6 +2,7 @@ from bs4 import BeautifulSoup, Comment
2
  import re
3
  from dateparser import DateDataParser
4
 
 
5
  def normalize_data(input):
6
  def normalize_dates(input_text):
7
  days = r"(?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag|Mo|Di|Mi|Do|Fr|Sa|So)"
@@ -12,7 +13,6 @@ def normalize_data(input):
12
  iso_pattern = r"(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})"
13
  german_date_pattern = day_month_year_pattern + "|" + dd_mm_yyyy_pattern + "|" + iso_pattern
14
 
15
-
16
  compiled_pattern = re.compile(german_date_pattern, re.VERBOSE)
17
 
18
  matches = compiled_pattern.findall(input_text)
@@ -31,20 +31,20 @@ def normalize_data(input):
31
  print(f"Fehler bei der Verarbeitung von '{match}': {e}")
32
 
33
  # Ersetze alle Vorkommen von '20.03. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
34
- german_date_pattern = r"(?<!\.)(\d{2})\.(\d{2})\.?\s*(und|\+|&|bis|bis zum|-)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})"
35
- input_text = re.sub(german_date_pattern, r" \1.\2.\6 \3 \4.\5.\6 ", input_text)
36
 
37
  # Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
38
- german_date_pattern = r"(?<!\d)(\d{2})\.?\s*(und|\+|&|bis|bis zum|-)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})"
39
- input_text = re.sub(german_date_pattern, r" \1.\4.\5 \2 \3.\4.\5 ", input_text)
40
 
41
  # Ersetze alle Vorkommen von '20.03.2025 bis/bis zum 21.03.2025' durch '20.03.2025 - 21.03.2025'
42
- german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})\.?\s*(bis|bis zum|-)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})"
43
- input_text = re.sub(german_date_pattern, r" \1.\2.\3 - \5.\6.\7 ", input_text)
44
 
45
  # Ersetze alle Vorkommen von '20.03.2025 und/& 21.03.2025' durch '20.03.2025 + 21.03.2025'
46
- german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})\.?\s*(und|\+|&)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})"
47
- input_text = re.sub(german_date_pattern, r" \1.\2.\3 + \5.\6.\7 ", input_text)
48
  return input_text
49
 
50
  def normalize_times(input_text):
@@ -95,7 +95,8 @@ def normalize_data(input):
95
  normalized_data = normalize_text(normalized_data)
96
  return normalized_data
97
 
98
- def clean_html(html:str):
 
99
  soup = BeautifulSoup(html, "lxml")
100
  body_content = soup.body
101
 
@@ -117,31 +118,35 @@ def clean_html(html:str):
117
  cleaned_html = "\n".join(clean_html_lines)
118
  return cleaned_html
119
 
120
- def strip_html_to_text(html:str):
121
- soup = BeautifulSoup(html,"lxml")
 
122
  return soup.get_text(separator=' ', strip=True)
123
 
124
 
125
- # text = """
126
- # Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03. bis 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.
127
- # 15. November 2024 & 13. Dezember 2024: Kunstausstellung "Der erweiterte Raum,"
128
- # Der siebte Workshop Retrodigitalisierung findet am 20. und 21.03.2025 bei ZB MED –.
129
- # 2. März bis 21. März 2025 \n
130
- # **Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB
131
- # """
132
- #
133
- # expected = """
134
- # Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03.2025 - 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.
135
- # 15.11.2024 + 13.12.2024: Kunstausstellung "Der erweiterte Raum,"
136
- # Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 + 21.03.2025 bei ZB MED –.
137
- # 02.03.2025 - 21.03.2025 \n
138
- # **Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB
139
- # """
140
- #
141
  #
142
- # normalized = normalize_data(text)
 
 
 
 
 
 
 
143
  #
144
- # if normalized == expected:
145
- # print("Normalization successful!")
146
- # else:
147
- # print("Normalization failed!")
 
 
 
 
 
2
  import re
3
  from dateparser import DateDataParser
4
 
5
+
6
  def normalize_data(input):
7
  def normalize_dates(input_text):
8
  days = r"(?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag|Mo|Di|Mi|Do|Fr|Sa|So)"
 
13
  iso_pattern = r"(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})"
14
  german_date_pattern = day_month_year_pattern + "|" + dd_mm_yyyy_pattern + "|" + iso_pattern
15
 
 
16
  compiled_pattern = re.compile(german_date_pattern, re.VERBOSE)
17
 
18
  matches = compiled_pattern.findall(input_text)
 
31
  print(f"Fehler bei der Verarbeitung von '{match}': {e}")
32
 
33
  # Ersetze alle Vorkommen von '20.03. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
34
+ german_date_pattern = r"(?<!\.)(\d{2})\.(\d{2})\.(\s*\d{2}:\d{2})?\s*(und|\+|&|bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
35
+ input_text = re.sub(german_date_pattern, r" \1.\2.\7 \4 \5.\6.\7 ", input_text)
36
 
37
  # Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
38
+ german_date_pattern = r"(?<!\d)(\d{2})\.(\s*\d{2}:\d{2})?\s*(und|\+|&|bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
39
+ input_text = re.sub(german_date_pattern, r" \1.\5.\6 \2 \3 \4.\5.\6 \7 ", input_text)
40
 
41
  # Ersetze alle Vorkommen von '20.03.2025 bis/bis zum 21.03.2025' durch '20.03.2025 - 21.03.2025'
42
+ german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?\s*(bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
43
+ input_text = re.sub(german_date_pattern, r" \1.\2.\3 \4 - \6.\7.\8 \9 ", input_text)
44
 
45
  # Ersetze alle Vorkommen von '20.03.2025 und/& 21.03.2025' durch '20.03.2025 + 21.03.2025'
46
+ german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})\.?(\s*\d{2}:\d{2})?\s*(und|\+|&)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
47
+ input_text = re.sub(german_date_pattern, r" \1.\2.\3 \4 + \6.\7.\8 \9 ", input_text)
48
  return input_text
49
 
50
  def normalize_times(input_text):
 
95
  normalized_data = normalize_text(normalized_data)
96
  return normalized_data
97
 
98
+
99
+ def clean_html(html: str):
100
  soup = BeautifulSoup(html, "lxml")
101
  body_content = soup.body
102
 
 
118
  cleaned_html = "\n".join(clean_html_lines)
119
  return cleaned_html
120
 
121
+
122
+ def strip_html_to_text(html: str):
123
+ soup = BeautifulSoup(html, "lxml")
124
  return soup.get_text(separator=' ', strip=True)
125
 
126
 
127
+ # texts = [
128
+ # "Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03. bis 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.",
129
+ # "15. November 2024 & 13. Dezember 2024: Kunstausstellung 'Der erweiterte Raum'",
130
+ # "Der siebte Workshop Retrodigitalisierung findet am 20. und 21.03.2025 bei ZB MED.",
131
+ # "2. März bis 21. März 2025 \n"
132
+ # "**Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB",
133
+ # "22.04.25 15 Uhr bis 23.04.25 16 Uhr."
134
+ # ]
 
 
 
 
 
 
 
 
135
  #
136
+ # expected_texts = [
137
+ # "Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03.2025 - 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.",
138
+ # "15.11.2024 + 13.12.2024: Kunstausstellung 'Der erweiterte Raum'",
139
+ # "Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 + 21.03.2025 bei ZB MED.",
140
+ # "02.03.2025 - 21.03.2025 \n",
141
+ # "**Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB",
142
+ # "22.04.2025 15:00 - 23.04.2025 16:00."
143
+ # ]
144
  #
145
+ # for i, text in enumerate(texts):
146
+ # normalized = normalize_data(text)
147
+ # normalized = re.sub("\s*", " ",normalized)
148
+ # expected = re.sub("\s*", " ",expected_texts[i])
149
+ # if normalized == expected:
150
+ # print("Normalization successful!")
151
+ # else:
152
+ # print("Normalization failed!")