manaviel85370
commited on
Commit
·
58c260c
1
Parent(s):
fc86982
create new date extractor, optimize testing and pipelines
Browse files- pages/9_Testing.py +2 -2
- src/nlp/playground/pipelines/date_experimentals.py +297 -0
- src/nlp/playground/pipelines/date_extractor_v2.py +3 -3
- src/nlp/playground/pipelines/date_extractor_v3.py +288 -0
- src/nlp/playground/pipelines/event_data_extractor.py +9 -6
- src/nlp/playground/pipelines/testing/date_extractor_testing.py +0 -0
- src/nlp/playground/pipelines/testing/event_data_extractor_testing.py +22 -12
- src/nlp/playground/pipelines/testing/price_extractor_testing.py +31 -3
- src/resources/TEXTS.py +0 -0
- src/utils/Event.py +31 -14
- src/utils/helpers.py +38 -33
pages/9_Testing.py
CHANGED
@@ -3,7 +3,7 @@ import streamlit as st
|
|
3 |
from src.nlp.experimental.textclassification.classify_title import train_data
|
4 |
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
|
5 |
from src.persistence.db import init_db
|
6 |
-
from src.utils.Event import Event,
|
7 |
from src.utils.apis.googlemaps_api import GoogleMapsAPI
|
8 |
from src.utils.helpers import normalize_data
|
9 |
|
@@ -90,7 +90,7 @@ if start_tests:
|
|
90 |
|
91 |
dates = el.get("information", {}).get("actual", {}).get("dates", [])
|
92 |
actual_event.schedule = [
|
93 |
-
|
94 |
date.get("end_time", None), date.get("admittance_time", None))
|
95 |
for date in dates]
|
96 |
|
|
|
3 |
from src.nlp.experimental.textclassification.classify_title import train_data
|
4 |
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
|
5 |
from src.persistence.db import init_db
|
6 |
+
from src.utils.Event import Event, Schedule
|
7 |
from src.utils.apis.googlemaps_api import GoogleMapsAPI
|
8 |
from src.utils.helpers import normalize_data
|
9 |
|
|
|
90 |
|
91 |
dates = el.get("information", {}).get("actual", {}).get("dates", [])
|
92 |
actual_event.schedule = [
|
93 |
+
Schedule(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None),
|
94 |
date.get("end_time", None), date.get("admittance_time", None))
|
95 |
for date in dates]
|
96 |
|
src/nlp/playground/pipelines/date_experimentals.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
import joblib
|
6 |
+
import spacy
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from huggingface_hub import hf_hub_download, login
|
9 |
+
from spacy import Language
|
10 |
+
from spacy.tokenizer import Tokenizer
|
11 |
+
from spacy.util import compile_suffix_regex, compile_infix_regex
|
12 |
+
|
13 |
+
from src.resources.TEXTS import TEXTS
|
14 |
+
from src.utils.Event import Schedule
|
15 |
+
from src.utils.helpers import normalize_data
|
16 |
+
from src.utils.markdown_processing.CustomMarkdownAnalyzer.MarkdownAnalyzer import MarkdownAnalyzer
|
17 |
+
|
18 |
+
load_dotenv()
|
19 |
+
token = os.getenv("HUGGING_FACE_SPACES_TOKEN")
|
20 |
+
login(token=token)
|
21 |
+
|
22 |
+
|
23 |
+
placeholder = {
|
24 |
+
"DATE_RANGE_TIME_RANGE": "[DATE] [TIME] - [DATE] [TIME]",
|
25 |
+
"DATE_RANGE": "[DATE] - [DATE]",
|
26 |
+
"DATE_TIME_RANGE": "[DATE] [TIME] - [TIME]",
|
27 |
+
"TIME_RANGE": "[TIME] - [TIME]",
|
28 |
+
"DATE_TIME": "[DATE] [TIME]",
|
29 |
+
"DATE": "[DATE]",
|
30 |
+
"TIME": "[TIME]"
|
31 |
+
}
|
32 |
+
|
33 |
+
def convert_to_schedule(date_time, label):
|
34 |
+
print("Converting ", date_time, label)
|
35 |
+
try:
|
36 |
+
if label == "DATE_RANGE_TIME_RANGE":
|
37 |
+
return Schedule(
|
38 |
+
start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
|
39 |
+
end_date=datetime.strptime(date_time[2], "%d.%m.%Y").date(),
|
40 |
+
start_time=datetime.strptime(date_time[1], "%H:%M").time(),
|
41 |
+
end_time=datetime.strptime(date_time[3], "%H:%M").time(),
|
42 |
+
admittance_time=None
|
43 |
+
)
|
44 |
+
|
45 |
+
if label == "DATE_RANGE":
|
46 |
+
return Schedule(
|
47 |
+
start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
|
48 |
+
end_date=datetime.strptime(date_time[1], "%d.%m.%Y").date(),
|
49 |
+
start_time=None,
|
50 |
+
end_time=None,
|
51 |
+
admittance_time=None
|
52 |
+
)
|
53 |
+
|
54 |
+
if label == "DATE_TIME_RANGE":
|
55 |
+
return Schedule(
|
56 |
+
start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
|
57 |
+
end_date=None,
|
58 |
+
start_time=datetime.strptime(date_time[1], "%H:%M").time(),
|
59 |
+
end_time=datetime.strptime(date_time[2], "%H:%M").time(),
|
60 |
+
admittance_time=None
|
61 |
+
)
|
62 |
+
|
63 |
+
if label == "TIME_RANGE":
|
64 |
+
return Schedule(
|
65 |
+
start_date=None,
|
66 |
+
end_date=None,
|
67 |
+
start_time=datetime.strptime(date_time[0], "%H:%M").time(),
|
68 |
+
end_time=datetime.strptime(date_time[1], "%H:%M").time(),
|
69 |
+
admittance_time=None
|
70 |
+
)
|
71 |
+
|
72 |
+
if label == "DATE_TIME":
|
73 |
+
return Schedule(
|
74 |
+
start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
|
75 |
+
end_date=None,
|
76 |
+
start_time=datetime.strptime(date_time[1], "%H:%M").time(),
|
77 |
+
end_time=None,
|
78 |
+
admittance_time=None
|
79 |
+
)
|
80 |
+
|
81 |
+
if label == "DATE":
|
82 |
+
return Schedule(
|
83 |
+
start_date=datetime.strptime(date_time, "%d.%m.%Y").date(),
|
84 |
+
end_date=None,
|
85 |
+
start_time=None,
|
86 |
+
end_time=None,
|
87 |
+
admittance_time=None
|
88 |
+
)
|
89 |
+
|
90 |
+
if label == "TIME":
|
91 |
+
return Schedule(
|
92 |
+
start_date=None,
|
93 |
+
end_date=None,
|
94 |
+
start_time=datetime.strptime(date_time, "%H:%M").time(),
|
95 |
+
end_time=None,
|
96 |
+
admittance_time=None
|
97 |
+
)
|
98 |
+
except Exception as e:
|
99 |
+
print(e)
|
100 |
+
return None
|
101 |
+
|
102 |
+
def _load_classifier(repo_id, model_name):
|
103 |
+
return joblib.load(
|
104 |
+
hf_hub_download(repo_id=repo_id, filename=model_name + ".pkl")
|
105 |
+
)
|
106 |
+
|
107 |
+
def classify_date_time(date_times, label, text):
|
108 |
+
# Text anhand des Platzhalters [LABEL] in Segmente teilen
|
109 |
+
segments = text.split(f"[{label}]")
|
110 |
+
tokens = []
|
111 |
+
# print(date_times)
|
112 |
+
date_time_positions = []
|
113 |
+
for i, segment in enumerate(segments):
|
114 |
+
tokens.extend(segment.split()) # Segment als Token hinzufügen
|
115 |
+
if i < len(date_times): # Falls noch Date-Times übrig sind
|
116 |
+
tokens.append(placeholder.get(label, "ERROR")) # Date-Time als eigenes Token einfügen
|
117 |
+
date_time_positions.append(len(tokens)-1)
|
118 |
+
|
119 |
+
# print("TOKENS:", tokens)
|
120 |
+
# print(date_time_positions)
|
121 |
+
# print(len(date_time_positions)==len(date_times))
|
122 |
+
|
123 |
+
|
124 |
+
# sliding window classification
|
125 |
+
window_size = 5
|
126 |
+
event_date_total = 0
|
127 |
+
other_total = 0
|
128 |
+
|
129 |
+
|
130 |
+
schedules = []
|
131 |
+
for i, date_time in enumerate(date_times):
|
132 |
+
# Berechne den Start-Index für das Fenster
|
133 |
+
start = max(0, date_time_positions[i] - (window_size - 1))
|
134 |
+
|
135 |
+
# Führe Klassifikation für jedes Fenster durch
|
136 |
+
while start + window_size <= len(tokens): # Solange das Fenster in den Tokens bleibt
|
137 |
+
window = tokens[start:start + window_size]
|
138 |
+
# print(window)
|
139 |
+
|
140 |
+
# Klassifikation durchführen
|
141 |
+
if label == "TIME":
|
142 |
+
time_class = time_classifier(" ".join(window))
|
143 |
+
# print(time_class)
|
144 |
+
else:
|
145 |
+
date_class = date_classifier(" ".join(window))
|
146 |
+
# print(date_class)
|
147 |
+
|
148 |
+
# Aufaddieren der Werte
|
149 |
+
event_date_total += date_class.get('EVENT_DATE', 0)
|
150 |
+
other_total += date_class.get('OTHER', 0)
|
151 |
+
|
152 |
+
# Fenster verschieben
|
153 |
+
start += 1
|
154 |
+
|
155 |
+
# Rückgabe der Gesamtsummen
|
156 |
+
if label == "TIME":
|
157 |
+
pass
|
158 |
+
else:
|
159 |
+
# print("Gesamtsumme EVENT_DATE:", event_date_total)
|
160 |
+
# print("Gesamtsumme OTHER:", other_total)
|
161 |
+
if event_date_total > other_total:
|
162 |
+
|
163 |
+
schedule = convert_to_schedule(date_time, label)
|
164 |
+
schedules.append(schedule)
|
165 |
+
# print(date_time)
|
166 |
+
# print("EVENT DATE: ", schedule)
|
167 |
+
return schedules
|
168 |
+
|
169 |
+
try:
|
170 |
+
date_classifier = _load_classifier("adojode/date_classifier", "date_classifier")
|
171 |
+
time_classifier = _load_classifier("adojode/time_classifier", "time_classifier")
|
172 |
+
except Exception as e:
|
173 |
+
print("Error loading classifier models from hugging face: ", e)
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
def extract_schedules(text):
|
178 |
+
try:
|
179 |
+
normalized = normalize_data(text)
|
180 |
+
# print("*"*100)
|
181 |
+
# print(normalized)
|
182 |
+
# print("*"*100)
|
183 |
+
cleaned = re.sub(r"\*", " ", normalized)
|
184 |
+
cleaned = re.sub(r"=", " ", cleaned)
|
185 |
+
cleaned = re.sub(r"#", " ", cleaned)
|
186 |
+
cleaned = re.sub(r"(-|—|–|bis)", "-", cleaned)
|
187 |
+
cleaned = re.sub(r"(und|sowie)", "+", cleaned)
|
188 |
+
# cleaned = re.sub( r"\b(?:mo|di|mi|do|fr|sa|so|montag|dienstag|mittwoch|donnerstag|freitag|samstag|sonntag)(?:s?)\b",
|
189 |
+
# " ", cleaned, flags=re.IGNORECASE)
|
190 |
+
|
191 |
+
cleaned = re.sub(r"(von|vom|am|um|ab)", " ", cleaned, flags=re.IGNORECASE)
|
192 |
+
cleaned = re.sub(r",", " ", cleaned)
|
193 |
+
cleaned = re.sub(r"\|", " ", cleaned)
|
194 |
+
cleaned = re.sub(r"\s+", " ", cleaned)
|
195 |
+
|
196 |
+
|
197 |
+
matches = {}
|
198 |
+
|
199 |
+
# Match für das Datum und die Zeit mit einer Zeitspanne
|
200 |
+
date_range_time_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})\s*-\s*(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})"
|
201 |
+
match = re.findall(date_range_time_range_pattern, cleaned)
|
202 |
+
if match:
|
203 |
+
matches["DATE_RANGE_TIME_RANGE"] = match
|
204 |
+
# print("DATE_RANGE_TIME_RANGE matches:", matches["DATE_RANGE_TIME_RANGE"])
|
205 |
+
cleaned = re.sub(date_range_time_range_pattern, "[DATE_RANGE_TIME_RANGE]", cleaned)
|
206 |
+
|
207 |
+
# Match für das Datum mit einem Zeitraum ohne Zeitangabe
|
208 |
+
date_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*-\s*(\d{2}\.\d{2}\.\d{4})"
|
209 |
+
match = re.findall(date_range_pattern, cleaned)
|
210 |
+
if match:
|
211 |
+
matches["DATE_RANGE"] = match
|
212 |
+
# print("DATE_RANGE matches:", matches["DATE_RANGE"])
|
213 |
+
cleaned = re.sub(date_range_pattern, "[DATE_RANGE]", cleaned)
|
214 |
+
|
215 |
+
# Match für das Datum mit einer Zeitspanne ohne Start- und Enddatum
|
216 |
+
date_time_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})"
|
217 |
+
match = re.findall(date_time_range_pattern, cleaned)
|
218 |
+
if match:
|
219 |
+
matches["DATE_TIME_RANGE"] = match
|
220 |
+
# print("DATE_TIME_RANGE matches:", matches["DATE_TIME_RANGE"])
|
221 |
+
cleaned = re.sub(date_time_range_pattern, "[DATE_TIME_RANGE]", cleaned)
|
222 |
+
|
223 |
+
# Match für eine reine Zeitspanne ohne Datum
|
224 |
+
time_range_pattern = r"(\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})"
|
225 |
+
match = re.findall(time_range_pattern, cleaned)
|
226 |
+
if match:
|
227 |
+
matches["TIME_RANGE"] = match
|
228 |
+
# print("TIME_RANGE matches:", matches["TIME_RANGE"])
|
229 |
+
cleaned = re.sub(time_range_pattern, "[TIME_RANGE]", cleaned)
|
230 |
+
|
231 |
+
# Match für Datum mit Zeitangabe
|
232 |
+
date_time_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})"
|
233 |
+
match = re.findall(date_time_pattern, cleaned)
|
234 |
+
if match:
|
235 |
+
matches["DATE_TIME"] = match
|
236 |
+
# print("DATE_TIME matches:", matches["DATE_TIME"])
|
237 |
+
cleaned = re.sub(date_time_pattern, "[DATE_TIME]", cleaned)
|
238 |
+
|
239 |
+
date_pattern = r"(\d{2}\.\d{2}\.\d{4})"
|
240 |
+
match = re.findall(date_pattern, cleaned)
|
241 |
+
if match:
|
242 |
+
matches["DATE"] = match
|
243 |
+
# print("DATE matches:", matches["DATE"])
|
244 |
+
cleaned = re.sub(date_pattern, "[DATE]", cleaned)
|
245 |
+
|
246 |
+
time_pattern = r"(\d{2}:\d{2})"
|
247 |
+
match = re.findall(time_pattern, cleaned)
|
248 |
+
if match:
|
249 |
+
matches["TIME"] = match
|
250 |
+
# print("TIME matches:", matches["TIME"])
|
251 |
+
cleaned = re.sub(time_pattern, "[TIME]", cleaned)
|
252 |
+
|
253 |
+
|
254 |
+
event_schedules = []
|
255 |
+
|
256 |
+
# return date_time if only one found
|
257 |
+
if len(matches)==1:
|
258 |
+
key, value = next(iter(matches.items()))
|
259 |
+
|
260 |
+
event_schedules.append(convert_to_schedule(label=key,date_time=value[0]))
|
261 |
+
return event_schedules
|
262 |
+
|
263 |
+
|
264 |
+
for key, value in matches.items():
|
265 |
+
# print(f"{key}: {value}")
|
266 |
+
schedules = classify_date_time(date_times=value, label=key, text=cleaned)
|
267 |
+
if schedules:
|
268 |
+
event_schedules.extend(schedules)
|
269 |
+
|
270 |
+
|
271 |
+
if len(event_schedules)==1:
|
272 |
+
return event_schedules
|
273 |
+
|
274 |
+
|
275 |
+
print(event_schedules)
|
276 |
+
unique_schedules = []
|
277 |
+
for i, schedule in enumerate(event_schedules):
|
278 |
+
if any(schedule in other for j, other in enumerate(event_schedules) if
|
279 |
+
i != j):
|
280 |
+
continue
|
281 |
+
unique_schedules.append(schedule)
|
282 |
+
return unique_schedules
|
283 |
+
|
284 |
+
except Exception as ex:
|
285 |
+
print(ex)
|
286 |
+
|
287 |
+
|
288 |
+
# TEXTS = ["\n\nTermin für öffentliche Besichtigung\n=================================== \n\n07.01.2025\n\n * Am 07.01.2025\n* Von 18:00 bis 19:00 Uhr\n* Tasköprüstraße 10 (ehemalige Selgros-Markthalle)\n* Termin im Kalender speichern\n"]
|
289 |
+
|
290 |
+
|
291 |
+
for text in TEXTS:
|
292 |
+
print(text)
|
293 |
+
schedules = extract_schedules(text)
|
294 |
+
print("*" * 100)
|
295 |
+
print("EXTRACTED SCHEDULES: ")
|
296 |
+
print(schedules)
|
297 |
+
print("*" * 100)
|
src/nlp/playground/pipelines/date_extractor_v2.py
CHANGED
@@ -6,7 +6,7 @@ from spacy.tokenizer import Tokenizer
|
|
6 |
from spacy.util import compile_suffix_regex, compile_infix_regex
|
7 |
import os
|
8 |
from dotenv import load_dotenv
|
9 |
-
from src.utils.Event import
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
import joblib
|
12 |
from huggingface_hub import login
|
@@ -202,10 +202,10 @@ class ScheduleExtractor(NLPProcessor):
|
|
202 |
datetime.strptime(token.text, "%H:%M").time())
|
203 |
|
204 |
if start_date and end_date and start_time and end_time and admittance_time:
|
205 |
-
date_times.append(
|
206 |
start_date = end_date = start_time = end_time = admittance_time = None
|
207 |
|
208 |
-
date_times.append(
|
209 |
date_times = self.__remove_subsets(date_times)
|
210 |
return list(set(date_times))
|
211 |
|
|
|
6 |
from spacy.util import compile_suffix_regex, compile_infix_regex
|
7 |
import os
|
8 |
from dotenv import load_dotenv
|
9 |
+
from src.utils.Event import Schedule
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
import joblib
|
12 |
from huggingface_hub import login
|
|
|
202 |
datetime.strptime(token.text, "%H:%M").time())
|
203 |
|
204 |
if start_date and end_date and start_time and end_time and admittance_time:
|
205 |
+
date_times.append(Schedule(start_date, end_date, start_time, end_time, admittance_time))
|
206 |
start_date = end_date = start_time = end_time = admittance_time = None
|
207 |
|
208 |
+
date_times.append(Schedule(start_date, end_date, start_time, end_time, admittance_time))
|
209 |
date_times = self.__remove_subsets(date_times)
|
210 |
return list(set(date_times))
|
211 |
|
src/nlp/playground/pipelines/date_extractor_v3.py
ADDED
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
from src.utils.Event import Schedule
|
6 |
+
from huggingface_hub import hf_hub_download
|
7 |
+
import joblib
|
8 |
+
from huggingface_hub import login
|
9 |
+
from datetime import datetime
|
10 |
+
|
11 |
+
from src.utils.helpers import normalize_data
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
token = os.getenv("HUGGING_FACE_SPACES_TOKEN")
|
15 |
+
login(token=token)
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
class ScheduleExtractorV3:
|
22 |
+
def __init__(self):
|
23 |
+
try:
|
24 |
+
self.date_classifier = self._load_classifier("adojode/date_classifier", "date_classifier")
|
25 |
+
self.time_classifier = self._load_classifier("adojode/time_classifier", "time_classifier")
|
26 |
+
except Exception as e:
|
27 |
+
print("Error loading classifier models from hugging face: ", e)
|
28 |
+
|
29 |
+
def _load_classifier(self,repo_id, model_name):
|
30 |
+
return joblib.load(
|
31 |
+
hf_hub_download(repo_id=repo_id, filename=model_name + ".pkl")
|
32 |
+
)
|
33 |
+
|
34 |
+
def extract(self, text):
|
35 |
+
try:
|
36 |
+
normalized = normalize_data(text)
|
37 |
+
# print("*"*100)
|
38 |
+
# print(normalized)
|
39 |
+
# print("*"*100)
|
40 |
+
cleaned = re.sub(r"\*", " ", normalized)
|
41 |
+
cleaned = re.sub(r"=", " ", cleaned)
|
42 |
+
cleaned = re.sub(r"#", " ", cleaned)
|
43 |
+
cleaned = re.sub(r"(-|—|–|bis)", "-", cleaned)
|
44 |
+
cleaned = re.sub(r"(und|sowie)", "+", cleaned)
|
45 |
+
# cleaned = re.sub( r"\b(?:mo|di|mi|do|fr|sa|so|montag|dienstag|mittwoch|donnerstag|freitag|samstag|sonntag)(?:s?)\b",
|
46 |
+
# " ", cleaned, flags=re.IGNORECASE)
|
47 |
+
|
48 |
+
cleaned = re.sub(r"(von|vom|am|um|ab)", " ", cleaned, flags=re.IGNORECASE)
|
49 |
+
cleaned = re.sub(r",", " ", cleaned)
|
50 |
+
cleaned = re.sub(r"\|", " ", cleaned)
|
51 |
+
cleaned = re.sub(r"\s+", " ", cleaned)
|
52 |
+
|
53 |
+
matches = {}
|
54 |
+
|
55 |
+
# Match für das Datum und die Zeit mit einer Zeitspanne
|
56 |
+
date_range_time_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})\s*-\s*(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})"
|
57 |
+
match = re.findall(date_range_time_range_pattern, cleaned)
|
58 |
+
if match:
|
59 |
+
matches["DATE_RANGE_TIME_RANGE"] = match
|
60 |
+
# print("DATE_RANGE_TIME_RANGE matches:", matches["DATE_RANGE_TIME_RANGE"])
|
61 |
+
cleaned = re.sub(date_range_time_range_pattern, "[DATE_RANGE_TIME_RANGE]", cleaned)
|
62 |
+
|
63 |
+
# Match für das Datum mit einem Zeitraum ohne Zeitangabe
|
64 |
+
date_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*-\s*(\d{2}\.\d{2}\.\d{4})"
|
65 |
+
match = re.findall(date_range_pattern, cleaned)
|
66 |
+
if match:
|
67 |
+
matches["DATE_RANGE"] = match
|
68 |
+
# print("DATE_RANGE matches:", matches["DATE_RANGE"])
|
69 |
+
cleaned = re.sub(date_range_pattern, "[DATE_RANGE]", cleaned)
|
70 |
+
|
71 |
+
# Match für das Datum mit einer Zeitspanne ohne Start- und Enddatum
|
72 |
+
date_time_range_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})"
|
73 |
+
match = re.findall(date_time_range_pattern, cleaned)
|
74 |
+
if match:
|
75 |
+
matches["DATE_TIME_RANGE"] = match
|
76 |
+
# print("DATE_TIME_RANGE matches:", matches["DATE_TIME_RANGE"])
|
77 |
+
cleaned = re.sub(date_time_range_pattern, "[DATE_TIME_RANGE]", cleaned)
|
78 |
+
|
79 |
+
# Match für eine reine Zeitspanne ohne Datum
|
80 |
+
time_range_pattern = r"(\d{2}:\d{2})\s*-\s*(\d{2}:\d{2})"
|
81 |
+
match = re.findall(time_range_pattern, cleaned)
|
82 |
+
if match:
|
83 |
+
matches["TIME_RANGE"] = match
|
84 |
+
# print("TIME_RANGE matches:", matches["TIME_RANGE"])
|
85 |
+
cleaned = re.sub(time_range_pattern, "[TIME_RANGE]", cleaned)
|
86 |
+
|
87 |
+
# Match für Datum mit Zeitangabe
|
88 |
+
date_time_pattern = r"(\d{2}\.\d{2}\.\d{4})\s*(\d{2}:\d{2})"
|
89 |
+
match = re.findall(date_time_pattern, cleaned)
|
90 |
+
if match:
|
91 |
+
matches["DATE_TIME"] = match
|
92 |
+
# print("DATE_TIME matches:", matches["DATE_TIME"])
|
93 |
+
cleaned = re.sub(date_time_pattern, "[DATE_TIME]", cleaned)
|
94 |
+
|
95 |
+
date_pattern = r"(\d{2}\.\d{2}\.\d{4})"
|
96 |
+
match = re.findall(date_pattern, cleaned)
|
97 |
+
if match:
|
98 |
+
matches["DATE"] = match
|
99 |
+
# print("DATE matches:", matches["DATE"])
|
100 |
+
cleaned = re.sub(date_pattern, "[DATE]", cleaned)
|
101 |
+
|
102 |
+
time_pattern = r"(\d{2}:\d{2})"
|
103 |
+
match = re.findall(time_pattern, cleaned)
|
104 |
+
if match:
|
105 |
+
matches["TIME"] = match
|
106 |
+
# print("TIME matches:", matches["TIME"])
|
107 |
+
cleaned = re.sub(time_pattern, "[TIME]", cleaned)
|
108 |
+
|
109 |
+
event_schedules = []
|
110 |
+
|
111 |
+
# return date_time if only one found
|
112 |
+
if len(matches) == 1:
|
113 |
+
key, value = next(iter(matches.items()))
|
114 |
+
|
115 |
+
event_schedules.append(self.convert_to_schedule(label=key, date_time=value[0]))
|
116 |
+
return event_schedules
|
117 |
+
|
118 |
+
for key, value in matches.items():
|
119 |
+
# print(f"{key}: {value}")
|
120 |
+
schedules = self.classify_date_time(date_times=value, label=key, text=cleaned)
|
121 |
+
if schedules:
|
122 |
+
event_schedules.extend(schedules)
|
123 |
+
|
124 |
+
if len(event_schedules) == 1:
|
125 |
+
return event_schedules
|
126 |
+
|
127 |
+
# print(event_schedules)
|
128 |
+
unique_schedules = []
|
129 |
+
for i, schedule in enumerate(event_schedules):
|
130 |
+
if any(schedule in other for j, other in enumerate(event_schedules) if
|
131 |
+
i != j):
|
132 |
+
continue
|
133 |
+
unique_schedules.append(schedule)
|
134 |
+
|
135 |
+
if len(unique_schedules) == 2:
|
136 |
+
first, second = unique_schedules
|
137 |
+
print("Versuche Schedules zu mergen....", first,second)
|
138 |
+
if any(not e for e in [first.start_date, second.start_date]) and any(not e for e in [first.end_date, second.end_date]) and any(not e for e in [first.start_time, second.start_time]) and any(not e for e in [first.end_time, second.end_time]) and any(not e for e in [first.admittance_time, second.admittance_time]):
|
139 |
+
merged = Schedule(
|
140 |
+
start_date=first.start_date or second.start_date,
|
141 |
+
end_date=first.end_date or second.end_date,
|
142 |
+
start_time=first.start_time or second.start_time,
|
143 |
+
end_time=first.end_time or second.end_time,
|
144 |
+
admittance_time=first.admittance_time or second.admittance_time
|
145 |
+
)
|
146 |
+
print("Merged:", merged)
|
147 |
+
return [merged]
|
148 |
+
return unique_schedules
|
149 |
+
|
150 |
+
except Exception as ex:
|
151 |
+
print(ex)
|
152 |
+
|
153 |
+
def classify_date_time(self, date_times, label, text):
|
154 |
+
# Text anhand des Platzhalters [LABEL] in Segmente teilen
|
155 |
+
segments = text.split(f"[{label}]")
|
156 |
+
tokens = []
|
157 |
+
# print(date_times)
|
158 |
+
date_time_positions = []
|
159 |
+
for i, segment in enumerate(segments):
|
160 |
+
tokens.extend(segment.split()) # Segment als Token hinzufügen
|
161 |
+
if i < len(date_times): # Falls noch Date-Times übrig sind
|
162 |
+
tokens.append(placeholder.get(label, "ERROR")) # Date-Time als eigenes Token einfügen
|
163 |
+
date_time_positions.append(len(tokens) - 1)
|
164 |
+
|
165 |
+
|
166 |
+
# sliding window classification
|
167 |
+
window_size = 5
|
168 |
+
event_date_total = 0
|
169 |
+
other_total = 0
|
170 |
+
|
171 |
+
schedules = []
|
172 |
+
for i, date_time in enumerate(date_times):
|
173 |
+
# Berechne den Start-Index für das Fenster
|
174 |
+
start = max(0, date_time_positions[i] - (window_size - 1))
|
175 |
+
|
176 |
+
# Führe Klassifikation für jedes Fenster durch
|
177 |
+
while start + window_size <= len(tokens): # Solange das Fenster in den Tokens bleibt
|
178 |
+
window = tokens[start:start + window_size]
|
179 |
+
# print(window)
|
180 |
+
|
181 |
+
# Klassifikation durchführen
|
182 |
+
if label == "TIME":
|
183 |
+
time_class = self.time_classifier(" ".join(window))
|
184 |
+
# print(time_class)
|
185 |
+
else:
|
186 |
+
date_class = self.date_classifier(" ".join(window))
|
187 |
+
# print(date_class)
|
188 |
+
|
189 |
+
# Aufaddieren der Werte
|
190 |
+
event_date_total += date_class.get('EVENT_DATE', 0)
|
191 |
+
other_total += date_class.get('OTHER', 0)
|
192 |
+
|
193 |
+
# Fenster verschieben
|
194 |
+
start += 1
|
195 |
+
|
196 |
+
# Rückgabe der Gesamtsummen
|
197 |
+
if label == "TIME":
|
198 |
+
pass
|
199 |
+
else:
|
200 |
+
# print("Gesamtsumme EVENT_DATE:", event_date_total)
|
201 |
+
# print("Gesamtsumme OTHER:", other_total)
|
202 |
+
if event_date_total > other_total:
|
203 |
+
schedule = self.convert_to_schedule(date_time, label)
|
204 |
+
schedules.append(schedule)
|
205 |
+
# print(date_time)
|
206 |
+
# print("EVENT DATE: ", schedule)
|
207 |
+
return schedules
|
208 |
+
|
209 |
+
def convert_to_schedule(self,date_time, label):
|
210 |
+
try:
|
211 |
+
if label == "DATE_RANGE_TIME_RANGE":
|
212 |
+
return Schedule(
|
213 |
+
start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
|
214 |
+
end_date=datetime.strptime(date_time[2], "%d.%m.%Y").date(),
|
215 |
+
start_time=datetime.strptime(date_time[1], "%H:%M").time(),
|
216 |
+
end_time=datetime.strptime(date_time[3], "%H:%M").time(),
|
217 |
+
admittance_time=None
|
218 |
+
)
|
219 |
+
|
220 |
+
if label == "DATE_RANGE":
|
221 |
+
return Schedule(
|
222 |
+
start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
|
223 |
+
end_date=datetime.strptime(date_time[1], "%d.%m.%Y").date(),
|
224 |
+
start_time=None,
|
225 |
+
end_time=None,
|
226 |
+
admittance_time=None
|
227 |
+
)
|
228 |
+
|
229 |
+
if label == "DATE_TIME_RANGE":
|
230 |
+
return Schedule(
|
231 |
+
start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
|
232 |
+
end_date=None,
|
233 |
+
start_time=datetime.strptime(date_time[1], "%H:%M").time(),
|
234 |
+
end_time=datetime.strptime(date_time[2], "%H:%M").time(),
|
235 |
+
admittance_time=None
|
236 |
+
)
|
237 |
+
|
238 |
+
if label == "TIME_RANGE":
|
239 |
+
return Schedule(
|
240 |
+
start_date=None,
|
241 |
+
end_date=None,
|
242 |
+
start_time=datetime.strptime(date_time[0], "%H:%M").time(),
|
243 |
+
end_time=datetime.strptime(date_time[1], "%H:%M").time(),
|
244 |
+
admittance_time=None
|
245 |
+
)
|
246 |
+
|
247 |
+
if label == "DATE_TIME":
|
248 |
+
return Schedule(
|
249 |
+
start_date=datetime.strptime(date_time[0], "%d.%m.%Y").date(),
|
250 |
+
end_date=None,
|
251 |
+
start_time=datetime.strptime(date_time[1], "%H:%M").time(),
|
252 |
+
end_time=None,
|
253 |
+
admittance_time=None
|
254 |
+
)
|
255 |
+
|
256 |
+
if label == "DATE":
|
257 |
+
return Schedule(
|
258 |
+
start_date=datetime.strptime(date_time, "%d.%m.%Y").date(),
|
259 |
+
end_date=None,
|
260 |
+
start_time=None,
|
261 |
+
end_time=None,
|
262 |
+
admittance_time=None
|
263 |
+
)
|
264 |
+
|
265 |
+
if label == "TIME":
|
266 |
+
return Schedule(
|
267 |
+
start_date=None,
|
268 |
+
end_date=None,
|
269 |
+
start_time=datetime.strptime(date_time, "%H:%M").time(),
|
270 |
+
end_time=None,
|
271 |
+
admittance_time=None
|
272 |
+
)
|
273 |
+
except Exception as e:
|
274 |
+
print(e)
|
275 |
+
return None
|
276 |
+
|
277 |
+
placeholder = {
|
278 |
+
"DATE_RANGE_TIME_RANGE": "[DATE] [TIME] - [DATE] [TIME]",
|
279 |
+
"DATE_RANGE": "[DATE] - [DATE]",
|
280 |
+
"DATE_TIME_RANGE": "[DATE] [TIME] - [TIME]",
|
281 |
+
"TIME_RANGE": "[TIME] - [TIME]",
|
282 |
+
"DATE_TIME": "[DATE] [TIME]",
|
283 |
+
"DATE": "[DATE]",
|
284 |
+
"TIME": "[TIME]"
|
285 |
+
}
|
286 |
+
|
287 |
+
|
288 |
+
|
src/nlp/playground/pipelines/event_data_extractor.py
CHANGED
@@ -3,6 +3,7 @@ import re
|
|
3 |
from src.nlp.playground.ner import GlinerHandler
|
4 |
from src.nlp.playground.pipelines.address_extractor import AddressExtractor
|
5 |
from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor
|
|
|
6 |
from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor
|
7 |
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
|
8 |
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
|
@@ -14,7 +15,8 @@ class EventDataExtractor:
|
|
14 |
self.title_extractor = TitleExtractor()
|
15 |
self.zero_shot_classifier = ZeroShotClassifier()
|
16 |
self.gliner_handler = GlinerHandler()
|
17 |
-
self.schedule_extractor = ScheduleExtractor()
|
|
|
18 |
self.address_extractor = AddressExtractor()
|
19 |
self.description_extractor = DescriptionExtractor()
|
20 |
|
@@ -26,8 +28,9 @@ class EventDataExtractor:
|
|
26 |
event.locations = self.extract_locations(data)
|
27 |
event.organizers = self.extract_organizers(data)
|
28 |
event.address = self.extract_address(data)
|
29 |
-
event.schedule =
|
30 |
event.description = self.extract_description(data, event.title)
|
|
|
31 |
|
32 |
print("Extraction process completed.")
|
33 |
return event
|
@@ -96,9 +99,9 @@ class EventDataExtractor:
|
|
96 |
|
97 |
def extract_prices(self, data):
|
98 |
print("Extracting prices...")
|
99 |
-
entities = self.gliner_handler.extract_entities(data, ["
|
100 |
-
|
101 |
-
filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d', e["text"])]
|
102 |
|
103 |
prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]
|
104 |
|
@@ -110,7 +113,7 @@ class EventDataExtractor:
|
|
110 |
))[0].label
|
111 |
|
112 |
if entrance_free_category == "Eintritt frei" and not prices:
|
113 |
-
return ["
|
114 |
|
115 |
return prices
|
116 |
|
|
|
3 |
from src.nlp.playground.ner import GlinerHandler
|
4 |
from src.nlp.playground.pipelines.address_extractor import AddressExtractor
|
5 |
from src.nlp.playground.pipelines.date_extractor_v2 import ScheduleExtractor
|
6 |
+
from src.nlp.playground.pipelines.date_extractor_v3 import ScheduleExtractorV3
|
7 |
from src.nlp.playground.pipelines.description_extractor import DescriptionExtractor
|
8 |
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
|
9 |
from src.nlp.playground.textclassification import ZeroShotClassifier, CustomMode
|
|
|
15 |
self.title_extractor = TitleExtractor()
|
16 |
self.zero_shot_classifier = ZeroShotClassifier()
|
17 |
self.gliner_handler = GlinerHandler()
|
18 |
+
# self.schedule_extractor = ScheduleExtractor()
|
19 |
+
self.schedule_extractor = ScheduleExtractorV3()
|
20 |
self.address_extractor = AddressExtractor()
|
21 |
self.description_extractor = DescriptionExtractor()
|
22 |
|
|
|
28 |
event.locations = self.extract_locations(data)
|
29 |
event.organizers = self.extract_organizers(data)
|
30 |
event.address = self.extract_address(data)
|
31 |
+
event.schedule = self.extract_schedule(data)
|
32 |
event.description = self.extract_description(data, event.title)
|
33 |
+
event.prices = self.extract_prices(data)
|
34 |
|
35 |
print("Extraction process completed.")
|
36 |
return event
|
|
|
99 |
|
100 |
def extract_prices(self, data):
|
101 |
print("Extracting prices...")
|
102 |
+
entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis der Veranstaltung"])
|
103 |
+
print(entities)
|
104 |
+
filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d', e["text"]) and e["score"]>=0.4]
|
105 |
|
106 |
prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]
|
107 |
|
|
|
113 |
))[0].label
|
114 |
|
115 |
if entrance_free_category == "Eintritt frei" and not prices:
|
116 |
+
return ["kostenlos"]
|
117 |
|
118 |
return prices
|
119 |
|
src/nlp/playground/pipelines/testing/date_extractor_testing.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/nlp/playground/pipelines/testing/event_data_extractor_testing.py
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
import csv
|
2 |
import gc
|
3 |
import time
|
4 |
-
from collections import defaultdict
|
5 |
|
6 |
import pandas as pd
|
7 |
|
8 |
from src.nlp.experimental.textclassification.classify_title import train_data
|
9 |
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
|
10 |
from src.persistence.db import init_db
|
11 |
-
from src.utils.Event import Event,
|
12 |
from src.utils.apis.googlemaps_api import GoogleMapsAPI
|
13 |
from src.utils.helpers import normalize_data
|
14 |
import matplotlib.pyplot as plt
|
@@ -25,7 +24,7 @@ def init_db_entries():
|
|
25 |
if all(f not in el.get("markdown", "") for f in filter_data):
|
26 |
filtered_elements.append(el)
|
27 |
print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
|
28 |
-
return filtered_elements
|
29 |
|
30 |
def event_similarity(actual, predicted):
|
31 |
# Liste der Attribute, die verglichen werden
|
@@ -75,11 +74,11 @@ for el in elements:
|
|
75 |
actual_event.url = el.get("url")
|
76 |
print(actual_event.url)
|
77 |
actual_event.title = el.get("information", {}).get("actual", {}).get("title", "")
|
78 |
-
actual_event.organizers = [org for org in el.get("information", {}).get("actual", {}).get("organizers", []) if
|
79 |
org.strip()]
|
80 |
actual_event.categories = el.get("information", {}).get("actual", {}).get("categories", [])
|
81 |
actual_event.locations = [
|
82 |
-
loc for loc in el.get("information", {}).get("actual", {}).get("locations", []) if loc
|
83 |
]
|
84 |
actual_event.prices = el.get("information", {}).get("actual", {}).get("prices", [])
|
85 |
address = el.get("information", {}).get("actual", {}).get("address")
|
@@ -91,7 +90,7 @@ for el in elements:
|
|
91 |
|
92 |
dates = el.get("information", {}).get("actual", {}).get("dates", [])
|
93 |
actual_event.schedule = [
|
94 |
-
|
95 |
date.get("end_time", None), date.get("admittance_time", None))
|
96 |
for date in dates]
|
97 |
|
@@ -162,18 +161,29 @@ field_sums = {
|
|
162 |
"address": df["address"].sum(),
|
163 |
"organizers": df["organizers"].sum(),
|
164 |
}
|
165 |
-
|
166 |
-
|
|
|
167 |
|
168 |
# 📊 Graphen erstellen
|
169 |
-
plt.figure(figsize=(10,
|
170 |
-
plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
# 🏷️ Achsenbeschriftungen & Titel
|
173 |
plt.xlabel("Event Attribute")
|
174 |
plt.ylabel("Anzahl der Übereinstimmungen")
|
175 |
-
plt.title(f"Summierte Übereinstimmungen pro Event-Attribut
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
177 |
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
178 |
|
179 |
# 📈 Zeige den Graphen
|
|
|
1 |
import csv
|
2 |
import gc
|
3 |
import time
|
|
|
4 |
|
5 |
import pandas as pd
|
6 |
|
7 |
from src.nlp.experimental.textclassification.classify_title import train_data
|
8 |
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
|
9 |
from src.persistence.db import init_db
|
10 |
+
from src.utils.Event import Event, Schedule
|
11 |
from src.utils.apis.googlemaps_api import GoogleMapsAPI
|
12 |
from src.utils.helpers import normalize_data
|
13 |
import matplotlib.pyplot as plt
|
|
|
24 |
if all(f not in el.get("markdown", "") for f in filter_data):
|
25 |
filtered_elements.append(el)
|
26 |
print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
|
27 |
+
return filtered_elements[20]
|
28 |
|
29 |
def event_similarity(actual, predicted):
|
30 |
# Liste der Attribute, die verglichen werden
|
|
|
74 |
actual_event.url = el.get("url")
|
75 |
print(actual_event.url)
|
76 |
actual_event.title = el.get("information", {}).get("actual", {}).get("title", "")
|
77 |
+
actual_event.organizers = [org.strip() for org in el.get("information", {}).get("actual", {}).get("organizers", []) if
|
78 |
org.strip()]
|
79 |
actual_event.categories = el.get("information", {}).get("actual", {}).get("categories", [])
|
80 |
actual_event.locations = [
|
81 |
+
loc.strip() for loc in el.get("information", {}).get("actual", {}).get("locations", []) if loc.strip()
|
82 |
]
|
83 |
actual_event.prices = el.get("information", {}).get("actual", {}).get("prices", [])
|
84 |
address = el.get("information", {}).get("actual", {}).get("address")
|
|
|
90 |
|
91 |
dates = el.get("information", {}).get("actual", {}).get("dates", [])
|
92 |
actual_event.schedule = [
|
93 |
+
Schedule(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None),
|
94 |
date.get("end_time", None), date.get("admittance_time", None))
|
95 |
for date in dates]
|
96 |
|
|
|
161 |
"address": df["address"].sum(),
|
162 |
"organizers": df["organizers"].sum(),
|
163 |
}
|
164 |
+
|
165 |
+
total_events = len(df) # Gesamtanzahl der Events
|
166 |
+
percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()} # Berechne Prozentwerte
|
167 |
|
168 |
# 📊 Graphen erstellen
|
169 |
+
plt.figure(figsize=(10, 6))
|
170 |
+
bars = plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
|
171 |
+
|
172 |
+
# Prozentwerte unter den Balken hinzufügen
|
173 |
+
for bar, (key, percent) in zip(bars, percentages.items()):
|
174 |
+
plt.text(bar.get_x() + bar.get_width() / 2, -0.05 * total_events, f"{percent:.1f}%",
|
175 |
+
ha="center", va="top", fontsize=10, color="black")
|
176 |
|
177 |
# 🏷️ Achsenbeschriftungen & Titel
|
178 |
plt.xlabel("Event Attribute")
|
179 |
plt.ylabel("Anzahl der Übereinstimmungen")
|
180 |
+
plt.title(f"Summierte Übereinstimmungen pro Event-Attribut")
|
181 |
+
|
182 |
+
# 📝 Info-Box mit Anzahl der Events
|
183 |
+
info_text = f"Getestete Events: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
|
184 |
+
plt.text(0.5, total_events * 1.05, info_text, fontsize=12, ha="center", bbox=dict(facecolor="white", alpha=0.8))
|
185 |
+
|
186 |
+
plt.ylim(0, total_events * 1.2) # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
|
187 |
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
188 |
|
189 |
# 📈 Zeige den Graphen
|
src/nlp/playground/pipelines/testing/price_extractor_testing.py
CHANGED
@@ -6,9 +6,34 @@ from src.nlp.playground.textclassification import CustomMode, ZeroShotClassifier
|
|
6 |
|
7 |
event_extractor = EventDataExtractor()
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
-
texts = [
|
12 |
"Das ist unser Museum! – Kinderstudio\nSo 15.12. 14:00\nZurück zur Übersicht\n\nTickets kaufen\nAngebot für: Kinder & Familien\nKosten: Kostenlos\nDauer: 1,5 Stunden\nKontakt: [email protected]\nWir treffen uns im Farblabor und entdecken zusammen spielerisch Farben und Formen. Mit unseren Lieblingsfarben ausgerüstet geht es dann ab in die Ausstellung – wer findet das erste Grashüpfergrün? Können wir zusammen ein Dreieck sein? Und hast du schon den Wal gefunden?\nDas Vermittlungsprogramm für Kinder und Familien setzt regelmäßig einen anderen Schwerpunkt. Das Thema ist momentan: Farbe ist alles!\nWeitere Veranstaltungen\n[\nSo 15.12. 14:00\nmre Blickwinkel – Rundgang Architektur](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-blickwinkel-rundgang-architektur-748/)\n[\nSo 15.12. 14:01\nmre Blickwinkel – Rundgang Architektur](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-blickwinkel-rundgang-architektur-6098/)\n[\nSo 15.12. 15:00\nRundgang Sonderausstellung – Fumihiko Maki und Maki & Associates](https://www.museum-re.de/de/besuch/veranstaltungskalender/rundgang-sonderausstellung-fumihiko-maki-und-maki-associates-3644/)\n[\nSo 15.12. 15:00\nFarbe ist alles! Rundgang Sammlung](https://www.museum-re.de/de/besuch/veranstaltungskalender/farbe-ist-alles-rundgang-sammlung-697/)\n[\nSo 15.12. 15:01\nFarbe ist alles! Rundgang Sammlung](https://www.museum-re.de/de/besuch/veranstaltungskalender/farbe-ist-alles-rundgang-sammlung-5110/)\n[\nMi 18.12. 18:00\nmre Tiefenrausch](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-tiefenrausch-6116/)",
|
13 |
"Rundgang Sonderausstellung – Fumihiko Maki und Maki & Associates\nSo 15.12. 15:00\nTickets kaufen\nAngebot für: Erwachsene\nKosten: 5 €\nDauer: 1 Stunde\nKontakt: [email protected]\nLassen sie uns den „Zuckerwürfel“ einmal genauer unter die Lupe nehmen. Wir gehen auf gemeinsame Entdeckungsreise durch das Museumsgebäude und erfahren nebenbei Spannendes über die Entstehung des mre.\nEntdecken Sie Konzepte und Gestaltungselemente der Architektur von Fumihiko Maki in unserer Sonderausstellung Fumihiko Maki und Maki & Associates – Für eine menschliche Architektur und finden Sie sie am Gebäude des mre wieder.",
|
14 |
"Geeignet für Kinder ab etwa 5 Jahren.\n(F) = Familienveranstaltung\nEintrittspreise:\nErwachsene: 6,50 Euro\nKinder/Ermäßigt: 3,50 Euro\nSchulvorführungen:\n2,50 Euro pro Person\nKartenreservierung:\nTelefon: 0541 323-7000\[email protected]\nBitte reservieren Sie Ihre Tickets vor Ihrem Besuch per Telefon, E-Mail oder direkt hier online. Viele Vorstellungen sind frühzeitig ausgebucht. Reservierte Karten müssen bis spätestens 15 Minuten vor Veranstaltungsbeginn an der Kasse abgeholt werden.\nTermine & Tickets:\n01.04.2025 - 14:30 Tickets",
|
@@ -35,5 +60,8 @@ texts = [
|
|
35 |
|
36 |
]
|
37 |
|
38 |
-
for text in
|
39 |
-
print(
|
|
|
|
|
|
|
|
6 |
|
7 |
event_extractor = EventDataExtractor()
|
8 |
|
9 |
+
texts = [
|
10 |
+
"""Kunst- und Handwerksmesse 2025
|
11 |
+
|
12 |
+
📅 Datum: 12. – 14. September 2025
|
13 |
+
📍 Ort: Messehalle Frankfurt, Deutschland
|
14 |
+
|
15 |
+
Die Kunst- und Handwerksmesse 2025 bringt talentierte Kunsthandwerker, Designer und kreative Köpfe aus ganz Europa zusammen. Besucher können sich auf eine Vielzahl handgefertigter Produkte freuen – von Keramik und Schmuck bis hin zu maßgeschneiderten Möbeln und Mode.
|
16 |
+
|
17 |
+
🔹 Highlights der Messe:
|
18 |
+
|
19 |
+
Live-Werkstätten: Erleben Sie, wie Meister ihres Fachs Kunstwerke aus Glas, Holz und Metall fertigen.
|
20 |
+
|
21 |
+
Gourmetbereich: Probieren Sie handgemachte Schokoladenkreationen (ab 4,50 € pro Stück) und exklusive Bio-Kaffeesorten (250g-Packung für 12,99 €).
|
22 |
+
|
23 |
+
Workshops: Nehmen Sie an einem Kalligraphie-Kurs teil (Materialkostenpauschale 15 €) oder gestalten Sie Ihre eigene Tonvase (30 € inkl. Brennkosten).
|
24 |
+
|
25 |
+
Antiquitätenmarkt: Entdecken Sie einzigartige Sammlerstücke wie historische Postkarten (ab 3 € pro Stück) oder antike Silberlöffel (Preis je nach Gewicht und Reinheitsgrad).
|
26 |
+
|
27 |
+
Rahmenservice: Lassen Sie Ihr vor Ort erworbenes Kunstwerk direkt rahmen (ab 25 € je nach Größe und Material).
|
28 |
+
|
29 |
+
Ein Muss für alle, die handgefertigte Unikate und künstlerische Inspiration lieben!
|
30 |
+
|
31 |
+
Die Teilnahme an der Messe kostet 20 € für Erwachsene, 10 € für Kinder.
|
32 |
+
|
33 |
+
""",
|
34 |
+
|
35 |
|
36 |
|
|
|
37 |
"Das ist unser Museum! – Kinderstudio\nSo 15.12. 14:00\nZurück zur Übersicht\n\nTickets kaufen\nAngebot für: Kinder & Familien\nKosten: Kostenlos\nDauer: 1,5 Stunden\nKontakt: [email protected]\nWir treffen uns im Farblabor und entdecken zusammen spielerisch Farben und Formen. Mit unseren Lieblingsfarben ausgerüstet geht es dann ab in die Ausstellung – wer findet das erste Grashüpfergrün? Können wir zusammen ein Dreieck sein? Und hast du schon den Wal gefunden?\nDas Vermittlungsprogramm für Kinder und Familien setzt regelmäßig einen anderen Schwerpunkt. Das Thema ist momentan: Farbe ist alles!\nWeitere Veranstaltungen\n[\nSo 15.12. 14:00\nmre Blickwinkel – Rundgang Architektur](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-blickwinkel-rundgang-architektur-748/)\n[\nSo 15.12. 14:01\nmre Blickwinkel – Rundgang Architektur](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-blickwinkel-rundgang-architektur-6098/)\n[\nSo 15.12. 15:00\nRundgang Sonderausstellung – Fumihiko Maki und Maki & Associates](https://www.museum-re.de/de/besuch/veranstaltungskalender/rundgang-sonderausstellung-fumihiko-maki-und-maki-associates-3644/)\n[\nSo 15.12. 15:00\nFarbe ist alles! Rundgang Sammlung](https://www.museum-re.de/de/besuch/veranstaltungskalender/farbe-ist-alles-rundgang-sammlung-697/)\n[\nSo 15.12. 15:01\nFarbe ist alles! Rundgang Sammlung](https://www.museum-re.de/de/besuch/veranstaltungskalender/farbe-ist-alles-rundgang-sammlung-5110/)\n[\nMi 18.12. 18:00\nmre Tiefenrausch](https://www.museum-re.de/de/besuch/veranstaltungskalender/mre-tiefenrausch-6116/)",
|
38 |
"Rundgang Sonderausstellung – Fumihiko Maki und Maki & Associates\nSo 15.12. 15:00\nTickets kaufen\nAngebot für: Erwachsene\nKosten: 5 €\nDauer: 1 Stunde\nKontakt: [email protected]\nLassen sie uns den „Zuckerwürfel“ einmal genauer unter die Lupe nehmen. Wir gehen auf gemeinsame Entdeckungsreise durch das Museumsgebäude und erfahren nebenbei Spannendes über die Entstehung des mre.\nEntdecken Sie Konzepte und Gestaltungselemente der Architektur von Fumihiko Maki in unserer Sonderausstellung Fumihiko Maki und Maki & Associates – Für eine menschliche Architektur und finden Sie sie am Gebäude des mre wieder.",
|
39 |
"Geeignet für Kinder ab etwa 5 Jahren.\n(F) = Familienveranstaltung\nEintrittspreise:\nErwachsene: 6,50 Euro\nKinder/Ermäßigt: 3,50 Euro\nSchulvorführungen:\n2,50 Euro pro Person\nKartenreservierung:\nTelefon: 0541 323-7000\[email protected]\nBitte reservieren Sie Ihre Tickets vor Ihrem Besuch per Telefon, E-Mail oder direkt hier online. Viele Vorstellungen sind frühzeitig ausgebucht. Reservierte Karten müssen bis spätestens 15 Minuten vor Veranstaltungsbeginn an der Kasse abgeholt werden.\nTermine & Tickets:\n01.04.2025 - 14:30 Tickets",
|
|
|
60 |
|
61 |
]
|
62 |
|
63 |
+
for text in texts:
|
64 |
+
print(text)
|
65 |
+
print("*" * 100)
|
66 |
+
print("Preise: ", event_extractor.extract_prices(text))
|
67 |
+
print("*" * 100)
|
src/resources/TEXTS.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/utils/Event.py
CHANGED
@@ -3,7 +3,7 @@ from textwrap import indent
|
|
3 |
from sympy import false
|
4 |
|
5 |
|
6 |
-
class
|
7 |
def __init__(self, start_date, end_date, start_time, end_time, admittance_time):
|
8 |
self.start_date = start_date
|
9 |
self.end_date = end_date
|
@@ -33,24 +33,41 @@ class DateTime:
|
|
33 |
|
34 |
def __eq__(self, other):
|
35 |
|
36 |
-
if isinstance(other,
|
37 |
-
|
|
|
38 |
else:
|
39 |
return False
|
40 |
|
41 |
def __hash__(self):
|
42 |
return hash((self.start_date, self.end_date, self.start_time, self.end_time, self.admittance_time))
|
43 |
|
|
|
|
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
|
56 |
class Event:
|
@@ -95,13 +112,13 @@ class Event:
|
|
95 |
|
96 |
@schedule.setter
|
97 |
def schedule(self, value):
|
98 |
-
if isinstance(value, list) and all(isinstance(entry,
|
99 |
self._schedule = value
|
100 |
else:
|
101 |
raise ValueError("Schedule must be a list of Schedule objects")
|
102 |
|
103 |
def add_schedule_entry(self, start_date, end_date, start_time, end_time, admittance_time):
|
104 |
-
self._schedule.append(
|
105 |
|
106 |
@property
|
107 |
def address(self):
|
|
|
3 |
from sympy import false
|
4 |
|
5 |
|
6 |
+
class Schedule:
|
7 |
def __init__(self, start_date, end_date, start_time, end_time, admittance_time):
|
8 |
self.start_date = start_date
|
9 |
self.end_date = end_date
|
|
|
33 |
|
34 |
def __eq__(self, other):
|
35 |
|
36 |
+
if isinstance(other, Schedule):
|
37 |
+
print("Equals?", str(self),str(other), str(self) == str(other))
|
38 |
+
return str(self).strip() == str(other).strip()
|
39 |
else:
|
40 |
return False
|
41 |
|
42 |
def __hash__(self):
|
43 |
return hash((self.start_date, self.end_date, self.start_time, self.end_time, self.admittance_time))
|
44 |
|
45 |
+
def __len__(self):
|
46 |
+
return len([element for element in [self.start_date, self.end_date, self.start_time, self.end_time, self.admittance_time] if element])
|
47 |
|
48 |
+
def __contains__(self,item):
|
49 |
+
if not isinstance(item, Schedule): # Falls other kein Schedule-Objekt ist, direkt False
|
50 |
+
return False
|
51 |
+
print("SELF: ", self)
|
52 |
+
print("ITEM: ", item)
|
53 |
+
return (
|
54 |
+
(self.start_date == item.start_date or self.start_date == item.end_date or item.start_date is None) and
|
55 |
+
(self.end_date == item.end_date or self.end_date == item.start_date or item.end_date is None) and
|
56 |
+
(self.start_time == item.start_time or self.start_time == self.end_time or item.start_time is None) and
|
57 |
+
(self.end_time == item.end_time or self.end_time == self.start_time or item.end_time is None) and
|
58 |
+
(self.admittance_time == item.admittance_time or item.admittance_time is None)
|
59 |
+
)
|
60 |
+
|
61 |
+
|
62 |
+
# class Address:
|
63 |
+
# def __init__(self, street, house_number, postal_code, city):
|
64 |
+
# self.street = street
|
65 |
+
# self.house_number = house_number
|
66 |
+
# self.postal_code = postal_code
|
67 |
+
# self.city = city
|
68 |
+
#
|
69 |
+
# def __str__(self):
|
70 |
+
# return f"🏠 {self.street if self.street else ''} {self.house_number if self.house_number else ''}, {self.postal_code if self.postal_code else ''} {self.city if self.city else ''}"
|
71 |
|
72 |
|
73 |
class Event:
|
|
|
112 |
|
113 |
@schedule.setter
|
114 |
def schedule(self, value):
|
115 |
+
if isinstance(value, list) and all(isinstance(entry, Schedule) for entry in value):
|
116 |
self._schedule = value
|
117 |
else:
|
118 |
raise ValueError("Schedule must be a list of Schedule objects")
|
119 |
|
120 |
def add_schedule_entry(self, start_date, end_date, start_time, end_time, admittance_time):
|
121 |
+
self._schedule.append(Schedule(start_date, end_date, start_time, end_time, admittance_time))
|
122 |
|
123 |
@property
|
124 |
def address(self):
|
src/utils/helpers.py
CHANGED
@@ -2,6 +2,7 @@ from bs4 import BeautifulSoup, Comment
|
|
2 |
import re
|
3 |
from dateparser import DateDataParser
|
4 |
|
|
|
5 |
def normalize_data(input):
|
6 |
def normalize_dates(input_text):
|
7 |
days = r"(?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag|Mo|Di|Mi|Do|Fr|Sa|So)"
|
@@ -12,7 +13,6 @@ def normalize_data(input):
|
|
12 |
iso_pattern = r"(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})"
|
13 |
german_date_pattern = day_month_year_pattern + "|" + dd_mm_yyyy_pattern + "|" + iso_pattern
|
14 |
|
15 |
-
|
16 |
compiled_pattern = re.compile(german_date_pattern, re.VERBOSE)
|
17 |
|
18 |
matches = compiled_pattern.findall(input_text)
|
@@ -31,20 +31,20 @@ def normalize_data(input):
|
|
31 |
print(f"Fehler bei der Verarbeitung von '{match}': {e}")
|
32 |
|
33 |
# Ersetze alle Vorkommen von '20.03. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
|
34 |
-
german_date_pattern = r"(?<!\.)(\d{2})\.(\d{2})
|
35 |
-
input_text = re.sub(german_date_pattern, r" \1.\2.\
|
36 |
|
37 |
# Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
|
38 |
-
german_date_pattern = r"(?<!\d)(\d{2})
|
39 |
-
input_text = re.sub(german_date_pattern, r" \1.\
|
40 |
|
41 |
# Ersetze alle Vorkommen von '20.03.2025 bis/bis zum 21.03.2025' durch '20.03.2025 - 21.03.2025'
|
42 |
-
german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})
|
43 |
-
input_text = re.sub(german_date_pattern, r" \1.\2.\3 - \
|
44 |
|
45 |
# Ersetze alle Vorkommen von '20.03.2025 und/& 21.03.2025' durch '20.03.2025 + 21.03.2025'
|
46 |
-
german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})
|
47 |
-
input_text = re.sub(german_date_pattern, r" \1.\2.\3 + \
|
48 |
return input_text
|
49 |
|
50 |
def normalize_times(input_text):
|
@@ -95,7 +95,8 @@ def normalize_data(input):
|
|
95 |
normalized_data = normalize_text(normalized_data)
|
96 |
return normalized_data
|
97 |
|
98 |
-
|
|
|
99 |
soup = BeautifulSoup(html, "lxml")
|
100 |
body_content = soup.body
|
101 |
|
@@ -117,31 +118,35 @@ def clean_html(html:str):
|
|
117 |
cleaned_html = "\n".join(clean_html_lines)
|
118 |
return cleaned_html
|
119 |
|
120 |
-
|
121 |
-
|
|
|
122 |
return soup.get_text(separator=' ', strip=True)
|
123 |
|
124 |
|
125 |
-
#
|
126 |
-
# Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03. bis 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.
|
127 |
-
# 15. November 2024 & 13. Dezember 2024: Kunstausstellung
|
128 |
-
# Der siebte Workshop Retrodigitalisierung findet am 20. und 21.03.2025 bei ZB MED
|
129 |
-
# 2. März bis 21. März 2025 \n
|
130 |
-
# **Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB
|
131 |
-
# ""
|
132 |
-
#
|
133 |
-
# expected = """
|
134 |
-
# Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03.2025 - 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.
|
135 |
-
# 15.11.2024 + 13.12.2024: Kunstausstellung "Der erweiterte Raum,"
|
136 |
-
# Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 + 21.03.2025 bei ZB MED –.
|
137 |
-
# 02.03.2025 - 21.03.2025 \n
|
138 |
-
# **Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB
|
139 |
-
# """
|
140 |
-
#
|
141 |
#
|
142 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
#
|
144 |
-
#
|
145 |
-
#
|
146 |
-
#
|
147 |
-
#
|
|
|
|
|
|
|
|
|
|
2 |
import re
|
3 |
from dateparser import DateDataParser
|
4 |
|
5 |
+
|
6 |
def normalize_data(input):
|
7 |
def normalize_dates(input_text):
|
8 |
days = r"(?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag|Mo|Di|Mi|Do|Fr|Sa|So)"
|
|
|
13 |
iso_pattern = r"(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})"
|
14 |
german_date_pattern = day_month_year_pattern + "|" + dd_mm_yyyy_pattern + "|" + iso_pattern
|
15 |
|
|
|
16 |
compiled_pattern = re.compile(german_date_pattern, re.VERBOSE)
|
17 |
|
18 |
matches = compiled_pattern.findall(input_text)
|
|
|
31 |
print(f"Fehler bei der Verarbeitung von '{match}': {e}")
|
32 |
|
33 |
# Ersetze alle Vorkommen von '20.03. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
|
34 |
+
german_date_pattern = r"(?<!\.)(\d{2})\.(\d{2})\.(\s*\d{2}:\d{2})?\s*(und|\+|&|bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
|
35 |
+
input_text = re.sub(german_date_pattern, r" \1.\2.\7 \4 \5.\6.\7 ", input_text)
|
36 |
|
37 |
# Ersetze alle Vorkommen von '20. und 21.03.2025' durch '20.03.2025 und 21.03.2025'
|
38 |
+
german_date_pattern = r"(?<!\d)(\d{2})\.(\s*\d{2}:\d{2})?\s*(und|\+|&|bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
|
39 |
+
input_text = re.sub(german_date_pattern, r" \1.\5.\6 \2 \3 \4.\5.\6 \7 ", input_text)
|
40 |
|
41 |
# Ersetze alle Vorkommen von '20.03.2025 bis/bis zum 21.03.2025' durch '20.03.2025 - 21.03.2025'
|
42 |
+
german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?\s*(bis|bis zum|-|—|–)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
|
43 |
+
input_text = re.sub(german_date_pattern, r" \1.\2.\3 \4 - \6.\7.\8 \9 ", input_text)
|
44 |
|
45 |
# Ersetze alle Vorkommen von '20.03.2025 und/& 21.03.2025' durch '20.03.2025 + 21.03.2025'
|
46 |
+
german_date_pattern = r"(\d{1,2})\.(\d{1,2})\.(\d{4})\.?(\s*\d{2}:\d{2})?\s*(und|\+|&)\s*(\d{1,2})\.(\d{1,2})\.(\d{4})(\s*\d{2}:\d{2})?"
|
47 |
+
input_text = re.sub(german_date_pattern, r" \1.\2.\3 \4 + \6.\7.\8 \9 ", input_text)
|
48 |
return input_text
|
49 |
|
50 |
def normalize_times(input_text):
|
|
|
95 |
normalized_data = normalize_text(normalized_data)
|
96 |
return normalized_data
|
97 |
|
98 |
+
|
99 |
+
def clean_html(html: str):
|
100 |
soup = BeautifulSoup(html, "lxml")
|
101 |
body_content = soup.body
|
102 |
|
|
|
118 |
cleaned_html = "\n".join(clean_html_lines)
|
119 |
return cleaned_html
|
120 |
|
121 |
+
|
122 |
+
def strip_html_to_text(html: str):
|
123 |
+
soup = BeautifulSoup(html, "lxml")
|
124 |
return soup.get_text(separator=' ', strip=True)
|
125 |
|
126 |
|
127 |
+
# texts = [
|
128 |
+
# "Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03. bis 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.",
|
129 |
+
# "15. November 2024 & 13. Dezember 2024: Kunstausstellung 'Der erweiterte Raum'",
|
130 |
+
# "Der siebte Workshop Retrodigitalisierung findet am 20. und 21.03.2025 bei ZB MED.",
|
131 |
+
# "2. März bis 21. März 2025 \n"
|
132 |
+
# "**Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB",
|
133 |
+
# "22.04.25 15 Uhr bis 23.04.25 16 Uhr."
|
134 |
+
# ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
#
|
136 |
+
# expected_texts = [
|
137 |
+
# "Die 18. Koblenzer Literaturtage „ganzOhr“ finden vom 22.03.2025 - 05.04.2025 statt. Das Programm wird im Januar 2025 veröffentlicht, der Vorverkauf startet im Februar.",
|
138 |
+
# "15.11.2024 + 13.12.2024: Kunstausstellung 'Der erweiterte Raum'",
|
139 |
+
# "Der siebte Workshop Retrodigitalisierung findet am 20.03.2025 + 21.03.2025 bei ZB MED.",
|
140 |
+
# "02.03.2025 - 21.03.2025 \n",
|
141 |
+
# "**Wann?** 05.12.2024, 19:00-21:00 **Wo?** Lesesaal im Marstallgebäude, TIB",
|
142 |
+
# "22.04.2025 15:00 - 23.04.2025 16:00."
|
143 |
+
# ]
|
144 |
#
|
145 |
+
# for i, text in enumerate(texts):
|
146 |
+
# normalized = normalize_data(text)
|
147 |
+
# normalized = re.sub("\s*", " ",normalized)
|
148 |
+
# expected = re.sub("\s*", " ",expected_texts[i])
|
149 |
+
# if normalized == expected:
|
150 |
+
# print("Normalization successful!")
|
151 |
+
# else:
|
152 |
+
# print("Normalization failed!")
|