refactor testing
Browse files- src/nlp/experimental/textclassification/classify_title.py +78 -96
- src/nlp/playground/pipelines/event_data_extractor.py +13 -12
- src/nlp/playground/pipelines/testing/event_data_extractor_testing.py +37 -18
- src/nlp/playground/pipelines/testing/location_extractor_testing.py +11 -0
- src/nlp/playground/pipelines/testing/results.csv +94 -45
- src/nlp/playground/pipelines/testing/results.txt +0 -0
- src/nlp/playground/pipelines/title_extractor.py +2 -1
- src/utils/Event.py +14 -23
src/nlp/experimental/textclassification/classify_title.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
-
import
|
2 |
-
|
|
|
3 |
|
4 |
train_data ={
|
5 |
"Veranstaltungstitel": [
|
@@ -338,104 +339,85 @@ train_data ={
|
|
338 |
]
|
339 |
}
|
340 |
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
# test_data = {"Veranstaltungstitel": [d for d in test_data["Veranstaltungstitel"] if d not in train_data["Veranstaltungstitel"]],
|
412 |
# "Abschnittstitel": [d for d in test_data["Abschnittstitel"] if d not in train_data["Abschnittstitel"]]}
|
413 |
# print(len(test_data["Veranstaltungstitel"]), " | " , len(test_data["Abschnittstitel"]))
|
414 |
#
|
415 |
-
# nlp = spacy.load("de_core_news_lg")
|
416 |
-
# nlp.add_pipe(
|
417 |
-
# "classy_classification",
|
418 |
-
# config={
|
419 |
-
# "data": train_data,
|
420 |
-
# "model": "spacy",
|
421 |
-
# }
|
422 |
-
# )
|
423 |
#
|
424 |
#
|
425 |
-
#
|
426 |
-
#
|
427 |
-
#
|
428 |
-
#
|
429 |
-
|
430 |
-
# print(f"{cats[0][0]}")
|
431 |
-
# print("*"*100)
|
432 |
-
#
|
433 |
-
# print("\n\n\n\n\n")
|
434 |
-
# for text in test_data["Abschnittstitel"]:
|
435 |
-
# print(text)
|
436 |
-
# print("*"*100)
|
437 |
-
# # print(nlp(text)._.cats)
|
438 |
-
# cats = sorted(nlp(text)._.cats.items(), key=lambda x: x[1], reverse=True)
|
439 |
-
# print(f"{cats[0][0]}")
|
440 |
-
# print("*"*100)
|
441 |
|
|
|
1 |
+
import pickle
|
2 |
+
|
3 |
+
from classy_classification import ClassyClassifier
|
4 |
|
5 |
train_data ={
|
6 |
"Veranstaltungstitel": [
|
|
|
339 |
]
|
340 |
}
|
341 |
|
342 |
+
test_data = {
|
343 |
+
"Veranstaltungstitel": [
|
344 |
+
"Feuer & Flamme – Eine Nacht voller Überraschungen",
|
345 |
+
"Jetzt oder nie!",
|
346 |
+
"Die vergessenen Töne",
|
347 |
+
"Warum nicht? Ein philosophischer Abend",
|
348 |
+
"Von Wundern und Wirklichkeit",
|
349 |
+
"Erzähl mir eine Geschichte...",
|
350 |
+
"Unter Strom – Eine Licht- und Soundinstallation",
|
351 |
+
"HORIZONTE ERWEITERN",
|
352 |
+
"Zuhören, Staunen, Lachen",
|
353 |
+
"Ein Blick genügt.",
|
354 |
+
"Schatten der Vergangenheit – Eine Spurensuche",
|
355 |
+
"ALLES MUSS RAUS!",
|
356 |
+
"Und plötzlich war alles anders.",
|
357 |
+
"DAS ENDE ODER EIN NEUER ANFANG?",
|
358 |
+
"Ein Moment für die Ewigkeit",
|
359 |
+
"Die Farbe der Nacht",
|
360 |
+
"Flüstern & Rauschen",
|
361 |
+
"GRENZENLOS",
|
362 |
+
"Was, wenn…?",
|
363 |
+
"Sonderführung: Das Unsichtbare sichtbar machen",
|
364 |
+
"Tanz der Elemente",
|
365 |
+
"Magie des Augenblicks",
|
366 |
+
"DIE ZEIT STEHT STILL",
|
367 |
+
"Bühne frei! – Eine Nacht voller Geschichten",
|
368 |
+
"Letzte Runde – Abschiedskonzert der Moonlight Band",
|
369 |
+
"Reise in eine andere Welt",
|
370 |
+
"Heute schon gestaunt?",
|
371 |
+
"Ungehörte Stimmen",
|
372 |
+
"Frei sein.",
|
373 |
+
"Zwischen den Zeilen lesen",
|
374 |
+
"Cro",
|
375 |
+
"Helene Fischer live"
|
376 |
+
],
|
377 |
+
"Abschnittstitel": [
|
378 |
+
"Hier gibt’s Infos",
|
379 |
+
"Wichtig!",
|
380 |
+
"So läuft’s ab",
|
381 |
+
"Wann & wo?",
|
382 |
+
"PLÄNE & ABLAUF",
|
383 |
+
"Hinkommen & mitmachen",
|
384 |
+
"Wie funktioniert das?",
|
385 |
+
"Ankommen leicht gemacht",
|
386 |
+
"Was kostet es?",
|
387 |
+
"Preise & Buchung",
|
388 |
+
"Sitzplätze & Reservierungen",
|
389 |
+
"Freier Eintritt oder nicht?",
|
390 |
+
"Anmeldung erforderlich?",
|
391 |
+
"Noch Fragen?",
|
392 |
+
"Regeln & Hinweise",
|
393 |
+
"Vorsicht & Sicherheit",
|
394 |
+
"Besucherinfos",
|
395 |
+
"Speis & Trank",
|
396 |
+
"Essen & Drinks",
|
397 |
+
"Party danach?",
|
398 |
+
"Meetup & Austausch",
|
399 |
+
"Soziale Medien & Hashtags",
|
400 |
+
"Helfer & Team",
|
401 |
+
"Mit wem arbeiten wir?",
|
402 |
+
"Unsere Partner",
|
403 |
+
"DIE WICHTIGSTEN INFOS",
|
404 |
+
"Presse & Berichte",
|
405 |
+
"Hier melden!",
|
406 |
+
"Kontakt & Support",
|
407 |
+
"KONTAKTIERE UNS",
|
408 |
+
"Was ist, wenn…?",
|
409 |
+
"Das sollten Sie wissen!"
|
410 |
+
]
|
411 |
+
}
|
412 |
# test_data = {"Veranstaltungstitel": [d for d in test_data["Veranstaltungstitel"] if d not in train_data["Veranstaltungstitel"]],
|
413 |
# "Abschnittstitel": [d for d in test_data["Abschnittstitel"] if d not in train_data["Abschnittstitel"]]}
|
414 |
# print(len(test_data["Veranstaltungstitel"]), " | " , len(test_data["Abschnittstitel"]))
|
415 |
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
#
|
417 |
#
|
418 |
+
# classifier = ClassyClassifier(data=train_data)
|
419 |
+
# classifier.set_embedding_model(model="stsb-xlm-r-multilingual")
|
420 |
+
# with open("title_classifier.pkl", "wb") as f:
|
421 |
+
# pickle.dump(classifier, f)
|
422 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
|
src/nlp/playground/pipelines/event_data_extractor.py
CHANGED
@@ -25,9 +25,9 @@ class EventDataExtractor:
|
|
25 |
event = Event()
|
26 |
event.title = self.extract_title(data)
|
27 |
event.categories = self.extract_categories(data)
|
28 |
-
event.locations = self.extract_locations(data)
|
29 |
-
event.organizers = self.extract_organizers(data)
|
30 |
event.address = self.extract_address(data)
|
|
|
|
|
31 |
event.schedule = self.extract_schedule(data)
|
32 |
event.description = self.extract_description(data, event.title)
|
33 |
event.prices = self.extract_prices(data)
|
@@ -71,17 +71,18 @@ class EventDataExtractor:
|
|
71 |
print(f"Extracted categories: {categories}")
|
72 |
return categories
|
73 |
|
74 |
-
def extract_locations(self, data):
|
|
|
75 |
print("Extracting locations...")
|
76 |
-
entities = self.gliner_handler.extract_entities(data, ["
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
return
|
81 |
|
82 |
def extract_organizers(self, data):
|
83 |
print("Extracting organizers...")
|
84 |
-
entities = self.gliner_handler.extract_entities(data, ["
|
85 |
organizers = list(set([item["text"] for item in entities if item["label"] == "EVENT_ORGANIZER"]))
|
86 |
|
87 |
print(f"Extracted organizers: {organizers}")
|
@@ -99,13 +100,13 @@ class EventDataExtractor:
|
|
99 |
|
100 |
def extract_prices(self, data):
|
101 |
print("Extracting prices...")
|
102 |
-
entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis
|
103 |
print(entities)
|
104 |
-
filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d', e["text"]) and e["score"]>=0.4]
|
105 |
|
106 |
prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]
|
107 |
|
108 |
-
prices = [p.replace(",", ".") + "€" for sublist in prices for p in sublist]
|
109 |
|
110 |
entrance_free_category = self.zero_shot_classifier.classify(data, CustomMode(
|
111 |
["Eintritt frei", "Ticket", "Preis"],
|
|
|
25 |
event = Event()
|
26 |
event.title = self.extract_title(data)
|
27 |
event.categories = self.extract_categories(data)
|
|
|
|
|
28 |
event.address = self.extract_address(data)
|
29 |
+
event.locations = self.extract_locations(data, event.address)
|
30 |
+
event.organizers = self.extract_organizers(data)
|
31 |
event.schedule = self.extract_schedule(data)
|
32 |
event.description = self.extract_description(data, event.title)
|
33 |
event.prices = self.extract_prices(data)
|
|
|
71 |
print(f"Extracted categories: {categories}")
|
72 |
return categories
|
73 |
|
74 |
+
def extract_locations(self, data, address):
|
75 |
+
address = address if address else ""
|
76 |
print("Extracting locations...")
|
77 |
+
entities = self.gliner_handler.extract_entities(data, ["Lokalität", "Adresse"])
|
78 |
+
print(entities)
|
79 |
+
if entities:
|
80 |
+
return list(set([entity["text"] for entity in entities if entity["label"] == "Lokalität" and entity["text"] != "" and entity["text"] not in address]))
|
81 |
+
return []
|
82 |
|
83 |
def extract_organizers(self, data):
|
84 |
print("Extracting organizers...")
|
85 |
+
entities = self.gliner_handler.extract_entities(data, ["EVENT_ORGANIZER"])
|
86 |
organizers = list(set([item["text"] for item in entities if item["label"] == "EVENT_ORGANIZER"]))
|
87 |
|
88 |
print(f"Extracted organizers: {organizers}")
|
|
|
100 |
|
101 |
def extract_prices(self, data):
|
102 |
print("Extracting prices...")
|
103 |
+
entities = self.gliner_handler.extract_entities(data, ["Eintrittspreis"])
|
104 |
print(entities)
|
105 |
+
filtered_entities = [e["text"] for e in entities if e["text"] and re.search(r'\d\s*(€|EUR|eur|Eur|Euro|euro|euros|EURO)', e["text"]) and e["score"]>=0.4]
|
106 |
|
107 |
prices = [re.findall(r'\d+(?:[.,]\d+)?', price) for price in filtered_entities]
|
108 |
|
109 |
+
prices = [p.replace(",", ".") + " €" for sublist in prices for p in sublist]
|
110 |
|
111 |
entrance_free_category = self.zero_shot_classifier.classify(data, CustomMode(
|
112 |
["Eintritt frei", "Ticket", "Preis"],
|
src/nlp/playground/pipelines/testing/event_data_extractor_testing.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import csv
|
2 |
import gc
|
|
|
3 |
import time
|
4 |
|
5 |
import pandas as pd
|
@@ -22,22 +23,33 @@ def init_db_entries():
|
|
22 |
filtered_elements = []
|
23 |
for el in elements:
|
24 |
if all(f not in el.get("markdown", "") for f in filter_data):
|
25 |
-
|
|
|
|
|
26 |
print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
|
27 |
-
return filtered_elements
|
28 |
|
29 |
def event_similarity(actual, predicted):
|
30 |
# Liste der Attribute, die verglichen werden
|
31 |
attributes = {
|
32 |
"title": (actual.title, predicted.title),
|
33 |
"schedule": (actual.schedule, predicted.schedule),
|
34 |
-
"prices": (actual.prices, predicted.prices),
|
35 |
"address": (actual.address, predicted.address),
|
36 |
-
"organizers": (actual.organizers, predicted.organizers),
|
|
|
37 |
}
|
38 |
|
39 |
# Dictionary mit 1 für Übereinstimmung, 0 für Abweichung
|
40 |
-
match_results = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# Berechnung der Gesamtähnlichkeit
|
43 |
similarity_percentage = (sum(match_results.values()) / len(attributes)) * 100
|
@@ -59,7 +71,7 @@ count = 0
|
|
59 |
|
60 |
with open('results.csv', 'a', newline='') as csvfile:
|
61 |
writer = csv.writer(csvfile, delimiter=' ')
|
62 |
-
header = ["url", "title", "schedule", "prices", "address", "organizers", "extraction_time"]
|
63 |
writer.writerow(header)
|
64 |
|
65 |
for el in elements:
|
@@ -77,10 +89,15 @@ for el in elements:
|
|
77 |
actual_event.organizers = [org.strip() for org in el.get("information", {}).get("actual", {}).get("organizers", []) if
|
78 |
org.strip()]
|
79 |
actual_event.categories = el.get("information", {}).get("actual", {}).get("categories", [])
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
82 |
]
|
83 |
-
actual_event.prices =
|
84 |
address = el.get("information", {}).get("actual", {}).get("address")
|
85 |
if address:
|
86 |
address = address.get("formatted")
|
@@ -89,11 +106,12 @@ for el in elements:
|
|
89 |
|
90 |
|
91 |
dates = el.get("information", {}).get("actual", {}).get("dates", [])
|
92 |
-
|
93 |
Schedule(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None),
|
94 |
date.get("end_time", None), date.get("admittance_time", None))
|
95 |
for date in dates]
|
96 |
|
|
|
97 |
|
98 |
if not actual_event.schedule:
|
99 |
continue
|
@@ -114,7 +132,7 @@ for el in elements:
|
|
114 |
quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
115 |
writer.writerow(
|
116 |
[actual_event.url, match_results["title"], match_results["schedule"], match_results["prices"],
|
117 |
-
match_results["address"], match_results["organizers"], duration])
|
118 |
|
119 |
# prediction_results.append({"similarity": similarity, "match_results": match_results})
|
120 |
print("************** ORIGINAL NORMALIZED *******************")
|
@@ -160,28 +178,29 @@ field_sums = {
|
|
160 |
"prices": df["prices"].sum(),
|
161 |
"address": df["address"].sum(),
|
162 |
"organizers": df["organizers"].sum(),
|
|
|
163 |
}
|
164 |
|
165 |
total_events = len(df) # Gesamtanzahl der Events
|
166 |
percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()} # Berechne Prozentwerte
|
167 |
|
168 |
-
# 📊 Graphen erstellen
|
169 |
plt.figure(figsize=(10, 6))
|
170 |
bars = plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
|
171 |
|
172 |
-
# Prozentwerte
|
173 |
for bar, (key, percent) in zip(bars, percentages.items()):
|
174 |
-
plt.text(bar.get_x() + bar.get_width() / 2, -0.
|
175 |
ha="center", va="top", fontsize=10, color="black")
|
176 |
|
177 |
# 🏷️ Achsenbeschriftungen & Titel
|
178 |
plt.xlabel("Event Attribute")
|
179 |
plt.ylabel("Anzahl der Übereinstimmungen")
|
180 |
-
plt.title(
|
181 |
|
182 |
-
#
|
183 |
-
info_text = f"Getestete
|
184 |
-
plt.
|
|
|
185 |
|
186 |
plt.ylim(0, total_events * 1.2) # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
|
187 |
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
|
|
1 |
import csv
|
2 |
import gc
|
3 |
+
import re
|
4 |
import time
|
5 |
|
6 |
import pandas as pd
|
|
|
23 |
filtered_elements = []
|
24 |
for el in elements:
|
25 |
if all(f not in el.get("markdown", "") for f in filter_data):
|
26 |
+
dates = el.get("information", {}).get("actual", {}).get("dates", [])
|
27 |
+
if dates:
|
28 |
+
filtered_elements.append(el)
|
29 |
print(f"{len(filtered_elements)} Testdatensätze in der Datenbank")
|
30 |
+
return filtered_elements[52:]
|
31 |
|
32 |
def event_similarity(actual, predicted):
|
33 |
# Liste der Attribute, die verglichen werden
|
34 |
attributes = {
|
35 |
"title": (actual.title, predicted.title),
|
36 |
"schedule": (actual.schedule, predicted.schedule),
|
37 |
+
"prices": (sorted(actual.prices), sorted(predicted.prices)),
|
38 |
"address": (actual.address, predicted.address),
|
39 |
+
"organizers": (sorted(actual.organizers), sorted(predicted.organizers)),
|
40 |
+
"location": (sorted(actual.locations), sorted(predicted.locations)),
|
41 |
}
|
42 |
|
43 |
# Dictionary mit 1 für Übereinstimmung, 0 für Abweichung
|
44 |
+
match_results = {
|
45 |
+
"title": int(actual.title == predicted.title),
|
46 |
+
"schedule": int(actual.schedule == predicted.schedule),
|
47 |
+
"prices": len([price for price in predicted.prices if price in actual.prices]) / len(actual.prices) if actual.prices and predicted.prices else int(actual.prices == predicted.prices),
|
48 |
+
"address": int(actual.address == predicted.address),
|
49 |
+
"organizers": len([organizer for organizer in predicted.organizers if organizer in actual.organizers]) / len(actual.organizers) if actual.organizers and predicted.organizers else int(actual.organizers == predicted.organizers),
|
50 |
+
"location": len([location for location in predicted.locations if location in actual.locations]) / len(actual.locations) if actual.locations and predicted.locations else int(actual.locations == predicted.locations),
|
51 |
+
}
|
52 |
+
# match_results = {attr: int(act == pred) for attr, (act, pred) in attributes.items()}
|
53 |
|
54 |
# Berechnung der Gesamtähnlichkeit
|
55 |
similarity_percentage = (sum(match_results.values()) / len(attributes)) * 100
|
|
|
71 |
|
72 |
with open('results.csv', 'a', newline='') as csvfile:
|
73 |
writer = csv.writer(csvfile, delimiter=' ')
|
74 |
+
header = ["url", "title", "schedule", "prices", "address", "organizers", "location", "extraction_time"]
|
75 |
writer.writerow(header)
|
76 |
|
77 |
for el in elements:
|
|
|
89 |
actual_event.organizers = [org.strip() for org in el.get("information", {}).get("actual", {}).get("organizers", []) if
|
90 |
org.strip()]
|
91 |
actual_event.categories = el.get("information", {}).get("actual", {}).get("categories", [])
|
92 |
+
location = el.get("information", {}).get("actual", {}).get("location", "")
|
93 |
+
actual_event.locations = [location.strip()] if location else []
|
94 |
+
prices = el.get("information", {}).get("actual", {}).get("prices", [])
|
95 |
+
formatted_prices = [
|
96 |
+
"kostenlos" if "kostenlos" in price.lower() or "frei" in price.lower()
|
97 |
+
else f"{p.replace(',', '.')} €"
|
98 |
+
for price in prices for p in re.findall(r'\d+(?:[.,]\d+)?', price)
|
99 |
]
|
100 |
+
actual_event.prices = formatted_prices
|
101 |
address = el.get("information", {}).get("actual", {}).get("address")
|
102 |
if address:
|
103 |
address = address.get("formatted")
|
|
|
106 |
|
107 |
|
108 |
dates = el.get("information", {}).get("actual", {}).get("dates", [])
|
109 |
+
schedules = [
|
110 |
Schedule(date.get("start_date", None), date.get("end_date", None), date.get("start_time", None),
|
111 |
date.get("end_time", None), date.get("admittance_time", None))
|
112 |
for date in dates]
|
113 |
|
114 |
+
actual_event.schedule = [schedule for schedule in schedules if len(schedule) > 0]
|
115 |
|
116 |
if not actual_event.schedule:
|
117 |
continue
|
|
|
132 |
quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
133 |
writer.writerow(
|
134 |
[actual_event.url, match_results["title"], match_results["schedule"], match_results["prices"],
|
135 |
+
match_results["address"], match_results["organizers"], match_results["location"], duration])
|
136 |
|
137 |
# prediction_results.append({"similarity": similarity, "match_results": match_results})
|
138 |
print("************** ORIGINAL NORMALIZED *******************")
|
|
|
178 |
"prices": df["prices"].sum(),
|
179 |
"address": df["address"].sum(),
|
180 |
"organizers": df["organizers"].sum(),
|
181 |
+
"location":df["location"].sum(),
|
182 |
}
|
183 |
|
184 |
total_events = len(df) # Gesamtanzahl der Events
|
185 |
percentages = {key: (value / total_events) * 100 for key, value in field_sums.items()} # Berechne Prozentwerte
|
186 |
|
|
|
187 |
plt.figure(figsize=(10, 6))
|
188 |
bars = plt.bar(field_sums.keys(), field_sums.values(), color=["blue", "orange", "green", "red", "purple"])
|
189 |
|
190 |
+
# 📊 Prozentwerte UNTER den Balken hinzufügen
|
191 |
for bar, (key, percent) in zip(bars, percentages.items()):
|
192 |
+
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.2, f"{percent:.1f}%",
|
193 |
ha="center", va="top", fontsize=10, color="black")
|
194 |
|
195 |
# 🏷️ Achsenbeschriftungen & Titel
|
196 |
plt.xlabel("Event Attribute")
|
197 |
plt.ylabel("Anzahl der Übereinstimmungen")
|
198 |
+
plt.title("Summierte Übereinstimmungen pro Event-Attribut")
|
199 |
|
200 |
+
# 📌 Info-Box OBEN LINKS im Graphen platzieren
|
201 |
+
info_text = f"Getestete Daten: {total_events}\nDurchschnittliche Verarbeitungszeit: {float(df['extraction_time'].sum()) / total_events:.2f}s"
|
202 |
+
plt.annotate(info_text, xy=(0.02, 0.85), xycoords="axes fraction", fontsize=12, ha="left",
|
203 |
+
bbox=dict(facecolor="white", alpha=0.8))
|
204 |
|
205 |
plt.ylim(0, total_events * 1.2) # Maximale Höhe etwas erhöhen für bessere Lesbarkeit
|
206 |
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
src/nlp/playground/pipelines/testing/location_extractor_testing.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.nlp.playground.pipelines.event_data_extractor import EventDataExtractor
|
2 |
+
from src.resources.TEXTS import TEXTS
|
3 |
+
|
4 |
+
event_extractor = EventDataExtractor()
|
5 |
+
|
6 |
+
for text in TEXTS:
|
7 |
+
print(text)
|
8 |
+
location = event_extractor.extract_locations(text)
|
9 |
+
print("*"*50," LOCATION ", "*"*50)
|
10 |
+
print(location)
|
11 |
+
print("*"*100)
|
src/nlp/playground/pipelines/testing/results.csv
CHANGED
@@ -1,45 +1,94 @@
|
|
1 |
-
url title schedule prices address organizers extraction_time
|
2 |
-
https://www.tib.eu/de/die-tib/neuigkeiten-und-termine/termine/detail/technik-salon-an-der-tib-fly-rocket-fly-am-5-dezember-2024-termin 1 0 1
|
3 |
-
https://www.tib.eu/de/die-tib/neuigkeiten-und-termine/termine/detail/7-workshop-retrodigitalisierung 1 0 1 0 0
|
4 |
-
https://www.tib.eu/de/die-tib/neuigkeiten-und-termine/aktuelles/detail/acm-wsdm-2025-renommierte-konferenz-zu-websuche-und-data-mining-in-hannover 1 1 1 1 1
|
5 |
-
https://www.eventbrite.de/e/infoveranstaltung-fur-geistliche-mutter-und-vater-tickets-1054784050489 1 1 1 1 0
|
6 |
-
http://www.cz-darmstadt.de/heiligabend 1 1 1 0 1
|
7 |
-
https://www.hamburg.de/politik-und-verwaltung/bezirke/altona/aktuelles/veranstaltungen/oeffentliche-besichtigung-notstandort-taskoepruestrasse-991296 1
|
8 |
-
https://www.hamburg.de/kultur/ausstellung/immersiv/leonardo-da-vinci-uomo-universale-960112 1 0 1 1 1
|
9 |
-
https://www.hamburg.de/kultur/musical-show/mj-das-michael-jackson-musical-401234 1 1 0 1 1
|
10 |
-
https://www.hannover.de/Museum-August-Kestner/Veranstaltungen/Veranstaltungskalender/Stadtansichten 1
|
11 |
-
https://www.bottrop.de/veranstaltungskalender/veranstaltungen/nikolausmarkt.php 1 0
|
12 |
-
https://schauspiel-erlangen.de/spielzeiten/2024-25/verleihung-erlanger-theaterpreis 1 0
|
13 |
-
http://foodklub.de/events/silvester-2024 1 0 0 1 0
|
14 |
-
https://www.rpi-heilbronn.de/veranstaltungen/zwischendrin.html 0 0 1 1 1 23.
|
15 |
-
https://sda.drs.de/grund-haupt-werkreal-real-gemeinschaftsschulen-u-sbbz/heilbronn/aktuelles.html
|
16 |
-
https://www.grimmwelt.de/de/kalender/fuehrung-fuer-personen-mit-demenz 1 0 0 0
|
17 |
-
https://www.grimmwelt.de/de/kalender/dornroeschen-und-frau-holle 0
|
18 |
-
https://www.grimmwelt.de/de/kalender/die-grimmwelt-von-a-z-3 1 0 0 0
|
19 |
-
https://www.grimmwelt.de/de/kalender/von-der-maerchensammlung-zum-woerterbuch 0 0 0 0
|
20 |
-
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-16/mo-torres-uebertrieben-unplugged-tour-2024 1
|
21 |
-
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-29/die-schoene-und-das-biest-das-musical 1
|
22 |
-
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-11/groundstar-user-conference-2024 1 0 1 1 0
|
23 |
-
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-15/weihnachtskonzert-staedtische-musikdirektion 1
|
24 |
-
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-09/last-christmas-miracle 1 0 1 1 0
|
25 |
-
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-20/hoehner-weihnacht 1 0 1 1 0
|
26 |
-
https://www.alivechurch.de/weihnachten/ 1
|
27 |
-
https://www.alivechurch.de/alphakurs 1 0 1 0 1
|
28 |
-
https://www.emk-karlsruhe.de/nachbarschaftsfest-am-6-7-juli-in-durlach-aue/ 0 0 1 0 0
|
29 |
-
https://www.emk-karlsruhe.de/einfuehrung-des-neuen-leiters-der-jugendkirche-am-1-12/ 1 0 1 0
|
30 |
-
https://www.emk-karlsruhe.de/offene-kirche-engel/ 0 0 1 0
|
31 |
-
https://www.emk-karlsruhe.de/adventsmarkt-in-der-kapelle-am-30-11/ 0
|
32 |
-
https://theater-koblenz.de/programm/ganzohr-literaturfestival/ 1 0 1 0
|
33 |
-
https://www.visit-lahnstein.de/theater/programm/ 0 0 1 0 1
|
34 |
-
https://www.museum-am-schoelerberg.de/buchungsangebote/captain-schnuppes-weltraumreise-2/20278 1
|
35 |
-
https://www.museum-am-schoelerberg.de/buchungsangebote/das-astronomiejahr-2025/20416 1 1
|
36 |
-
https://www.museum-am-schoelerberg.de/buchungsangebote/die-olchis-das-grosse-weltraumabenteuer/20743 1 0 0 0
|
37 |
-
https://www.museum-am-schoelerberg.de/buchungsangebote/wer-rettet-den-weihnachtsmann/ 1 0 0 0 1
|
38 |
-
https://www.museum-am-schoelerberg.de/buchungsangebote/queen-heaven/ 1 0 0 0
|
39 |
-
https://ev-stjohann.de/events/eschberg-lebendiger-adventskalender-am-3-und-4-advent/ 1 0 1 0 0
|
40 |
-
https://ev-stjohann.de/events/oek-weihnachsvesper/ 1
|
41 |
-
https://ev-stjohann.de/events/kindergottesdienst-70/ 1
|
42 |
-
https://ev-stjohann.de/events/kindergottesdienst-69/ 1
|
43 |
-
https://ev-stjohann.de/events/johanneskirche-das-licht-scheint-in-der-finsternis-gottesdienst-mit-dance-performance/ 1
|
44 |
-
https://ahmadiyya.de/events/islamausstellung-in-hemer-2/ 1 0 1 0 0
|
45 |
-
https://halo.club/event/why-so-serious-13-12/ 1 0 0 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
url title schedule prices address organizers location extraction_time
|
2 |
+
https://www.tib.eu/de/die-tib/neuigkeiten-und-termine/termine/detail/technik-salon-an-der-tib-fly-rocket-fly-am-5-dezember-2024-termin 1 1 0 1 0.0 0.0 43.717700242996216
|
3 |
+
https://www.tib.eu/de/die-tib/neuigkeiten-und-termine/termine/detail/7-workshop-retrodigitalisierung 1 0 1 1 0.75 0 49.21727967262268
|
4 |
+
https://www.tib.eu/de/die-tib/neuigkeiten-und-termine/aktuelles/detail/acm-wsdm-2025-renommierte-konferenz-zu-websuche-und-data-mining-in-hannover 1 1 1 1 0 1 34.70475196838379
|
5 |
+
https://www.eventbrite.de/e/infoveranstaltung-fur-geistliche-mutter-und-vater-tickets-1054784050489 1 1 1 1 1.0 1 37.44318437576294
|
6 |
+
http://www.cz-darmstadt.de/heiligabend 1 1 1 1 0 1 20.86623740196228
|
7 |
+
https://www.hamburg.de/politik-und-verwaltung/bezirke/altona/aktuelles/veranstaltungen/oeffentliche-besichtigung-notstandort-taskoepruestrasse-991296 1 1 1 0 1.0 0.0 29.26156735420227
|
8 |
+
https://www.hamburg.de/kultur/ausstellung/immersiv/leonardo-da-vinci-uomo-universale-960112 1 0 1 1 1 1 73.98112225532532
|
9 |
+
https://www.hamburg.de/kultur/musical-show/mj-das-michael-jackson-musical-401234 1 1 1.0 1 0 1 58.78098440170288
|
10 |
+
https://www.hannover.de/Museum-August-Kestner/Veranstaltungen/Veranstaltungskalender/Stadtansichten 1 1 1 1 0 0.0 16.56206512451172
|
11 |
+
https://www.bottrop.de/veranstaltungskalender/veranstaltungen/nikolausmarkt.php 1 0 0 0 0 1.0 80.8017590045929
|
12 |
+
https://schauspiel-erlangen.de/spielzeiten/2024-25/verleihung-erlanger-theaterpreis 1 0 0 1 0.0 1.0 13.861144542694092
|
13 |
+
http://foodklub.de/events/silvester-2024 1 0 1.0 1 1.0 0 20.870990991592407
|
14 |
+
https://www.rpi-heilbronn.de/veranstaltungen/zwischendrin.html 0 0 1 1 0 1.0 23.867390632629395
|
15 |
+
https://sda.drs.de/grund-haupt-werkreal-real-gemeinschaftsschulen-u-sbbz/heilbronn/aktuelles.html 1 1 1 1 0 0.0 49.501890659332275
|
16 |
+
https://www.grimmwelt.de/de/kalender/fuehrung-fuer-personen-mit-demenz 1 0 1.0 1 0 0.0 35.93979334831238
|
17 |
+
https://www.grimmwelt.de/de/kalender/dornroeschen-und-frau-holle 0 1 1.0 1 0 1.0 81.40064716339111
|
18 |
+
https://www.grimmwelt.de/de/kalender/die-grimmwelt-von-a-z-3 1 0 1.0 0 0 0.0 34.69148111343384
|
19 |
+
https://www.grimmwelt.de/de/kalender/von-der-maerchensammlung-zum-woerterbuch 0 0 1.0 1 0 0.0 48.23841381072998
|
20 |
+
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-16/mo-torres-uebertrieben-unplugged-tour-2024 1 1 1 0 1.0 1.0 39.58735251426697
|
21 |
+
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-29/die-schoene-und-das-biest-das-musical 1 1 1 1 1.0 1.0 33.0947539806366
|
22 |
+
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-11/groundstar-user-conference-2024 1 0 1 1 1.0 1.0 20.838589191436768
|
23 |
+
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-15/weihnachtskonzert-staedtische-musikdirektion 1 1 1 1 0.0 1.0 43.034565925598145
|
24 |
+
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-09/last-christmas-miracle 1 0 1 1 1.0 1.0 27.90658688545227
|
25 |
+
http://www.eurogress-aachen.de/veranstaltungskalender/2024-12-20/hoehner-weihnacht 1 0 1 1 1.0 1.0 39.0615758895874
|
26 |
+
https://www.alivechurch.de/weihnachten/ 1 1 0 1 0 1.0 53.24011707305908
|
27 |
+
https://www.alivechurch.de/alphakurs 1 0 0 1 0 1 60.60767650604248
|
28 |
+
https://www.emk-karlsruhe.de/nachbarschaftsfest-am-6-7-juli-in-durlach-aue/ 0 0 1 0 0.0 1.0 58.35703468322754
|
29 |
+
https://www.emk-karlsruhe.de/einfuehrung-des-neuen-leiters-der-jugendkirche-am-1-12/ 1 0 1 1 0 0 21.811877727508545
|
30 |
+
https://www.emk-karlsruhe.de/offene-kirche-engel/ 0 0 1 0 0 0.0 14.054759502410889
|
31 |
+
https://www.emk-karlsruhe.de/adventsmarkt-in-der-kapelle-am-30-11/ 0 1 1 1 1 1.0 16.296363353729248
|
32 |
+
https://theater-koblenz.de/programm/ganzohr-literaturfestival/ 1 0 1 0 0 0 22.599891901016235
|
33 |
+
https://www.visit-lahnstein.de/theater/programm/ 0 0 1 0 0 1.0 73.70467042922974
|
34 |
+
https://www.museum-am-schoelerberg.de/buchungsangebote/captain-schnuppes-weltraumreise-2/20278 1 1 1.0 1 0 1 57.995845794677734
|
35 |
+
https://www.museum-am-schoelerberg.de/buchungsangebote/das-astronomiejahr-2025/20416 1 0 1.0 0 1 1 35.048099994659424
|
36 |
+
https://www.museum-am-schoelerberg.de/buchungsangebote/die-olchis-das-grosse-weltraumabenteuer/20743 1 0 1.0 1 0 0 49.85187029838562
|
37 |
+
https://www.museum-am-schoelerberg.de/buchungsangebote/wer-rettet-den-weihnachtsmann/ 1 0 1.0 0 1 1 35.63845634460449
|
38 |
+
https://www.museum-am-schoelerberg.de/buchungsangebote/queen-heaven/ 1 0 1.0 1 0 0 36.98330879211426
|
39 |
+
https://ev-stjohann.de/events/eschberg-lebendiger-adventskalender-am-3-und-4-advent/ 1 0 1 0 0.0 0 17.110201120376587
|
40 |
+
https://ev-stjohann.de/events/oek-weihnachsvesper/ 1 1 1 1 1.0 0 12.05985713005066
|
41 |
+
https://ev-stjohann.de/events/kindergottesdienst-70/ 1 1 1 1 0 1.0 11.393477201461792
|
42 |
+
https://ev-stjohann.de/events/kindergottesdienst-69/ 1 1 1 1 0 1.0 10.89378833770752
|
43 |
+
https://ev-stjohann.de/events/johanneskirche-das-licht-scheint-in-der-finsternis-gottesdienst-mit-dance-performance/ 1 1 1 1 0 1.0 38.91781949996948
|
44 |
+
https://ahmadiyya.de/events/islamausstellung-in-hemer-2/ 1 0 1 0 0.0 1.0 107.58875632286072
|
45 |
+
https://halo.club/event/why-so-serious-13-12/ 1 0 1.0 0 0 0.0 182.2700481414795
|
46 |
+
https://halo.club/event/pretty-nice-vibe-21-12/ 1 0 0.0 0 0 1.0 59.138978242874146
|
47 |
+
https://halo.club/event/blackout-thursdays-19-12/ 1 0 1.0 1 0 0 136.42740631103516
|
48 |
+
https://halo.club/event/why-so-serious-27-12/ 1 0 1.0 0 0 0.0 181.50714254379272
|
49 |
+
https://halo.club/event/pretty-nice-vibe-28-12/ 1 0 1.0 0 0 1.0 158.82524609565735
|
50 |
+
https://halo.club/event/blackout-thursdays-26-12/ 1 0 1.0 1 0 0 130.22622179985046
|
51 |
+
https://halo.club/event/blackout-thursdays-12-12/ 1 0 1.0 1 0 0.0 85.08140540122986
|
52 |
+
https://halo.club/event/why-so-serious-20-12/ 0 0 1.0 0 0 0.0 99.1928482055664
|
53 |
+
http://www.ga-ga.de/events/18.01.2025-eatdancelove1801-10281 0 0 1.0 1 1 0 123.07266283035278
|
54 |
+
http://www.blankenesekiezinternat.de/event/%e0%a5%90-bki-bescherungs-stampf-%e0%a5%90/ 1 0 1 1 0 0 27.504791021347046
|
55 |
+
http://www.blankenesekiezinternat.de/event/%e0%a5%90-trip-to-galactica-%e0%a5%90-24/ 1 1 1 1 0 0 26.146658658981323
|
56 |
+
http://www.blankenesekiezinternat.de/event/%e0%a5%90-roads-of-proggy-%e0%a5%90-25/ 1 1 1 0 0 0 25.835285186767578
|
57 |
+
http://www.blankenesekiezinternat.de/event/%e0%a5%90-bkistampf-thursday-%e0%a5%90-17/ 1 1 1 0 0 0 32.729358196258545
|
58 |
+
http://www.blankenesekiezinternat.de/event/%e0%a5%90-silvester-stampf-%e0%a5%90-2/ 1 0 1 0 0 0 25.82702112197876
|
59 |
+
https://www.my-private.club/h1club-hamburg/events/wild-wednesday-_2024-12-18_10242 1 1 0 0 1 1.0 18.63705825805664
|
60 |
+
https://www.my-private.club/h1club-hamburg/events/sweet-dreams_2024-12-21_10237 1 1 1 0 0 0.0 15.092102527618408
|
61 |
+
https://www.my-private.club/h1club-hamburg/events/bad-santa_2024-12-25_10243 1 1 0 0 0 0.0 17.851985216140747
|
62 |
+
https://www.nomadenland.de/veranstaltung/ziegenwanderung/ 0 0 0 0 0.0 0 59.95140218734741
|
63 |
+
http://www.wuhlheide.de/programm/berliner-rundfunk-open-air-2025/2025-07-05 1 0 1.0 1 0 1 15.058967590332031
|
64 |
+
http://www.wuhlheide.de/programm/cro/2025-08-09 0 0 1.0 1 0.0 1 13.188536882400513
|
65 |
+
http://www.wuhlheide.de/programm/k-i-z/2025-08-23 1 0 1 1 0.0 1.0 12.820086240768433
|
66 |
+
http://www.wuhlheide.de/programm/tokio-hotel/2025-08-15 1 0 1.0 1 0.0 0 14.821857690811157
|
67 |
+
http://www.wuhlheide.de/programm/25-jahre-sido/2025-07-12 1 0 1.0 1 0.0 1 15.217310428619385
|
68 |
+
https://gestoert-aber-geil.online-ticket.de/gestoert-aber-geil-das-festival-tickets-2025 0 1 1 1 1.0 1.0 23.675548553466797
|
69 |
+
http://www.wuhlheide.de/programm/feine-sahne-fischfilet/2025-07-19 1 0 1 1 0.0 1 15.708831548690796
|
70 |
+
https://www.koeln.de/event/hafen-weihnachtsmarkt/2024-12-13/ 1 1 1 0 0 0 21.582816123962402
|
71 |
+
https://www.koeln.de/event/flohmarkt-an-der-galopprennbahn-2/2024-12-13/ 1 0 1 1 1 0 23.040252685546875
|
72 |
+
https://www.augsburg.de/detail-kalender-2/fur-selbsthilfeaktive/event-5436-5436-20250115 1 0 1 0 0 1 23.105677843093872
|
73 |
+
https://www.augsburg.de/aktuelles-aus-der-stadt/detail/eine-ganze-nacht-lang-demokratie-und-was-uns-zusammenhaelt 1 0 1 0 0 0 89.41105699539185
|
74 |
+
https://www.mainz.de/freizeit-und-sport/feste-und-veranstaltungen/oktoberfest.php 1 1 1 0 0 0 28.638829469680786
|
75 |
+
https://www.mainz.de/leben-und-arbeit/migration-und-integration/interkulturelle-woche.php 1 0 1 0 0 1 56.86952352523804
|
76 |
+
https://www.mainz.de/freizeit-und-sport/feste-und-veranstaltungen/oktoberfest.php 1 1 1 0 0 0 27.58851170539856
|
77 |
+
https://www.mainz.de/microsite/wissenimherzen/wissenschafts-events/wissenschaftsmarkt.php 1 0 1 1 0 0 20.361992597579956
|
78 |
+
https://www.erfurt.de/ef/de/erleben/veranstaltungen/vst/2025/145265.html 1 0 1 0 1.0 1.0 21.989774465560913
|
79 |
+
https://www.erfurt.de/ef/de/erleben/veranstaltungen/vst/2019/jl_142683.html 1 0 1 0 0.0 1.0 56.0450382232666
|
80 |
+
https://www.erfurt.de/ef/de/erleben/veranstaltungen/vst/2022/143177.html 1 0 0 1 1.0 0.0 94.01974368095398
|
81 |
+
https://www.erfurt.de/ef/de/erleben/veranstaltungen/ast/2024/km_149460.html 1 0 1 1 1.0 0.0 89.23220944404602
|
82 |
+
https://www.erfurt.de/ef/de/erleben/veranstaltungen/vst/2024/ts_149122.html 0 1 0 0 0.5 0 54.83802843093872
|
83 |
+
https://www.erfurt.de/ef/de/erleben/veranstaltungen/maerkte/117680.html 1 1 1 0 1.0 1.0 21.97410273551941
|
84 |
+
https://www.erfurt.de/ef/de/erleben/veranstaltungen/vst/2024/km_149250.html 1 1 1 0 1.0 0.0 36.96526265144348
|
85 |
+
https://www.erfurt.de/ef/de/erleben/veranstaltungen/vst/2025/km_149497.html 1 0 1 0 1.0 0.0 40.81477332115173
|
86 |
+
https://lust-auf-leverkusen.de/veranstaltung/hoehner-2/ 1 1 0.5 0 0.0 0 74.72248268127441
|
87 |
+
https://lust-auf-leverkusen.de/veranstaltung/christmas-in-vienna-2024-live-aus-wien-auf-der-kinoleinwand/ 0 1 1 0 1.0 1.0 69.38979887962341
|
88 |
+
https://www.wuerzburg.de/events-termine/kilianivolksfest 0 0 1 0 1 0 12.064836263656616
|
89 |
+
https://www.wuerzburg.de/themen/kultur-bildung-kulturangebot/stadtbuecherei/veranstaltungen/buecherflohmarkt 1 0 1 1 0 1.0 27.614226579666138
|
90 |
+
https://www.wuerzburg.de/themen/kultur-bildung-kulturangebot/stadtbuecherei/veranstaltungen/schreibwerkstatt 1 0 1 1 0 0.0 100.45706009864807
|
91 |
+
http://www.liederhalle-stuttgart.de/events/dudes-live-tour-2025/ 0 0 1 1 1.0 1.0 23.20697021484375
|
92 |
+
http://www.liederhalle-stuttgart.de/events/blumengarten-ich-liebe-dich-fuer-immer-tour-2025/ 0 0 1 1 1.0 1.0 28.90886640548706
|
93 |
+
https://www.easyticket.de/veranstaltung/manfred-mann-s-earth-band/99316/ 1 1 1.0 0 0 0 55.74727940559387
|
94 |
+
https://www.easyticket.de/event-date-redirect/99089 1 1 1.0 1 0 0.0 41.58063340187073
|
src/nlp/playground/pipelines/testing/results.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/nlp/playground/pipelines/title_extractor.py
CHANGED
@@ -54,12 +54,13 @@ class TitleExtractor:
|
|
54 |
print("lowest Level", lowest_level)
|
55 |
return lowest_level["text"]
|
56 |
|
|
|
57 |
def extract_title_classy_classification(self,event_text):
|
58 |
analyzer = MarkdownAnalyzer(event_text)
|
59 |
identified_headers = analyzer.identify_headers()
|
60 |
|
61 |
classifier = joblib.load(
|
62 |
-
hf_hub_download(repo_id="
|
63 |
)
|
64 |
|
65 |
headers = identified_headers["Header"] if identified_headers else analyzer.identify_emphasis()
|
|
|
54 |
print("lowest Level", lowest_level)
|
55 |
return lowest_level["text"]
|
56 |
|
57 |
+
|
58 |
def extract_title_classy_classification(self,event_text):
|
59 |
analyzer = MarkdownAnalyzer(event_text)
|
60 |
identified_headers = analyzer.identify_headers()
|
61 |
|
62 |
classifier = joblib.load(
|
63 |
+
hf_hub_download(repo_id="adojode/title_classifier", filename="title_classifier" + ".pkl")
|
64 |
)
|
65 |
|
66 |
headers = identified_headers["Header"] if identified_headers else analyzer.identify_emphasis()
|
src/utils/Event.py
CHANGED
@@ -45,29 +45,20 @@ class Schedule:
|
|
45 |
def __len__(self):
|
46 |
return len([element for element in [self.start_date, self.end_date, self.start_time, self.end_time, self.admittance_time] if element])
|
47 |
|
48 |
-
def __contains__(self,item):
|
49 |
-
if not isinstance(item, Schedule):
|
50 |
-
return False
|
51 |
-
|
52 |
-
print("
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
# class Address:
|
63 |
-
# def __init__(self, street, house_number, postal_code, city):
|
64 |
-
# self.street = street
|
65 |
-
# self.house_number = house_number
|
66 |
-
# self.postal_code = postal_code
|
67 |
-
# self.city = city
|
68 |
-
#
|
69 |
-
# def __str__(self):
|
70 |
-
# return f"🏠 {self.street if self.street else ''} {self.house_number if self.house_number else ''}, {self.postal_code if self.postal_code else ''} {self.city if self.city else ''}"
|
71 |
|
72 |
|
73 |
class Event:
|
|
|
45 |
def __len__(self):
|
46 |
return len([element for element in [self.start_date, self.end_date, self.start_time, self.end_time, self.admittance_time] if element])
|
47 |
|
48 |
+
def __contains__(self, item):
|
49 |
+
if not isinstance(item, Schedule):
|
50 |
+
return False # Falls item kein Schedule-Objekt ist, direkt False
|
51 |
+
|
52 |
+
print(f"SELF: {self}")
|
53 |
+
print(f"ITEM: {item}")
|
54 |
+
|
55 |
+
return all([
|
56 |
+
item.start_date in {self.start_date, self.end_date, None},
|
57 |
+
item.end_date in {self.end_date, self.start_date, None},
|
58 |
+
item.start_time in {self.start_time, self.end_time, None},
|
59 |
+
item.end_time in {self.end_time, self.start_time, None},
|
60 |
+
item.admittance_time in {self.admittance_time, None}
|
61 |
+
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
|
64 |
class Event:
|