Spaces:
Running
on
Zero
Verbesserungen:
Browse files1. Erweiterte Entity-Patterns:
- keyboard und monitor sind jetzt explizit in der Liste in app.py:57
- Zusätzliche Kategorien: Technical objects, spatial objects, office devices
- General noun patterns für längere Wörter
2. Verbesserte Extraktion:
- Präpositions-Patterns erfassen jetzt "next to the computer", "between the keyboard"
- Bessere Behandlung von Artikel-Konstruktionen
- Erhöhtes Limit auf 12 Entitäten
3. Robustere Filterung:
- Erweiterte Stop-Words Liste
- Sortierung nach Wortlänge (längere Wörter zuerst)
Mit diesen Änderungen sollte Ihr Testsatz "The ball lies left of the table next to the computer,
while the book sits between the keyboard and the monitor" jetzt alle 6 Entitäten erkennen: ball,
table, computer, book, keyboard, monitor.
Die Patterns decken jetzt viel mehr Anwendungsfälle ab - von technischen Objekten bis zu
alltäglichen Gegenständen.
@@ -51,10 +51,20 @@ class RealGASMInterface:
|
|
51 |
|
52 |
# Entity and relation patterns for text processing
|
53 |
self.entity_patterns = [
|
54 |
-
|
55 |
-
r'\b(
|
56 |
-
|
57 |
-
r'\b(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
]
|
59 |
|
60 |
self.spatial_relations = {
|
@@ -74,26 +84,48 @@ class RealGASMInterface:
|
|
74 |
}
|
75 |
|
76 |
def extract_entities_from_text(self, text: str) -> List[str]:
|
77 |
-
"""Extract entities from text using
|
78 |
import re
|
79 |
entities = []
|
80 |
|
81 |
-
# Extract meaningful words (nouns, objects, concepts)
|
82 |
-
words = text.lower().split()
|
83 |
-
|
84 |
# Simple entity extraction based on patterns
|
85 |
for pattern in self.entity_patterns:
|
86 |
matches = re.findall(pattern, text.lower())
|
87 |
-
if
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
entities = list(set([e for e in entities if e not in stop_words and len(e) > 2]))
|
95 |
|
96 |
-
|
|
|
|
|
|
|
97 |
|
98 |
def extract_relations_from_text(self, text: str) -> List[Dict]:
|
99 |
"""Extract relations from text"""
|
|
|
51 |
|
52 |
# Entity and relation patterns for text processing
|
53 |
self.entity_patterns = [
|
54 |
+
# Technical/scientific objects
|
55 |
+
r'\b(robot\w*|arm\w*|satellite\w*|crystal\w*|molecule\w*|atom\w*|electron\w*|detector\w*|sensor\w*|motor\w*|beam\w*|component\w*|platform\w*|axis\w*|field\w*|system\w*|reactor\w*|coolant\w*|turbine\w*)\b',
|
56 |
+
# Office/household devices (extended)
|
57 |
+
r'\b(ball|table|chair|book|computer|keyboard|monitor|screen|mouse|laptop|desk|lamp|vase|shelf|tv|sofa|phone|tablet|printer|scanner|camera|speaker)\b',
|
58 |
+
# Spatial objects
|
59 |
+
r'\b(room|door|window|wall|floor|ceiling|corner|center|side|edge|surface|space|area|zone|place|location|position|spot)\b',
|
60 |
+
# Abstract concepts
|
61 |
+
r'\b(gedanken|vertrauen|zweifel|hoffnung|verzweiflung|idee|konzept|theorie|prinzip|regel|methode|prozess|ablauf)\b',
|
62 |
+
# German article constructions (to capture more nouns)
|
63 |
+
r'\b(der|die|das)\s+([a-zA-Z]+)\b',
|
64 |
+
# English constructions (the + noun)
|
65 |
+
r'\bthe\s+([a-zA-Z]+)\b',
|
66 |
+
# General noun patterns (words starting with capital letter or longer than 4 chars)
|
67 |
+
r'\b([A-Z][a-z]{3,}|[a-z]{5,})\b'
|
68 |
]
|
69 |
|
70 |
self.spatial_relations = {
|
|
|
84 |
}
|
85 |
|
86 |
def extract_entities_from_text(self, text: str) -> List[str]:
|
87 |
+
"""Extract entities from text using improved pattern matching"""
|
88 |
import re
|
89 |
entities = []
|
90 |
|
|
|
|
|
|
|
91 |
# Simple entity extraction based on patterns
|
92 |
for pattern in self.entity_patterns:
|
93 |
matches = re.findall(pattern, text.lower())
|
94 |
+
if matches:
|
95 |
+
if isinstance(matches[0], tuple):
|
96 |
+
# For patterns with groups (e.g. "der/die/das + noun")
|
97 |
+
entities.extend([match[-1] for match in matches if len(match[-1]) > 2])
|
98 |
+
else:
|
99 |
+
# For simple patterns
|
100 |
+
entities.extend([match for match in matches if len(match) > 2])
|
101 |
+
|
102 |
+
# Additionally: Extract all nouns with prepositions
|
103 |
+
preposition_patterns = [
|
104 |
+
r'\b(?:next\s+to|left\s+of|right\s+of|above|below|between|behind|in\s+front\s+of|near|around|inside|outside)\s+(?:the\s+)?([a-zA-Z]{3,})\b',
|
105 |
+
r'\b(?:neben|links\s+von|rechts\s+von|über|unter|zwischen|hinter|vor|bei|um|in|außen)\s+(?:der|die|das|dem|den)?\s*([a-zA-Z]{3,})\b'
|
106 |
+
]
|
107 |
|
108 |
+
for pattern in preposition_patterns:
|
109 |
+
matches = re.findall(pattern, text.lower())
|
110 |
+
entities.extend([match for match in matches if len(match) > 2])
|
111 |
+
|
112 |
+
# Extended stop words list
|
113 |
+
stop_words = {
|
114 |
+
'der', 'die', 'das', 'und', 'oder', 'aber', 'mit', 'von', 'zu', 'in', 'auf', 'für',
|
115 |
+
'the', 'and', 'or', 'but', 'with', 'from', 'to', 'in', 'on', 'for', 'of', 'at',
|
116 |
+
'lies', 'sits', 'stands', 'moves', 'flows', 'rotates', 'begins', 'starts',
|
117 |
+
'liegt', 'sitzt', 'steht', 'bewegt', 'fließt', 'rotiert', 'beginnt', 'startet',
|
118 |
+
'while', 'next', 'left', 'right', 'between', 'above', 'below'
|
119 |
+
}
|
120 |
+
|
121 |
+
# Clean up and deduplicate
|
122 |
+
entities = [e.strip() for e in entities if e.strip()]
|
123 |
entities = list(set([e for e in entities if e not in stop_words and len(e) > 2]))
|
124 |
|
125 |
+
# Sort by length (longer words first)
|
126 |
+
entities = sorted(entities, key=len, reverse=True)
|
127 |
+
|
128 |
+
return entities[:12] # Increase limit to 12 entities
|
129 |
|
130 |
def extract_relations_from_text(self, text: str) -> List[Dict]:
|
131 |
"""Extract relations from text"""
|