scheitelpunk commited on
Commit
04f921c
·
1 Parent(s): 143badf

Verbesserungen:

Browse files

1. Erweiterte Entity-Patterns:
- keyboard und monitor sind jetzt explizit in der Liste in app.py:57
- Zusätzliche Kategorien: Technical objects, spatial objects, office devices
- General noun patterns für längere Wörter

2. Verbesserte Extraktion:
- Präpositions-Patterns erfassen jetzt "next to the computer", "between the keyboard"
- Bessere Behandlung von Artikel-Konstruktionen
- Erhöhtes Limit auf 12 Entitäten

3. Robustere Filterung:
- Erweiterte Stop-Words Liste
- Sortierung nach Wortlänge (längere Wörter zuerst)

Mit diesen Änderungen sollte Ihr Testsatz "The ball lies left of the table next to the computer,
while the book sits between the keyboard and the monitor" jetzt alle 6 Entitäten erkennen: ball,
table, computer, book, keyboard, monitor.

Die Patterns decken jetzt viel mehr Anwendungsfälle ab - von technischen Objekten bis zu
alltäglichen Gegenständen.

Files changed (1) hide show
  1. app.py +47 -15
app.py CHANGED
@@ -51,10 +51,20 @@ class RealGASMInterface:
51
 
52
  # Entity and relation patterns for text processing
53
  self.entity_patterns = [
54
- r'\b(robot\w*|arm\w*|satellite\w*|crystal\w*|molecule\w*|atom\w*|electron\w*)\b',
55
- r'\b(ball|table|chair|book|computer|lamp|vase|shelf|tv|sofa)\b',
56
- r'\b(gedanken|vertrauen|zweifel|hoffnung|verzweiflung)\b',
57
- r'\b(der|die|das)\s+([a-zA-Z]+)\b'
 
 
 
 
 
 
 
 
 
 
58
  ]
59
 
60
  self.spatial_relations = {
@@ -74,26 +84,48 @@ class RealGASMInterface:
74
  }
75
 
76
  def extract_entities_from_text(self, text: str) -> List[str]:
77
- """Extract entities from text using simple pattern matching"""
78
  import re
79
  entities = []
80
 
81
- # Extract meaningful words (nouns, objects, concepts)
82
- words = text.lower().split()
83
-
84
  # Simple entity extraction based on patterns
85
  for pattern in self.entity_patterns:
86
  matches = re.findall(pattern, text.lower())
87
- if isinstance(matches[0], tuple) if matches else False:
88
- entities.extend([match[1] for match in matches if len(match[1]) > 2])
89
- else:
90
- entities.extend([match for match in matches if len(match) > 2])
 
 
 
 
 
 
 
 
 
91
 
92
- # Remove duplicates and common words
93
- stop_words = {'der', 'die', 'das', 'und', 'oder', 'aber', 'mit', 'von', 'zu', 'in', 'auf', 'für'}
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  entities = list(set([e for e in entities if e not in stop_words and len(e) > 2]))
95
 
96
- return entities[:10] # Limit to 10 entities
 
 
 
97
 
98
  def extract_relations_from_text(self, text: str) -> List[Dict]:
99
  """Extract relations from text"""
 
51
 
52
  # Entity and relation patterns for text processing
53
  self.entity_patterns = [
54
+ # Technical/scientific objects
55
+ r'\b(robot\w*|arm\w*|satellite\w*|crystal\w*|molecule\w*|atom\w*|electron\w*|detector\w*|sensor\w*|motor\w*|beam\w*|component\w*|platform\w*|axis\w*|field\w*|system\w*|reactor\w*|coolant\w*|turbine\w*)\b',
56
+ # Office/household devices (extended)
57
+ r'\b(ball|table|chair|book|computer|keyboard|monitor|screen|mouse|laptop|desk|lamp|vase|shelf|tv|sofa|phone|tablet|printer|scanner|camera|speaker)\b',
58
+ # Spatial objects
59
+ r'\b(room|door|window|wall|floor|ceiling|corner|center|side|edge|surface|space|area|zone|place|location|position|spot)\b',
60
+ # Abstract concepts
61
+ r'\b(gedanken|vertrauen|zweifel|hoffnung|verzweiflung|idee|konzept|theorie|prinzip|regel|methode|prozess|ablauf)\b',
62
+ # German article constructions (to capture more nouns)
63
+ r'\b(der|die|das)\s+([a-zA-Z]+)\b',
64
+ # English constructions (the + noun)
65
+ r'\bthe\s+([a-zA-Z]+)\b',
66
+ # General noun patterns (words starting with capital letter or longer than 4 chars)
67
+ r'\b([A-Z][a-z]{3,}|[a-z]{5,})\b'
68
  ]
69
 
70
  self.spatial_relations = {
 
84
  }
85
 
86
  def extract_entities_from_text(self, text: str) -> List[str]:
87
+ """Extract entities from text using improved pattern matching"""
88
  import re
89
  entities = []
90
 
 
 
 
91
  # Simple entity extraction based on patterns
92
  for pattern in self.entity_patterns:
93
  matches = re.findall(pattern, text.lower())
94
+ if matches:
95
+ if isinstance(matches[0], tuple):
96
+ # For patterns with groups (e.g. "der/die/das + noun")
97
+ entities.extend([match[-1] for match in matches if len(match[-1]) > 2])
98
+ else:
99
+ # For simple patterns
100
+ entities.extend([match for match in matches if len(match) > 2])
101
+
102
+ # Additionally: Extract all nouns with prepositions
103
+ preposition_patterns = [
104
+ r'\b(?:next\s+to|left\s+of|right\s+of|above|below|between|behind|in\s+front\s+of|near|around|inside|outside)\s+(?:the\s+)?([a-zA-Z]{3,})\b',
105
+ r'\b(?:neben|links\s+von|rechts\s+von|über|unter|zwischen|hinter|vor|bei|um|in|außen)\s+(?:der|die|das|dem|den)?\s*([a-zA-Z]{3,})\b'
106
+ ]
107
 
108
+ for pattern in preposition_patterns:
109
+ matches = re.findall(pattern, text.lower())
110
+ entities.extend([match for match in matches if len(match) > 2])
111
+
112
+ # Extended stop words list
113
+ stop_words = {
114
+ 'der', 'die', 'das', 'und', 'oder', 'aber', 'mit', 'von', 'zu', 'in', 'auf', 'für',
115
+ 'the', 'and', 'or', 'but', 'with', 'from', 'to', 'in', 'on', 'for', 'of', 'at',
116
+ 'lies', 'sits', 'stands', 'moves', 'flows', 'rotates', 'begins', 'starts',
117
+ 'liegt', 'sitzt', 'steht', 'bewegt', 'fließt', 'rotiert', 'beginnt', 'startet',
118
+ 'while', 'next', 'left', 'right', 'between', 'above', 'below'
119
+ }
120
+
121
+ # Clean up and deduplicate
122
+ entities = [e.strip() for e in entities if e.strip()]
123
  entities = list(set([e for e in entities if e not in stop_words and len(e) > 2]))
124
 
125
+ # Sort by length (longer words first)
126
+ entities = sorted(entities, key=len, reverse=True)
127
+
128
+ return entities[:12] # Increase limit to 12 entities
129
 
130
  def extract_relations_from_text(self, text: str) -> List[Dict]:
131
  """Extract relations from text"""