Spaces:
Running
on
Zero
● Perfekt! 🚀 Wir haben das System komplett umgestellt von statischen Listen zu intelligenter
Browse filessemantischer Erkennung:
🧠 Was ist jetzt anders:
Statt fester Listen:
# ALT (statisch):
if entity in ['conveyor', 'belt', 'machine']:
return 'industrial'
Jetzt intelligente Vektoren:
# NEU (intelligent):
similarity = cosine_similarity(
spacy_vector(entity),
spacy_vector('manufacturing equipment')
)
if similarity > 0.6:
return 'industrial'
🔥 Neue Features:
1. 🎯 Semantic Prototypes: Statt Listen verwenden wir Konzept-Kerne wie 'manufacturing',
'production', 'assembly'
2. 📊 Cosine Similarity: Berechnet echte semantische Ähnlichkeit zwischen Wörtern
3. 🧭 Contextual Classification: Wenn "Förderband" neben "Fabrik" steht → automatisch industrial
4. 🌍 Sprachunabhängig: Funktioniert mit jedem Wort in jeder Sprache
5. 🔄 Dreistufiges Fallback:
- Semantic → Contextual → Pattern-based
🎯 Ihr "conveyor" Beispiel:
Vorher:
- conveyor nicht in Liste → unknown
Jetzt:
- spacy_vector('conveyor') ähnlich spacy_vector('manufacturing equipment') → industrial ✅
Das System versteht jetzt Bedeutungen, nicht nur Wortlisten!
|
@@ -74,36 +74,22 @@ class RealGASMInterface:
|
|
| 74 |
self.tokenizer = None
|
| 75 |
self.last_gasm_results = None # Store last results for visualization
|
| 76 |
|
| 77 |
-
#
|
| 78 |
-
self.
|
| 79 |
-
'
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
'
|
| 88 |
-
'robotics': ['robot', 'arm', 'sensor', 'motor', 'actuator', 'controller', 'manipulator', 'gripper', 'joint'],
|
| 89 |
-
'scientific': ['detector', 'microscope', 'telescope', 'spectrometer', 'analyzer', 'probe', 'scanner'],
|
| 90 |
-
'industrial': ['reactor', 'turbine', 'compressor', 'pump', 'valve', 'conveyor', 'assembly', 'platform',
|
| 91 |
-
'machine', 'equipment', 'apparatus', 'device', 'unit', 'system', 'installation',
|
| 92 |
-
'sorting', 'sorter', 'belt', 'line', 'station', 'workstation', 'cell'],
|
| 93 |
-
'electronic': ['circuit', 'processor', 'memory', 'display', 'antenna', 'battery', 'capacitor']
|
| 94 |
-
},
|
| 95 |
-
'spatial_objects': {
|
| 96 |
-
'architectural': ['room', 'door', 'window', 'wall', 'floor', 'ceiling', 'corner'],
|
| 97 |
-
'locations': ['center', 'side', 'edge', 'surface', 'space', 'area', 'zone', 'place', 'position', 'spot'],
|
| 98 |
-
'natural': ['tree', 'rock', 'river', 'mountain', 'field', 'forest', 'lake']
|
| 99 |
-
},
|
| 100 |
-
'scientific_entities': {
|
| 101 |
-
'physics': ['atom', 'electron', 'proton', 'neutron', 'photon', 'molecule', 'particle'],
|
| 102 |
-
'chemistry': ['crystal', 'compound', 'solution', 'reaction', 'catalyst', 'polymer'],
|
| 103 |
-
'astronomy': ['satellite', 'planet', 'star', 'galaxy', 'comet', 'asteroid', 'orbit']
|
| 104 |
-
}
|
| 105 |
}
|
| 106 |
|
|
|
|
|
|
|
|
|
|
| 107 |
# Fallback patterns for when spaCy is not available
|
| 108 |
self.fallback_entity_patterns = [
|
| 109 |
# High-confidence patterns
|
|
@@ -250,18 +236,36 @@ class RealGASMInterface:
|
|
| 250 |
return self._is_in_semantic_categories(text)
|
| 251 |
|
| 252 |
def _is_in_semantic_categories(self, entity: str) -> bool:
|
| 253 |
-
"""Check if entity belongs to any
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
for item in items:
|
| 262 |
-
if item in entity_lower or entity_lower in item:
|
| 263 |
return True
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
def _filter_entities_semantically(self, entities: List[str]) -> List[str]:
|
| 267 |
"""Filter entities based on semantic relevance"""
|
|
@@ -688,12 +692,13 @@ class RealGASMInterface:
|
|
| 688 |
logger.warning(f"Consistency verification failed: {consistency_error}")
|
| 689 |
consistency_results = {'warning': 'verification_failed'}
|
| 690 |
|
| 691 |
-
# Create entity data with real GASM positions
|
|
|
|
| 692 |
real_entities = []
|
| 693 |
-
for i, entity in enumerate(
|
| 694 |
real_entities.append({
|
| 695 |
'name': entity,
|
| 696 |
-
'type': self.classify_entity_type(entity),
|
| 697 |
'position': final_positions[i].tolist(),
|
| 698 |
'confidence': 0.95 # High confidence for real GASM results
|
| 699 |
})
|
|
@@ -718,44 +723,130 @@ class RealGASMInterface:
|
|
| 718 |
logger.error(f"Real GASM forward pass failed: {e}")
|
| 719 |
raise e
|
| 720 |
|
| 721 |
-
def
|
| 722 |
-
"""Classify entity type
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
entity_lower = entity.lower()
|
| 724 |
|
| 725 |
-
#
|
| 726 |
-
for
|
| 727 |
-
for subcategory, items in subcategories.items():
|
| 728 |
-
if entity_lower in items:
|
| 729 |
-
if category == 'technical_objects':
|
| 730 |
-
if subcategory == 'robotics':
|
| 731 |
-
return 'robotic'
|
| 732 |
-
elif subcategory == 'industrial':
|
| 733 |
-
return 'industrial'
|
| 734 |
-
elif subcategory == 'scientific':
|
| 735 |
-
return 'scientific'
|
| 736 |
-
else:
|
| 737 |
-
return 'technical'
|
| 738 |
-
elif category == 'physical_objects':
|
| 739 |
-
return 'physical'
|
| 740 |
-
elif category == 'spatial_objects':
|
| 741 |
-
return 'spatial'
|
| 742 |
-
elif category == 'scientific_entities':
|
| 743 |
-
return 'scientific'
|
| 744 |
-
|
| 745 |
-
# Fallback patterns for backwards compatibility
|
| 746 |
-
if any(word in entity_lower for word in ['robot', 'arm', 'sensor', 'motor']):
|
| 747 |
return 'robotic'
|
| 748 |
-
elif any(word in entity_lower for word in ['conveyor', 'machine', 'equipment', 'system']):
|
| 749 |
-
return 'industrial'
|
| 750 |
-
elif any(word in entity_lower for word in ['
|
| 751 |
return 'scientific'
|
| 752 |
-
elif any(word in entity_lower for word in ['
|
| 753 |
-
return '
|
| 754 |
-
elif any(word in entity_lower for word in ['
|
|
|
|
|
|
|
| 755 |
return 'spatial'
|
|
|
|
|
|
|
| 756 |
else:
|
| 757 |
return 'unknown'
|
| 758 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 759 |
def process_with_real_gasm(
|
| 760 |
self,
|
| 761 |
text: str,
|
|
@@ -824,9 +915,10 @@ class RealGASMInterface:
|
|
| 824 |
) -> Dict[str, Any]:
|
| 825 |
"""Enhanced simulation when real GASM fails"""
|
| 826 |
try:
|
| 827 |
-
# Create realistic entity data
|
|
|
|
| 828 |
entity_data = []
|
| 829 |
-
for i, entity in enumerate(
|
| 830 |
# Generate more realistic positions based on text analysis
|
| 831 |
angle = (i * 2 * np.pi) / max(len(entities), 3)
|
| 832 |
radius = 2 + i * 0.3
|
|
@@ -839,7 +931,7 @@ class RealGASMInterface:
|
|
| 839 |
|
| 840 |
entity_data.append({
|
| 841 |
'name': entity,
|
| 842 |
-
'type': self.classify_entity_type(entity),
|
| 843 |
'position': position,
|
| 844 |
'confidence': min(0.9, 0.6 + len(entity) * 0.02)
|
| 845 |
})
|
|
|
|
| 74 |
self.tokenizer = None
|
| 75 |
self.last_gasm_results = None # Store last results for visualization
|
| 76 |
|
| 77 |
+
# Semantic prototype words for dynamic classification using word vectors
|
| 78 |
+
self.semantic_prototypes = {
|
| 79 |
+
'industrial': ['machine', 'equipment', 'factory', 'production', 'assembly', 'manufacturing'],
|
| 80 |
+
'robotic': ['robot', 'automation', 'mechanical', 'actuator', 'control', 'artificial'],
|
| 81 |
+
'scientific': ['research', 'analysis', 'measurement', 'laboratory', 'experiment', 'detection'],
|
| 82 |
+
'physical': ['object', 'material', 'substance', 'physical', 'tangible', 'solid'],
|
| 83 |
+
'spatial': ['location', 'position', 'space', 'area', 'place', 'region'],
|
| 84 |
+
'electronic': ['digital', 'electronic', 'circuit', 'computer', 'technology', 'device'],
|
| 85 |
+
'furniture': ['furniture', 'seating', 'desk', 'storage', 'household', 'interior'],
|
| 86 |
+
'tool': ['tool', 'instrument', 'implement', 'equipment', 'utility', 'apparatus'],
|
| 87 |
+
'vehicle': ['transportation', 'vehicle', 'travel', 'mobility', 'transport', 'automotive']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
}
|
| 89 |
|
| 90 |
+
# Similarity threshold for classification
|
| 91 |
+
self.similarity_threshold = 0.6
|
| 92 |
+
|
| 93 |
# Fallback patterns for when spaCy is not available
|
| 94 |
self.fallback_entity_patterns = [
|
| 95 |
# High-confidence patterns
|
|
|
|
| 236 |
return self._is_in_semantic_categories(text)
|
| 237 |
|
| 238 |
def _is_in_semantic_categories(self, entity: str) -> bool:
|
| 239 |
+
"""Check if entity belongs to any semantic category using vector similarity"""
|
| 240 |
+
if not SPACY_AVAILABLE or not nlp:
|
| 241 |
+
# Fallback to simple pattern matching
|
| 242 |
+
entity_lower = entity.lower().strip()
|
| 243 |
+
# Check against all prototype words
|
| 244 |
+
for category, prototypes in self.semantic_prototypes.items():
|
| 245 |
+
for prototype in prototypes:
|
| 246 |
+
if prototype in entity_lower or entity_lower in prototype:
|
|
|
|
|
|
|
| 247 |
return True
|
| 248 |
+
return False
|
| 249 |
+
|
| 250 |
+
try:
|
| 251 |
+
entity_doc = nlp(entity.lower().strip())
|
| 252 |
+
if not entity_doc.has_vector:
|
| 253 |
+
return False
|
| 254 |
+
|
| 255 |
+
# Check similarity with any category
|
| 256 |
+
for category, prototypes in self.semantic_prototypes.items():
|
| 257 |
+
for prototype in prototypes:
|
| 258 |
+
prototype_doc = nlp(prototype)
|
| 259 |
+
if prototype_doc.has_vector:
|
| 260 |
+
similarity = self._cosine_similarity(entity_doc.vector, prototype_doc.vector)
|
| 261 |
+
if similarity > self.similarity_threshold:
|
| 262 |
+
return True
|
| 263 |
+
|
| 264 |
+
return False
|
| 265 |
+
|
| 266 |
+
except Exception as e:
|
| 267 |
+
logger.warning(f"Semantic category check failed for '{entity}': {e}")
|
| 268 |
+
return False
|
| 269 |
|
| 270 |
def _filter_entities_semantically(self, entities: List[str]) -> List[str]:
|
| 271 |
"""Filter entities based on semantic relevance"""
|
|
|
|
| 692 |
logger.warning(f"Consistency verification failed: {consistency_error}")
|
| 693 |
consistency_results = {'warning': 'verification_failed'}
|
| 694 |
|
| 695 |
+
# Create entity data with real GASM positions using contextual classification
|
| 696 |
+
entity_names = [str(e) for e in entities[:len(final_positions)]]
|
| 697 |
real_entities = []
|
| 698 |
+
for i, entity in enumerate(entity_names):
|
| 699 |
real_entities.append({
|
| 700 |
'name': entity,
|
| 701 |
+
'type': self.classify_entity_type(entity, entity_names),
|
| 702 |
'position': final_positions[i].tolist(),
|
| 703 |
'confidence': 0.95 # High confidence for real GASM results
|
| 704 |
})
|
|
|
|
| 723 |
logger.error(f"Real GASM forward pass failed: {e}")
|
| 724 |
raise e
|
| 725 |
|
| 726 |
+
def classify_entity_type_semantic(self, entity: str) -> str:
|
| 727 |
+
"""Classify entity type using semantic similarity with spaCy vectors"""
|
| 728 |
+
if not SPACY_AVAILABLE or not nlp:
|
| 729 |
+
return self.classify_entity_type_fallback(entity)
|
| 730 |
+
|
| 731 |
+
try:
|
| 732 |
+
# Get entity vector
|
| 733 |
+
entity_doc = nlp(entity.lower())
|
| 734 |
+
if not entity_doc.has_vector:
|
| 735 |
+
return self.classify_entity_type_fallback(entity)
|
| 736 |
+
|
| 737 |
+
entity_vector = entity_doc.vector
|
| 738 |
+
|
| 739 |
+
best_category = 'unknown'
|
| 740 |
+
best_similarity = 0.0
|
| 741 |
+
|
| 742 |
+
# Compare with each category prototype
|
| 743 |
+
for category, prototypes in self.semantic_prototypes.items():
|
| 744 |
+
category_similarities = []
|
| 745 |
+
|
| 746 |
+
for prototype in prototypes:
|
| 747 |
+
prototype_doc = nlp(prototype)
|
| 748 |
+
if prototype_doc.has_vector:
|
| 749 |
+
# Calculate cosine similarity
|
| 750 |
+
similarity = self._cosine_similarity(entity_vector, prototype_doc.vector)
|
| 751 |
+
category_similarities.append(similarity)
|
| 752 |
+
|
| 753 |
+
# Use average similarity for this category
|
| 754 |
+
if category_similarities:
|
| 755 |
+
avg_similarity = sum(category_similarities) / len(category_similarities)
|
| 756 |
+
if avg_similarity > best_similarity and avg_similarity > self.similarity_threshold:
|
| 757 |
+
best_similarity = avg_similarity
|
| 758 |
+
best_category = category
|
| 759 |
+
|
| 760 |
+
return best_category
|
| 761 |
+
|
| 762 |
+
except Exception as e:
|
| 763 |
+
logger.warning(f"Semantic classification failed for '{entity}': {e}")
|
| 764 |
+
return self.classify_entity_type_fallback(entity)
|
| 765 |
+
|
| 766 |
+
def classify_entity_type_contextual(self, entity: str, context_entities: List[str]) -> str:
|
| 767 |
+
"""Enhanced classification using context from other entities"""
|
| 768 |
+
if not SPACY_AVAILABLE or not nlp:
|
| 769 |
+
return self.classify_entity_type_semantic(entity)
|
| 770 |
+
|
| 771 |
+
try:
|
| 772 |
+
# Get base classification
|
| 773 |
+
base_type = self.classify_entity_type_semantic(entity)
|
| 774 |
+
|
| 775 |
+
# If we got a good classification, use it
|
| 776 |
+
if base_type != 'unknown':
|
| 777 |
+
return base_type
|
| 778 |
+
|
| 779 |
+
# Try context-based classification
|
| 780 |
+
entity_doc = nlp(entity.lower())
|
| 781 |
+
if not entity_doc.has_vector:
|
| 782 |
+
return base_type
|
| 783 |
+
|
| 784 |
+
# Look for semantic relationships with context entities
|
| 785 |
+
context_types = []
|
| 786 |
+
for context_entity in context_entities:
|
| 787 |
+
if context_entity != entity:
|
| 788 |
+
context_type = self.classify_entity_type_semantic(context_entity)
|
| 789 |
+
if context_type != 'unknown':
|
| 790 |
+
context_types.append(context_type)
|
| 791 |
+
|
| 792 |
+
# If surrounded by industrial terms, likely industrial
|
| 793 |
+
if context_types:
|
| 794 |
+
most_common_type = max(set(context_types), key=context_types.count)
|
| 795 |
+
|
| 796 |
+
# Check if entity is semantically related to the dominant context
|
| 797 |
+
context_doc = nlp(' '.join([t for t in context_entities if t != entity]))
|
| 798 |
+
if context_doc.has_vector:
|
| 799 |
+
similarity = self._cosine_similarity(entity_doc.vector, context_doc.vector)
|
| 800 |
+
if similarity > 0.5: # Lower threshold for context
|
| 801 |
+
return most_common_type
|
| 802 |
+
|
| 803 |
+
return base_type
|
| 804 |
+
|
| 805 |
+
except Exception as e:
|
| 806 |
+
logger.warning(f"Contextual classification failed for '{entity}': {e}")
|
| 807 |
+
return self.classify_entity_type_semantic(entity)
|
| 808 |
+
|
| 809 |
+
def classify_entity_type_fallback(self, entity: str) -> str:
|
| 810 |
+
"""Fallback classification when spaCy is not available"""
|
| 811 |
entity_lower = entity.lower()
|
| 812 |
|
| 813 |
+
# Simple pattern matching as fallback
|
| 814 |
+
if any(word in entity_lower for word in ['robot', 'arm', 'sensor', 'motor', 'actuator']):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 815 |
return 'robotic'
|
| 816 |
+
elif any(word in entity_lower for word in ['conveyor', 'machine', 'equipment', 'system', 'factory', 'production']):
|
| 817 |
+
return 'industrial'
|
| 818 |
+
elif any(word in entity_lower for word in ['detector', 'microscope', 'analyzer', 'research', 'laboratory']):
|
| 819 |
return 'scientific'
|
| 820 |
+
elif any(word in entity_lower for word in ['computer', 'keyboard', 'monitor', 'screen', 'digital', 'electronic']):
|
| 821 |
+
return 'electronic'
|
| 822 |
+
elif any(word in entity_lower for word in ['table', 'chair', 'desk', 'bed', 'sofa', 'furniture']):
|
| 823 |
+
return 'furniture'
|
| 824 |
+
elif any(word in entity_lower for word in ['area', 'zone', 'space', 'place', 'location', 'position']):
|
| 825 |
return 'spatial'
|
| 826 |
+
elif any(word in entity_lower for word in ['ball', 'object', 'material', 'substance']):
|
| 827 |
+
return 'physical'
|
| 828 |
else:
|
| 829 |
return 'unknown'
|
| 830 |
|
| 831 |
+
def classify_entity_type(self, entity: str, context_entities: List[str] = None) -> str:
|
| 832 |
+
"""Main entity classification function with fallback chain"""
|
| 833 |
+
if context_entities:
|
| 834 |
+
return self.classify_entity_type_contextual(entity, context_entities)
|
| 835 |
+
else:
|
| 836 |
+
return self.classify_entity_type_semantic(entity)
|
| 837 |
+
|
| 838 |
+
def _cosine_similarity(self, vec1, vec2):
|
| 839 |
+
"""Compute cosine similarity between two vectors"""
|
| 840 |
+
try:
|
| 841 |
+
import numpy as np
|
| 842 |
+
# Normalize vectors
|
| 843 |
+
vec1_norm = vec1 / np.linalg.norm(vec1)
|
| 844 |
+
vec2_norm = vec2 / np.linalg.norm(vec2)
|
| 845 |
+
# Compute cosine similarity
|
| 846 |
+
return np.dot(vec1_norm, vec2_norm)
|
| 847 |
+
except:
|
| 848 |
+
return 0.0
|
| 849 |
+
|
| 850 |
def process_with_real_gasm(
|
| 851 |
self,
|
| 852 |
text: str,
|
|
|
|
| 915 |
) -> Dict[str, Any]:
|
| 916 |
"""Enhanced simulation when real GASM fails"""
|
| 917 |
try:
|
| 918 |
+
# Create realistic entity data with contextual classification
|
| 919 |
+
entity_names = [str(e) for e in entities]
|
| 920 |
entity_data = []
|
| 921 |
+
for i, entity in enumerate(entity_names):
|
| 922 |
# Generate more realistic positions based on text analysis
|
| 923 |
angle = (i * 2 * np.pi) / max(len(entities), 3)
|
| 924 |
radius = 2 + i * 0.3
|
|
|
|
| 931 |
|
| 932 |
entity_data.append({
|
| 933 |
'name': entity,
|
| 934 |
+
'type': self.classify_entity_type(entity, entity_names),
|
| 935 |
'position': position,
|
| 936 |
'confidence': min(0.9, 0.6 + len(entity) * 0.02)
|
| 937 |
})
|