File size: 8,265 Bytes
c2e327f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from sentence_transformers import SentenceTransformer, util
import json
import time
import pandas as pd
import numpy as np
import pickle

import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from chromadb.db.clickhouse import NoDatapointsException

def query_right_aas(json_query, collection, metalabel, model):
  query = json.loads(json_query) 
  name = query['Name']
  definition = query["Definition"]
  unit = query["Unit"]
  datatype = query["Datatype"]
  semantic_id = query["SemanticId"]
  return_matches = query["ReturnMatches"]

  datatype_mapping = {'boolean': 'BOOLEAN', 'string': 'STRING', 'string_translatable':'STRING', 'translatable_string': 'STRING', 'non_translatable_string':'STRING',
      'date':'DATE', 'data_time':'DATE', 'uri':'URI', 'int':'INT', 'int_measure':'INT', 'int_currency':'INT', 'integer': 'INT',
      'real':'REAL', 'real_measure': 'REAL', 'real_currency':'REAL', 'enum_code': 'ENUM_CODE', 'enum_int':'ENUM_CODE',
      'ENUM_REAL': 'ENUM_CODE', 'ENUM_RATIONAL': 'ENUM_CODE', 'ENUM_BOOLEAN': 'ENUM_CODE', 'ENUM_STRING': 'ENUM_CODE',
      'enum_reference': 'ENUM_CODE', 'enum_instance': 'ENUM_CODE', 'set(b1,b2)': 'SET',
      'constrained_set(b1,b2,cmn,cmx)': 'SET', 'set [0,?]': 'SET', 'set [1,?]': 'SET','set [1, ?]': 'SET', 'nan': 'NaN',
        'media_type':'LARGE_OBJECT_TYPE'}
      
  unit_mapping = {'nan': 'NaN', 'hertz': 'FREQUENCY', 'hz': 'FREQUENCY', 'pa': 'PRESSURE', 'pascal': 'PRESSURE', 'n/m²':'PRESSURE',
        'bar': 'PRESSURE', '%': 'SCALARS_PERC', 'w': 'POWER', 'watt': 'POWER', 'kw': 'POWER', 'kg/m³':'CHEMISTRY',
        'm²/s': 'CHEMISTRY', 'pa*s': 'CHEMISTRY', 'v':'ELECTRICAL', 'volt': 'ELECTRICAL', 'db': 'ACOUSTICS',
        'db(a)': 'ACOUSTICS','k': 'TEMPERATURE', '°c': 'TEMPERATURE', 'n': 'MECHANICS', 'newton':'MECHANICS', 'kg/s':'FLOW',
        'kg/h':'FLOW', 'm³/s': 'FLOW', 'm³/h': 'FLOW', 'l/s':'FLOW', 'l/h':'FLOW', 'µm': 'LENGTH', 'mm':'LENGTH', 'cm':'LENGTH',
        'dm':'LENGTH', 'm':'LENGTH' ,'meter': 'LENGTH', 'm/s':'SPEED', 'km/h': 'SPEED', 's^(-1)':'FREQUENCY', '1/s':'FREQUENCY',
        's':'TIME', 'h':'TIME', 'min':'TIME', 'd': 'TIME', 'hours': 'TIME', 'a': 'ELECTRICAL', 'm³': 'VOLUME',
        'm²': 'AREA', 'rpm': 'FLOW', 'nm': 'MECHANICS', 'm/m': 'MECHANICS', 'm³/m²s': 'MECHANICS', 'w(m²*K)': 'HEAT_TRANSFER',
        'kwh': 'ELECTRICAL', 'kg/(s*m²)': 'FLOW', 'kg': 'MASS', 'w/(m*k)': 'HEAT_TRANSFER', 'm²*k/w': 'HEAT_TRANSFER',
        'j/s': 'POWER'}
  
  unit_lower = unit.lower() 
  datatype_lower = datatype.lower()

  unit_categ = unit_mapping.get(unit_lower)
  datatype_categ = datatype_mapping.get(datatype_lower)

  if unit_categ == None:
    unit_categ = 'NaN'
  if datatype_categ == None:
    datatype_categ = 'NaN'

  concat= (unit_categ, datatype_categ)
  keys = [k for k, v in metalabel.items() if v == concat]
  metadata = keys[0]
  
  name_embedding = model.encode(name)
  definition_embedding = model.encode(definition)
  concat_name_def_query = np.concatenate((definition_embedding, name_embedding), axis = 0)
  concat_name_def_query = concat_name_def_query.tolist()

  queries = [concat_name_def_query]
  #print(type(queries))

  # Query wird mit Semantic Search, k-nearest-neighbor durchgeführt
  # Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib
  # Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden
  # In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2

  # Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich
  try:
    homogen = collection.query(
        query_embeddings=queries,
        n_results=1,
        where={"SESemanticId": semantic_id}
    )
  #except NoDatapointsException:
  #  homogen = 'Nix'
  
  except Exception: 
    homogen = 'Nix'

  if homogen != 'Nix':
    result = homogen
    result['matching_method']= 'Semantic equivalent , same semantic Id'
    result['matching_algorithm'] = 'None'
    result['distances'] = [[0]]
    value = result['documents'][0][0]
    value_dict = json.loads(value)

    final_result = {
    "matching_method": result['matching_method'],
    "matching_algorithm": result['matching_algorithm'],
    "matching_distance": result['distances'][0][0],
    "aas_id": result['metadatas'][0][0]['AASId'],
    "aas_id_short": result['metadatas'][0][0]['AASIdShort'],
    "submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
    "submodel_id": result['metadatas'][0][0]['SubmodelId'],
    "matched_object": value_dict,
    }
    final_results = [final_result]
  # Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten
  elif homogen == 'Nix':
    try:
      with_metadata = collection.query(
        query_embeddings=queries,
        n_results=return_matches,
        where={"Metalabel": metadata},
      )

    #except NoDatapointsException:
    #  with_metadata = 'Nix'

    except Exception: 
      with_metadata = 'Nix'

    without_metadata = collection.query(
      query_embeddings=queries,
      n_results=return_matches,
    )
    print(without_metadata)

    if with_metadata == 'Nix':
      result = without_metadata
      result['matching_method']= 'Semantically not equivalent, NLP without Metadata'
      result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass'
    
    elif with_metadata != 'Nix': 
      distance_with_meta = with_metadata['distances'][0][0]
      distance_without_meta = without_metadata['distances'][0][0]
      #print(distance_with_meta)
      #print(distance_without_meta)
      # Vergleich der Abstände von mit und ohne Metadaten
      if distance_without_meta <= distance_with_meta:
        result = without_metadata
        result['matching_method']= 'Semantically not equivalent, NLP without Metadata'
        result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass'
      
      else:
        result = with_metadata
        result['matching_method']= 'Semantically not equivalent, NLP without Metadata'
        result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass'
    # Aufbereiten des passenden finalen Ergebnisses
    final_results = []
    print(result)
    for i in range(0, return_matches):
        value = result['documents'][0][i]
        value_dict = json.loads(value)
        final_result = {
        "matching_method": result['matching_method'],
        "matching_algorithm": result['matching_algorithm'],
        "matching_distance": result['distances'][0][i],
        #"aas_id": result['metadatas'][0][i]['AASId'],
        #"aas_id_short": result['metadatas'][0][i]['AASIdShort'],
        "submodel_id_short": result['metadatas'][0][i]['SubmodelName'],
        "submodel_id": result['metadatas'][0][i]['SubmodelId'],
        "matched_object": value_dict
        }
        #final_result = json.dumps(final_result, indent = 4)
        final_results.append(final_result)

  return final_results

def get_right_collection(collections, aas_id): 
  right_collection = []
  for collection in collections:
      try_collection = collection.get(where={'AASId': aas_id})
      try:
        collection_aas_id = try_collection['metadatas'][0]['AASId']
        right_collection.append(collection)
      except:
        print('Nix')
  if(right_collection == []):
    right_collection = ['AAS not in database']

  return right_collection

# Eine spezifische AAS
def query_specific_aas(query, metalabel, model, collections, client_chroma):
  json_query = json.dumps(query, indent = 4) 
  aas_id = query['AASId']
  right_collection = get_right_collection(collections, aas_id)
  if right_collection == ['AAS not in database']:
    result = right_collection
  else: 
    collection = client_chroma.get_collection(right_collection[0].name)
    result = query_right_aas(json_query, collection, metalabel, model)
  
  return result