Spaces:
Runtime error
Runtime error
from sentence_transformers import SentenceTransformer, util | |
import json | |
import time | |
import pandas as pd | |
import numpy as np | |
import pickle | |
import chromadb | |
from chromadb.config import Settings | |
from chromadb.utils import embedding_functions | |
from chromadb.db.clickhouse import NoDatapointsException | |
def query_aas(query_json, collection, model, metalabel): | |
query = json.loads(query_json) | |
name = query["Name"] | |
definition = query["Definition"] | |
unit = query["Unit"] | |
datatype = query["Datatype"] | |
semantic_id = query["SemanticId"] | |
return_matches = query["ReturnMatches"] | |
#model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass") | |
datatype_mapping = { | |
"boolean": "BOOLEAN", | |
"string": "STRING", | |
"string_translatable": "STRING", | |
"translatable_string": "STRING", | |
"non_translatable_string": "STRING", | |
"date": "DATE", | |
"data_time": "DATE", | |
"uri": "URI", | |
"int": "INT", | |
"int_measure": "INT", | |
"int_currency": "INT", | |
"integer": "INT", | |
"real": "REAL", | |
"real_measure": "REAL", | |
"real_currency": "REAL", | |
"enum_code": "ENUM_CODE", | |
"enum_int": "ENUM_CODE", | |
"ENUM_REAL": "ENUM_CODE", | |
"ENUM_RATIONAL": "ENUM_CODE", | |
"ENUM_BOOLEAN": "ENUM_CODE", | |
"ENUM_STRING": "ENUM_CODE", | |
"enum_reference": "ENUM_CODE", | |
"enum_instance": "ENUM_CODE", | |
"set(b1,b2)": "SET", | |
"constrained_set(b1,b2,cmn,cmx)": "SET", | |
"set [0,?]": "SET", | |
"set [1,?]": "SET", | |
"set [1, ?]": "SET", | |
"nan": "NaN", | |
"media_type": "LARGE_OBJECT_TYPE", | |
} | |
unit_mapping = { | |
"nan": "NaN", | |
"hertz": "FREQUENCY", | |
"hz": "FREQUENCY", | |
"pa": "PRESSURE", | |
"pascal": "PRESSURE", | |
"n/m²": "PRESSURE", | |
"bar": "PRESSURE", | |
"%": "SCALARS_PERC", | |
"w": "POWER", | |
"watt": "POWER", | |
"kw": "POWER", | |
"kg/m³": "CHEMISTRY", | |
"m²/s": "CHEMISTRY", | |
"pa*s": "CHEMISTRY", | |
"v": "ELECTRICAL", | |
"volt": "ELECTRICAL", | |
"db": "ACOUSTICS", | |
"db(a)": "ACOUSTICS", | |
"k": "TEMPERATURE", | |
"°c": "TEMPERATURE", | |
"n": "MECHANICS", | |
"newton": "MECHANICS", | |
"kg/s": "FLOW", | |
"kg/h": "FLOW", | |
"m³/s": "FLOW", | |
"m³/h": "FLOW", | |
"l/s": "FLOW", | |
"l/h": "FLOW", | |
"µm": "LENGTH", | |
"mm": "LENGTH", | |
"cm": "LENGTH", | |
"dm": "LENGTH", | |
"m": "LENGTH", | |
"meter": "LENGTH", | |
"m/s": "SPEED", | |
"km/h": "SPEED", | |
"s^(-1)": "FREQUENCY", | |
"1/s": "FREQUENCY", | |
"s": "TIME", | |
"h": "TIME", | |
"min": "TIME", | |
"d": "TIME", | |
"hours": "TIME", | |
"a": "ELECTRICAL", | |
"m³": "VOLUME", | |
"m²": "AREA", | |
"rpm": "FLOW", | |
"nm": "MECHANICS", | |
"m/m": "MECHANICS", | |
"m³/m²s": "MECHANICS", | |
"w(m²*K)": "HEAT_TRANSFER", | |
"kwh": "ELECTRICAL", | |
"kg/(s*m²)": "FLOW", | |
"kg": "MASS", | |
"w/(m*k)": "HEAT_TRANSFER", | |
"m²*k/w": "HEAT_TRANSFER", | |
"j/s": "POWER", | |
} | |
#with open( | |
# "./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment/metadata.pickle", | |
# "rb", | |
#) as handle: | |
# metalabel = pickle.load(handle) | |
unit_lower = unit.lower() | |
datatype_lower = datatype.lower() | |
unit_categ = unit_mapping.get(unit_lower) | |
datatype_categ = datatype_mapping.get(datatype_lower) | |
if unit_categ == None: | |
unit_categ = "NaN" | |
if datatype_categ == None: | |
datatype_categ = "NaN" | |
concat = (unit_categ, datatype_categ) | |
keys = [k for k, v in metalabel.items() if v == concat] | |
metadata = keys[0] | |
name_embedding = model.encode(name) | |
definition_embedding = model.encode(definition) | |
concat_name_def_query = np.concatenate( | |
(definition_embedding, name_embedding), axis=0 | |
) | |
concat_name_def_query = concat_name_def_query.tolist() | |
queries = [concat_name_def_query] | |
print(type(queries)) | |
# Query wird mit Semantic Search, k-nearest-neighbor durchgeführt | |
# Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib | |
# Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden | |
# In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2 | |
# Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich | |
try: | |
homogen = collection.query( | |
query_embeddings=queries, n_results=1, where={"SESemanticId": semantic_id} | |
) | |
# except NoDatapointsException: | |
# homogen = 'Nix' | |
except Exception: | |
homogen = "Nix" | |
if homogen != "Nix": | |
result = homogen | |
result["matching_method"] = "Semantic equivalent , same semantic Id" | |
result["matching_algorithm"] = "None" | |
result["distances"] = [[0]] | |
final_result = { | |
"matching_method": result['matching_method'], | |
"matching_algorithm": result['matching_algorithm'], | |
"matching_distance": result['distances'][0][0], | |
"aas_id": result['metadatas'][0][0]['AASId'], | |
"aas_id_short": result['metadatas'][0][0]['AASIdShort'], | |
"submodel_id_short": result['metadatas'][0][0]['SubmodelName'], | |
"submodel_id": result['metadatas'][0][0]['SubmodelId'], | |
"matched_object": result['documents'][0][0], | |
} | |
final_results = [final_result] | |
# Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten | |
elif homogen == "Nix": | |
try: | |
with_metadata = collection.query( | |
query_embeddings=queries, | |
n_results=return_matches, | |
where={"Metalabel": metadata}, | |
) | |
# except NoDatapointsException: | |
# with_metadata = 'Nix' | |
except Exception: | |
with_metadata = "Nix" | |
without_metadata = collection.query( | |
query_embeddings=queries, | |
n_results=return_matches, | |
) | |
if with_metadata == "Nix": | |
result = without_metadata | |
result[ | |
"matching_method" | |
] = "Semantically not equivalent, NLP without Metadata" | |
result[ | |
"matching_algorithm" | |
] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" | |
elif with_metadata != "Nix": | |
distance_with_meta = with_metadata["distances"][0][0] | |
distance_without_meta = without_metadata["distances"][0][0] | |
print(distance_with_meta) | |
print(distance_without_meta) | |
# Vergleich der Abstände von mit und ohne Metadaten | |
if distance_without_meta <= distance_with_meta: | |
result = without_metadata | |
result[ | |
"matching_method" | |
] = "Semantically not equivalent, NLP without Metadata" | |
result[ | |
"matching_algorithm" | |
] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" | |
else: | |
result = with_metadata | |
result[ | |
"matching_method" | |
] = "Semantically not equivalent, NLP without Metadata" | |
result[ | |
"matching_algorithm" | |
] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" | |
# Aufbereiten des passenden finalen Ergebnisses | |
final_results = [] | |
for i in range(0, return_matches): | |
value = result['documents'][0][i] | |
value_dict = json.loads(value) | |
final_result = { | |
"matching_method": result['matching_method'], | |
"matching_algorithm": result['matching_algorithm'], | |
"matching_distance": result['distances'][0][i], | |
"aas_id": result['metadatas'][0][i]['AASId'], | |
"aas_id_short": result['metadatas'][0][i]['AASIdShort'], | |
"submodel_id_short": result['metadatas'][0][i]['SubmodelName'], | |
"submodel_id": result['metadatas'][0][i]['SubmodelId'], | |
#"matched_object": result['documents'][0][i] | |
"matched_object": value_dict | |
} | |
final_results.append(final_result) | |
return final_results | |
def ask_database(query, metalabel, model, collections, client_chroma): | |
# Alle AAS werden nacheinaner abgefragt | |
json_query = json.dumps(query, indent=4) | |
results = [] | |
for collection in collections: | |
print(collection.name) | |
collection = client_chroma.get_collection(collection.name) | |
result = query_aas(json_query, collection, model, metalabel) | |
results.append(result) | |
#results_json = json.dumps(results) | |
return results | |