Spaces:
Running
on
T4
Running
on
T4
''' | |
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | |
SPDX-License-Identifier: MIT-0 | |
''' | |
from collections import namedtuple | |
from datetime import datetime, timedelta | |
from dateutil import tz, parser | |
import itertools | |
import json | |
import os | |
import time | |
import uuid | |
import requests | |
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth | |
from requests_aws4auth import AWS4Auth | |
from requests.auth import HTTPBasicAuth | |
from datetime import datetime | |
import boto3 | |
import streamlit as st | |
import utilities.mvectors as cb | |
current_date_time = (datetime.now()).isoformat() | |
today_ = datetime.today().strftime('%Y-%m-%d') | |
def handler(input_,session_id): | |
DOMAIN_ENDPOINT = st.session_state.OpenSearchDomainEndpoint #"search-opensearchservi-rimlzstyyeih-3zru5p2nxizobaym45e5inuayq.us-west-2.es.amazonaws.com" | |
REGION = st.session_state.REGION | |
#SAGEMAKER_MODEL_ID = st.session_state.SAGEMAKER_MODEL_ID | |
BEDROCK_TEXT_MODEL_ID = st.session_state.BEDROCK_TEXT_MODEL_ID | |
BEDROCK_MULTIMODAL_MODEL_ID = st.session_state.BEDROCK_MULTIMODAL_MODEL_ID | |
SAGEMAKER_SPARSE_MODEL_ID = st.session_state.SAGEMAKER_SPARSE_MODEL_ID | |
SAGEMAKER_CrossEncoder_MODEL_ID = st.session_state.SAGEMAKER_CrossEncoder_MODEL_ID | |
print("BEDROCK_TEXT_MODEL_ID") | |
print(BEDROCK_TEXT_MODEL_ID) | |
####### Hybrid Search weights logic for throwing warning to users for inappropriate weights ####### | |
# def my_filtering_function(pair): | |
# key, value = pair | |
# if key.split("-")[0] + " Search" in st.session_state["inputs_"]["searchType"]: | |
# return True # keep pair in the filtered dictionary | |
# else: | |
# return False # filter pair out of the dictionary | |
# filtered_search = dict(filter(my_filtering_function, st.session_state.input_weightage.items())) | |
# search_types_used = ", ".join(st.session_state["inputs_"]["searchType"]) | |
# if((sum(st.session_state.weights_)!=100 or len(st.session_state["inputs_"]["searchType"])!=len(list(filter(lambda a: a >0, st.session_state.weights_)))) and len(st.session_state["inputs_"]["searchType"])!=1): | |
# st.warning('User Input Error for **WEIGHTS** :-\n\nOne or both of the below conditions was not satisfied, \n1. The total weight of all the selected search type(s): "'+search_types_used+'" should be equal to 100 \n 2. The weight of each of the search types, "'+search_types_used+'" should be greater than 0 \n\n Entered input: '+json.dumps(filtered_search)+'\n\n Please re-enter your weights to satisfy the above conditions and try again',icon = "🚨") | |
# refresh = st.button("Re-Enter") | |
# if(refresh): | |
# st.switch_page('pages/1_Semantic_Search.py') | |
# st.stop() | |
####### Auth and connection for OpenSearch domain ####### | |
credentials = boto3.Session().get_credentials() | |
awsauth = HTTPBasicAuth('master',st.secrets['ml_search_demo_api_access']) | |
host = 'https://'+DOMAIN_ENDPOINT+'/' | |
headers = {"Content-Type": "application/json"} | |
####### Parsing Inputs from user ####### | |
print("*********") | |
print(input_) | |
search_types = input_["searchType"] | |
if("NormType" not in input_.keys()): | |
norm_type = "min_max" | |
else: | |
norm_type = input_["NormType"] | |
if("CombineType" not in input_.keys()): | |
combine_type = "arithmetic_mean" | |
else: | |
combine_type = input_["CombineType"] | |
if("weight" not in input_.keys()): | |
semantic_weight = 0.5 | |
else: | |
semantic_weight = input_["weight"] | |
query = input_["text"] | |
img = input_["image"] | |
if("sparse" not in input_.keys()): | |
sparse = "disabled" | |
else: | |
sparse = input_["sparse"] | |
k_ = input_["K"] | |
image_upload = input_["imageUpload"] | |
num_queries = len(search_types) | |
weights = [] | |
searches = ['Keyword','Vector','Multimodal','NeuralSparse'] | |
for i in searches: | |
weight = input_['weightage'][i+'-weight']/100 | |
if(weight>0.0): | |
weights.append(weight) | |
######## Updating hybrid Search pipeline ####### | |
print("Updating Search pipeline with new weights") | |
s_pipeline_payload = {"version": 1234} | |
s_pipeline_payload["phase_results_processors"] = [ | |
{ | |
"normalization-processor": { | |
"normalization": { | |
"technique": norm_type | |
}, | |
"combination": { | |
"technique": combine_type, | |
"parameters": { | |
"weights": weights | |
} | |
} | |
} | |
} | |
] | |
opensearch_search_pipeline = (requests.get(host+'_search/pipeline/hybrid_search_pipeline', auth=awsauth,headers=headers)).text | |
if(opensearch_search_pipeline!='{}'): | |
path = "_search/pipeline/hybrid_search_pipeline" | |
url = host + path | |
r = requests.put(url, auth=awsauth, json=s_pipeline_payload, headers=headers) | |
print("Hybrid Search Pipeline updated: "+str(r.status_code)) | |
######## Combining hybrid+rerank pipeline ####### | |
opensearch_rerank_pipeline = (requests.get(host+'_search/pipeline/rerank_pipeline', auth=awsauth,headers=headers)).text | |
######## start of Applying LLM filters ####### | |
if(st.session_state.input_rewritten_query!=""): | |
filter_ = {"filter": { | |
"bool": { | |
"must": []}}} | |
filter_['filter']['bool']['must'] = st.session_state.input_rewritten_query['query']['bool']['must'] | |
######## end of Applying LLM filters ####### | |
######### Create the queries for hybrid search ######### | |
path = "demostore-search-index/_search" | |
url = host + path | |
hybrid_payload = { | |
"_source": { | |
"exclude": [ | |
"product_description_vector","product_multimodal_vector","product_image" | |
] | |
}, | |
"query": { | |
"hybrid": { | |
"queries": [ | |
#1. keyword query | |
#2. vector search query | |
#3. multimodal query | |
#4. Sparse query | |
] | |
} | |
},"size":k_, | |
"highlight": { | |
"fields": { | |
"product_description": {} | |
} | |
}} | |
if('Keyword Search' in search_types): | |
keyword_payload = { | |
"match": { | |
"product_description": { | |
"query": query | |
} | |
} | |
} | |
if(st.session_state.input_rewritten_query !=""): | |
keyword_payload = st.session_state.input_rewritten_query['query'] | |
if(st.session_state.input_manual_filter == "True"): | |
keyword_payload['bool']={'filter':[]} | |
if(st.session_state.input_category!=None): | |
keyword_payload['bool']['filter'].append({"term": {"category": st.session_state.input_category}}) | |
if(st.session_state.input_gender!=None): | |
keyword_payload['bool']['filter'].append({"term": {"gender_affinity": st.session_state.input_gender}}) | |
if(st.session_state.input_price!=(0,0)): | |
keyword_payload['bool']['filter'].append({"range": {"price": {"gte": st.session_state.input_price[0],"lte": st.session_state.input_price[1] }}}) | |
keyword_payload['bool']['must'] = [{ | |
"match": { | |
"product_description": { | |
"query": query | |
} | |
} | |
}] | |
del keyword_payload['match'] | |
hybrid_payload["query"]["hybrid"]["queries"].append(keyword_payload) | |
if('Vector Search' in search_types): | |
if(st.session_state.input_mvector_rerank): | |
query_vector = cb.vectorise(query,False) | |
vector_field = "description_vector" | |
vector_payload = {"knn": {}} | |
vector_payload["knn"][vector_field]= { | |
"vector":query_vector, | |
"k": k_ | |
} | |
# path3 = "_plugins/_ml/models/"+BEDROCK_TEXT_MODEL_ID+"/_predict" | |
# url3 = host+path3 | |
# payload3 = { | |
# "parameters": { | |
# "inputText": query | |
# } | |
# } | |
# r3 = requests.post(url3, auth=awsauth, json=payload3, headers=headers) | |
# vector_ = json.loads(r3.text) | |
# #print(r3.text) | |
# query_vector = vector_['inference_results'][0]['output'][0]['data'] | |
# #print(query_vector) | |
# vector_payload = { | |
# "knn": { | |
# "product_description_vector": { | |
# "vector":query_vector, | |
# #"query_text": query, | |
# #"model_id": BEDROCK_TEXT_MODEL_ID, | |
# "k": k_ | |
# } | |
# } | |
# } | |
#using neural query | |
else: | |
vector_payload = { | |
"neural": { | |
"product_description_vector": { | |
"query_text": query, | |
"model_id": BEDROCK_TEXT_MODEL_ID, | |
"k": k_ | |
} | |
} | |
} | |
###### start of efficient filter applying ##### | |
if(st.session_state.input_rewritten_query!=""): | |
vector_payload['neural']['product_description_vector']['filter'] = filter_['filter'] | |
if(st.session_state.input_manual_filter == "True"): | |
vector_payload['neural']['product_description_vector']['filter'] = {"bool":{"must":[]}} | |
if(st.session_state.input_category!=None): | |
vector_payload['neural']['product_description_vector']['filter']["bool"]["must"].append({"term": {"category": st.session_state.input_category}}) | |
if(st.session_state.input_gender!=None): | |
vector_payload['neural']['product_description_vector']['filter']["bool"]["must"].append({"term": {"gender_affinity": st.session_state.input_gender}}) | |
if(st.session_state.input_price!=(0,0)): | |
vector_payload['neural']['product_description_vector']['filter']["bool"]["must"].append({"range": {"price": {"gte": st.session_state.input_price[0],"lte": st.session_state.input_price[1] }}}) | |
###### end of efficient filter applying ##### | |
hybrid_payload["query"]["hybrid"]["queries"].append(vector_payload) | |
if('Multimodal Search' in search_types): | |
multimodal_payload = { | |
"neural": { | |
"product_multimodal_vector": { | |
"model_id": BEDROCK_MULTIMODAL_MODEL_ID, | |
"k": k_ | |
} | |
} | |
} | |
if(image_upload == 'yes' and query == ""): | |
multimodal_payload["neural"]["product_multimodal_vector"]["query_image"] = img | |
if(image_upload == 'no' and query != ""): | |
multimodal_payload["neural"]["product_multimodal_vector"]["query_text"] = query | |
if(image_upload == 'yes' and query != ""): | |
multimodal_payload["neural"]["product_multimodal_vector"]["query_image"] = img | |
multimodal_payload["neural"]["product_multimodal_vector"]["query_text"] = query | |
###### start of efficient filter applying ##### | |
if(st.session_state.input_rewritten_query!=""): | |
multimodal_payload['neural']['product_multimodal_vector']['filter'] = filter_['filter'] | |
if(st.session_state.input_manual_filter == "True"): | |
multimodal_payload['neural']['product_multimodal_vector']['filter'] = {"bool":{"must":[]}} | |
if(st.session_state.input_category!=None): | |
multimodal_payload['neural']['product_multimodal_vector']['filter']["bool"]["must"].append({"term": {"category": st.session_state.input_category}}) | |
if(st.session_state.input_gender!=None): | |
multimodal_payload['neural']['product_multimodal_vector']['filter']["bool"]["must"].append({"term": {"gender_affinity": st.session_state.input_gender}}) | |
if(st.session_state.input_price!=(0,0)): | |
multimodal_payload['neural']['product_multimodal_vector']['filter']["bool"]["must"].append({"range": {"price": {"gte": st.session_state.input_price[0],"lte": st.session_state.input_price[1] }}}) | |
# print("vector_payload**************") | |
# print(vector_payload) | |
###### end of efficient filter applying ##### | |
hybrid_payload["query"]["hybrid"]["queries"].append(multimodal_payload) | |
if('NeuralSparse Search' in search_types): | |
path2 = "_plugins/_ml/models/"+SAGEMAKER_SPARSE_MODEL_ID+"/_predict" | |
url2 = host+path2 | |
payload2 = { | |
"parameters": { | |
"inputs": query | |
} | |
} | |
r2 = requests.post(url2, auth=awsauth, json=payload2, headers=headers) | |
sparse_ = json.loads(r2.text) | |
query_sparse = sparse_["inference_results"][0]["output"][0]["dataAsMap"]["response"][0] | |
query_sparse_sorted = {key: value for key, | |
value in sorted(query_sparse.items(), | |
key=lambda item: item[1],reverse=True)} | |
print("text expansion is enabled") | |
#print(query_sparse_sorted) | |
query_sparse_sorted_filtered = {} | |
rank_features = [] | |
for key_ in query_sparse_sorted.keys(): | |
if(query_sparse_sorted[key_]>=st.session_state.input_sparse_filter): | |
feature = {"rank_feature": {"field": "product_description_sparse_vector."+key_,"boost":query_sparse_sorted[key_]}} | |
rank_features.append(feature) | |
query_sparse_sorted_filtered[key_]=query_sparse_sorted[key_] | |
else: | |
break | |
#print(query_sparse_sorted_filtered) | |
sparse_payload = {"bool":{"should":rank_features}} | |
###### start of efficient filter applying ##### | |
if(st.session_state.input_rewritten_query!=""): | |
sparse_payload['bool']['must'] = filter_['filter']['bool']['must'] | |
if(st.session_state.input_manual_filter == "True"): | |
sparse_payload['bool']['filter']=[] | |
if(st.session_state.input_category!=None): | |
sparse_payload['bool']['filter'].append({"term": {"category": st.session_state.input_category}}) | |
if(st.session_state.input_gender!=None): | |
sparse_payload['bool']['filter'].append({"term": {"gender_affinity": st.session_state.input_gender}}) | |
if(st.session_state.input_price!=(0,0)): | |
sparse_payload['bool']['filter'].append({"range": {"price": {"gte": st.session_state.input_price[0],"lte": st.session_state.input_price[1] }}}) | |
###### end of efficient filter applying ##### | |
# sparse_payload = { | |
# "neural_sparse": | |
# { | |
# "desc_embedding_sparse": | |
# { | |
# "query_text": query, | |
# "model_id": SAGEMAKER_SPARSE_MODEL_ID, | |
# #"max_token_score": 2 | |
# } | |
# } | |
# } | |
hybrid_payload["query"]["hybrid"]["queries"].append(sparse_payload) | |
docs = [] | |
if(st.session_state.input_sql_query!=""): | |
url = host +"_plugins/_sql?format=json" | |
payload = {"query":st.session_state.input_sql_query} | |
r = requests.post(url, auth=awsauth, json=payload, headers=headers) | |
if(len(hybrid_payload["query"]["hybrid"]["queries"])==1): | |
if(st.session_state.input_mvector_rerank and 'Vector Search' in search_types): | |
path = "retail-search-colbert-description/_search" | |
url = host + path | |
r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers) | |
response_ = json.loads(r.text) | |
docs = response_['hits']['hits'] | |
docs = cb.search(docs) | |
else: | |
single_query = hybrid_payload["query"]["hybrid"]["queries"][0] | |
del hybrid_payload["query"]["hybrid"] | |
hybrid_payload["query"] = single_query | |
if(st.session_state.re_ranker == 'true' and st.session_state.input_reranker == 'Cohere Rerank'): | |
path = "demostore-search-index/_search?search_pipeline=rerank_pipeline" | |
url = host + path | |
hybrid_payload["ext"] = {"rerank": { | |
"query_context": { | |
"query_text": query | |
} | |
}} | |
r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers) | |
response_ = json.loads(r.text) | |
docs = response_['hits']['hits'] | |
else: | |
if( st.session_state.input_hybridType == "OpenSearch Hybrid Query"): | |
url_ = url + "?search_pipeline=hybrid_search_pipeline" | |
if(st.session_state.re_ranker == 'true' and st.session_state.input_reranker == 'Cohere Rerank'): | |
url_ = url + "?search_pipeline=hybrid_rerank_pipeline" | |
hybrid_payload["ext"] = {"rerank": { | |
"query_context": { | |
"query_text": query | |
} | |
}} | |
r = requests.get(url_, auth=awsauth, json=hybrid_payload, headers=headers) | |
response_ = json.loads(r.text) | |
docs = response_['hits']['hits'] | |
else: | |
all_docs = [] | |
all_docs_ids = [] | |
only_hits = [] | |
rrf_hits = [] | |
for i,query in enumerate(hybrid_payload["query"]["hybrid"]["queries"]): | |
payload_ = {'_source': | |
{'exclude': ['desc_embedding_bedrock-multimodal', 'desc_embedding_bedrock-text', 'product_description_sparse_vector']}, | |
'query': query, | |
'size': k_, 'highlight': {'fields': {'product_description': {}}}} | |
r_ = requests.get(url, auth=awsauth, json=payload_, headers=headers) | |
resp = json.loads(r_.text) | |
all_docs.append({"search":list(query.keys())[0],"results":resp['hits']['hits'],"weight":weights[i]}) | |
only_hits.append(resp['hits']['hits']) | |
for hit in resp['hits']['hits']: | |
all_docs_ids.append(hit["_id"]) | |
id_scores = [] | |
rrf_hits_unsorted = [] | |
for id in all_docs_ids: | |
score = 0.0 | |
for result_set in all_docs: | |
if id in json.dumps(result_set['results']): | |
for n,res in enumerate(result_set['results']): | |
if(res["_id"] == id): | |
score += result_set["weight"] * (1.0 / (n+1)) | |
id_scores.append({"id":id,"score":score}) | |
for only_hit in only_hits: | |
for i_ in only_hit: | |
if(id == i_["_id"]): | |
i_["_score"] = score | |
rrf_hits_unsorted.append(i_) | |
docs = sorted(rrf_hits_unsorted, key=lambda x: x['_score'],reverse=True) | |
arr = [] | |
dup = [] | |
for doc in docs: | |
if(doc['_source']['image_url'] not in dup): | |
res_ = { | |
"desc":doc['_source']['product_description'], | |
"caption":doc['_source']['caption'], | |
"image_url":doc['_source']['image_url'], | |
"category":doc['_source']['category'], | |
"price":doc['_source']['price'], | |
"gender_affinity":doc['_source']['gender_affinity'], | |
"style":doc['_source']['style'], | |
} | |
if('max_score_dict_list_sorted' in doc): | |
res_['max_score_dict_list_sorted'] = doc['max_score_dict_list_sorted'] | |
if('highlight' in doc): | |
res_['highlight'] = doc['highlight']['product_description'] | |
if('NeuralSparse Search' in search_types): | |
res_['sparse'] = doc['_source']['product_description_sparse_vector'] | |
res_['query_sparse'] = query_sparse_sorted_filtered | |
# if(st.session_state.input_rekog_label !="" or st.session_state.input_is_rewrite_query == 'enabled'): | |
# res_['rekog'] = {'color':doc['_source']['rekog_color'],'category': doc['_source']['rekog_categories'],'objects':doc['_source']['rekog_objects']} | |
res_['id'] = doc['_id'] | |
res_['score'] = doc['_score'] | |
res_['title'] = doc['_source']['product_description'] | |
arr.append(res_) | |
dup.append(doc['_source']['image_url']) | |
return arr[0:k_] | |