OpenSearch-AI

Running on T4

File size: 13,040 Bytes

2e2dda5
 
 
 
 
 
 
6f4ec47
2e2dda5
 
 
f05a8a8
2e2dda5
 
59c4f4e
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66dfc70
 
 
 
 
 
 
 
 
 
 
 
 
 
2e2dda5
66dfc70
 
 
 
 
 
 
 
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cde73bf
 
 
 
 
 
 
2c198a6
cde73bf
 
 
 
 
 
 
 
 
 
2e2dda5
 
 
 
 
 
 
 
 
cde73bf
 
 
 
 
 
 
 
 
 
 
 
2e2dda5
 
cde73bf
 
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2c6e99
66dfc70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e2dda5
 
 
 
 
 
 
 
c2c6e99
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c77dc87
cde73bf
 
 
2e2dda5
97ca84a
c77dc87
 
 
2e2dda5
 
 
 
 
c77dc87
 
 
 
 
 
2e2dda5
 
bbe0e25
2e2dda5
 
 
c77dc87
 
 
 
 
 
2e2dda5
 
 
 
 
1f43c77
 
 
 
 
 
 
 
 
 
 
 
2e2dda5
1f43c77
 
2e2dda5
1f43c77
2e2dda5
1f43c77
2e2dda5
 
1f43c77
2e2dda5
1f43c77
 
 
 
 
 
 
2e2dda5
 
1f43c77
 
2e2dda5
1f43c77
 
 
 
 
2e2dda5
 
1f43c77
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddab07b
2e2dda5
 
d8df773
 
 
 
 
8a1e73f
2e2dda5
 
d8df773
 
 
8a1e73f
d8df773
8a1e73f
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddab07b
2e2dda5

import boto3
import json
import os
import streamlit as st
import base64
import re
import requests 
#import utilities.re_ranker as re_ranker
import utilities.invoke_models as invoke_models
#import langchain
headers = {"Content-Type": "application/json"}
host = "https://search-opensearchservi-shjckef2t7wo-iyv6rajdgxg6jas25aupuxev6i.us-west-2.es.amazonaws.com/"

parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1])

def query_(awsauth,inputs, session_id,search_types):

    print("using index: "+st.session_state.input_index)

    question = inputs['query']
    
    k=1
    embedding = invoke_models.invoke_model_mm(question,"none")
    
    query_mm = {
        "size": k,
          "_source": {
        "exclude": [
            "processed_element_embedding_bedrock-multimodal","processed_element_embedding_sparse","image_encoding","processed_element_embedding"
        ]
        },
        "query": { # exact knn search
                "script_score": {
                    "query": {
                    "match_all": {}
                    },
                    "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "processed_element_embedding_bedrock-multimodal",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                    }
                    }
                }
                } 
        #         {   #approximate knn search
        #     "knn": {
        #         "processed_element_embedding_bedrock-multimodal": {
        #             "vector": embedding, 
        #             "k": k}
        #         }
        # }
    }

    path = st.session_state.input_index+"_mm/_search"
    url = host+path
    r = requests.get(url, auth=awsauth, json=query_mm, headers=headers)
    response_mm = json.loads(r.text)

    hits = response_mm['hits']['hits']
    context = []
    context_tables = []
    images = []

    for hit in hits:
        images.append({'file':hit['_source']['image'],'caption':hit['_source']['processed_element']})
    
    ####### SEARCH ########
    num_queries = len(search_types)
    
    weights = []
    
    searches = ['Keyword','Vector','NeuralSparse']
    equal_weight = (int(100/num_queries) )/100
    s_pipeline_payload = {}
    s_pipeline_path = "_search/pipeline/rag-search-pipeline" 
    if(st.session_state.input_is_rerank):
        s_pipeline_payload["response_processors"] = [
                {
                    "rerank": {
                    "ml_opensearch": {
                        "model_id": "deBS3pYB5VHEj-qVuPHT"
                    },
                    "context": {
                        "document_fields": [
                        "processed_element"
                        ]
                    }
                    }
                }
                ]

    if(num_queries>1):
        for index,search in enumerate(search_types):
            
            if(index != (num_queries-1)):
                weight = equal_weight
            else:
                weight = 1-sum(weights)
                
            weights.append(weight)
        s_pipeline_payload["phase_results_processors"] = [
                    {
                        "normalization-processor": {
                        "normalization": {
                            "technique": "min_max"
                        },
                        "combination": {
                            "technique": "arithmetic_mean",
                            "parameters": {
                            "weights": weights
                            }
                        }
                        }
                    }
                    ]
                    
    SIZE = 5
    
    hybrid_payload = {
        "_source": {
        "exclude": [
            "processed_element_embedding","processed_element_embedding_sparse"
        ]
        },
        "query": {
        "hybrid": {
            "queries": [
            
            #1. keyword query
            #2. vector search query
            #3. Sparse query
        
            ]
        }
        },"size":SIZE,
    }
    
    
            
    if('Keyword Search' in search_types):
        
        keyword_payload = {
                        "match": {
                        "processed_element": {
                            "query": question
                        }
                        }
                    }
        
        hybrid_payload["query"]["hybrid"]["queries"].append(keyword_payload)
        
    
        
    if('Vector Search' in search_types):
        
        embedding  = invoke_models.invoke_model(question)
        vector_payload = {   # exact knn search
                "script_score": {
                    "query": {
                    "match_all": {}
                    },
                    "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "processed_element_embedding",
                        "query_value": embedding,
                        "space_type": "cosinesimil"
                    }
                    }
                }
                }
        # vector_payload = {   # aproximate knn search
        #     "knn": {
        #          "processed_element_embedding": {
        #              "vector": embedding, 
        #              "k": 2}
        #          }
        #                 }
                
        hybrid_payload["query"]["hybrid"]["queries"].append(vector_payload)
        
    if('Sparse Search' in search_types):
            
        sparse_payload =  {  "neural_sparse": {
                "processed_element_embedding_sparse": {
                    "query_text": question,
                    "model_id": "fkol-ZMBTp0efWqBcO2P"
                }
                }}
                    
        
        hybrid_payload["query"]["hybrid"]["queries"].append(sparse_payload)
        
        # path2 =  "_plugins/_ml/models/srrJ-owBQhe1aB-khx2n/_predict"
        # url2 = host+path2
        # payload2 = {
        # "parameters": {
        #     "inputs": question
        #     }
        #         }
        # r2 = requests.post(url2, auth=awsauth, json=payload2, headers=headers)
        # sparse_ = json.loads(r2.text)
        # query_sparse = sparse_["inference_results"][0]["output"][0]["dataAsMap"]["response"][0]
        
    hits = []
    if(num_queries>1): 
        s_pipeline_url = host + s_pipeline_path 
        r = requests.put(s_pipeline_url, auth=awsauth, json=s_pipeline_payload, headers=headers)
        path = st.session_state.input_index+"/_search?search_pipeline=rag-search-pipeline"
    else:
        if(st.session_state.input_is_rerank):
            path = st.session_state.input_index+"/_search?search_pipeline=rerank_pipeline_rag"
        else:
            path = st.session_state.input_index+"/_search"
    url = host+path
    if(len(hybrid_payload["query"]["hybrid"]["queries"])==1):
        single_query = hybrid_payload["query"]["hybrid"]["queries"][0]
        del hybrid_payload["query"]["hybrid"]
        hybrid_payload["query"] = single_query
        if(st.session_state.input_is_rerank):
            hybrid_payload["ext"] = {"rerank": {
                                          "query_context": {
                                             "query_text": question
                                          }
                                        }}
        r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
        response_ = json.loads(r.text)
        print(response_)
        hits = response_['hits']['hits']
        
    else:
        if(st.session_state.input_is_rerank):
            hybrid_payload["ext"] = {"rerank": {
                                          "query_context": {
                                             "query_text": question
                                          }
                                        }}
        r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
        response_ = json.loads(r.text)
        hits = response_['hits']['hits']
    
    ##### GET reference tables separately like *_mm index search for images  ######
    # def lazy_get_table():
    #     table_ref = []
    #     any_table_exists = False
    #     for fname in os.listdir(parent_dirname+"/split_pdf_csv"):
    #         if fname.startswith(st.session_state.input_index):
    #             any_table_exists = True
    #             break       
    #     if(any_table_exists):
    #         #################### Basic Match query #################
    #         # payload_tables = {
    #         #                     "query": {
    #         #                         "bool":{
                                
    #         #                         "must":{"match": {
    #         #                                         "processed_element": question
                                                
    #         #                                     }},
                                                
    #         #                             "filter":{"term":{"raw_element_type": "table"}}
                                    
                                
    #         #                     }}}
            
    #         #################### Neural Sparse query #################
    #         payload_tables = {"query":{"neural_sparse": {
    #                 "processed_element_embedding_sparse": {
    #                     "query_text": question,
    #                     "model_id": "fkol-ZMBTp0efWqBcO2P"
    #                 }
    #                 }  }     }
            
            
    #         r_ = requests.get(url, auth=awsauth, json=payload_tables, headers=headers)
    #         r_tables = json.loads(r_.text)
            
    #         for res_ in r_tables['hits']['hits']:
    #             if(res_["_source"]['raw_element_type'] == 'table'):
    #                 table_ref.append({'name':res_["_source"]['table'],'text':res_["_source"]['processed_element']})
    #             if(len(table_ref) == 2):
    #                 break
                    
            
    #     return table_ref
        
        
    ########################### LLM Generation ########################
    prompt_template = """
        The following is a friendly conversation between a human and an AI. 
        The AI is talkative and provides lots of specific details from its context.
        {context}
        Instruction: Based on the above documents, provide a detailed answer for, {question}. Answer "don't know", 
        if not present in the context. 
        Solution:"""
        
    
    
    idx = 0
    images_2 = []
    is_table_in_result = False
    df = []
    for id,hit in enumerate(hits[0:5]):
        
        
        if(hit["_source"]["raw_element_type"] == 'table'):
            #print("Need to analyse table")
            is_table_in_result = True
            #table_res = invoke_models.read_from_table(hit["_source"]["table"],question) # use for complex analytical dataframe questions (uses panda at the background))
            df.append({'name':hit["_source"]["table"],'text':hit["_source"]["processed_element"]})
            context_tables.append(str(id+1) + " : Reference from a table :" + hit["_source"]["processed_element"])#table_res+"\n\n"+
            
        else:
            if(hit["_source"]["image"]!="None"):
                with open(parent_dirname+'/figures/'+st.session_state.input_index+"/"+hit["_source"]["raw_element_type"].split("_")[1].replace(".jpg","")+"-resized.jpg", "rb") as read_img:
                    input_encoded = base64.b64encode(read_img.read()).decode("utf8")
                context.append(str(id+1) + " : Reference from a image :" + invoke_models.generate_image_captions_llm(input_encoded,question))
            else:
                context.append(str(id+1) + " : Reference from a text chunk :" + hit["_source"]["processed_element"])
            
        if(hit["_source"]["image"]!="None"):
            images_2.append({'file':hit["_source"]["image"],'caption':hit["_source"]["processed_element"]})
            
        idx = idx +1
    
    # if(is_table_in_result == False):
    #     df = lazy_get_table()
    #     print("forcefully selected top 2 tables")
    #     print(df)
        
    #     for pos,table in enumerate(df):
    #         table_res = invoke_models.read_from_table(table['name'],question)
    #         context_tables.append(table_res)#+"\n\n"+table['text']
    
    
    total_context = context_tables + context
    

    llm_prompt = prompt_template.format(context="\n".join(total_context[0:3]),question=question)
    output = invoke_models.invoke_llm_model( "\n\nHuman: {input}\n\nAssistant:".format(input=llm_prompt) ,False)
    if(len(images_2)==0):
        images_2 = images
    return {'text':output,'source':total_context,'image':images_2,'table':df}