OpenSearch-AI

Running on T4

App Files Files Community

OpenSearch-AI / RAG /rag_DocumentSearcher.py

prasadnu

change ksize in RAG

5195f8b 8 days ago

raw

history blame contribute delete

11.6 kB

	import boto3
	import json
	import os
	import streamlit as st
	import base64
	import re
	import requests
	#import utilities.re_ranker as re_ranker
	import utilities.invoke_models as invoke_models
	#import langchain
	headers = {"Content-Type": "application/json"}
	host = "https://search-opensearchservi-shjckef2t7wo-iyv6rajdgxg6jas25aupuxev6i.us-west-2.es.amazonaws.com/"

	parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1])

	def query_(awsauth,inputs, session_id,search_types):

	print("using index: "+st.session_state.input_index)

	question = inputs['query']

	k=1
	embedding = invoke_models.invoke_model_mm(question,"none")

	query_mm = {
	"size": k,
	"_source": {
	"exclude": [
	"processed_element_embedding_bedrock-multimodal","processed_element_embedding_sparse","image_encoding","processed_element_embedding"
	]
	},
	"query": {
	"knn": {
	"processed_element_embedding_bedrock-multimodal": {
	"vector": embedding,
	"k": k}
	}
	}
	}

	path = st.session_state.input_index+"_mm/_search"
	url = host+path
	r = requests.get(url, auth=awsauth, json=query_mm, headers=headers)
	response_mm = json.loads(r.text)

	hits = response_mm['hits']['hits']
	context = []
	context_tables = []
	images = []

	for hit in hits:
	images.append({'file':hit['_source']['image'],'caption':hit['_source']['processed_element']})

	####### SEARCH ########
	num_queries = len(search_types)

	weights = []

	searches = ['Keyword','Vector','NeuralSparse']
	equal_weight = (int(100/num_queries) )/100
	s_pipeline_payload = {}
	s_pipeline_path = "_search/pipeline/rag-search-pipeline"
	if(st.session_state.input_is_rerank):
	s_pipeline_payload["response_processors"] = [
	{
	"rerank": {
	"ml_opensearch": {
	"model_id": "deBS3pYB5VHEj-qVuPHT"
	},
	"context": {
	"document_fields": [
	"processed_element"
	]
	}
	}
	}
	]

	if(num_queries>1):
	for index,search in enumerate(search_types):

	if(index != (num_queries-1)):
	weight = equal_weight
	else:
	weight = 1-sum(weights)

	weights.append(weight)
	s_pipeline_payload["phase_results_processors"] = [
	{
	"normalization-processor": {
	"normalization": {
	"technique": "min_max"
	},
	"combination": {
	"technique": "arithmetic_mean",
	"parameters": {
	"weights": weights
	}
	}
	}
	}
	]

	SIZE = 5

	hybrid_payload = {
	"_source": {
	"exclude": [
	"processed_element_embedding","processed_element_embedding_sparse"
	]
	},
	"query": {
	"hybrid": {
	"queries": [

	#1. keyword query
	#2. vector search query
	#3. Sparse query

	]
	}
	},"size":SIZE,
	}



	if('Keyword Search' in search_types):

	keyword_payload = {
	"match": {
	"processed_element": {
	"query": question
	}
	}
	}

	hybrid_payload["query"]["hybrid"]["queries"].append(keyword_payload)



	if('Vector Search' in search_types):

	embedding = invoke_models.invoke_model(question)

	vector_payload = {
	"knn": {
	"processed_element_embedding": {
	"vector": embedding,
	"k": 2}
	}
	}

	hybrid_payload["query"]["hybrid"]["queries"].append(vector_payload)

	if('Sparse Search' in search_types):

	sparse_payload = { "neural_sparse": {
	"processed_element_embedding_sparse": {
	"query_text": question,
	"model_id": "fkol-ZMBTp0efWqBcO2P"
	}
	}}


	hybrid_payload["query"]["hybrid"]["queries"].append(sparse_payload)

	# path2 = "_plugins/_ml/models/srrJ-owBQhe1aB-khx2n/_predict"
	# url2 = host+path2
	# payload2 = {
	# "parameters": {
	# "inputs": question
	# }
	# }
	# r2 = requests.post(url2, auth=awsauth, json=payload2, headers=headers)
	# sparse_ = json.loads(r2.text)
	# query_sparse = sparse_["inference_results"][0]["output"][0]["dataAsMap"]["response"][0]

	hits = []
	if(num_queries>1):
	s_pipeline_url = host + s_pipeline_path
	r = requests.put(s_pipeline_url, auth=awsauth, json=s_pipeline_payload, headers=headers)
	path = st.session_state.input_index+"/_search?search_pipeline=rag-search-pipeline"
	else:
	if(st.session_state.input_is_rerank):
	path = st.session_state.input_index+"/_search?search_pipeline=rerank_pipeline_rag"
	else:
	path = st.session_state.input_index+"/_search"
	url = host+path
	if(len(hybrid_payload["query"]["hybrid"]["queries"])==1):
	single_query = hybrid_payload["query"]["hybrid"]["queries"][0]
	del hybrid_payload["query"]["hybrid"]
	hybrid_payload["query"] = single_query
	if(st.session_state.input_is_rerank):
	hybrid_payload["ext"] = {"rerank": {
	"query_context": {
	"query_text": question
	}
	}}
	r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
	response_ = json.loads(r.text)
	print(response_)
	hits = response_['hits']['hits']

	else:
	if(st.session_state.input_is_rerank):
	hybrid_payload["ext"] = {"rerank": {
	"query_context": {
	"query_text": question
	}
	}}
	r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
	response_ = json.loads(r.text)
	hits = response_['hits']['hits']

	##### GET reference tables separately like *_mm index search for images ######
	# def lazy_get_table():
	# table_ref = []
	# any_table_exists = False
	# for fname in os.listdir(parent_dirname+"/split_pdf_csv"):
	# if fname.startswith(st.session_state.input_index):
	# any_table_exists = True
	# break
	# if(any_table_exists):
	# #################### Basic Match query #################
	# # payload_tables = {
	# # "query": {
	# # "bool":{

	# # "must":{"match": {
	# # "processed_element": question

	# # }},

	# # "filter":{"term":{"raw_element_type": "table"}}


	# # }}}

	# #################### Neural Sparse query #################
	# payload_tables = {"query":{"neural_sparse": {
	# "processed_element_embedding_sparse": {
	# "query_text": question,
	# "model_id": "fkol-ZMBTp0efWqBcO2P"
	# }
	# } } }


	# r_ = requests.get(url, auth=awsauth, json=payload_tables, headers=headers)
	# r_tables = json.loads(r_.text)

	# for res_ in r_tables['hits']['hits']:
	# if(res_["_source"]['raw_element_type'] == 'table'):
	# table_ref.append({'name':res_["_source"]['table'],'text':res_["_source"]['processed_element']})
	# if(len(table_ref) == 2):
	# break


	# return table_ref


	########################### LLM Generation ########################
	prompt_template = """
	The following is a friendly conversation between a human and an AI.
	The AI is talkative and provides lots of specific details from its context.
	{context}
	Instruction: Based on the above documents, provide a detailed answer for, {question}. Answer "don't know",
	if not present in the context.
	Solution:"""



	idx = 0
	images_2 = []
	is_table_in_result = False
	df = []
	for hit in hits[0:5]:


	if(hit["_source"]["raw_element_type"] == 'table'):
	#print("Need to analyse table")
	is_table_in_result = True
	table_res = invoke_models.read_from_table(hit["_source"]["table"],question)
	df.append({'name':hit["_source"]["table"],'text':hit["_source"]["processed_element"]})
	context_tables.append(table_res+"\n\n"+hit["_source"]["processed_element"])

	else:
	if(hit["_source"]["image"]!="None"):
	with open(parent_dirname+'/figures/'+st.session_state.input_index+"/"+hit["_source"]["raw_element_type"].split("_")[1].replace(".jpg","")+"-resized.jpg", "rb") as read_img:
	input_encoded = base64.b64encode(read_img.read()).decode("utf8")
	context.append(invoke_models.generate_image_captions_llm(input_encoded,question))
	else:
	context.append(hit["_source"]["processed_element"])

	if(hit["_source"]["image"]!="None"):
	images_2.append({'file':hit["_source"]["image"],'caption':hit["_source"]["processed_element"]})

	idx = idx +1

	# if(is_table_in_result == False):
	# df = lazy_get_table()
	# print("forcefully selected top 2 tables")
	# print(df)

	# for pos,table in enumerate(df):
	# table_res = invoke_models.read_from_table(table['name'],question)
	# context_tables.append(table_res)#+"\n\n"+table['text']


	total_context = context_tables + context


	llm_prompt = prompt_template.format(context=total_context[0],question=question)
	output = invoke_models.invoke_llm_model( "\n\nHuman: {input}\n\nAssistant:".format(input=llm_prompt) ,False)
	if(len(images_2)==0):
	images_2 = images
	return {'text':output,'source':total_context,'image':images_2,'table':df}