prasadnu commited on
Commit
c2c6e99
·
1 Parent(s): 67aeb50

search pipeline updated

Browse files
RAG/rag_DocumentLoader.py CHANGED
@@ -16,7 +16,6 @@ from requests_aws4auth import AWS4Auth
16
  import re_ranker
17
  import utilities.invoke_models as invoke_models
18
  from requests.auth import HTTPBasicAuth
19
-
20
  import generate_csv_for_tables
21
  from pdf2image import convert_from_bytes,convert_from_path
22
  #import langchain
 
16
  import re_ranker
17
  import utilities.invoke_models as invoke_models
18
  from requests.auth import HTTPBasicAuth
 
19
  import generate_csv_for_tables
20
  from pdf2image import convert_from_bytes,convert_from_path
21
  #import langchain
RAG/rag_DocumentSearcher.py CHANGED
@@ -66,11 +66,6 @@ def query_(awsauth,inputs, session_id,search_types):
66
  images.append({'file':hit['_source']['image'],'caption':hit['_source']['processed_element']})
67
 
68
  ####### SEARCH ########
69
-
70
-
71
- path = "_search/pipeline/rag-search-pipeline"
72
- url = host + path
73
-
74
  num_queries = len(search_types)
75
 
76
  weights = []
@@ -89,7 +84,8 @@ def query_(awsauth,inputs, session_id,search_types):
89
 
90
  #print(weights)
91
 
92
-
 
93
  s_pipeline_payload = {
94
  "description": "Post processor for hybrid search",
95
  "phase_results_processors": [
@@ -110,10 +106,6 @@ def query_(awsauth,inputs, session_id,search_types):
110
  }
111
 
112
  r = requests.put(url, auth=awsauth, json=s_pipeline_payload, headers=headers)
113
- #print(r.status_code)
114
- #print(r.text)
115
-
116
-
117
 
118
  SIZE = 5
119
 
@@ -154,7 +146,7 @@ def query_(awsauth,inputs, session_id,search_types):
154
 
155
  if('Vector Search' in search_types):
156
 
157
- embedding = embedding = invoke_models.invoke_model(question)
158
 
159
  vector_payload = {
160
  "knn": {
@@ -172,7 +164,7 @@ def query_(awsauth,inputs, session_id,search_types):
172
  sparse_payload = { "neural_sparse": {
173
  "processed_element_embedding_sparse": {
174
  "query_text": question,
175
- "model_id": "srrJ-owBQhe1aB-khx2n"
176
  }
177
  }}
178
 
@@ -190,13 +182,6 @@ def query_(awsauth,inputs, session_id,search_types):
190
  # sparse_ = json.loads(r2.text)
191
  # query_sparse = sparse_["inference_results"][0]["output"][0]["dataAsMap"]["response"][0]
192
 
193
-
194
-
195
-
196
-
197
- # print("hybrid_payload")
198
- # print("---------------")
199
- #print(hybrid_payload)
200
  hits = []
201
  if(num_queries>1):
202
  path = st.session_state.input_index+"/_search?search_pipeline=rag-search-pipeline"
@@ -208,23 +193,16 @@ def query_(awsauth,inputs, session_id,search_types):
208
  del hybrid_payload["query"]["hybrid"]
209
  hybrid_payload["query"] = single_query
210
  r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
211
- #print(r.status_code)
212
  response_ = json.loads(r.text)
213
- #print("-------------------------------------------------------------------")
214
- #print(r.text)
215
  hits = response_['hits']['hits']
216
 
217
  else:
218
  r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
219
- #print(r.status_code)
220
  response_ = json.loads(r.text)
221
- #print("-------------------------------------------------------------------")
222
- #print(response_)
223
  hits = response_['hits']['hits']
224
 
225
  ##### GET reference tables separately like *_mm index search for images ######
226
  def lazy_get_table():
227
- #print("Forcing table analysis")
228
  table_ref = []
229
  any_table_exists = False
230
  for fname in os.listdir(parent_dirname+"/split_pdf_csv"):
@@ -251,7 +229,7 @@ def query_(awsauth,inputs, session_id,search_types):
251
  payload_tables = {"query":{"neural_sparse": {
252
  "processed_element_embedding_sparse": {
253
  "query_text": question,
254
- "model_id": "srrJ-owBQhe1aB-khx2n"
255
  }
256
  } } }
257
 
 
66
  images.append({'file':hit['_source']['image'],'caption':hit['_source']['processed_element']})
67
 
68
  ####### SEARCH ########
 
 
 
 
 
69
  num_queries = len(search_types)
70
 
71
  weights = []
 
84
 
85
  #print(weights)
86
 
87
+ path = "_search/pipeline/rag-search-pipeline"
88
+ url = host + path
89
  s_pipeline_payload = {
90
  "description": "Post processor for hybrid search",
91
  "phase_results_processors": [
 
106
  }
107
 
108
  r = requests.put(url, auth=awsauth, json=s_pipeline_payload, headers=headers)
 
 
 
 
109
 
110
  SIZE = 5
111
 
 
146
 
147
  if('Vector Search' in search_types):
148
 
149
+ embedding = invoke_models.invoke_model(question)
150
 
151
  vector_payload = {
152
  "knn": {
 
164
  sparse_payload = { "neural_sparse": {
165
  "processed_element_embedding_sparse": {
166
  "query_text": question,
167
+ "model_id": "fkol-ZMBTp0efWqBcO2P"
168
  }
169
  }}
170
 
 
182
  # sparse_ = json.loads(r2.text)
183
  # query_sparse = sparse_["inference_results"][0]["output"][0]["dataAsMap"]["response"][0]
184
 
 
 
 
 
 
 
 
185
  hits = []
186
  if(num_queries>1):
187
  path = st.session_state.input_index+"/_search?search_pipeline=rag-search-pipeline"
 
193
  del hybrid_payload["query"]["hybrid"]
194
  hybrid_payload["query"] = single_query
195
  r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
 
196
  response_ = json.loads(r.text)
 
 
197
  hits = response_['hits']['hits']
198
 
199
  else:
200
  r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
 
201
  response_ = json.loads(r.text)
 
 
202
  hits = response_['hits']['hits']
203
 
204
  ##### GET reference tables separately like *_mm index search for images ######
205
  def lazy_get_table():
 
206
  table_ref = []
207
  any_table_exists = False
208
  for fname in os.listdir(parent_dirname+"/split_pdf_csv"):
 
229
  payload_tables = {"query":{"neural_sparse": {
230
  "processed_element_embedding_sparse": {
231
  "query_text": question,
232
+ "model_id": "fkol-ZMBTp0efWqBcO2P"
233
  }
234
  } } }
235
 
pages/Multimodal_Conversational_Search.py CHANGED
@@ -145,11 +145,6 @@ if clear:
145
  st.session_state.questions_ = []
146
  st.session_state.answers_ = []
147
  st.session_state.input_query=""
148
- # st.session_state.input_searchType="Conversational Search (RAG)"
149
- # st.session_state.input_temperature = "0.001"
150
- # st.session_state.input_topK = 200
151
- # st.session_state.input_topP = 0.95
152
- # st.session_state.input_maxTokens = 1024
153
 
154
 
155
  def handle_input():
@@ -163,11 +158,6 @@ def handle_input():
163
  if key.startswith('input_'):
164
  inputs[key.removeprefix('input_')] = st.session_state[key]
165
  st.session_state.inputs_ = inputs
166
-
167
- #######
168
-
169
-
170
- #st.write(inputs)
171
  question_with_id = {
172
  'question': inputs["query"],
173
  'id': len(st.session_state.questions_)
@@ -175,7 +165,7 @@ def handle_input():
175
  st.session_state.questions_.append(question_with_id)
176
  if(st.session_state.input_is_colpali):
177
  out_ = colpali.colpali_search_rerank(st.session_state.input_query)
178
- #print(out_)
179
  else:
180
  out_ = rag_DocumentSearcher.query_(awsauth, inputs, st.session_state['session_id'],st.session_state.input_rag_searchType)
181
  st.session_state.answers_.append({
 
145
  st.session_state.questions_ = []
146
  st.session_state.answers_ = []
147
  st.session_state.input_query=""
 
 
 
 
 
148
 
149
 
150
  def handle_input():
 
158
  if key.startswith('input_'):
159
  inputs[key.removeprefix('input_')] = st.session_state[key]
160
  st.session_state.inputs_ = inputs
 
 
 
 
 
161
  question_with_id = {
162
  'question': inputs["query"],
163
  'id': len(st.session_state.questions_)
 
165
  st.session_state.questions_.append(question_with_id)
166
  if(st.session_state.input_is_colpali):
167
  out_ = colpali.colpali_search_rerank(st.session_state.input_query)
168
+
169
  else:
170
  out_ = rag_DocumentSearcher.query_(awsauth, inputs, st.session_state['session_id'],st.session_state.input_rag_searchType)
171
  st.session_state.answers_.append({
utilities/re_ranker.py CHANGED
@@ -6,16 +6,15 @@ import streamlit as st
6
  from sentence_transformers import CrossEncoder
7
 
8
  model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
9
- kendra_ranking = boto3.client("kendra-ranking",region_name = 'us-east-1')
10
-
11
-
12
- print("Create a rescore execution plan.")
13
 
14
  # Provide a name for the rescore execution plan
15
- name = "MyRescoreExecutionPlan"
16
  # Set your required additional capacity units
17
  # Don't set capacity units if you don't require more than 1 unit given by default
18
- capacity_units = 2
19
 
20
  # try:
21
  # rescore_execution_plan_response = kendra_ranking.create_rescore_execution_plan(
@@ -45,7 +44,7 @@ capacity_units = 2
45
  # print("%s" % e)
46
 
47
  # print("Program ends.")
48
-
49
 
50
 
51
  def re_rank(self_, rerank_type, search_type, question, answers):
@@ -78,52 +77,29 @@ def re_rank(self_, rerank_type, search_type, question, answers):
78
 
79
 
80
  re_ranked = [{}]
81
-
82
-
83
-
84
-
85
-
86
- if(rerank_type == 'Kendra Rescore'):
87
-
88
-
89
-
90
-
91
- rescore_response = kendra_ranking.rescore(
92
- RescoreExecutionPlanId = 'b2a4d4f3-98ff-4e17-8b69-4c61ed7d91eb',
93
- SearchQuery = query,
94
- Documents = ans
95
- )
96
-
97
-
98
- #[{'DocumentId': 'DocId1', 'Score': 2.0}, {'DocumentId': 'DocId2', 'Score': 1.0}]
99
-
100
 
101
- re_ranked[0]['answer']=[]
102
- for result in rescore_response["ResultItems"]:
103
-
104
- pos_ = ids.index(result['DocumentId'])
105
-
106
- re_ranked[0]['answer'].append(answers[0]['answer'][pos_])
107
- re_ranked[0]['search_type']=search_type,
108
- re_ranked[0]['id'] = len(question)
109
-
110
- #st.session_state.answers_none_rank = st.session_state.answers
111
- return re_ranked
112
-
113
-
114
- # if(rerank_type == 'None'):
115
-
116
- # st.session_state.answers = st.session_state.answers_none_rank
117
-
118
-
119
  if(rerank_type == 'Cross Encoder'):
120
 
121
  scores = model.predict(
122
  ques_ans
123
  )
124
 
125
- print("scores")
126
- print(scores)
127
  index__ = 0
128
  for i in ans:
129
  i['new_score'] = scores[index__]
@@ -148,9 +124,6 @@ def re_rank(self_, rerank_type, search_type, question, answers):
148
  return re_ranked
149
 
150
 
151
-
152
-
153
- #return st.session_state.answers
154
 
155
 
156
 
 
6
  from sentence_transformers import CrossEncoder
7
 
8
  model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
9
+ ####### Add this Kendra Rescore ranking
10
+ #kendra_ranking = boto3.client("kendra-ranking",region_name = 'us-east-1')
11
+ #print("Create a rescore execution plan.")
 
12
 
13
  # Provide a name for the rescore execution plan
14
+ #name = "MyRescoreExecutionPlan"
15
  # Set your required additional capacity units
16
  # Don't set capacity units if you don't require more than 1 unit given by default
17
+ #capacity_units = 2
18
 
19
  # try:
20
  # rescore_execution_plan_response = kendra_ranking.create_rescore_execution_plan(
 
44
  # print("%s" % e)
45
 
46
  # print("Program ends.")
47
+ #########################
48
 
49
 
50
  def re_rank(self_, rerank_type, search_type, question, answers):
 
77
 
78
 
79
  re_ranked = [{}]
80
+ ####### Add this Kendra Rescore ranking
81
+ # if(rerank_type == 'Kendra Rescore'):
82
+ # rescore_response = kendra_ranking.rescore(
83
+ # RescoreExecutionPlanId = 'b2a4d4f3-98ff-4e17-8b69-4c61ed7d91eb',
84
+ # SearchQuery = query,
85
+ # Documents = ans
86
+ # )
87
+ # re_ranked[0]['answer']=[]
88
+ # for result in rescore_response["ResultItems"]:
89
+
90
+ # pos_ = ids.index(result['DocumentId'])
91
+
92
+ # re_ranked[0]['answer'].append(answers[0]['answer'][pos_])
93
+ # re_ranked[0]['search_type']=search_type,
94
+ # re_ranked[0]['id'] = len(question)
95
+ # return re_ranked
 
 
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  if(rerank_type == 'Cross Encoder'):
98
 
99
  scores = model.predict(
100
  ques_ans
101
  )
102
 
 
 
103
  index__ = 0
104
  for i in ans:
105
  i['new_score'] = scores[index__]
 
124
  return re_ranked
125
 
126
 
 
 
 
127
 
128
 
129