OpenSearch-AI

Running on T4

App Files Files Community

OpenSearch-AI / semantic_search /llm_eval.py

prasadnu

mvectors

a2b72ed 12 days ago

raw

history blame contribute delete

4.36 kB

	import os
	import io
	import sys
	sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities")
	import json
	import glob
	import boto3
	import base64
	import logging
	import requests
	import numpy as np
	import pandas as pd
	from typing import List
	from botocore.auth import SigV4Auth
	#from langchain.llms.bedrock import Bedrock
	from botocore.awsrequest import AWSRequest
	import streamlit as st
	import re
	from sklearn.metrics import ndcg_score,dcg_score
	from sklearn import preprocessing as pre
	import invoke_models as llm#invoke_llm_model

	# bedrock_ = boto3.client(
	# 'bedrock-runtime',
	# aws_access_key_id=st.secrets['user_access_key'],
	# aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1'
	# )

	# inference_modifier = {
	# "max_tokens_to_sample": 4096,
	# "temperature": 0,
	# "top_k": 250,
	# "top_p": 1,
	# "stop_sequences": ["\n\nHuman"],
	# }
	# textgen_llm = Bedrock(
	# model_id="anthropic.claude-v2:1",
	# client=bedrock_,
	# model_kwargs=inference_modifier,
	# )


	#@st.cache_data
	def eval(question, answers):
	search_results: str = ""
	prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n
	The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n
	It does not need to be a stringent test. The goal is to filter out totally irrelevant product retrievals. \n
	If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.

	<User question>
	{}
	</User question>

	<Retrieved document>
	{}
	</Retrieved document>

	Now based on the information provided above, for every given Retrieved document, provide the index of the document, it's score out of 5 based on relevance with the User question, is it relevant or not as true or false, reason why you this is relevant or not, in json format,

	Answer:
	"""
	#Finally, as the last line of your response, write the relevant indexes as a comma separated list in a line.


	query = question[0]['question']
	index_ = 0
	for i in answers[0]['answer']:
	desc = i['caption']+ "."+ i['desc']
	search_results += f"Index: {index_}, Description: {desc}\n\n"
	index_ = index_+1
	prompt = prompt.format(query, search_results)
	response = json.loads(llm.invoke_llm_model(prompt,False))
	#response = textgen_llm(prompt)
	#print("Response from LLM: ", response)
	# inter_trim =response.split("[")[1]
	# final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}')
	llm_scores = []
	current_scores = []
	for idx,i in enumerate(answers[0]['answer']):
	idx_str = str(idx)
	if('relevant' in response[idx_str]):
	relevance = response[idx_str]['relevant']
	else:
	relevance = True

	if('score' in response[idx_str]):
	score_ = response[idx_str]['score']
	else:
	score_ = 0.0
	i['relevant'] = relevance
	llm_scores.append(score_)
	current_scores.append(i['score'])


	#print("LLM Scores: ", llm_scores)
	#print("Current Scores: ", current_scores)
	x = np.array(llm_scores)
	x = x.reshape(-1, 1)
	x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist()

	y = np.array(current_scores)
	y = y.reshape(-1, 1)
	y_norm = (pre.MinMaxScaler().fit_transform(y)).flatten().tolist()


	st.session_state.answers = answers
	dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores]))

	# IDCG score
	idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores]))

	# Normalized DCG score
	ndcg = dcg
	print("NDCG: ", ndcg)
	if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0):
	st.session_state.ndcg_increase = "↑~"+str('%.3f'%(ndcg-st.session_state.input_ndcg ))
	elif(ndcg < st.session_state.input_ndcg):
	st.session_state.ndcg_increase = "↓~"+str('%.3f'%(st.session_state.input_ndcg - ndcg))
	else:
	st.session_state.ndcg_increase = " ~ "

	print(st.session_state.ndcg_increase)
	print(st.session_state.input_ndcg)
	st.session_state.input_ndcg = ndcg