prasadnu's picture
mvectors
a2b72ed
import os
import io
import sys
sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities")
import json
import glob
import boto3
import base64
import logging
import requests
import numpy as np
import pandas as pd
from typing import List
from botocore.auth import SigV4Auth
#from langchain.llms.bedrock import Bedrock
from botocore.awsrequest import AWSRequest
import streamlit as st
import re
from sklearn.metrics import ndcg_score,dcg_score
from sklearn import preprocessing as pre
import invoke_models as llm#invoke_llm_model
# bedrock_ = boto3.client(
# 'bedrock-runtime',
# aws_access_key_id=st.secrets['user_access_key'],
# aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1'
# )
# inference_modifier = {
# "max_tokens_to_sample": 4096,
# "temperature": 0,
# "top_k": 250,
# "top_p": 1,
# "stop_sequences": ["\n\nHuman"],
# }
# textgen_llm = Bedrock(
# model_id="anthropic.claude-v2:1",
# client=bedrock_,
# model_kwargs=inference_modifier,
# )
#@st.cache_data
def eval(question, answers):
search_results: str = ""
prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n
The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n
It does not need to be a stringent test. The goal is to filter out totally irrelevant product retrievals. \n
If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
<User question>
{}
</User question>
<Retrieved document>
{}
</Retrieved document>
Now based on the information provided above, for every given Retrieved document, provide the index of the document, it's score out of 5 based on relevance with the User question, is it relevant or not as true or false, reason why you this is relevant or not, in json format,
Answer:
"""
#Finally, as the last line of your response, write the relevant indexes as a comma separated list in a line.
query = question[0]['question']
index_ = 0
for i in answers[0]['answer']:
desc = i['caption']+ "."+ i['desc']
search_results += f"Index: {index_}, Description: {desc}\n\n"
index_ = index_+1
prompt = prompt.format(query, search_results)
response = json.loads(llm.invoke_llm_model(prompt,False))
#response = textgen_llm(prompt)
#print("Response from LLM: ", response)
# inter_trim =response.split("[")[1]
# final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}')
llm_scores = []
current_scores = []
for idx,i in enumerate(answers[0]['answer']):
idx_str = str(idx)
if('relevant' in response[idx_str]):
relevance = response[idx_str]['relevant']
else:
relevance = True
if('score' in response[idx_str]):
score_ = response[idx_str]['score']
else:
score_ = 0.0
i['relevant'] = relevance
llm_scores.append(score_)
current_scores.append(i['score'])
#print("LLM Scores: ", llm_scores)
#print("Current Scores: ", current_scores)
x = np.array(llm_scores)
x = x.reshape(-1, 1)
x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist()
y = np.array(current_scores)
y = y.reshape(-1, 1)
y_norm = (pre.MinMaxScaler().fit_transform(y)).flatten().tolist()
st.session_state.answers = answers
dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores]))
# IDCG score
idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores]))
# Normalized DCG score
ndcg = dcg
print("NDCG: ", ndcg)
if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0):
st.session_state.ndcg_increase = "&uarr;~"+str('%.3f'%(ndcg-st.session_state.input_ndcg ))
elif(ndcg < st.session_state.input_ndcg):
st.session_state.ndcg_increase = "&darr;~"+str('%.3f'%(st.session_state.input_ndcg - ndcg))
else:
st.session_state.ndcg_increase = " ~ "
print(st.session_state.ndcg_increase)
print(st.session_state.input_ndcg)
st.session_state.input_ndcg = ndcg