import os import io import sys sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities") import json import glob import boto3 import base64 import logging import requests import numpy as np import pandas as pd from typing import List from botocore.auth import SigV4Auth #from langchain.llms.bedrock import Bedrock from botocore.awsrequest import AWSRequest import streamlit as st import re from sklearn.metrics import ndcg_score,dcg_score from sklearn import preprocessing as pre import invoke_models as llm#invoke_llm_model # bedrock_ = boto3.client( # 'bedrock-runtime', # aws_access_key_id=st.secrets['user_access_key'], # aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1' # ) # inference_modifier = { # "max_tokens_to_sample": 4096, # "temperature": 0, # "top_k": 250, # "top_p": 1, # "stop_sequences": ["\n\nHuman"], # } # textgen_llm = Bedrock( # model_id="anthropic.claude-v2:1", # client=bedrock_, # model_kwargs=inference_modifier, # ) #@st.cache_data def eval(question, answers): search_results: str = "" prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n It does not need to be a stringent test. The goal is to filter out totally irrelevant product retrievals. \n If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. {} {} Now based on the information provided above, for every given Retrieved document, provide the index of the document, it's score out of 5 based on relevance with the User question, is it relevant or not as true or false, reason why you this is relevant or not, in json format, Answer: """ #Finally, as the last line of your response, write the relevant indexes as a comma separated list in a line. query = question[0]['question'] index_ = 0 for i in answers[0]['answer']: desc = i['caption']+ "."+ i['desc'] search_results += f"Index: {index_}, Description: {desc}\n\n" index_ = index_+1 prompt = prompt.format(query, search_results) response = json.loads(llm.invoke_llm_model(prompt,False)) #response = textgen_llm(prompt) #print("Response from LLM: ", response) # inter_trim =response.split("[")[1] # final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}') llm_scores = [] current_scores = [] for idx,i in enumerate(answers[0]['answer']): idx_str = str(idx) if('relevant' in response[idx_str]): relevance = response[idx_str]['relevant'] else: relevance = True if('score' in response[idx_str]): score_ = response[idx_str]['score'] else: score_ = 0.0 i['relevant'] = relevance llm_scores.append(score_) current_scores.append(i['score']) #print("LLM Scores: ", llm_scores) #print("Current Scores: ", current_scores) x = np.array(llm_scores) x = x.reshape(-1, 1) x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist() y = np.array(current_scores) y = y.reshape(-1, 1) y_norm = (pre.MinMaxScaler().fit_transform(y)).flatten().tolist() st.session_state.answers = answers dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores])) # IDCG score idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores])) # Normalized DCG score ndcg = dcg print("NDCG: ", ndcg) if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0): st.session_state.ndcg_increase = "↑~"+str('%.3f'%(ndcg-st.session_state.input_ndcg )) elif(ndcg < st.session_state.input_ndcg): st.session_state.ndcg_increase = "↓~"+str('%.3f'%(st.session_state.input_ndcg - ndcg)) else: st.session_state.ndcg_increase = " ~ " print(st.session_state.ndcg_increase) print(st.session_state.input_ndcg) st.session_state.input_ndcg = ndcg