Spaces:
Running
on
T4
Running
on
T4
import os | |
import io | |
import sys | |
sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities") | |
import json | |
import glob | |
import boto3 | |
import base64 | |
import logging | |
import requests | |
import numpy as np | |
import pandas as pd | |
from typing import List | |
from botocore.auth import SigV4Auth | |
#from langchain.llms.bedrock import Bedrock | |
from botocore.awsrequest import AWSRequest | |
import streamlit as st | |
import re | |
from sklearn.metrics import ndcg_score,dcg_score | |
from sklearn import preprocessing as pre | |
import invoke_models as llm#invoke_llm_model | |
# bedrock_ = boto3.client( | |
# 'bedrock-runtime', | |
# aws_access_key_id=st.secrets['user_access_key'], | |
# aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1' | |
# ) | |
# inference_modifier = { | |
# "max_tokens_to_sample": 4096, | |
# "temperature": 0, | |
# "top_k": 250, | |
# "top_p": 1, | |
# "stop_sequences": ["\n\nHuman"], | |
# } | |
# textgen_llm = Bedrock( | |
# model_id="anthropic.claude-v2:1", | |
# client=bedrock_, | |
# model_kwargs=inference_modifier, | |
# ) | |
#@st.cache_data | |
def eval(question, answers): | |
search_results: str = "" | |
prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n | |
The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n | |
It does not need to be a stringent test. The goal is to filter out totally irrelevant product retrievals. \n | |
If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. | |
<User question> | |
{} | |
</User question> | |
<Retrieved document> | |
{} | |
</Retrieved document> | |
Now based on the information provided above, for every given Retrieved document, provide the index of the document, it's score out of 5 based on relevance with the User question, is it relevant or not as true or false, reason why you this is relevant or not, in json format, | |
Answer: | |
""" | |
#Finally, as the last line of your response, write the relevant indexes as a comma separated list in a line. | |
query = question[0]['question'] | |
index_ = 0 | |
for i in answers[0]['answer']: | |
desc = i['caption']+ "."+ i['desc'] | |
search_results += f"Index: {index_}, Description: {desc}\n\n" | |
index_ = index_+1 | |
prompt = prompt.format(query, search_results) | |
response = json.loads(llm.invoke_llm_model(prompt,False)) | |
#response = textgen_llm(prompt) | |
#print("Response from LLM: ", response) | |
# inter_trim =response.split("[")[1] | |
# final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}') | |
llm_scores = [] | |
current_scores = [] | |
for idx,i in enumerate(answers[0]['answer']): | |
idx_str = str(idx) | |
if('relevant' in response[idx_str]): | |
relevance = response[idx_str]['relevant'] | |
else: | |
relevance = True | |
if('score' in response[idx_str]): | |
score_ = response[idx_str]['score'] | |
else: | |
score_ = 0.0 | |
i['relevant'] = relevance | |
llm_scores.append(score_) | |
current_scores.append(i['score']) | |
#print("LLM Scores: ", llm_scores) | |
#print("Current Scores: ", current_scores) | |
x = np.array(llm_scores) | |
x = x.reshape(-1, 1) | |
x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist() | |
y = np.array(current_scores) | |
y = y.reshape(-1, 1) | |
y_norm = (pre.MinMaxScaler().fit_transform(y)).flatten().tolist() | |
st.session_state.answers = answers | |
dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores])) | |
# IDCG score | |
idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores])) | |
# Normalized DCG score | |
ndcg = dcg | |
print("NDCG: ", ndcg) | |
if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0): | |
st.session_state.ndcg_increase = "↑~"+str('%.3f'%(ndcg-st.session_state.input_ndcg )) | |
elif(ndcg < st.session_state.input_ndcg): | |
st.session_state.ndcg_increase = "↓~"+str('%.3f'%(st.session_state.input_ndcg - ndcg)) | |
else: | |
st.session_state.ndcg_increase = " ~ " | |
print(st.session_state.ndcg_increase) | |
print(st.session_state.input_ndcg) | |
st.session_state.input_ndcg = ndcg |