Spaces:
Running
on
T4
Running
on
T4
File size: 4,357 Bytes
2e2dda5 d0de3b9 2e2dda5 2e1b45a 2e2dda5 d0de3b9 2e2dda5 d0de3b9 2e2dda5 e4bf383 d0de3b9 a2b72ed 9b25384 2e2dda5 b49c8d4 903032c b49c8d4 30a19c6 903032c 2e2dda5 a2b72ed 2e2dda5 d0de3b9 2e2dda5 98d18bd 2e2dda5 98d18bd d0de3b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import os
import io
import sys
sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities")
import json
import glob
import boto3
import base64
import logging
import requests
import numpy as np
import pandas as pd
from typing import List
from botocore.auth import SigV4Auth
#from langchain.llms.bedrock import Bedrock
from botocore.awsrequest import AWSRequest
import streamlit as st
import re
from sklearn.metrics import ndcg_score,dcg_score
from sklearn import preprocessing as pre
import invoke_models as llm#invoke_llm_model
# bedrock_ = boto3.client(
# 'bedrock-runtime',
# aws_access_key_id=st.secrets['user_access_key'],
# aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1'
# )
# inference_modifier = {
# "max_tokens_to_sample": 4096,
# "temperature": 0,
# "top_k": 250,
# "top_p": 1,
# "stop_sequences": ["\n\nHuman"],
# }
# textgen_llm = Bedrock(
# model_id="anthropic.claude-v2:1",
# client=bedrock_,
# model_kwargs=inference_modifier,
# )
#@st.cache_data
def eval(question, answers):
search_results: str = ""
prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n
The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n
It does not need to be a stringent test. The goal is to filter out totally irrelevant product retrievals. \n
If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
<User question>
{}
</User question>
<Retrieved document>
{}
</Retrieved document>
Now based on the information provided above, for every given Retrieved document, provide the index of the document, it's score out of 5 based on relevance with the User question, is it relevant or not as true or false, reason why you this is relevant or not, in json format,
Answer:
"""
#Finally, as the last line of your response, write the relevant indexes as a comma separated list in a line.
query = question[0]['question']
index_ = 0
for i in answers[0]['answer']:
desc = i['caption']+ "."+ i['desc']
search_results += f"Index: {index_}, Description: {desc}\n\n"
index_ = index_+1
prompt = prompt.format(query, search_results)
response = json.loads(llm.invoke_llm_model(prompt,False))
#response = textgen_llm(prompt)
#print("Response from LLM: ", response)
# inter_trim =response.split("[")[1]
# final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}')
llm_scores = []
current_scores = []
for idx,i in enumerate(answers[0]['answer']):
idx_str = str(idx)
if('relevant' in response[idx_str]):
relevance = response[idx_str]['relevant']
else:
relevance = True
if('score' in response[idx_str]):
score_ = response[idx_str]['score']
else:
score_ = 0.0
i['relevant'] = relevance
llm_scores.append(score_)
current_scores.append(i['score'])
#print("LLM Scores: ", llm_scores)
#print("Current Scores: ", current_scores)
x = np.array(llm_scores)
x = x.reshape(-1, 1)
x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist()
y = np.array(current_scores)
y = y.reshape(-1, 1)
y_norm = (pre.MinMaxScaler().fit_transform(y)).flatten().tolist()
st.session_state.answers = answers
dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores]))
# IDCG score
idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores]))
# Normalized DCG score
ndcg = dcg
print("NDCG: ", ndcg)
if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0):
st.session_state.ndcg_increase = "↑~"+str('%.3f'%(ndcg-st.session_state.input_ndcg ))
elif(ndcg < st.session_state.input_ndcg):
st.session_state.ndcg_increase = "↓~"+str('%.3f'%(st.session_state.input_ndcg - ndcg))
else:
st.session_state.ndcg_increase = " ~ "
print(st.session_state.ndcg_increase)
print(st.session_state.input_ndcg)
st.session_state.input_ndcg = ndcg |