File size: 4,357 Bytes
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
d0de3b9
2e2dda5
 
 
 
 
2e1b45a
2e2dda5
d0de3b9
 
 
 
 
2e2dda5
d0de3b9
 
 
 
 
 
 
 
 
 
 
 
2e2dda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4bf383
d0de3b9
a2b72ed
9b25384
 
2e2dda5
 
 
b49c8d4
 
 
903032c
 
 
b49c8d4
30a19c6
903032c
 
2e2dda5
 
 
 
 
a2b72ed
 
2e2dda5
 
 
 
 
 
 
 
 
 
 
d0de3b9
2e2dda5
 
 
 
 
98d18bd
2e2dda5
 
 
 
 
 
 
98d18bd
 
d0de3b9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import io
import sys
sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])+"/utilities")
import json
import glob
import boto3
import base64
import logging
import requests
import numpy as np
import pandas as pd
from typing import List
from botocore.auth import SigV4Auth
#from langchain.llms.bedrock import Bedrock
from botocore.awsrequest import AWSRequest
import streamlit as st
import re
from sklearn.metrics import ndcg_score,dcg_score
from sklearn import preprocessing as pre
import invoke_models as llm#invoke_llm_model

# bedrock_ = boto3.client(
#     'bedrock-runtime',
#     aws_access_key_id=st.secrets['user_access_key'],
#     aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1'
# )

# inference_modifier = {
#     "max_tokens_to_sample": 4096,
#     "temperature": 0,
#     "top_k": 250,
#     "top_p": 1,
#     "stop_sequences": ["\n\nHuman"],
# }
# textgen_llm = Bedrock(
#     model_id="anthropic.claude-v2:1",
#     client=bedrock_,
#     model_kwargs=inference_modifier,
# )


#@st.cache_data
def eval(question, answers):
    search_results: str = ""
    prompt: str = """Human: You are a grader assessing relevance of a retrieved document to a user question. \n 
    The User question and Retrieved documents are provided below. The Retrieved documents are retail product descriptions that the human is looking for. \n
    It does not need to be a stringent test. The goal is to filter out totally irrelevant product retrievals. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. 
    
    <User question>
    {}
    </User question>

    <Retrieved document>
    {}
    </Retrieved document>

    Now based on the information provided above, for every given Retrieved document, provide the index of the document, it's score out of 5 based on relevance with the User question, is it relevant or not as true or false, reason why you this is relevant or not, in json format,
    
    Answer:
    """
    #Finally, as the last line of your response, write the relevant indexes as a comma separated list in a line.


    query = question[0]['question']
    index_ = 0
    for i in answers[0]['answer']:
        desc = i['caption']+ "."+ i['desc']
        search_results += f"Index: {index_}, Description: {desc}\n\n"
        index_ = index_+1
    prompt = prompt.format(query, search_results)
    response = json.loads(llm.invoke_llm_model(prompt,False))
    #response = textgen_llm(prompt)
    #print("Response from LLM: ", response)
    # inter_trim =response.split("[")[1]
    # final_out = json.loads('{"results":['+inter_trim.split("]")[0]+']}')
    llm_scores = []
    current_scores = []
    for idx,i in enumerate(answers[0]['answer']):
        idx_str = str(idx)
        if('relevant' in response[idx_str]):
            relevance = response[idx_str]['relevant']
        else:
            relevance = True

        if('score' in response[idx_str]):
            score_ = response[idx_str]['score']
        else:
            score_ = 0.0
        i['relevant'] = relevance  
        llm_scores.append(score_)
        current_scores.append(i['score'])
        
        
    #print("LLM Scores: ", llm_scores)
    #print("Current Scores: ", current_scores)
    x = np.array(llm_scores)
    x = x.reshape(-1, 1)
    x_norm = (pre.MinMaxScaler().fit_transform(x)).flatten().tolist()
    
    y = np.array(current_scores)
    y = y.reshape(-1, 1)
    y_norm = (pre.MinMaxScaler().fit_transform(y)).flatten().tolist()

   
    st.session_state.answers = answers
    dcg = dcg_score(np.asarray([llm_scores]),np.asarray([current_scores]))
    
    # IDCG score
    idcg = dcg_score(np.asarray([llm_scores]),np.asarray([llm_scores]))
    
    # Normalized DCG score
    ndcg = dcg
    print("NDCG: ", ndcg)
    if(ndcg > st.session_state.input_ndcg and st.session_state.input_ndcg!=0.0):
        st.session_state.ndcg_increase = "&uarr;~"+str('%.3f'%(ndcg-st.session_state.input_ndcg ))
    elif(ndcg < st.session_state.input_ndcg):
        st.session_state.ndcg_increase = "&darr;~"+str('%.3f'%(st.session_state.input_ndcg - ndcg))
    else:
        st.session_state.ndcg_increase = " ~ "
        
    print(st.session_state.ndcg_increase)        
    print(st.session_state.input_ndcg)
    st.session_state.input_ndcg = ndcg