|
import os |
|
import json |
|
import pandas as pd |
|
import time |
|
|
|
import phoenix as px |
|
from phoenix.trace.langchain import OpenInferenceTracer, LangChainInstrumentor |
|
|
|
|
|
|
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.chains.question_answering import load_qa_chain |
|
from langchain import HuggingFaceHub |
|
from langchain.prompts import PromptTemplate |
|
|
|
from langchain.chains import RetrievalQA |
|
from langchain.callbacks import StdOutCallbackHandler |
|
|
|
|
|
from langchain.storage import LocalFileStore |
|
from langchain.embeddings import CacheBackedEmbeddings |
|
from langchain.vectorstores import FAISS |
|
|
|
|
|
from langchain.document_loaders import WebBaseLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
|
import numpy as np |
|
import streamlit as st |
|
import pandas as pd |
|
|
|
|
|
|
|
from PIL import Image |
|
|
|
|
|
global trace_df |
|
|
|
|
|
st.set_page_config(page_title="RAG PoC", layout="wide") |
|
st.sidebar.image(Image.open("./test-logo.png"), use_column_width=True) |
|
|
|
@st.cache_resource |
|
def tracer_config(): |
|
|
|
session = px.launch_app() |
|
|
|
tracer = OpenInferenceTracer() |
|
|
|
LangChainInstrumentor(tracer).instrument() |
|
time.sleep(3) |
|
print(session.url) |
|
|
|
tracer_config() |
|
|
|
|
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["π **RAG**", "π FactVsHallucinate", "π€ **RAG Scoring** " ]) |
|
|
|
|
|
|
|
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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" |
|
|
|
|
|
|
|
|
|
|
|
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") |
|
|
|
|
|
|
|
|
|
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000}) |
|
|
|
handler = StdOutCallbackHandler() |
|
|
|
|
|
|
|
|
|
|
|
|
|
class HallucinatePromptContext: |
|
def __init__(self): |
|
self.variables_list = ["query","answer","context"] |
|
self.base_template = """In this task, you will be presented with a query, a reference text and an answer. The answer is |
|
generated to the question based on the reference text. The answer may contain false information, you |
|
must use the reference text to determine if the answer to the question contains false information, |
|
if the answer is a hallucination of facts. Your objective is to determine whether the reference text |
|
contains factual information and is not a hallucination. A 'hallucination' in this context refers to |
|
an answer that is not based on the reference text or assumes information that is not available in |
|
the reference text. Your response should be a single word: either "factual" or "hallucinated", and |
|
it should not include any other text or characters. "hallucinated" indicates that the answer |
|
provides factually inaccurate information to the query based on the reference text. "factual" |
|
indicates that the answer to the question is correct relative to the reference text, and does not |
|
contain made up information. Please read the query and reference text carefully before determining |
|
your response. |
|
|
|
# Query: {query} |
|
# Reference text: {context} |
|
# Answer: {answer} |
|
Is the answer above factual or hallucinated based on the query and reference text?""" |
|
|
|
|
|
|
|
class HallucinatonEvaluater: |
|
def __init__(self, item): |
|
self.question = item["question"] |
|
self.answer = item["answer"] |
|
|
|
self.context = item["context"] |
|
self.llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000}) |
|
|
|
def get_prompt_template(self): |
|
prompt = HallucinatePromptContext() |
|
template = prompt.base_template |
|
varialbles = prompt.variables_list |
|
eval_template = PromptTemplate(input_variables=varialbles, template=template) |
|
return eval_template |
|
|
|
def evaluate(self): |
|
prompt = self.get_prompt_template().format(query = self.question, answer = self.answer, context = self.context) |
|
score = self.llm(prompt) |
|
return score |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data |
|
def initialize_vectorstore(): |
|
|
|
webpage_loader = WebBaseLoader("https://www.tredence.com/case-studies/forecasting-app-installs-for-a-large-retailer-in-the-us").load() |
|
webpage_chunks = _text_splitter(webpage_loader) |
|
|
|
global vectorstore |
|
global retriever |
|
|
|
|
|
vectorstore = FAISS.from_documents(webpage_chunks, embedder) |
|
print("vector store initialized with sample doc") |
|
|
|
|
|
retriever = vectorstore.as_retriever() |
|
st.session_state['vectorstore'] = vectorstore |
|
st.session_state['docadd'] = 0 |
|
print("st.session_state['docadd'] ", st.session_state['docadd']) |
|
|
|
return retriever |
|
|
|
|
|
def _text_splitter(doc): |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=600, |
|
chunk_overlap=50, |
|
length_function=len, |
|
) |
|
return text_splitter.transform_documents(doc) |
|
|
|
def _load_docs(path: str): |
|
load_doc = WebBaseLoader(path).load() |
|
doc = _text_splitter(load_doc) |
|
return doc |
|
|
|
|
|
|
|
|
|
|
|
|
|
def rag_response(response): |
|
|
|
|
|
|
|
|
|
|
|
st.markdown('<h1 style="color:#100170;font-size:32px;text-align:center;padding:0;">RAG Response</h1>', unsafe_allow_html=True) |
|
|
|
question_title = '<h2 style="color:#100170;font-size:18px;">Question</h2>' |
|
question = f"<div style='background-color:#f0f0f0; padding:10px; border-radius:10px;'>{response['query']}</div>" |
|
st.markdown(question_title, unsafe_allow_html=True) |
|
st.markdown(question, unsafe_allow_html=True) |
|
|
|
rag_output_title = '<h2 style="color:#100170;font-size:18px;">RAG Output</h2>' |
|
rag_output = f"<div style='background-color:#f0f0f0; padding:10px; border-radius:10px;'>{response['result']}</div>" |
|
st.markdown(rag_output_title, unsafe_allow_html=True) |
|
st.markdown(rag_output, unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _create_hallucination_scenario(item): |
|
score = HallucinatonEvaluater(item).evaluate() |
|
return score |
|
|
|
def hallu_eval(question: str, answer: str, context: str): |
|
print("in hallu eval") |
|
hallucination_score = _create_hallucination_scenario({ |
|
"question": question, |
|
"answer": answer, |
|
"context": context |
|
} |
|
) |
|
print("got hallu score") |
|
st.markdown('<h1 style="color:#100170;font-size:24px;">Hallucinated?</h1>', unsafe_allow_html=True) |
|
st.text_area(label=" ", value=hallucination_score, height=30) |
|
|
|
|
|
|
|
|
|
def scoring_eval(question: str, answer: str, context: str): |
|
print("in scoring eval") |
|
score = _create_evaluation_scenario({ |
|
"question": question, |
|
"answer": answer, |
|
"context": context |
|
} |
|
) |
|
print("got score") |
|
st.markdown('<h1 style="color:#100170;font-size:24px;">Completion Score</h1>', unsafe_allow_html=True) |
|
st.text_area(label=" ", value=score, height=30) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def click_button(response): |
|
|
|
|
|
|
|
hallu_eval(response["query"], response["result"], "blah blah") |
|
|
|
|
|
class BasePromptContext: |
|
def __init__(self): |
|
self.variables_list = ["question","answer","context"] |
|
self.base_template = """Please act as an impartial judge and evaluate the quality of the provided answer which attempts to answer the provided question based on a provided context. |
|
And you'll need to submit your grading for the correctness, comprehensiveness and readability of the answer, using JSON format with the 2 items in parenthesis: |
|
("score": [your score number for the correctness of the answer], "reasoning": [your one line step by step reasoning about the correctness of the answer]) |
|
Below is your grading rubric: |
|
- Correctness: If the answer correctly answer the question, below are the details for different scores: |
|
- Score 0: the answer is completely incorrect, doesnβt mention anything about the question or is completely contrary to the correct answer. |
|
- For example, when asked βHow to terminate a databricks clusterβ, the answer is empty string, or content thatβs completely irrelevant, or sorry I donβt know the answer. |
|
- Score 50: the answer provides some relevance to the question and answer one aspect of the question correctly. |
|
- Example: |
|
- Question: How to terminate a databricks cluster |
|
- Answer: Databricks cluster is a cloud-based computing environment that allows users to process big data and run distributed data processing tasks efficiently. |
|
- Or answer: In the Databricks workspace, navigate to the "Clusters" tab. And then this is a hard question that I need to think more about it |
|
- Score 75: the answer mostly answer the question but is missing or hallucinating on one critical aspect. |
|
- Example: |
|
- Question: How to terminate a databricks clusterβ |
|
- Answer: βIn the Databricks workspace, navigate to the "Clusters" tab. |
|
Find the cluster you want to terminate from the list of active clusters. |
|
And then youβll find a button to terminate all clusters at onceβ |
|
- Score 100: the answer correctly answer the question and not missing any major aspect |
|
- Example: |
|
- Question: How to terminate a databricks cluster |
|
- Answer: In the Databricks workspace, navigate to the "Clusters" tab. |
|
Find the cluster you want to terminate from the list of active clusters. |
|
Click on the down-arrow next to the cluster name to open the cluster details. |
|
Click on the "Terminate" button. A confirmation dialog will appear. Click "Terminate" again to confirm the action.β |
|
Provided question: |
|
{question} |
|
Provided answer: |
|
{answer} |
|
Provided context: |
|
{context} |
|
Please provide your grading for the correctness and explain you gave the particular grading""" |
|
|
|
class Evaluater: |
|
def __init__(self, item): |
|
self.question = item["question"] |
|
self.answer = item["answer"] |
|
|
|
self.context = item["context"] |
|
self.llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000}) |
|
|
|
def get_prompt_template(self): |
|
prompt = BasePromptContext() |
|
template = prompt.base_template |
|
varialbles = prompt.variables_list |
|
eval_template = PromptTemplate(input_variables=varialbles, template=template) |
|
return eval_template |
|
|
|
def evaluate(self): |
|
prompt = self.get_prompt_template().format(question = self.question, answer = self.answer, context = self.context) |
|
score = self.llm(prompt) |
|
return score |
|
|
|
|
|
def _create_evaluation_scenario(item): |
|
score = Evaluater(item).evaluate() |
|
return score |
|
|
|
|
|
|
|
def _create_hallucination_scenario(item): |
|
score = HallucinatonEvaluater(item).evaluate() |
|
return score |
|
|
|
|
|
|
|
with tab1: |
|
|
|
with st.form(" RAG with evaluation - scoring & hallucination "): |
|
|
|
initialize_vectorstore() |
|
time.sleep(2) |
|
try: |
|
if st.session_state['docadd'] == 1: |
|
retriever = st.session_state['retriever'] |
|
else: |
|
retriever = initialize_vectorstore() |
|
except: |
|
st.session_state['docadd'] = 0 |
|
retriever = initialize_vectorstore() |
|
|
|
|
|
options = ["true", "false"] |
|
|
|
st.markdown('<h1 style="color:#100170;font-size:24px;margin:0;padding:0">User Query</h1>', unsafe_allow_html=True) |
|
question = st.text_input(label="", value="", placeholder="Type in question", label_visibility="visible", disabled=False) |
|
st.markdown("<h1 style='color:#100170;font-size:24px;margin-bottom:0;padding:0;'>Perform Evaluation</h1>", unsafe_allow_html=True) |
|
evaluate = st.radio("", ["True", "False"]) |
|
m = st.markdown(""" |
|
<style> |
|
div.stButton > button:first-child { |
|
background-color: #100170; |
|
color:#ffffff; |
|
} |
|
# div.stButton > button:hover { |
|
# background-color: #00ff00; |
|
# color:#ff0000; |
|
# } |
|
</style>""", unsafe_allow_html=True) |
|
|
|
|
|
columns = st.columns([2,1,2]) |
|
|
|
if columns[1].form_submit_button(" Start RAG "): |
|
|
|
st.markdown("""<hr style="height:10px;border:none;color:#333;background-color: #100170;" /> """, unsafe_allow_html=True) |
|
|
|
print("retrie ,", retriever) |
|
chain = RetrievalQA.from_chain_type( |
|
llm=llm, |
|
retriever=retriever, |
|
callbacks=[handler], |
|
return_source_documents=True |
|
) |
|
|
|
|
|
response = chain(question) |
|
print(response["result"]) |
|
|
|
|
|
rag_response(response) |
|
|
|
|
|
|
|
time.sleep(4) |
|
|
|
df = px.active_session().get_spans_dataframe() |
|
|
|
|
|
print(df.count()) |
|
df_sorted = df.sort_values(by='end_time',ascending=False) |
|
|
|
model_input = json.loads(df_sorted[df_sorted["name"] == "LLMChain"]["attributes.input.value"][0]) |
|
context = model_input["context"] |
|
|
|
print(context) |
|
|
|
if evaluate: |
|
score = _create_evaluation_scenario({ |
|
"question": question, |
|
"answer": response['result'], |
|
"context": context |
|
}) |
|
hallucination_score = _create_hallucination_scenario({ |
|
"question": question, |
|
"answer": response['result'], |
|
"context": context |
|
} |
|
) |
|
else: |
|
score = "Evaluation is Turned OFF" |
|
|
|
st.markdown('<h2 style="color:#100170;font-size:18px">Confidence Score</h2>', unsafe_allow_html=True) |
|
st.markdown(f'<div style="max-height: 150px; overflow-y: auto; background-color:#f0f0f0; padding:10px; border-radius:10px;">{score}</div>', unsafe_allow_html=True) |
|
|
|
|
|
st.markdown('<h2 style="color:#100170;font-size:18px">Hallucinated?</h2>', unsafe_allow_html=True) |
|
st.markdown(f'<div style="max-height: 150px; overflow-y: auto; background-color:#f0f0f0; padding:10px; border-radius:10px;">{hallucination_score}</div>', unsafe_allow_html=True) |
|
|
|
|
|
cleaned_context = "\n".join(line.strip() for line in context.splitlines() if line.strip()) |
|
st.markdown('<h2 style="color:#100170;font-size:18px">Context</h2>', unsafe_allow_html=True) |
|
st.markdown(f'<div style="max-height: 300px; overflow-y: auto; background-color:#f0f0f0; padding:10px; border-radius:10px;">{cleaned_context}</div>', unsafe_allow_html=True) |
|
|
|
|
|
|
|
metadata_header = '<h2 style="color:#100170;font-size:18px">Augmented Knowledge Metadata</h2>' |
|
metadata_container = '<div style="max-height: 300px; overflow-y: auto; background-color:#f0f0f0; padding:10px; border-radius:10px;">{}</div>' |
|
metadata_list = [doc.metadata for doc in response["source_documents"]] |
|
formatted_metadata_list = [] |
|
for i, metadata in enumerate(metadata_list, start=1): |
|
formatted_metadata = f"<h2 style='color:#3366ff; font-size:16px;padding:5px 0px;'>Metadata {i}:</h2><" |
|
source = metadata.get('source', '').replace('\n', '') |
|
title = metadata.get('title', '').replace('\n', '') |
|
description = metadata.get('description', '').replace('\n', '') |
|
|
|
formatted_metadata += f"<span style='color:#ff9900; font-weight:bold;'>Source:</span> <span style='color:#009900;'>{source}</span><br>" |
|
formatted_metadata += f"<span style='color:#ff9900; font-weight:bold;'>Title:</span> <span style='color:#cc00cc;'>{title}</span><br>" |
|
formatted_metadata += f"<span style='color:#ff9900; font-weight:bold;'>Description:</span> <span style='color:#ff0000;'>{description}</span><br><br>" |
|
|
|
formatted_metadata_list.append(formatted_metadata) |
|
|
|
metadata_text = '\n'.join(formatted_metadata_list) |
|
formatted_metadata_section = metadata_header + metadata_container.format(metadata_text) |
|
|
|
st.markdown(formatted_metadata_section, unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab2: |
|
|
|
|
|
|
|
with st.form(" LLM-aasisted evaluation of Hallucination"): |
|
|
|
|
|
|
|
question = st.text_input(label="**Question**", value="", label_visibility="visible", disabled=False) |
|
answer = st.text_input(label="**answer**", value="", label_visibility="visible", disabled=False) |
|
context = st.text_input(label="**context**", value="", label_visibility="visible", disabled=False) |
|
|
|
|
|
if st.form_submit_button("Evaluate"): |
|
hallu_eval(question, answer, context) |
|
|
|
|
|
with tab3: |
|
|
|
|
|
with st.form("RAG scoring"): |
|
|
|
|
|
|
|
question = st.text_input(label="**Question**", value="", label_visibility="visible", disabled=False) |
|
answer = st.text_input(label="**answer**", value="", label_visibility="visible", disabled=False) |
|
context = st.text_input(label="**context**", value="", label_visibility="visible", disabled=False) |
|
|
|
|
|
if st.form_submit_button("Evaluate"): |
|
scoring_eval(question, answer, context) |
|
|
|
|
|
|
|
print("activ session: ", px.active_session().get_spans_dataframe()) |
|
trace_df = px.active_session().get_spans_dataframe() |
|
|
|
st.session_state['trace_df'] = trace_df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rag(): |
|
print("in rag") |
|
options = ["true", "false"] |
|
question = st.text_input(label="user question", value="", label_visibility="visible", disabled=False) |
|
evaluate = st.selectbox(label="select evaluation",options=options, index=0, placeholder="Choose an option", disabled=False, label_visibility="visible") |
|
|
|
|
|
|
|
if st.button("do RAG"): |
|
chain = RetrievalQA.from_chain_type( |
|
llm=llm, |
|
retriever=retriever, |
|
callbacks=[handler], |
|
return_source_documents=True |
|
) |
|
|
|
|
|
response = chain(question) |
|
print(response["result"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_response(response) |
|
|
|
|
|
|
|
|
|
click = st.button("Do you want to see more?") |
|
if click: |
|
st.session_state.more_stuff = True |
|
|
|
if st.session_state.more_stuff: |
|
click_button(response) |
|
|
|
|
|
|
|
return(response) |
|
|
|
|
|
a = st.markdown(""" |
|
<style> |
|
div.stTextArea > textarea { |
|
background-color: #0099ff; |
|
height: 1400px; |
|
width: 800px; |
|
} |
|
</style>""", unsafe_allow_html=True) |