|
import json |
|
|
|
import pandas as pd |
|
import streamlit as st |
|
from util.evaluator import evaluator, write_evaluation_commentary |
|
import os |
|
|
|
|
|
|
|
examples = { |
|
'good': [ |
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
{"role": "user", "content": "What causes rainbows to appear in the sky?"}, |
|
{"role": "assistant", |
|
"content": "Rainbows appear when sunlight is refracted, dispersed, and reflected inside water droplets in the atmosphere, resulting in a spectrum of light appearing in the sky."}, |
|
{"role": "user", "content": "That's interesting! Why does it create so many colors?"} |
|
], |
|
'bad': [ |
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
{"role": "user", "content": "What causes rainbows to appear in the sky?"}, |
|
{"role": "assistant", |
|
"content": "Rainbows happen because light in the sky gets mixed up and sometimes shows colors when it's raining or when there is water around."}, |
|
{"role": "user", "content": "That doesn't seem very clear."} |
|
] |
|
} |
|
|
|
|
|
|
|
def check_password(): |
|
def password_entered(): |
|
if password_input == os.getenv('PASSWORD'): |
|
st.session_state['password_correct'] = True |
|
else: |
|
st.error("Incorrect Password, please try again.") |
|
|
|
password_input = st.text_input("Enter Password:", type="password") |
|
submit_button = st.button("Submit", on_click=password_entered) |
|
|
|
if submit_button and not st.session_state.get('password_correct', False): |
|
st.error("Please enter a valid password to access the demo.") |
|
|
|
|
|
|
|
st.title('Single Evaluation of Conversations') |
|
|
|
|
|
st.sidebar.write(""" |
|
### Welcome to the Single Evaluation of Conversations Demo |
|
This application allows you to evaluate the quality of conversations generated for various contexts using different language models. You can either use predefined examples or input your own conversations and contexts. |
|
""") |
|
|
|
|
|
st.sidebar.write(""" |
|
### Explanation Principles |
|
When evaluating conversations, consider the following principles mapped to user empowerment and regulatory compliance outcomes: |
|
|
|
1. **Factually Correct**: The information should be accurate and relevant to empower users and meet external audit requirements. |
|
2. **Useful**: Explanations should be clear and meaningful, helping users make informed decisions. |
|
3. **Context Specific**: Explanations should be tailored to the context of use, enhancing their relevance and utility. |
|
4. **User Specific**: Explanations should address the needs and preferences of the user, enabling better decision-making. |
|
5. **Provide Pluralism**: Explanations should present diverse perspectives, allowing users to understand different viewpoints and make well-rounded decisions. |
|
""") |
|
|
|
|
|
if not st.session_state.get('password_correct', False): |
|
check_password() |
|
else: |
|
st.sidebar.success("Password Verified. Proceed with the demo.") |
|
model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106']) |
|
|
|
|
|
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own')) |
|
|
|
if input_type == 'Use predefined example': |
|
example_type = st.radio("Select an example type:", ('good', 'bad')) |
|
conversation = examples[example_type] |
|
context = "Example context" |
|
else: |
|
conversation_input = st.text_area('Enter your conversation (JSON format):', |
|
'[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who won the world series in 2020?"}, {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}]') |
|
context_input = st.text_input('Enter your context:', 'general user') |
|
|
|
try: |
|
conversation = json.loads(conversation_input) |
|
context = context_input |
|
except json.JSONDecodeError: |
|
st.error("Invalid JSON format for conversation.") |
|
conversation = None |
|
context = None |
|
|
|
st.write('### Conversation') |
|
if conversation: |
|
for exchange in conversation: |
|
role = exchange['role'].capitalize() |
|
content = exchange['content'] |
|
st.markdown(f"**{role}:** {content}") |
|
else: |
|
st.write('No conversation entered yet.') |
|
|
|
st.write('### Context') |
|
if context: |
|
st.write(context) |
|
else: |
|
st.write('No context entered yet.') |
|
|
|
if st.button('Evaluate Conversation'): |
|
if conversation and context: |
|
eval = evaluator(model_name) |
|
scores = eval.evaluate_conversation(conversation, context) |
|
st.write('### Scores') |
|
details = write_evaluation_commentary(scores["aggregate_scores"]) |
|
df = pd.DataFrame(details) |
|
st.write(df) |
|
|
|
data = { |
|
'Conversation': conversation, |
|
'Context': context, |
|
**{detail['Principle']: detail['Score'] for detail in details} |
|
} |
|
df = pd.DataFrame([data]) |
|
|
|
|
|
csv = df.to_csv(index=False) |
|
st.download_button( |
|
label="Download evaluation as CSV", |
|
data=csv, |
|
file_name='evaluation.csv', |
|
mime='text/csv', |
|
) |
|
else: |
|
st.error('Please enter both a conversation and a context to evaluate.') |
|
|