# Imports

In [5]:
# pip install -U sentence-transformers

In [1]:
from sentence_transformers import SentenceTransformer
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import Ollama
from langchain.evaluation import load_evaluator
import faiss
import pandas as pd
import numpy as np
import pickle
import time
from tqdm import tqdm

  from tqdm.autonotebook import tqdm, trange


# Intialization

In [2]:
# Load the FAISS index
index = faiss.read_index("database/pdf_sections_index.faiss")

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
with open('database/pdf_sections_data.pkl', 'rb') as f:
        sections_data = pickle.load(f)

# RAG functions

In [5]:
def search_faiss(query, k=3):
    query_vector = model.encode([query])[0].astype('float32')
    query_vector = np.expand_dims(query_vector, axis=0)
    distances, indices = index.search(query_vector, k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            'distance': dist,
            'content': sections_data[idx]['content'],
            'metadata': sections_data[idx]['metadata']
        })
    
    return results

In [15]:
# Create a prompt template
prompt_template = """
You are an AI assistant specialized in Mental Health guidelines. 
Use the following pieces of context to answer the question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}

Answer:"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

llm = Ollama(
    model="llama3"
)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt)

def answer_question(query):
    # Search for relevant context
    search_results = search_faiss(query)
    
    # Combine the content from the search results
    context = "\n\n".join([result['content'] for result in search_results])

    # Run the chain
    response = chain.run(context=context, question=query)
    
    return response

# Reading GT

In [16]:
df = pd.read_csv('data/MentalHealth_Dataset.csv')

In [17]:
time_list=[]
response_list=[]
for i in tqdm(range(len(df))):
    query = df['Questions'].values[i]
    start = time.time()
    response = answer_question(query)
    end = time.time()   
    time_list.append(end-start)
    response_list.append(response)

100%|███████████████████████████████████████████| 10/10 [01:45<00:00, 10.55s/it]


In [18]:
df['latency'] = time_list
df['response'] = response_list

# Evaluation

In [29]:
eval_llm = Ollama(
    model="phi3"
)

In [30]:
metrics = ['correctness', 'relevance', 'coherence', 'conciseness']

In [31]:
for metric in metrics:
    evaluator = load_evaluator("labeled_criteria", criteria=metric, llm=eval_llm)
    
    reasoning = []
    value = []
    score = []
    
    for i in tqdm(range(len(df))):
        eval_result = evaluator.evaluate_strings(
            prediction=df.response.values[i],
            input=df.Questions.values[i],
            reference=df.Answers.values[i]
        )
        reasoning.append(eval_result['reasoning'])
        value.append(eval_result['value'])
        score.append(eval_result['score'])
    
    df[metric+'_reasoning'] = reasoning
    df[metric+'_value'] = value
    df[metric+'_score'] = score 

100%|███████████████████████████████████████████| 10/10 [01:15<00:00,  7.51s/it]
100%|███████████████████████████████████████████| 10/10 [00:59<00:00,  5.99s/it]
100%|███████████████████████████████████████████| 10/10 [00:50<00:00,  5.10s/it]
100%|███████████████████████████████████████████| 10/10 [00:48<00:00,  4.88s/it]


In [78]:
df.head()

Unnamed: 0,Questions,Answers,latency,response,correctness_reasoning,correctness_value,correctness_score,relevance_reasoning,relevance_value,relevance_score,coherence_reasoning,coherence_value,coherence_score,conciseness_reasoning,conciseness_value,conciseness_score
0,What is Mental Health,"Mental Health is a "" state of well-being in wh...",11.974234,"Based on the provided context, specifically fr...",The submission refers to the provided input wh...,Y,1,Step 1: Evaluate relevance criterion\nThe subm...,Y,1,Step 1: Assess coherence\nThe submission direc...,Y,1,1. The submission directly answers the questio...,Y,1
1,What are the most common mental disorders ment...,The most common mental disorders include depre...,5.863329,"Based on the provided context, the mental diso...",Step 1: Check if the submission is factually a...,Y,1,Step 1: Analyze the relevance criterion\nThe s...,Y,1,The submission begins with an appropriate ques...,Y,1,Step 1: Review conciseness criterion\nThe subm...,Y,1
2,What are the early warning signs and symptoms ...,Early warning signs and symptoms of depression...,13.434543,"Based on the provided context, I found a refer...",Step 1: Evaluate Correctness\nThe submission a...,Y,1,Step 1: Identify the relevant criterion from t...,Y,1,Step 1: Evaluate coherence\nThe submission is ...,Y,1,Step 1: Evaluate conciseness - The submission ...,Y,1
3,How can someone help a person who suffers from...,"To help someone with anxiety, one can support ...",13.838464,"According to the provided context, specificall...",Step 1: Correctness\nThe submission accurately...,Y,1,Step 1: Analyze relevance criterion\nThe submi...,Y,1,Step 1: Evaluate coherence\nThe submission dis...,Y,1,Step 1: Evaluate conciseness - The submission ...,N,0
4,What are the causes of mental illness listed i...,Causes of mental illness include abnormal func...,6.871735,"According to the provided context, the causes ...",The submission lists factors that align with t...,N,0,Step 1: Review relevance criterion - Check if ...,Y,1,Step 1: Compare the submission with the provid...,Y,1,Step 1: Assess conciseness\nThe submission is ...,Y,1


In [32]:
df[['correctness_score','relevance_score','coherence_score','conciseness_score','latency']].mean()

correctness_score     0.800000
relevance_score       0.900000
coherence_score       1.000000
conciseness_score     0.800000
latency              10.544803
dtype: float64

In [34]:
irr_q=pd.read_csv('data/Unrelated_questions.csv')

In [35]:
time_list=[]
response_list=[]
for i in tqdm(range(len(irr_q))):
    query = irr_q['Questions'].values[i]
    start = time.time()
    response = answer_question(query)
    end = time.time()   
    time_list.append(end-start)
    response_list.append(response)

100%|███████████████████████████████████████████| 10/10 [01:02<00:00,  6.30s/it]


In [36]:
irr_q['response']=response_list
irr_q['latency']=time_list

In [79]:
irr_q.head()

Unnamed: 0,Questions,response,latency,irrelevant_score
0,What is the capital of Mars?,I don't know. The provided context does not se...,12.207266,True
1,How many unicorns live in New York City?,I don't know. The information provided does no...,2.368774,True
2,What is the color of happiness?,I don't know! The provided context only talks ...,5.480067,True
3,Can cats fly on Tuesdays?,I don't know the answer to this question as it...,5.272529,True
4,How much does a thought weigh?,I don't know. The context provided is about me...,5.253224,True


In [37]:
irr_q['latency']

0    12.207266
1     2.368774
2     5.480067
3     5.272529
4     5.253224
5     5.351224
6     8.118429
7     7.288261
8     3.856500
9     7.745016
Name: latency, dtype: float64

In [39]:
irr_q['irrelevant_score'] = irr_q['response'].str.contains("I don't know")

In [40]:
irr_q[['irrelevant_score','latency']].mean()

irrelevant_score    0.900000
latency             6.294129
dtype: float64

# Improvement

In [48]:
new_prompt_template = """
You are an AI assistant specialized in Mental Health guidelines.
Use the provided context to answer the question short and accurately. 
If you don't know the answer, simply say, "I don't know."

Context:
{context}

Question: {question}

Answer:"""

prompt = PromptTemplate(template=new_prompt_template, input_variables=["context", "question"])

llm = Ollama(
    model="llama3"
)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt)

def answer_question_new(query):
    # Search for relevant context
    search_results = search_faiss(query)
    
    # Combine the content from the search results
    context = "\n\n".join([result['content'] for result in search_results])

    # Run the chain
    response = chain.run(context=context, question=query)
    
    return response

In [49]:
df2=df.copy()

In [50]:
time_list=[]
response_list=[]
for i in tqdm(range(len(df2))):
    query = df2['Questions'].values[i]
    start = time.time()
    response = answer_question(query)
    end = time.time()   
    time_list.append(end-start)
    response_list.append(response)

100%|███████████████████████████████████████████| 10/10 [01:34<00:00,  9.40s/it]


In [51]:
df2['latency'] = time_list
df2['response'] = response_list

In [52]:
for metric in metrics:
    evaluator = load_evaluator("labeled_criteria", criteria=metric, llm=eval_llm)
    
    reasoning = []
    value = []
    score = []
    
    for i in tqdm(range(len(df2))):
        eval_result = evaluator.evaluate_strings(
            prediction=df2.response.values[i],
            input=df2.Questions.values[i],
            reference=df2.Answers.values[i]
        )
        reasoning.append(eval_result['reasoning'])
        value.append(eval_result['value'])
        score.append(eval_result['score'])
    
    df2[metric+'_reasoning'] = reasoning
    df2[metric+'_value'] = value
    df2[metric+'_score'] = score 

100%|███████████████████████████████████████████| 10/10 [01:00<00:00,  6.01s/it]
100%|███████████████████████████████████████████| 10/10 [00:53<00:00,  5.35s/it]
100%|███████████████████████████████████████████| 10/10 [00:47<00:00,  4.77s/it]
100%|███████████████████████████████████████████| 10/10 [00:55<00:00,  5.60s/it]


In [77]:
df2.head()

Unnamed: 0,Questions,Answers,latency,response,correctness_reasoning,correctness_value,correctness_score,relevance_reasoning,relevance_value,relevance_score,coherence_reasoning,coherence_value,coherence_score,conciseness_reasoning,conciseness_value,conciseness_score
0,What is Mental Health,"Mental Health is a "" state of well-being in wh...",11.046327,"Based on the context provided, mental health r...",Step 1: Evaluate if the submission is factuall...,N,0,Step 1: Analyze the relevance criterion\nThe s...,N,0,The submission discusses mental health in rela...,Y,1,Step 1: Analyze conciseness criterion\nThe sub...,Y,1
1,What are the most common mental disorders ment...,The most common mental disorders include depre...,4.509713,The handbook mentions several mental illnesses...,The submission mentions depression and schizop...,N,0,Step 1: Analyze relevance criterion - Check if...,Y,1,Step 1: Assess coherence\nThe submission menti...,N,0,Step 1: Analyze conciseness criterion\nThe sub...,N,0
2,What are the early warning signs and symptoms ...,Early warning signs and symptoms of depression...,8.50118,"According to the provided context, specificall...",The submission matches the reference data in t...,Y,1,The submission refers directly to information ...,Y,1,Step 1: Evaluate coherence - The submission is...,Y,1,The submission is concise and includes most of...,Y,1
3,How can someone help a person who suffers from...,"To help someone with anxiety, one can support ...",10.611402,"According to the Mental Health Handbook, when ...",The submission seems consistent with the refer...,Y,1,Step 1: Review relevance criterion\nThe submis...,Y,1,"The submission is coherent, well-structured, a...",Y,1,The submission is relatively concise and cover...,Y,1
4,What are the causes of mental illness listed i...,Causes of mental illness include abnormal func...,6.299272,"According to the context, the causes of mental...","The submission lists causes such as neglect, s...",N,0,The submission mentions factors that are part ...,N,0,The submission is coherent and well-structured...,Y,1,Step 1: Read and understand both the input dat...,N,0


In [47]:
df2[['correctness_score','relevance_score','coherence_score','conciseness_score','latency']].mean()

correctness_score    0.500000
relevance_score      0.888889
coherence_score      0.888889
conciseness_score    0.900000
latency              8.190205
dtype: float64

# Query relevance

In [66]:
def new_search_faiss(query, k=3, threshold=1.5):
    query_vector = model.encode([query])[0].astype('float32')
    query_vector = np.expand_dims(query_vector, axis=0)
    distances, indices = index.search(query_vector, k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if dist < threshold:  # Only include results within the threshold distance
            results.append({
                'distance': dist,
                'content': sections_data[idx]['content'],
                'metadata': sections_data[idx]['metadata']
            })
    
    return results

In [70]:
new_prompt_template = """
You are an AI assistant specialized in Mental Health guidelines.
Use the provided context to answer the question short and accurately. 
If you don't know the answer, simply say, "I don't know."

Context:
{context}

Question: {question}

Answer:"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

llm = Ollama(
    model="llama3"
)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt)

def new_answer_question(query):
    # Search for relevant context
    search_results = new_search_faiss(query)
    
    if search_results==[]:
        response="I don't know, sorry"
    else:
        context = "\n\n".join([result['content'] for result in search_results])
        response = chain.run(context=context, question=query)
    
    return response

In [71]:
irr_q2=irr_q.copy()

In [72]:
time_list=[]
response_list=[]
for i in tqdm(range(len(irr_q2))):
    query = irr_q['Questions'].values[i]
    start = time.time()
    response = new_answer_question(query)
    end = time.time()   
    time_list.append(end-start)
    response_list.append(response)

100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 61.93it/s]


In [73]:
irr_q2['response']=response_list
irr_q2['latency']=time_list

In [80]:
irr_q2.head()

Unnamed: 0,Questions,response,latency,irrelevant_score
0,What is the capital of Mars?,"I don't know, sorry",0.061378,True
1,How many unicorns live in New York City?,"I don't know, sorry",0.012511,True
2,What is the color of happiness?,"I don't know, sorry",0.0119,True
3,Can cats fly on Tuesdays?,"I don't know, sorry",0.011438,True
4,How much does a thought weigh?,"I don't know, sorry",0.010644,True


In [74]:
irr_q2['irrelevant_score'] = irr_q2['response'].str.contains("I don't know")

In [75]:
irr_q2[['irrelevant_score','latency']].mean()

irrelevant_score    1.000000
latency             0.016068
dtype: float64