File size: 7,788 Bytes
5ec7b71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# Install necessary libraries in Colab
# !pip install datasets langchain_community smolagents chardet gradio pandas nltk sklearn

# Import required modules
import os
import getpass
import pandas as pd
import chardet
import re
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever
# from smolagents import Tool, HfApiModel, CodeAgent
from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool, ManagedAgent
from smolagents.agents import ToolCallingAgent
from smolagents import Tool, HfApiModel, TransformersModel, LiteLLMModel
from typing import Optional
import gradio as gr
import logging
from nltk.corpus import words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


if 'GROQ_API_KEY' not in os.environ or not os.environ['GROQ_API_KEY']:
    os.environ['GROQ_API_KEY'] = getpass.getpass('Enter GROQ_API_KEY: ')
else:
    print("GROQ_API_KEY is already set.")
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load NLTK word list for valid word checks
try:
    english_words = set(words.words())
except LookupError:
    import nltk
    nltk.download('words')
    english_words = set(words.words())

# Define allowed topics for health and wellness
ALLOWED_TOPICS = [
    "mental health",
    "physical health",
    "fitness",
    "nutrition",
    "exercise",
    "mindfulness",
    "sleep",
    "stress management",
    "wellness",
    "relaxation",
    "healthy lifestyle",
    "self-care",
    "meditation",
    "diet",
    "hydration",
    "breathing techniques",
    "yoga",
    "stress relief",
    "emotional health",
    "spiritual health",
    "healthy habits"
]

def is_valid_input(query):
    """
    Validate the user's input question.
    """
    if not query or query.strip() == "":
        return False, "Input cannot be empty. Please provide a meaningful question."

    if len(query.strip()) < 2:
        return False, "Input is too short. Please provide more context or details."

    # Check for valid words
    words_in_text = re.findall(r'\b\w+\b', query.lower())
    recognized_words = [word for word in words_in_text if word in english_words]

    if not recognized_words:
        return False, "Input appears unclear. Please use valid words in your question."

    return True, "Valid input."

def similarity_search(query, corpus, threshold=0.2):
    """
    Perform similarity search using TF-IDF and cosine similarity.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus + [query])
    query_vector = tfidf_matrix[-1]
    similarities = cosine_similarity(query_vector, tfidf_matrix[:-1]).flatten()
    max_similarity = max(similarities)
    if max_similarity >= threshold:
        most_similar_idx = similarities.argmax()
        return True, corpus[most_similar_idx], max_similarity
    return False, None, max_similarity

# Load and process the AIChatbot.csv file
def load_csv(file_path):
    """
    Load and process a CSV file into a list of documents.
    """
    try:
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read())
            encoding = result['encoding']
        data = pd.read_csv(file_path, encoding=encoding)
        questions = data['Question'].dropna().tolist()
        documents = [
            Document(page_content=row.to_string(index=False), metadata={"source": file_path})
            for _, row in data.iterrows()
        ]
        logger.info(f"Loaded {len(documents)} documents from {file_path}")
        return documents, questions
    except Exception as e:
        logger.error(f"Error loading CSV file: {e}")
        return [], []

# Load the AIChatbot.csv file
file_path = "AIChatbot.csv"  # Ensure this file is uploaded to your environment
source_docs, corpus_questions = load_csv(file_path)
if not source_docs:
    raise ValueError(f"Failed to load documents from {file_path}. Please check the file.")

# Split documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)
docs_processed = text_splitter.split_documents(source_docs)
logger.info(f"Split documents into {len(docs_processed)} chunks.")

# Define the retriever tool
class RetrieverTool(Tool):
    name = "retriever"
    description = "Uses semantic search to retrieve the parts of chatbot documentation most relevant to the query."
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. Use an affirmative tone rather than a question."
        }
    }
    output_type = "string"

    def __init__(self, docs, **kwargs):
        super().__init__(**kwargs)
        self.retriever = BM25Retriever.from_documents(docs, k=10)

    def forward(self, query: str) -> str:
        assert isinstance(query, str), "Search query must be a string."
        docs = self.retriever.invoke(query)
        # Return only the content of the most relevant document
        if docs:
            return docs[0].page_content.strip()
        else:
            return "No relevant information found."

retriever_tool = RetrieverTool(docs_processed)

# Define the improved custom prompt
custom_prompt = """
You are a friendly and knowledgeable AI assistant for a daily wellness company. Your goal is to provide clear, concise, and actionable answers to the user's health and wellness-related questions. Use a warm, approachable tone to make the user feel at ease.

When answering:
1. Focus on brevity without sacrificing accuracy or helpfulness.
2. Highlight key points in an easy-to-understand manner.
3. Include examples, tips, or short step-by-step guides where relevant.
4. Format lists or steps using markdown for better readability (e.g., numbered lists, bullet points).
5. Ensure your response is self-contained, engaging, and ends with a polite closing remark.

Answer each question in a similar concise, helpful, and friendly way.
"""

# Define the agent using smolagents
model = LiteLLMModel("groq/llama3-8b-8192")  # Ensure the model is available
agent = CodeAgent(
    tools=[retriever_tool], model=model, max_iterations=4, verbose=True
)

# Gradio interface for interacting with the RAG pipeline
def gradio_interface(query):
    try:
        is_valid, message = is_valid_input(query)
        if not is_valid:
            return message

        # Perform similarity search to verify the query's viability
        similar, similar_question, similarity_score = similarity_search(query, corpus_questions, threshold=0.2)
        if not similar:
            return (
                "I'm here to assist with health and wellness-related topics. "
                "However, I couldn't find a closely related question in the dataset. "
                "Please refine your query."
            )
        
        # Directly query the agent if the question is valid
        return agent.run(f"{custom_prompt}\n\nQuestion: {query}").strip()
    except Exception as e:
        logger.error(f"Error during query processing: {e}")
        return "**An error occurred while processing your request. Please try again later.**"

interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(label="Enter your question", placeholder="e.g., How does box breathing help reduce anxiety?"),
    outputs=gr.Markdown(label="Answer"),
    title="AI Chatbot for Wellness",
    description="Ask questions based on the AIChatbot.csv file. Focus on health and wellness topics.",
    theme="compact"
)

if __name__ == "__main__":
    interface.launch(debug=True)