chatbot / app.py
Phoenix21's picture
Update app.py
9afbd21 verified
raw
history blame
7.79 kB
# Install necessary libraries in Colab
# !pip install datasets langchain_community smolagents chardet gradio pandas nltk sklearn
# Import required modules
import os
import getpass
import pandas as pd
import chardet
import re
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever
from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool , Tool ,LiteLLMModel
import gradio as gr
import logging
from nltk.corpus import words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("Daily Wellness AI Guru")
# Securely input the GROQ API key
if 'GROQ_API_KEY' not in os.environ or not os.environ['GROQ_API_KEY']:
os.environ['GROQ_API_KEY'] = getpass.getpass('Enter GROQ_API_KEY: ')
else:
print("GROQ_API_KEY is already set.")
# Load NLTK word list for valid word checks
try:
english_words = set(words.words())
except LookupError:
import nltk
nltk.download('words')
english_words = set(words.words())
# Define allowed topics for health and wellness
ALLOWED_TOPICS = [
"mental health",
"physical health",
"fitness",
"nutrition",
"exercise",
"mindfulness",
"sleep",
"stress management",
"wellness",
"relaxation",
"healthy lifestyle",
"self-care",
"meditation",
"diet",
"hydration",
"breathing techniques",
"yoga",
"stress relief",
"emotional health",
"spiritual health",
"healthy habits"
]
def is_valid_input(query):
"""
Validate the user's input question.
"""
if not query or query.strip() == "":
return False, "Input cannot be empty. Please provide a meaningful question."
if len(query.strip()) < 2:
return False, "Input is too short. Please provide more context or details."
# Check for valid words
words_in_text = re.findall(r'\b\w+\b', query.lower())
recognized_words = [word for word in words_in_text if word in english_words]
if not recognized_words:
return False, "Input appears unclear. Please use valid words in your question."
return True, "Valid input."
def similarity_search(query, corpus, threshold=0.2):
"""
Perform similarity search using TF-IDF and cosine similarity.
"""
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus + [query])
query_vector = tfidf_matrix[-1]
similarities = cosine_similarity(query_vector, tfidf_matrix[:-1]).flatten()
max_similarity = max(similarities)
if max_similarity >= threshold:
most_similar_idx = similarities.argmax()
return True, corpus[most_similar_idx], max_similarity
return False, None, max_similarity
# Load and process the AIChatbot.csv file
def load_csv(file_path):
"""
Load and process a CSV file into a list of documents.
"""
try:
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
encoding = result['encoding']
data = pd.read_csv(file_path, encoding=encoding)
questions = data['Question'].dropna().tolist()
documents = [
Document(page_content=row.to_string(index=False), metadata={"source": file_path})
for _, row in data.iterrows()
]
logger.info(f"Loaded {len(documents)} documents from {file_path}")
return documents, questions
except Exception as e:
logger.error(f"Error loading CSV file: {e}")
return [], []
# Load the AIChatbot.csv file
file_path = "AIChatbot.csv" # Ensure this file is uploaded to your environment
source_docs, corpus_questions = load_csv(file_path)
if not source_docs:
raise ValueError(f"Failed to load documents from {file_path}. Please check the file.")
# Split documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n", ".", " ", ""],
)
docs_processed = text_splitter.split_documents(source_docs)
logger.info(f"Split documents into {len(docs_processed)} chunks.")
# Define the retriever tool
class RetrieverTool(Tool):
name = "retriever"
description = "Uses semantic search to retrieve the parts of chatbot documentation most relevant to the query."
inputs = {
"query": {
"type": "string",
"description": "The query to perform. Use an affirmative tone rather than a question."
}
}
output_type = "string"
def __init__(self, docs, **kwargs):
super().__init__(**kwargs)
self.retriever = BM25Retriever.from_documents(docs, k=10)
def forward(self, query: str) -> str:
assert isinstance(query, str), "Search query must be a string."
docs = self.retriever.invoke(query)
if docs:
return docs[0].page_content.strip()
else:
return "No relevant information found."
retriever_tool = RetrieverTool(docs_processed)
# Define DuckDuckGoSearchTool
duckduckgo_search_tool = DuckDuckGoSearchTool()
# Define the improved custom prompt
custom_prompt = """
You are Daily Wellness AI Guru, a friendly and knowledgeable assistant here to simplify wellness. Your goal is to provide clear, concise, and actionable answers to the user's health and wellness-related questions. Mention how Daily Wellness AI offers tailored solutions for day-to-day wellness tasks. Use a warm and friendly tone to make the user feel at ease.
When answering:
1. Address the user warmly with "Hello! This is Daily Wellness AI Guru."
2. Highlight the key points in an easy-to-understand manner.
3. Include practical examples, tips, or short guides where relevant.
4. Format the response for clarity using markdown (e.g., numbered lists, bullet points).
5. Reinforce how Daily Wellness AI helps simplify wellness through AI-powered solutions.
6. End with an engaging and polite closing remark that invites further questions.
"""
# Define the agent using smolagents
model = LiteLLMModel("groq/llama3-8b-8192") # Ensure the model is available
agent = CodeAgent(
tools=[retriever_tool, duckduckgo_search_tool], model=model, max_iterations=4, verbose=True
)
# Gradio interface for interacting with the RAG pipeline
def gradio_interface(query):
try:
# Validate input
is_valid, message = is_valid_input(query)
if not is_valid:
return message
# Perform similarity search
similar, similar_question, similarity_score = similarity_search(query, corpus_questions, threshold=0.2)
if similar:
response = agent.run(f"{custom_prompt}\n\nQuestion: {query}")
return response.strip()
else:
response = duckduckgo_search_tool.invoke(query)
return f"{response.strip()}\n\nRemember, Daily Wellness AI is here to simplify wellness with AI-powered solutions. Feel free to ask more questions!"
except Exception as e:
logger.error(f"Error during query processing: {e}")
return "**An error occurred while processing your request. Please try again later.**"
# Create the Gradio interface
interface = gr.Interface(
fn=gradio_interface,
inputs=gr.Textbox(label="Enter your question", placeholder="e.g., How does box breathing help reduce anxiety?"),
outputs=gr.Markdown(label="Answer"),
title="Daily Wellness AI Guru Chatbot",
description="Ask health and wellness questions. Get actionable, friendly advice from your wellness companion.",
theme="compact"
)
if __name__ == "__main__":
interface.launch(debug=True)