|
|
|
|
|
|
|
|
|
import os |
|
import getpass |
|
import pandas as pd |
|
import chardet |
|
import re |
|
from langchain.docstore.document import Document |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.retrievers import BM25Retriever |
|
|
|
from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool, ManagedAgent |
|
from smolagents.agents import ToolCallingAgent |
|
from smolagents import Tool, HfApiModel, TransformersModel, LiteLLMModel |
|
from typing import Optional |
|
import gradio as gr |
|
import logging |
|
from nltk.corpus import words |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
if 'GROQ_API_KEY' not in os.environ or not os.environ['GROQ_API_KEY']: |
|
os.environ['GROQ_API_KEY'] = getpass.getpass('Enter GROQ_API_KEY: ') |
|
else: |
|
print("GROQ_API_KEY is already set.") |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
try: |
|
english_words = set(words.words()) |
|
except LookupError: |
|
import nltk |
|
nltk.download('words') |
|
english_words = set(words.words()) |
|
|
|
|
|
ALLOWED_TOPICS = [ |
|
"mental health", |
|
"physical health", |
|
"fitness", |
|
"nutrition", |
|
"exercise", |
|
"mindfulness", |
|
"sleep", |
|
"stress management", |
|
"wellness", |
|
"relaxation", |
|
"healthy lifestyle", |
|
"self-care", |
|
"meditation", |
|
"diet", |
|
"hydration", |
|
"breathing techniques", |
|
"yoga", |
|
"stress relief", |
|
"emotional health", |
|
"spiritual health", |
|
"healthy habits" |
|
] |
|
|
|
def is_valid_input(query): |
|
""" |
|
Validate the user's input question. |
|
""" |
|
if not query or query.strip() == "": |
|
return False, "Input cannot be empty. Please provide a meaningful question." |
|
|
|
if len(query.strip()) < 2: |
|
return False, "Input is too short. Please provide more context or details." |
|
|
|
|
|
words_in_text = re.findall(r'\b\w+\b', query.lower()) |
|
recognized_words = [word for word in words_in_text if word in english_words] |
|
|
|
if not recognized_words: |
|
return False, "Input appears unclear. Please use valid words in your question." |
|
|
|
return True, "Valid input." |
|
|
|
def similarity_search(query, corpus, threshold=0.2): |
|
""" |
|
Perform similarity search using TF-IDF and cosine similarity. |
|
""" |
|
vectorizer = TfidfVectorizer() |
|
tfidf_matrix = vectorizer.fit_transform(corpus + [query]) |
|
query_vector = tfidf_matrix[-1] |
|
similarities = cosine_similarity(query_vector, tfidf_matrix[:-1]).flatten() |
|
max_similarity = max(similarities) |
|
if max_similarity >= threshold: |
|
most_similar_idx = similarities.argmax() |
|
return True, corpus[most_similar_idx], max_similarity |
|
return False, None, max_similarity |
|
|
|
|
|
def load_csv(file_path): |
|
""" |
|
Load and process a CSV file into a list of documents. |
|
""" |
|
try: |
|
with open(file_path, 'rb') as f: |
|
result = chardet.detect(f.read()) |
|
encoding = result['encoding'] |
|
data = pd.read_csv(file_path, encoding=encoding) |
|
questions = data['Question'].dropna().tolist() |
|
documents = [ |
|
Document(page_content=row.to_string(index=False), metadata={"source": file_path}) |
|
for _, row in data.iterrows() |
|
] |
|
logger.info(f"Loaded {len(documents)} documents from {file_path}") |
|
return documents, questions |
|
except Exception as e: |
|
logger.error(f"Error loading CSV file: {e}") |
|
return [], [] |
|
|
|
|
|
file_path = "AIChatbot.csv" |
|
source_docs, corpus_questions = load_csv(file_path) |
|
if not source_docs: |
|
raise ValueError(f"Failed to load documents from {file_path}. Please check the file.") |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=500, |
|
chunk_overlap=50, |
|
add_start_index=True, |
|
strip_whitespace=True, |
|
separators=["\n\n", "\n", ".", " ", ""], |
|
) |
|
docs_processed = text_splitter.split_documents(source_docs) |
|
logger.info(f"Split documents into {len(docs_processed)} chunks.") |
|
|
|
|
|
class RetrieverTool(Tool): |
|
name = "retriever" |
|
description = "Uses semantic search to retrieve the parts of chatbot documentation most relevant to the query." |
|
inputs = { |
|
"query": { |
|
"type": "string", |
|
"description": "The query to perform. Use an affirmative tone rather than a question." |
|
} |
|
} |
|
output_type = "string" |
|
|
|
def __init__(self, docs, **kwargs): |
|
super().__init__(**kwargs) |
|
self.retriever = BM25Retriever.from_documents(docs, k=10) |
|
|
|
def forward(self, query: str) -> str: |
|
assert isinstance(query, str), "Search query must be a string." |
|
docs = self.retriever.invoke(query) |
|
|
|
if docs: |
|
return docs[0].page_content.strip() |
|
else: |
|
return "No relevant information found." |
|
|
|
retriever_tool = RetrieverTool(docs_processed) |
|
|
|
|
|
custom_prompt = """ |
|
You are a friendly and knowledgeable AI assistant for a daily wellness company. Your goal is to provide clear, concise, and actionable answers to the user's health and wellness-related questions. Use a warm, approachable tone to make the user feel at ease. |
|
|
|
When answering: |
|
1. Focus on brevity without sacrificing accuracy or helpfulness. |
|
2. Highlight key points in an easy-to-understand manner. |
|
3. Include examples, tips, or short step-by-step guides where relevant. |
|
4. Format lists or steps using markdown for better readability (e.g., numbered lists, bullet points). |
|
5. Ensure your response is self-contained, engaging, and ends with a polite closing remark. |
|
|
|
Answer each question in a similar concise, helpful, and friendly way. |
|
""" |
|
|
|
|
|
model = LiteLLMModel("groq/llama3-8b-8192") |
|
agent = CodeAgent( |
|
tools=[retriever_tool], model=model, max_iterations=4, verbose=True |
|
) |
|
|
|
|
|
def gradio_interface(query): |
|
try: |
|
is_valid, message = is_valid_input(query) |
|
if not is_valid: |
|
return message |
|
|
|
|
|
similar, similar_question, similarity_score = similarity_search(query, corpus_questions, threshold=0.2) |
|
if not similar: |
|
return ( |
|
"I'm here to assist with health and wellness-related topics. " |
|
"However, I couldn't find a closely related question in the dataset. " |
|
"Please refine your query." |
|
) |
|
|
|
|
|
return agent.run(f"{custom_prompt}\n\nQuestion: {query}").strip() |
|
except Exception as e: |
|
logger.error(f"Error during query processing: {e}") |
|
return "**An error occurred while processing your request. Please try again later.**" |
|
|
|
interface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=gr.Textbox(label="Enter your question", placeholder="e.g., How does box breathing help reduce anxiety?"), |
|
outputs=gr.Markdown(label="Answer"), |
|
title="AI Chatbot for Wellness", |
|
description="Ask questions based on the AIChatbot.csv file. Focus on health and wellness topics.", |
|
theme="compact" |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch(debug=True) |
|
|