Spaces:

Phoenix21
/

chatbot

Sleeping

App Files Files Community

chatbot / app.py

Phoenix21

Create app.py

5ec7b71 verified 3 months ago

raw

history blame

7.79 kB

	# Install necessary libraries in Colab
	# !pip install datasets langchain_community smolagents chardet gradio pandas nltk sklearn

	# Import required modules
	import os
	import getpass
	import pandas as pd
	import chardet
	import re
	from langchain.docstore.document import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.retrievers import BM25Retriever
	# from smolagents import Tool, HfApiModel, CodeAgent
	from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool, ManagedAgent
	from smolagents.agents import ToolCallingAgent
	from smolagents import Tool, HfApiModel, TransformersModel, LiteLLMModel
	from typing import Optional
	import gradio as gr
	import logging
	from nltk.corpus import words
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity


	if 'GROQ_API_KEY' not in os.environ or not os.environ['GROQ_API_KEY']:
	os.environ['GROQ_API_KEY'] = getpass.getpass('Enter GROQ_API_KEY: ')
	else:
	print("GROQ_API_KEY is already set.")
	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Load NLTK word list for valid word checks
	try:
	english_words = set(words.words())
	except LookupError:
	import nltk
	nltk.download('words')
	english_words = set(words.words())

	# Define allowed topics for health and wellness
	ALLOWED_TOPICS = [
	"mental health",
	"physical health",
	"fitness",
	"nutrition",
	"exercise",
	"mindfulness",
	"sleep",
	"stress management",
	"wellness",
	"relaxation",
	"healthy lifestyle",
	"self-care",
	"meditation",
	"diet",
	"hydration",
	"breathing techniques",
	"yoga",
	"stress relief",
	"emotional health",
	"spiritual health",
	"healthy habits"
	]

	def is_valid_input(query):
	"""
	Validate the user's input question.
	"""
	if not query or query.strip() == "":
	return False, "Input cannot be empty. Please provide a meaningful question."

	if len(query.strip()) < 2:
	return False, "Input is too short. Please provide more context or details."

	# Check for valid words
	words_in_text = re.findall(r'\b\w+\b', query.lower())
	recognized_words = [word for word in words_in_text if word in english_words]

	if not recognized_words:
	return False, "Input appears unclear. Please use valid words in your question."

	return True, "Valid input."

	def similarity_search(query, corpus, threshold=0.2):
	"""
	Perform similarity search using TF-IDF and cosine similarity.
	"""
	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform(corpus + [query])
	query_vector = tfidf_matrix[-1]
	similarities = cosine_similarity(query_vector, tfidf_matrix[:-1]).flatten()
	max_similarity = max(similarities)
	if max_similarity >= threshold:
	most_similar_idx = similarities.argmax()
	return True, corpus[most_similar_idx], max_similarity
	return False, None, max_similarity

	# Load and process the AIChatbot.csv file
	def load_csv(file_path):
	"""
	Load and process a CSV file into a list of documents.
	"""
	try:
	with open(file_path, 'rb') as f:
	result = chardet.detect(f.read())
	encoding = result['encoding']
	data = pd.read_csv(file_path, encoding=encoding)
	questions = data['Question'].dropna().tolist()
	documents = [
	Document(page_content=row.to_string(index=False), metadata={"source": file_path})
	for _, row in data.iterrows()
	]
	logger.info(f"Loaded {len(documents)} documents from {file_path}")
	return documents, questions
	except Exception as e:
	logger.error(f"Error loading CSV file: {e}")
	return [], []

	# Load the AIChatbot.csv file
	file_path = "AIChatbot.csv" # Ensure this file is uploaded to your environment
	source_docs, corpus_questions = load_csv(file_path)
	if not source_docs:
	raise ValueError(f"Failed to load documents from {file_path}. Please check the file.")

	# Split documents into manageable chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=50,
	add_start_index=True,
	strip_whitespace=True,
	separators=["\n\n", "\n", ".", " ", ""],
	)
	docs_processed = text_splitter.split_documents(source_docs)
	logger.info(f"Split documents into {len(docs_processed)} chunks.")

	# Define the retriever tool
	class RetrieverTool(Tool):
	name = "retriever"
	description = "Uses semantic search to retrieve the parts of chatbot documentation most relevant to the query."
	inputs = {
	"query": {
	"type": "string",
	"description": "The query to perform. Use an affirmative tone rather than a question."
	}
	}
	output_type = "string"

	def __init__(self, docs, **kwargs):
	super().__init__(**kwargs)
	self.retriever = BM25Retriever.from_documents(docs, k=10)

	def forward(self, query: str) -> str:
	assert isinstance(query, str), "Search query must be a string."
	docs = self.retriever.invoke(query)
	# Return only the content of the most relevant document
	if docs:
	return docs[0].page_content.strip()
	else:
	return "No relevant information found."

	retriever_tool = RetrieverTool(docs_processed)

	# Define the improved custom prompt
	custom_prompt = """
	You are a friendly and knowledgeable AI assistant for a daily wellness company. Your goal is to provide clear, concise, and actionable answers to the user's health and wellness-related questions. Use a warm, approachable tone to make the user feel at ease.

	When answering:
	1. Focus on brevity without sacrificing accuracy or helpfulness.
	2. Highlight key points in an easy-to-understand manner.
	3. Include examples, tips, or short step-by-step guides where relevant.
	4. Format lists or steps using markdown for better readability (e.g., numbered lists, bullet points).
	5. Ensure your response is self-contained, engaging, and ends with a polite closing remark.

	Answer each question in a similar concise, helpful, and friendly way.
	"""

	# Define the agent using smolagents
	model = LiteLLMModel("groq/llama3-8b-8192") # Ensure the model is available
	agent = CodeAgent(
	tools=[retriever_tool], model=model, max_iterations=4, verbose=True
	)

	# Gradio interface for interacting with the RAG pipeline
	def gradio_interface(query):
	try:
	is_valid, message = is_valid_input(query)
	if not is_valid:
	return message

	# Perform similarity search to verify the query's viability
	similar, similar_question, similarity_score = similarity_search(query, corpus_questions, threshold=0.2)
	if not similar:
	return (
	"I'm here to assist with health and wellness-related topics. "
	"However, I couldn't find a closely related question in the dataset. "
	"Please refine your query."
	)

	# Directly query the agent if the question is valid
	return agent.run(f"{custom_prompt}\n\nQuestion: {query}").strip()
	except Exception as e:
	logger.error(f"Error during query processing: {e}")
	return "An error occurred while processing your request. Please try again later."

	interface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.Textbox(label="Enter your question", placeholder="e.g., How does box breathing help reduce anxiety?"),
	outputs=gr.Markdown(label="Answer"),
	title="AI Chatbot for Wellness",
	description="Ask questions based on the AIChatbot.csv file. Focus on health and wellness topics.",
	theme="compact"
	)

	if __name__ == "__main__":
	interface.launch(debug=True)