Spaces:

sampazar
/

Compass-FSS-Advisor

Sleeping

App Files Files Community

Sam commited on Oct 2, 2024

Commit

03a9af9

1 Parent(s): f3a52d4

Add application file

Browse files

Files changed (2) hide show

app.py +46 -9
requirements.txt +26 -16

app.py CHANGED Viewed

@@ -12,15 +12,15 @@ import chainlit as cl
 import tiktoken
 # Specific imports from the libraries
-from langchain.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.embeddings import OpenAIEmbeddings #Note: Old import was - from langchain_openai import OpenAIEmbeddings
 from langchain_community.vectorstores import Qdrant
 from langchain.prompts import ChatPromptTemplate
-from langchain.chat_models import ChatOpenAI #Note: Old import was - from langchain_openai import ChatOpenAI
 from operator import itemgetter
 from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable import RunnablePassthrough
 #-----Set Environment Variables-----#
 load_dotenv()
@@ -32,11 +32,18 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 openai.api_key = OPENAI_API_KEY
 #-----Document Loading and Processing -----#
-loader = PyMuPDFLoader("/home/user/app/data/airbnb_q1_2024.pdf")
-documents = loader.load()
-#Note: I changed the loader file path from one that worked locally only to one that worked with Docker. The old file path is loader = PyMuPDFLoader("/Users/sampazar/AIE3-Midterm/data/airbnb_q1_2024.pdf")
 def tiktoken_len(text):
     tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
     return len(tokens)
@@ -54,12 +61,26 @@ split_chunks = text_splitter.split_documents(documents)
 # Load OpenAI Embeddings Model
 embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 # Creating a Qdrant Vector Store
 qdrant_vector_store = Qdrant.from_documents(
     split_chunks,
     embeddings,
     location=":memory:",
-    collection_name="Airbnb_Q1_2024",
 )
 # Create a Retriever
@@ -67,7 +88,23 @@ retriever = qdrant_vector_store.as_retriever()
 #-----Prompt Template and Language Model Setup-----#
 # Define the prompt template
-template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':
 Context:
 {context}
@@ -108,7 +145,7 @@ async def start_chat():
     settings = {
         "model": "gpt-4o",
         "temperature": 0,
-        "max_tokens": 500,
         "top_p": 1,
         "frequency_penalty": 0,
         "presence_penalty": 0,

 import tiktoken
 # Specific imports from the libraries
+from langchain_community.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_community.vectorstores import Qdrant
 from langchain.prompts import ChatPromptTemplate
 from operator import itemgetter
 from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable import RunnablePassthrough
+import glob
 #-----Set Environment Variables-----#
 load_dotenv()
 openai.api_key = OPENAI_API_KEY
 #-----Document Loading and Processing -----#
+# Load all PDF files from the specified directory
+pdf_files = glob.glob("/home/user/app/data/*.pdf")
+# Initialize an empty list to hold all documents
+documents = []
+# Load each PDF file and append its documents to the list
+for pdf_file in pdf_files:
+    loader = PyMuPDFLoader(pdf_file)
+    documents.extend(loader.load())
+# Split the documents into chunks
 def tiktoken_len(text):
     tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
     return len(tokens)
 # Load OpenAI Embeddings Model
 embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+# Check that the embeddings model works as expected
+try:
+    test_text = "Sample text for embedding."
+    test_embedding = embeddings.embed_query(test_text)
+    print(f"Test embedding generated successfully: {test_embedding[:5]}...")  # Print a part of the embedding
+except Exception as e:
+    print(f"Error generating test embedding: {e}")
+    exit()
 # Creating a Qdrant Vector Store
+print(f"Number of split chunks: {len(split_chunks)}")
+if len(split_chunks) == 0:
+    print("Error: No split chunks found. Please check the document loading and splitting process.")
+    exit()
 qdrant_vector_store = Qdrant.from_documents(
     split_chunks,
     embeddings,
     location=":memory:",
+    collection_name="HUD_FSS_Rules_and_Regs",
 )
 # Create a Retriever
 #-----Prompt Template and Language Model Setup-----#
 # Define the prompt template
+template = """You are a helpful AI chatbot for HUD Family Self Sufficiency (FSS) Program Managers and FSS Coordinators. You answer questions about HUD FSS rules and regulations and help guide program managers and FSS Coordinators to lead FSS programs that are participant-centered and draw insights from the Compass Working Capital program model.
+Draw from your knowledge base wherever possible to answer questions. Your knowledge base includes:
+1. Relevant HUD regulations from the Code of Federal Regulations (CFR). This includes CFR Part 887 and CFR Part 984.
+2. The FSS Final Rule from 7/13/2023, which also includes Q&A with answers from HUD.
+3.  The FSS Program Guidebook created by HUD.
+You use these resources to help FSS Coordinators with their questions. When communicating with FSS Program Managers and FSS Coordinators, follow these guidelines:
+1. Be Client-Centered: Your goal is to help the FSS client be successful and benefit from the FSS program. Write in a way that emphasizes what the client is able to do and how the user can support the client. If the FSS coordinator or FSS program manager can choose to interpret rules and regulations in a way that is advantageous to the FSS client, encourage them to do so. Do not suggest options that are strictly adhering to the rules in a way that is disadvantageous to the FSS client when there are options to interpret the rules in a way that is advantageous to the FSS client.
+2. Cite Your Sources: When you reference the Code of Federal Regulations (CFR) documents from the knowledge base, include the Part, Subpart, Section, and other identifying information for what you are referencing so the user can learn more. Those documents will have clear labels for Parts, Subparts, and Sections such as § 984.305 (a) (2) (ii). When you pull information from these documents, include those section labels and a quote of the actual text formatted in a way that makes it clear that it's a quote. For other documents, include quotes if they're very relevant and be sure to include the name of the document it's from. If you don't know the name of the document, do not include the quote.
+3.  Making the Complex Simple: FSS program manager questions are often quite complex and embedded within a specific client scenario. Provide relevant context from the knowledge base and then adapt it to the specific client scenario. Be clear, concise, but still friendly and supportive in tone.
+Generally, a good answer will:
+1. Defer first to the content in the HUD regulations and make direct references to them whenever possible. Sometimes questions are worded in a way that suggests that the FSS program has discretion in an area where there is none. Review the regulations first to see what is clearly allowed or not allowed before consulting other sources.
+2. Defer second to the program Action Plan. You will not have access to individual programs Action Plans, but the answer should prompt the user to review their policies on whatever topic they asked about. You could also make reference to specific, required Action Plan sections using HUD’s Sample Action Plan. If the question asked is related to an area governed by a local policy decision, encourage the user to consider adopting a flexible, client-centered approach. Remind the user that Action Plan policies can be updated and changed. Revised Action Plans need to be approved by HUD.
+3. Defer third to other applicable HUD sources like the Guidebook and the FAQs in the FSS Final Rule. If content in the Guidebook and FAQs differs from the HUD regulations, the regulations should be considered correct.
+4. Infuse client-centered responses throughout. If the policy in question includes a local policy decision, encourage the user to take a client-centered approach.
 Context:
 {context}
     settings = {
         "model": "gpt-4o",
         "temperature": 0,
+        "max_tokens": 750,
         "top_p": 1,
         "frequency_penalty": 0,
         "presence_penalty": 0,

requirements.txt CHANGED Viewed

@@ -1,21 +1,31 @@
-chainlit==0.7.700
-langchain==0.2.5
-langchain_community==0.2.5
-langchain_core==0.2.9
-langchain_text_splitters==0.2.1
-python-dotenv==1.0.1
-#Adding OpenAI API client and Qdrant client
-openai==1.35.3 #Be sure to use the latest version 'pip show openai'
-qdrant-client==1.9.2 #Be sure to use the latest version 'pip show qdrant-client'
-# Adding PyMuPDF for PDF processing
-PyMuPDF==1.24.5 #Be sure to use the latest version 'pip show pymupdf'
 tiktoken==0.7.0
-#cohere==4.37
-transformers==4.37.0
 pandas==2.0.3
-#Removed Hugging Face and FAISS dependencies
-#langchain_huggingface==0.0.3
-#faiss-cpu

+# Core packages
+fastapi==0.100.1
+uvicorn==0.23.2
+# OpenAI & LangChain dependencies
+openai==1.51.0
+langchain==0.3.1
+langchain-openai==0.2.1
+langchain-core==0.3.8
+langchain-text-splitters==0.3.0
+langchain-huggingface==0.1.0
+langchain-community==0.3.1
+# Document processing
+PyMuPDF==1.24.5
+# Qdrant for vector store
+qdrant-client==1.9.2
+# Tokenization and transformers
 tiktoken==0.7.0
+transformers==4.45.1
+# Data processing
 pandas==2.0.3
+# Chainlit for chat interface
+chainlit==0.7.700
+# Other necessary libraries
+python-dotenv==1.0.1  # For environment variables