Sam commited on
Commit
03a9af9
·
1 Parent(s): f3a52d4

Add application file

Browse files
Files changed (2) hide show
  1. app.py +46 -9
  2. requirements.txt +26 -16
app.py CHANGED
@@ -12,15 +12,15 @@ import chainlit as cl
12
  import tiktoken
13
 
14
  # Specific imports from the libraries
15
- from langchain.document_loaders import PyMuPDFLoader
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
- from langchain.embeddings import OpenAIEmbeddings #Note: Old import was - from langchain_openai import OpenAIEmbeddings
18
  from langchain_community.vectorstores import Qdrant
19
  from langchain.prompts import ChatPromptTemplate
20
- from langchain.chat_models import ChatOpenAI #Note: Old import was - from langchain_openai import ChatOpenAI
21
  from operator import itemgetter
22
  from langchain.schema.output_parser import StrOutputParser
23
  from langchain.schema.runnable import RunnablePassthrough
 
24
 
25
  #-----Set Environment Variables-----#
26
  load_dotenv()
@@ -32,11 +32,18 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
32
  openai.api_key = OPENAI_API_KEY
33
 
34
  #-----Document Loading and Processing -----#
35
- loader = PyMuPDFLoader("/home/user/app/data/airbnb_q1_2024.pdf")
36
- documents = loader.load()
37
 
38
- #Note: I changed the loader file path from one that worked locally only to one that worked with Docker. The old file path is loader = PyMuPDFLoader("/Users/sampazar/AIE3-Midterm/data/airbnb_q1_2024.pdf")
 
39
 
 
 
 
 
 
 
40
  def tiktoken_len(text):
41
  tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
42
  return len(tokens)
@@ -54,12 +61,26 @@ split_chunks = text_splitter.split_documents(documents)
54
  # Load OpenAI Embeddings Model
55
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
56
 
 
 
 
 
 
 
 
 
 
57
  # Creating a Qdrant Vector Store
 
 
 
 
 
58
  qdrant_vector_store = Qdrant.from_documents(
59
  split_chunks,
60
  embeddings,
61
  location=":memory:",
62
- collection_name="Airbnb_Q1_2024",
63
  )
64
 
65
  # Create a Retriever
@@ -67,7 +88,23 @@ retriever = qdrant_vector_store.as_retriever()
67
 
68
  #-----Prompt Template and Language Model Setup-----#
69
  # Define the prompt template
70
- template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  Context:
73
  {context}
@@ -108,7 +145,7 @@ async def start_chat():
108
  settings = {
109
  "model": "gpt-4o",
110
  "temperature": 0,
111
- "max_tokens": 500,
112
  "top_p": 1,
113
  "frequency_penalty": 0,
114
  "presence_penalty": 0,
 
12
  import tiktoken
13
 
14
  # Specific imports from the libraries
15
+ from langchain_community.document_loaders import PyMuPDFLoader
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
18
  from langchain_community.vectorstores import Qdrant
19
  from langchain.prompts import ChatPromptTemplate
 
20
  from operator import itemgetter
21
  from langchain.schema.output_parser import StrOutputParser
22
  from langchain.schema.runnable import RunnablePassthrough
23
+ import glob
24
 
25
  #-----Set Environment Variables-----#
26
  load_dotenv()
 
32
  openai.api_key = OPENAI_API_KEY
33
 
34
  #-----Document Loading and Processing -----#
35
+ # Load all PDF files from the specified directory
36
+ pdf_files = glob.glob("/home/user/app/data/*.pdf")
37
 
38
+ # Initialize an empty list to hold all documents
39
+ documents = []
40
 
41
+ # Load each PDF file and append its documents to the list
42
+ for pdf_file in pdf_files:
43
+ loader = PyMuPDFLoader(pdf_file)
44
+ documents.extend(loader.load())
45
+
46
+ # Split the documents into chunks
47
  def tiktoken_len(text):
48
  tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
49
  return len(tokens)
 
61
  # Load OpenAI Embeddings Model
62
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
63
 
64
+ # Check that the embeddings model works as expected
65
+ try:
66
+ test_text = "Sample text for embedding."
67
+ test_embedding = embeddings.embed_query(test_text)
68
+ print(f"Test embedding generated successfully: {test_embedding[:5]}...") # Print a part of the embedding
69
+ except Exception as e:
70
+ print(f"Error generating test embedding: {e}")
71
+ exit()
72
+
73
  # Creating a Qdrant Vector Store
74
+ print(f"Number of split chunks: {len(split_chunks)}")
75
+ if len(split_chunks) == 0:
76
+ print("Error: No split chunks found. Please check the document loading and splitting process.")
77
+ exit()
78
+
79
  qdrant_vector_store = Qdrant.from_documents(
80
  split_chunks,
81
  embeddings,
82
  location=":memory:",
83
+ collection_name="HUD_FSS_Rules_and_Regs",
84
  )
85
 
86
  # Create a Retriever
 
88
 
89
  #-----Prompt Template and Language Model Setup-----#
90
  # Define the prompt template
91
+ template = """You are a helpful AI chatbot for HUD Family Self Sufficiency (FSS) Program Managers and FSS Coordinators. You answer questions about HUD FSS rules and regulations and help guide program managers and FSS Coordinators to lead FSS programs that are participant-centered and draw insights from the Compass Working Capital program model.
92
+
93
+ Draw from your knowledge base wherever possible to answer questions. Your knowledge base includes:
94
+ 1. Relevant HUD regulations from the Code of Federal Regulations (CFR). This includes CFR Part 887 and CFR Part 984.
95
+ 2. The FSS Final Rule from 7/13/2023, which also includes Q&A with answers from HUD.
96
+ 3. The FSS Program Guidebook created by HUD.
97
+
98
+ You use these resources to help FSS Coordinators with their questions. When communicating with FSS Program Managers and FSS Coordinators, follow these guidelines:
99
+ 1. Be Client-Centered: Your goal is to help the FSS client be successful and benefit from the FSS program. Write in a way that emphasizes what the client is able to do and how the user can support the client. If the FSS coordinator or FSS program manager can choose to interpret rules and regulations in a way that is advantageous to the FSS client, encourage them to do so. Do not suggest options that are strictly adhering to the rules in a way that is disadvantageous to the FSS client when there are options to interpret the rules in a way that is advantageous to the FSS client.
100
+ 2. Cite Your Sources: When you reference the Code of Federal Regulations (CFR) documents from the knowledge base, include the Part, Subpart, Section, and other identifying information for what you are referencing so the user can learn more. Those documents will have clear labels for Parts, Subparts, and Sections such as § 984.305 (a) (2) (ii). When you pull information from these documents, include those section labels and a quote of the actual text formatted in a way that makes it clear that it's a quote. For other documents, include quotes if they're very relevant and be sure to include the name of the document it's from. If you don't know the name of the document, do not include the quote.
101
+ 3. Making the Complex Simple: FSS program manager questions are often quite complex and embedded within a specific client scenario. Provide relevant context from the knowledge base and then adapt it to the specific client scenario. Be clear, concise, but still friendly and supportive in tone.
102
+
103
+ Generally, a good answer will:
104
+ 1. Defer first to the content in the HUD regulations and make direct references to them whenever possible. Sometimes questions are worded in a way that suggests that the FSS program has discretion in an area where there is none. Review the regulations first to see what is clearly allowed or not allowed before consulting other sources.
105
+ 2. Defer second to the program Action Plan. You will not have access to individual programs Action Plans, but the answer should prompt the user to review their policies on whatever topic they asked about. You could also make reference to specific, required Action Plan sections using HUD’s Sample Action Plan. If the question asked is related to an area governed by a local policy decision, encourage the user to consider adopting a flexible, client-centered approach. Remind the user that Action Plan policies can be updated and changed. Revised Action Plans need to be approved by HUD.
106
+ 3. Defer third to other applicable HUD sources like the Guidebook and the FAQs in the FSS Final Rule. If content in the Guidebook and FAQs differs from the HUD regulations, the regulations should be considered correct.
107
+ 4. Infuse client-centered responses throughout. If the policy in question includes a local policy decision, encourage the user to take a client-centered approach.
108
 
109
  Context:
110
  {context}
 
145
  settings = {
146
  "model": "gpt-4o",
147
  "temperature": 0,
148
+ "max_tokens": 750,
149
  "top_p": 1,
150
  "frequency_penalty": 0,
151
  "presence_penalty": 0,
requirements.txt CHANGED
@@ -1,21 +1,31 @@
1
- chainlit==0.7.700
2
- langchain==0.2.5
3
- langchain_community==0.2.5
4
- langchain_core==0.2.9
5
- langchain_text_splitters==0.2.1
6
- python-dotenv==1.0.1
 
 
 
 
 
 
7
 
8
- #Adding OpenAI API client and Qdrant client
9
- openai==1.35.3 #Be sure to use the latest version 'pip show openai'
10
- qdrant-client==1.9.2 #Be sure to use the latest version 'pip show qdrant-client'
11
 
12
- # Adding PyMuPDF for PDF processing
13
- PyMuPDF==1.24.5 #Be sure to use the latest version 'pip show pymupdf'
14
 
 
15
  tiktoken==0.7.0
16
- #cohere==4.37
17
- transformers==4.37.0
 
18
  pandas==2.0.3
19
- #Removed Hugging Face and FAISS dependencies
20
- #langchain_huggingface==0.0.3
21
- #faiss-cpu
 
 
 
 
1
+ # Core packages
2
+ fastapi==0.100.1
3
+ uvicorn==0.23.2
4
+
5
+ # OpenAI & LangChain dependencies
6
+ openai==1.51.0
7
+ langchain==0.3.1
8
+ langchain-openai==0.2.1
9
+ langchain-core==0.3.8
10
+ langchain-text-splitters==0.3.0
11
+ langchain-huggingface==0.1.0
12
+ langchain-community==0.3.1
13
 
14
+ # Document processing
15
+ PyMuPDF==1.24.5
 
16
 
17
+ # Qdrant for vector store
18
+ qdrant-client==1.9.2
19
 
20
+ # Tokenization and transformers
21
  tiktoken==0.7.0
22
+ transformers==4.45.1
23
+
24
+ # Data processing
25
  pandas==2.0.3
26
+
27
+ # Chainlit for chat interface
28
+ chainlit==0.7.700
29
+
30
+ # Other necessary libraries
31
+ python-dotenv==1.0.1 # For environment variables