Spaces:
Sleeping
Sleeping
Sam
commited on
Commit
·
03a9af9
1
Parent(s):
f3a52d4
Add application file
Browse files- app.py +46 -9
- requirements.txt +26 -16
app.py
CHANGED
@@ -12,15 +12,15 @@ import chainlit as cl
|
|
12 |
import tiktoken
|
13 |
|
14 |
# Specific imports from the libraries
|
15 |
-
from
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
-
from
|
18 |
from langchain_community.vectorstores import Qdrant
|
19 |
from langchain.prompts import ChatPromptTemplate
|
20 |
-
from langchain.chat_models import ChatOpenAI #Note: Old import was - from langchain_openai import ChatOpenAI
|
21 |
from operator import itemgetter
|
22 |
from langchain.schema.output_parser import StrOutputParser
|
23 |
from langchain.schema.runnable import RunnablePassthrough
|
|
|
24 |
|
25 |
#-----Set Environment Variables-----#
|
26 |
load_dotenv()
|
@@ -32,11 +32,18 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
32 |
openai.api_key = OPENAI_API_KEY
|
33 |
|
34 |
#-----Document Loading and Processing -----#
|
35 |
-
|
36 |
-
|
37 |
|
38 |
-
#
|
|
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def tiktoken_len(text):
|
41 |
tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
|
42 |
return len(tokens)
|
@@ -54,12 +61,26 @@ split_chunks = text_splitter.split_documents(documents)
|
|
54 |
# Load OpenAI Embeddings Model
|
55 |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
# Creating a Qdrant Vector Store
|
|
|
|
|
|
|
|
|
|
|
58 |
qdrant_vector_store = Qdrant.from_documents(
|
59 |
split_chunks,
|
60 |
embeddings,
|
61 |
location=":memory:",
|
62 |
-
collection_name="
|
63 |
)
|
64 |
|
65 |
# Create a Retriever
|
@@ -67,7 +88,23 @@ retriever = qdrant_vector_store.as_retriever()
|
|
67 |
|
68 |
#-----Prompt Template and Language Model Setup-----#
|
69 |
# Define the prompt template
|
70 |
-
template = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
Context:
|
73 |
{context}
|
@@ -108,7 +145,7 @@ async def start_chat():
|
|
108 |
settings = {
|
109 |
"model": "gpt-4o",
|
110 |
"temperature": 0,
|
111 |
-
"max_tokens":
|
112 |
"top_p": 1,
|
113 |
"frequency_penalty": 0,
|
114 |
"presence_penalty": 0,
|
|
|
12 |
import tiktoken
|
13 |
|
14 |
# Specific imports from the libraries
|
15 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
+
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
18 |
from langchain_community.vectorstores import Qdrant
|
19 |
from langchain.prompts import ChatPromptTemplate
|
|
|
20 |
from operator import itemgetter
|
21 |
from langchain.schema.output_parser import StrOutputParser
|
22 |
from langchain.schema.runnable import RunnablePassthrough
|
23 |
+
import glob
|
24 |
|
25 |
#-----Set Environment Variables-----#
|
26 |
load_dotenv()
|
|
|
32 |
openai.api_key = OPENAI_API_KEY
|
33 |
|
34 |
#-----Document Loading and Processing -----#
|
35 |
+
# Load all PDF files from the specified directory
|
36 |
+
pdf_files = glob.glob("/home/user/app/data/*.pdf")
|
37 |
|
38 |
+
# Initialize an empty list to hold all documents
|
39 |
+
documents = []
|
40 |
|
41 |
+
# Load each PDF file and append its documents to the list
|
42 |
+
for pdf_file in pdf_files:
|
43 |
+
loader = PyMuPDFLoader(pdf_file)
|
44 |
+
documents.extend(loader.load())
|
45 |
+
|
46 |
+
# Split the documents into chunks
|
47 |
def tiktoken_len(text):
|
48 |
tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
|
49 |
return len(tokens)
|
|
|
61 |
# Load OpenAI Embeddings Model
|
62 |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
63 |
|
64 |
+
# Check that the embeddings model works as expected
|
65 |
+
try:
|
66 |
+
test_text = "Sample text for embedding."
|
67 |
+
test_embedding = embeddings.embed_query(test_text)
|
68 |
+
print(f"Test embedding generated successfully: {test_embedding[:5]}...") # Print a part of the embedding
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Error generating test embedding: {e}")
|
71 |
+
exit()
|
72 |
+
|
73 |
# Creating a Qdrant Vector Store
|
74 |
+
print(f"Number of split chunks: {len(split_chunks)}")
|
75 |
+
if len(split_chunks) == 0:
|
76 |
+
print("Error: No split chunks found. Please check the document loading and splitting process.")
|
77 |
+
exit()
|
78 |
+
|
79 |
qdrant_vector_store = Qdrant.from_documents(
|
80 |
split_chunks,
|
81 |
embeddings,
|
82 |
location=":memory:",
|
83 |
+
collection_name="HUD_FSS_Rules_and_Regs",
|
84 |
)
|
85 |
|
86 |
# Create a Retriever
|
|
|
88 |
|
89 |
#-----Prompt Template and Language Model Setup-----#
|
90 |
# Define the prompt template
|
91 |
+
template = """You are a helpful AI chatbot for HUD Family Self Sufficiency (FSS) Program Managers and FSS Coordinators. You answer questions about HUD FSS rules and regulations and help guide program managers and FSS Coordinators to lead FSS programs that are participant-centered and draw insights from the Compass Working Capital program model.
|
92 |
+
|
93 |
+
Draw from your knowledge base wherever possible to answer questions. Your knowledge base includes:
|
94 |
+
1. Relevant HUD regulations from the Code of Federal Regulations (CFR). This includes CFR Part 887 and CFR Part 984.
|
95 |
+
2. The FSS Final Rule from 7/13/2023, which also includes Q&A with answers from HUD.
|
96 |
+
3. The FSS Program Guidebook created by HUD.
|
97 |
+
|
98 |
+
You use these resources to help FSS Coordinators with their questions. When communicating with FSS Program Managers and FSS Coordinators, follow these guidelines:
|
99 |
+
1. Be Client-Centered: Your goal is to help the FSS client be successful and benefit from the FSS program. Write in a way that emphasizes what the client is able to do and how the user can support the client. If the FSS coordinator or FSS program manager can choose to interpret rules and regulations in a way that is advantageous to the FSS client, encourage them to do so. Do not suggest options that are strictly adhering to the rules in a way that is disadvantageous to the FSS client when there are options to interpret the rules in a way that is advantageous to the FSS client.
|
100 |
+
2. Cite Your Sources: When you reference the Code of Federal Regulations (CFR) documents from the knowledge base, include the Part, Subpart, Section, and other identifying information for what you are referencing so the user can learn more. Those documents will have clear labels for Parts, Subparts, and Sections such as § 984.305 (a) (2) (ii). When you pull information from these documents, include those section labels and a quote of the actual text formatted in a way that makes it clear that it's a quote. For other documents, include quotes if they're very relevant and be sure to include the name of the document it's from. If you don't know the name of the document, do not include the quote.
|
101 |
+
3. Making the Complex Simple: FSS program manager questions are often quite complex and embedded within a specific client scenario. Provide relevant context from the knowledge base and then adapt it to the specific client scenario. Be clear, concise, but still friendly and supportive in tone.
|
102 |
+
|
103 |
+
Generally, a good answer will:
|
104 |
+
1. Defer first to the content in the HUD regulations and make direct references to them whenever possible. Sometimes questions are worded in a way that suggests that the FSS program has discretion in an area where there is none. Review the regulations first to see what is clearly allowed or not allowed before consulting other sources.
|
105 |
+
2. Defer second to the program Action Plan. You will not have access to individual programs Action Plans, but the answer should prompt the user to review their policies on whatever topic they asked about. You could also make reference to specific, required Action Plan sections using HUD’s Sample Action Plan. If the question asked is related to an area governed by a local policy decision, encourage the user to consider adopting a flexible, client-centered approach. Remind the user that Action Plan policies can be updated and changed. Revised Action Plans need to be approved by HUD.
|
106 |
+
3. Defer third to other applicable HUD sources like the Guidebook and the FAQs in the FSS Final Rule. If content in the Guidebook and FAQs differs from the HUD regulations, the regulations should be considered correct.
|
107 |
+
4. Infuse client-centered responses throughout. If the policy in question includes a local policy decision, encourage the user to take a client-centered approach.
|
108 |
|
109 |
Context:
|
110 |
{context}
|
|
|
145 |
settings = {
|
146 |
"model": "gpt-4o",
|
147 |
"temperature": 0,
|
148 |
+
"max_tokens": 750,
|
149 |
"top_p": 1,
|
150 |
"frequency_penalty": 0,
|
151 |
"presence_penalty": 0,
|
requirements.txt
CHANGED
@@ -1,21 +1,31 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
#
|
9 |
-
|
10 |
-
qdrant-client==1.9.2 #Be sure to use the latest version 'pip show qdrant-client'
|
11 |
|
12 |
-
#
|
13 |
-
|
14 |
|
|
|
15 |
tiktoken==0.7.0
|
16 |
-
|
17 |
-
|
|
|
18 |
pandas==2.0.3
|
19 |
-
|
20 |
-
#
|
21 |
-
|
|
|
|
|
|
|
|
1 |
+
# Core packages
|
2 |
+
fastapi==0.100.1
|
3 |
+
uvicorn==0.23.2
|
4 |
+
|
5 |
+
# OpenAI & LangChain dependencies
|
6 |
+
openai==1.51.0
|
7 |
+
langchain==0.3.1
|
8 |
+
langchain-openai==0.2.1
|
9 |
+
langchain-core==0.3.8
|
10 |
+
langchain-text-splitters==0.3.0
|
11 |
+
langchain-huggingface==0.1.0
|
12 |
+
langchain-community==0.3.1
|
13 |
|
14 |
+
# Document processing
|
15 |
+
PyMuPDF==1.24.5
|
|
|
16 |
|
17 |
+
# Qdrant for vector store
|
18 |
+
qdrant-client==1.9.2
|
19 |
|
20 |
+
# Tokenization and transformers
|
21 |
tiktoken==0.7.0
|
22 |
+
transformers==4.45.1
|
23 |
+
|
24 |
+
# Data processing
|
25 |
pandas==2.0.3
|
26 |
+
|
27 |
+
# Chainlit for chat interface
|
28 |
+
chainlit==0.7.700
|
29 |
+
|
30 |
+
# Other necessary libraries
|
31 |
+
python-dotenv==1.0.1 # For environment variables
|