HanLee commited on
Commit
53edd39
Β·
1 Parent(s): 313a2ae

feat: update to streamlit, lcel, and deployable to huggingface

Browse files
.chainlit/config.toml DELETED
@@ -1,78 +0,0 @@
1
- [project]
2
- # Whether to enable telemetry (default: true). No personal data is collected.
3
- enable_telemetry = true
4
-
5
- # List of environment variables to be provided by each user to use the app.
6
- user_env = []
7
-
8
- # Duration (in seconds) during which the session is saved when the connection is lost
9
- session_timeout = 3600
10
-
11
- # Enable third parties caching (e.g LangChain cache)
12
- cache = false
13
-
14
- # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
15
- # follow_symlink = false
16
-
17
- [features]
18
- # Show the prompt playground
19
- prompt_playground = true
20
-
21
- # Authorize users to upload files with messages
22
- multi_modal = true
23
-
24
- # Allows user to use speech to text
25
- [features.speech_to_text]
26
- enabled = false
27
- # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
28
- # language = "en-US"
29
-
30
- [UI]
31
- # Name of the app and chatbot.
32
- name = "Chatbot"
33
-
34
- # Show the readme while the conversation is empty.
35
- show_readme_as_default = true
36
-
37
- # Description of the app and chatbot. This is used for HTML tags.
38
- # description = ""
39
-
40
- # Large size content are by default collapsed for a cleaner ui
41
- default_collapse_content = true
42
-
43
- # The default value for the expand messages settings.
44
- default_expand_messages = false
45
-
46
- # Hide the chain of thought details from the user in the UI.
47
- hide_cot = false
48
-
49
- # Link to your github repo. This will add a github button in the UI's header.
50
- github = "https://github.com/LinkedInLearning/hands-on-ai-building-and-deploying-llm-powered-apps-4511409"
51
-
52
- # Specify a CSS file that can be used to customize the user interface.
53
- # The CSS file can be served from the public directory or via an external link.
54
- # custom_css = "/public/test.css"
55
-
56
- # Override default MUI light theme. (Check theme.ts)
57
- [UI.theme.light]
58
- #background = "#FAFAFA"
59
- #paper = "#FFFFFF"
60
-
61
- [UI.theme.light.primary]
62
- #main = "#F80061"
63
- #dark = "#980039"
64
- #light = "#FFE7EB"
65
-
66
- # Override default MUI dark theme. (Check theme.ts)
67
- [UI.theme.dark]
68
- #background = "#FAFAFA"
69
- #paper = "#FFFFFF"
70
-
71
- [UI.theme.dark.primary]
72
- #main = "#F80061"
73
- #dark = "#980039"
74
- #light = "#FFE7EB"
75
-
76
-
77
- [meta]
78
- generated_by = "0.7.501"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.devcontainer/devcontainer.json CHANGED
@@ -1,11 +1,10 @@
1
  {
 
2
  "extensions": [
3
  "GitHub.github-vscode-theme",
4
  "ms-toolsai.jupyter",
5
  "ms-python.python"
6
- // Additional Extensions Here
7
  ],
8
- "onCreateCommand" : "[ -f requirements.txt ] && pip install -r requirements.txt; echo PS1='\"$ \"' >> ~/.bashrc", //Set Terminal Prompt to $
9
  }
10
-
11
- // DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference
 
1
  {
2
+ "image": "mcr.microsoft.com/devcontainers/python:3.11",
3
  "extensions": [
4
  "GitHub.github-vscode-theme",
5
  "ms-toolsai.jupyter",
6
  "ms-python.python"
 
7
  ],
8
+ "onCreateCommand": "bash .devcontainer/setup.sh"
9
  }
10
+ // DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference
 
.devcontainer/setup.sh ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ echo "Upgrading pip..."
5
+ pip install --upgrade pip || {
6
+ echo "Failed to upgrade pip"
7
+ exit 1
8
+ }
9
+
10
+ echo "πŸ”§ Installing NVM..."
11
+ export NVM_DIR="$HOME/.nvm"
12
+ mkdir -p "$NVM_DIR"
13
+
14
+ # Download and install NVM
15
+ curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash || {
16
+ echo "Failed to download NVM installer"
17
+ exit 1
18
+ }
19
+
20
+ # Add NVM to bashrc for future sessions
21
+ echo 'export NVM_DIR="$HOME/.nvm"' >> ~/.bashrc
22
+ echo '[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"' >> ~/.bashrc
23
+ echo '[ -s "$NVM_DIR/bash_completion" ] && \. "$NVM_DIR/bash_completion"' >> ~/.bashrc
24
+
25
+ # Load NVM for current session
26
+ if [ -s "$NVM_DIR/nvm.sh" ]; then
27
+ \. "$NVM_DIR/nvm.sh"
28
+ echo "NVM loaded successfully"
29
+ else
30
+ echo "NVM script not found at $NVM_DIR/nvm.sh"
31
+ exit 1
32
+ fi
33
+
34
+ # Verify NVM is available
35
+ if ! command -v nvm &> /dev/null; then
36
+ echo "NVM command not found after sourcing. Trying alternative approach..."
37
+ # Try to source it with bash explicitly
38
+ bash -c "source $NVM_DIR/nvm.sh && nvm --version" || {
39
+ echo "Failed to verify NVM installation"
40
+ exit 1
41
+ }
42
+ fi
43
+
44
+ echo "πŸ“¦ Installing Node.js LTS..."
45
+ # Run nvm commands in a bash subshell to ensure proper environment
46
+ bash -c "source $NVM_DIR/nvm.sh && nvm install --lts" || {
47
+ echo "Failed to install Node.js"
48
+ exit 1
49
+ }
50
+
51
+ # Run nvm use in a bash subshell
52
+ bash -c "source $NVM_DIR/nvm.sh && nvm use --lts" || {
53
+ echo "Failed to use Node.js LTS"
54
+ exit 1
55
+ }
56
+
57
+ echo "🧰 Installing latest npm..."
58
+ # Run npm in a bash subshell to ensure node is available
59
+ bash -c "source $NVM_DIR/nvm.sh && nvm use --lts && npm install -g npm@latest" || {
60
+ echo "Failed to update npm"
61
+ exit 1
62
+ }
63
+
64
+ echo "βœ… NVM, Node.js, and npm installed successfully."
65
+
66
+ if [ -f requirements.txt ]; then
67
+ echo "Installing requirements..."
68
+ pip install -r requirements.txt || {
69
+ echo "Failed to install requirements"
70
+ exit 1
71
+ }
72
+ else
73
+ echo "No requirements.txt found, skipping package installation"
74
+ fi
75
+
76
+ echo "Setting up terminal prompt..."
77
+ cat << 'EOF' >> ~/.bashrc
78
+ # Function to get git branch
79
+ parse_git_branch() {
80
+ git branch 2> /dev/null | sed -e '/^[^*]/d' -e 's/* \(.*\)/ (\1)/'
81
+ }
82
+
83
+ # Color definitions
84
+ BLUE='\[\033[34m\]'
85
+ GREEN='\[\033[32m\]'
86
+ YELLOW='\[\033[33m\]'
87
+ RESET='\[\033[00m\]'
88
+
89
+ # Set prompt with current directory and git branch
90
+ export PS1="${BLUE}\W${RESET}${YELLOW}\$(parse_git_branch)${RESET}${GREEN} $ ${RESET}"
91
+ EOF
92
+
93
+ echo "Setup completed successfully!"
.gitignore CHANGED
@@ -4,6 +4,11 @@
4
  # Chainlit
5
  .chainlit/.langchain.db
6
 
 
 
 
 
 
7
  # Chroma
8
  .chromadb/
9
 
 
4
  # Chainlit
5
  .chainlit/.langchain.db
6
 
7
+ # Claude settings (local only)
8
+ CLAUDE.md
9
+ .claude/settings.json
10
+ .claude/settings.local.json
11
+
12
  # Chroma
13
  .chromadb/
14
 
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [server]
2
+ runOnSave = true
3
+ fileWatcherType = "auto"
.vscode/settings.json CHANGED
@@ -3,21 +3,23 @@
3
  "editor.cursorBlinking": "solid",
4
  "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace",
5
  "editor.fontLigatures": false,
6
- "editor.fontSize": 22,
7
  "editor.formatOnPaste": true,
8
  "editor.formatOnSave": true,
9
  "editor.lineNumbers": "on",
10
  "editor.matchBrackets": "always",
11
  "editor.minimap.enabled": false,
12
  "editor.smoothScrolling": true,
13
- "editor.tabSize": 2,
14
  "editor.useTabStops": true,
15
  "emmet.triggerExpansionOnTab": true,
16
- "explorer.openEditors.visible": 0,
17
  "files.autoSave": "afterDelay",
18
  "screencastMode.onlyKeyboardShortcuts": true,
19
- "terminal.integrated.fontSize": 18,
20
  "workbench.colorTheme": "Visual Studio Dark",
21
  "workbench.fontAliasing": "antialiased",
22
- "workbench.statusBar.visible": true
23
- }
 
 
 
 
3
  "editor.cursorBlinking": "solid",
4
  "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace",
5
  "editor.fontLigatures": false,
6
+ "editor.fontSize": 14,
7
  "editor.formatOnPaste": true,
8
  "editor.formatOnSave": true,
9
  "editor.lineNumbers": "on",
10
  "editor.matchBrackets": "always",
11
  "editor.minimap.enabled": false,
12
  "editor.smoothScrolling": true,
13
+ "editor.tabSize": 4,
14
  "editor.useTabStops": true,
15
  "emmet.triggerExpansionOnTab": true,
 
16
  "files.autoSave": "afterDelay",
17
  "screencastMode.onlyKeyboardShortcuts": true,
18
+ "terminal.integrated.fontSize": 14,
19
  "workbench.colorTheme": "Visual Studio Dark",
20
  "workbench.fontAliasing": "antialiased",
21
+ "workbench.statusBar.visible": true,
22
+ "workbench.tree.indent": 8,
23
+ "workbench.tree.renderIndentGuides": "always",
24
+ "workbench.fontSize": 14
25
+ }
README.md CHANGED
@@ -1,7 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Hands-On AI: Building and Deploying LLM-Powered Apps
2
  This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
3
 
 
 
 
 
 
 
 
 
4
  _See the readme file in the main branch for updated instructions and information._
 
5
  ## Instructions
6
  This repository has branches for each of the videos in the course. You can use the branch pop up menu in github to switch to a specific branch and take a look at the course at that stage, or you can add `/tree/BRANCH_NAME` to the URL to go to the branch you want to access.
7
 
@@ -20,15 +41,19 @@ To resolve this issue:
20
  Add changes to git using this command: git add .
21
  Commit changes using this command: git commit -m "some message"
22
 
23
- ## Installing
24
- 1. To use these exercise files, you must have the following installed:
25
- - [list of requirements for course]
26
- 2. Clone this repository into your local machine using the terminal (Mac), CMD (Windows), or a GUI tool like SourceTree.
27
- 3. [Course-specific instructions]
 
 
 
 
28
 
29
 
30
  [0]: # (Replace these placeholder URLs with actual course URLs)
31
 
32
- [lil-course-url]: https://www.linkedin.com/learning/
33
- [lil-thumbnail-url]: http://
34
 
 
1
+ ---
2
+ title: Test
3
+ emoji: πŸš€
4
+ colorFrom: green
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.46.0
8
+ pinned: false
9
+ app_file: app/app.py
10
+ license: other
11
+ ---
12
+
13
  # Hands-On AI: Building and Deploying LLM-Powered Apps
14
  This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
15
 
16
+ ![lil-thumbnail-url]
17
+
18
+ Are you ready to start building applications with large language models (LLMs), but not sure where to begin? This course, which is designed uniquely for beginners with no experience in the LLM space, offers an overview of the fundamentals of LLMs with hands-on challenges to boost your skills along the way.
19
+
20
+ Explore the essentials of retrieval-augmented generation including search engine basics, embedding model limitations, and how to build a chat-with-PDF application. Along the way, instructor Han Lee shows you how to get up and running with prompt engineering, using the prompt playground for LLM apps.
21
+
22
+ This course is integrated with GitHub Codespaces, an instant cloud developer environment that offers all the functionality of your favorite IDE without the need for any local machine setup. With GitHub Codespaces, you can get hands-on practice from any machine, at any timeβ€”all while using a tool that you’ll likely encounter in the workplace. Check out the β€œUsing GitHub Codespaces with this course” video to learn how to get started.
23
+
24
  _See the readme file in the main branch for updated instructions and information._
25
+
26
  ## Instructions
27
  This repository has branches for each of the videos in the course. You can use the branch pop up menu in github to switch to a specific branch and take a look at the course at that stage, or you can add `/tree/BRANCH_NAME` to the URL to go to the branch you want to access.
28
 
 
41
  Add changes to git using this command: git add .
42
  Commit changes using this command: git commit -m "some message"
43
 
44
+ ### Instructor
45
+
46
+ Han-chung Lee
47
+
48
+ Machine Learning Engineer in NLP, Search, and Recommendation Systems
49
+
50
+
51
+
52
+ Check out my other courses on [LinkedIn Learning](https://www.linkedin.com/learning/instructors/han-chung-lee?u=104).
53
 
54
 
55
  [0]: # (Replace these placeholder URLs with actual course URLs)
56
 
57
+ [lil-course-url]: https://www.linkedin.com/learning/hands-on-ai-building-llm-powered-apps
58
+ [lil-thumbnail-url]: https://media.licdn.com/dms/image/D560DAQGRl8C0MWSlTg/learning-public-crop_675_1200/0/1708734970228?e=2147483647&v=beta&t=415ypTLk6X2GXE5io0I1Ejc9vFT6EHEYEOclgbRB5aM
59
 
app/app.py CHANGED
@@ -1,171 +1,313 @@
1
- # Chroma compatibility issue resolution
2
- # https://docs.trychroma.com/troubleshooting#sqlite
3
- __import__('pysqlite3')
4
- import sys
5
- sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
6
-
7
- from tempfile import NamedTemporaryFile
8
-
9
- import chainlit as cl
10
- from chainlit.types import AskFileResponse
11
-
12
- import chromadb
13
- from chromadb.config import Settings
14
- from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain
15
- from langchain.chains.base import Chain
16
- from langchain.chat_models import ChatOpenAI
17
- from langchain.document_loaders import PDFPlumberLoader
18
- from langchain.embeddings.openai import OpenAIEmbeddings
19
- from langchain.text_splitter import RecursiveCharacterTextSplitter
20
- from langchain.vectorstores import Chroma
21
- from langchain.vectorstores.base import VectorStore
22
-
23
- from prompt import EXAMPLE_PROMPT, PROMPT, WELCOME_MESSAGE
24
-
25
-
26
- namespaces = set()
27
-
28
-
29
- def process_file(*, file: AskFileResponse) -> list:
30
- if file.type != "application/pdf":
31
- raise TypeError("Only PDF files are supported")
32
-
33
-
34
- with NamedTemporaryFile() as tempfile:
35
- tempfile.write(file.content)
36
-
37
- loader = PDFPlumberLoader(tempfile.name)
38
-
39
- documents = loader.load()
40
-
41
- text_splitter = RecursiveCharacterTextSplitter(
42
- chunk_size=3000,
43
- chunk_overlap=100
44
- )
45
-
46
- docs = text_splitter.split_documents(documents)
47
-
48
- for i, doc in enumerate(docs):
49
- doc.metadata["source"] = f"source_{i}"
50
-
51
- if not docs:
52
- raise ValueError("PDF file parsing failed.")
53
-
54
- return docs
55
 
 
 
 
 
56
 
57
- def create_search_engine(*, file: AskFileResponse) -> VectorStore:
58
-
59
- # Process and save data in the user session
60
- docs = process_file(file=file)
61
- cl.user_session.set("docs", docs)
62
-
63
- encoder = OpenAIEmbeddings(
64
- model="text-embedding-ada-002"
65
- )
66
-
67
- # Initialize Chromadb client and settings, reset to ensure we get a clean
68
- # search engine
69
- client = chromadb.EphemeralClient()
70
- client_settings=Settings(
71
- allow_reset=True,
72
- anonymized_telemetry=False
73
- )
74
- search_engine = Chroma(
75
- client=client,
76
- client_settings=client_settings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  )
78
- search_engine._client.reset()
79
 
80
- search_engine = Chroma.from_documents(
81
- client=client,
82
- documents=docs,
83
- embedding=encoder,
84
- client_settings=client_settings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  )
86
 
87
- return search_engine
88
-
89
-
90
- @cl.on_chat_start
91
- async def start():
92
-
93
- files = None
94
- while files is None:
95
- files = await cl.AskFileMessage(
96
- content=WELCOME_MESSAGE,
97
- accept=["application/pdf"],
98
- max_size_mb=20,
99
- ).send()
100
-
101
- file = files[0]
102
- msg = cl.Message(content=f"Processing `{file.name}`...")
103
- await msg.send()
104
 
105
- try:
106
- search_engine = await cl.make_async(create_search_engine)(file=file)
107
- except Exception as e:
108
- await cl.Message(content=f"Error: {e}").send()
109
- raise SystemError
110
-
111
- llm = ChatOpenAI(
112
- model='gpt-3.5-turbo-16k-0613',
113
- temperature=0,
114
- streaming=True
115
- )
116
 
117
- chain = RetrievalQAWithSourcesChain.from_chain_type(
118
- llm=llm,
119
- chain_type="stuff",
120
- retriever=search_engine.as_retriever(max_tokens_limit=4097),
121
 
122
- chain_type_kwargs={
123
- "prompt": PROMPT,
124
- "document_prompt": EXAMPLE_PROMPT
125
- },
126
- )
127
 
128
- msg.content = f"`{file.name}` processed. You can now ask questions!"
129
- await msg.update()
 
 
 
 
 
130
 
131
- cl.user_session.set("chain", chain)
 
 
 
 
132
 
 
 
 
 
 
133
 
134
- @cl.on_message
135
- async def main(message: cl.Message):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
138
- cb = cl.AsyncLangchainCallbackHandler()
139
- response = await chain.acall(message.content, callbacks=[cb])
140
- answer = response["answer"]
141
- sources = response["sources"].strip()
142
- source_elements = []
143
 
144
- # Get the documents from the user session
145
- docs = cl.user_session.get("docs")
146
- metadatas = [doc.metadata for doc in docs]
147
- all_sources = [m["source"] for m in metadatas]
148
 
149
- # Adding sources to the answer
150
- if sources:
151
- found_sources = []
 
 
 
 
152
 
153
- # Add the sources to the message
154
- for source in sources.split(","):
155
- source_name = source.strip().replace(".", "")
156
- # Get the index of the source
157
- try:
158
- index = all_sources.index(source_name)
159
- except ValueError:
160
- continue
161
- text = docs[index].page_content
162
- found_sources.append(source_name)
163
- # Create the text element referenced in the message
164
- source_elements.append(cl.Text(content=text, name=source_name))
165
-
166
- if found_sources:
167
- answer += f"\nSources: {', '.join(found_sources)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  else:
169
- answer += "\nNo sources found"
 
170
 
171
- await cl.Message(content=answer, elements=source_elements).send()
 
 
1
+ from typing import List, Dict, Any, Tuple
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ from dotenv import load_dotenv
4
+ from langchain.schema import Document
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain.vectorstores.base import VectorStore
7
 
8
+ from langchain_core.output_parsers import StrOutputParser
9
+ from langchain_core.messages import HumanMessage, AIMessage
10
+ import streamlit as st
11
+
12
+ from utils import process_file, create_search_engine
13
+ from prompt import PROMPT, WELCOME_MESSAGE
14
+
15
+
16
+ load_dotenv()
17
+
18
+
19
+ # Page configuration
20
+ st.set_page_config(
21
+ page_title="PDF Q&A Assistant",
22
+ page_icon="πŸ“š",
23
+ layout="wide",
24
+ initial_sidebar_state="expanded",
25
+ )
26
+
27
+ # Initialize session state
28
+ if "messages" not in st.session_state:
29
+ st.session_state.messages = []
30
+ if "chain" not in st.session_state:
31
+ st.session_state.chain = None
32
+ if "vector_store" not in st.session_state:
33
+ st.session_state.vector_store = None
34
+ if "retriever" not in st.session_state:
35
+ st.session_state.retriever = None
36
+ if "docs" not in st.session_state:
37
+ st.session_state.docs = None
38
+ if "processed_file" not in st.session_state:
39
+ st.session_state.processed_file = None
40
+ if "openai_api_key" not in st.session_state:
41
+ st.session_state.openai_api_key = None
42
+
43
+
44
+ def create_qa_chain(vector_store: VectorStore, api_key: str) -> Tuple[Any, Any]:
45
+ """Create the QA chain with the vector store using LCEL.
46
+
47
+ Args:
48
+ vector_store: The vector store containing document embeddings
49
+ api_key: OpenAI API key
50
+
51
+ Returns:
52
+ Tuple containing:
53
+ - chain: The LCEL chain for question answering
54
+ - retriever: The document retriever
55
+ """
56
+ llm = ChatOpenAI(
57
+ model='gpt-4.1-mini',
58
+ temperature=0,
59
+ streaming=True,
60
+ max_tokens=8192,
61
+ api_key=api_key
62
  )
 
63
 
64
+ # Create retriever
65
+ retriever = vector_store.as_retriever(search_kwargs={"k": 5})
66
+
67
+ def format_docs(docs: List[Document]) -> str:
68
+ """Format retrieved documents for the prompt.
69
+
70
+ Args:
71
+ docs: List of retrieved documents
72
+
73
+ Returns:
74
+ Formatted string containing document content and sources
75
+ """
76
+ formatted = []
77
+ for doc in docs:
78
+ content = doc.page_content
79
+ source = doc.metadata.get("source", "unknown")
80
+ formatted.append(f"Content: {content}\nSource: {source}")
81
+ return "\n\n".join(formatted)
82
+
83
+ def get_question(inputs: Dict[str, Any]) -> str:
84
+ return inputs["question"]
85
+
86
+ def get_chat_history(inputs: Dict[str, Any]) -> List[Any]:
87
+ return inputs["chat_history"]
88
+
89
+ chain = (
90
+ {
91
+ "context": get_question | retriever | format_docs,
92
+ "question": get_question,
93
+ "chat_history": get_chat_history
94
+ }
95
+ | PROMPT
96
+ | llm
97
+ | StrOutputParser()
98
  )
99
 
100
+ return chain, retriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ def format_answer_with_sources(response: str, retrieved_docs: List[Document]) -> Tuple[str, List[Dict[str, str]]]:
104
+ """Format the answer with source information.
 
 
105
 
106
+ Args:
107
+ response: The LLM response containing the answer
108
+ retrieved_docs: List of documents retrieved from the vector store
 
 
109
 
110
+ Returns:
111
+ Tuple containing:
112
+ - answer: The formatted answer string
113
+ - source_contents: List of source dictionaries with name and content
114
+ """
115
+ answer = response
116
+ source_contents = []
117
 
118
+ sources_text = ""
119
+ if "SOURCES:" in answer:
120
+ parts = answer.split("SOURCES:")
121
+ if len(parts) > 1:
122
+ sources_text = parts[1].strip()
123
 
124
+ if sources_text and retrieved_docs:
125
+ source_map = {}
126
+ for doc in retrieved_docs:
127
+ source_name = doc.metadata.get("source", "unknown")
128
+ source_map[source_name] = doc.page_content
129
 
130
+ found_sources = []
131
+ for source in sources_text.split(","):
132
+ source_name = source.strip().replace(".", "")
133
+ if source_name in source_map:
134
+ found_sources.append(source_name)
135
+ source_contents.append({
136
+ "name": source_name,
137
+ "content": source_map[source_name]
138
+ })
139
+
140
+ return answer, source_contents
141
+
142
+
143
+ def get_chat_history_messages(messages: List[Dict[str, str]]) -> List[Any]:
144
+ """Convert Streamlit messages to LangChain message format.
145
+
146
+ Args:
147
+ messages: List of Streamlit message dictionaries with 'role' and 'content' keys
148
+
149
+ Returns:
150
+ List of LangChain message objects (HumanMessage or AIMessage)
151
+ """
152
+ chat_history = []
153
+ for msg in messages:
154
+ if msg["role"] == "user":
155
+ chat_history.append(HumanMessage(content=msg["content"]))
156
+ elif msg["role"] == "assistant":
157
+ chat_history.append(AIMessage(content=msg["content"]))
158
+ return chat_history
159
+
160
+
161
+ def main() -> None:
162
+ """Main Streamlit application function for PDF Q&A Assistant.
163
+
164
+ Handles file upload, processing, and chat interface for asking questions
165
+ about uploaded PDF documents using RAG (Retrieval Augmented Generation).
166
+ """
167
+ st.title("πŸ“š PDF Q&A Assistant")
168
+ st.markdown(WELCOME_MESSAGE)
169
+
170
+ # Sidebar for file upload
171
+ with st.sidebar:
172
+ st.header("πŸ”‘ API Configuration")
173
+
174
+ api_key = st.text_input(
175
+ "OpenAI API Key",
176
+ type="password",
177
+ value=st.session_state.openai_api_key if st.session_state.openai_api_key else "",
178
+ help="Enter your OpenAI API key to use the application"
179
+ )
180
 
181
+ if api_key:
182
+ st.session_state.openai_api_key = api_key
183
+ st.success("βœ… API Key configured")
184
+ else:
185
+ st.warning("⚠️ Please enter your OpenAI API key to continue")
 
186
 
187
+ st.divider()
 
 
 
188
 
189
+ st.header("πŸ“€ Upload PDF")
190
+ uploaded_file = st.file_uploader(
191
+ "Choose a PDF file",
192
+ type=["pdf"],
193
+ help="Upload a PDF file to ask questions about its content",
194
+ disabled=not st.session_state.openai_api_key
195
+ )
196
 
197
+ if uploaded_file is not None and st.session_state.openai_api_key:
198
+ if st.session_state.processed_file != uploaded_file.name:
199
+ with st.status("Processing PDF...", expanded=True) as status:
200
+ st.write("πŸ“„ Reading PDF content...")
201
+
202
+ try:
203
+ docs = process_file(
204
+ uploaded_file.getvalue(), "application/pdf")
205
+ st.write(f"βœ… Extracted {len(docs)} text chunks")
206
+
207
+ st.write("πŸ” Creating vector store...")
208
+ vector_store, _ = create_search_engine(
209
+ uploaded_file.getvalue(), "application/pdf", api_key=st.session_state.openai_api_key)
210
+
211
+ st.session_state.vector_store = vector_store
212
+ st.session_state.docs = docs
213
+ st.session_state.processed_file = uploaded_file.name
214
+
215
+ status.update(
216
+ label="βœ… PDF processed successfully!", state="complete")
217
+
218
+ except Exception as e:
219
+ status.update(
220
+ label="❌ Error processing PDF", state="error")
221
+ st.error(f"Error: {str(e)}")
222
+ return
223
+
224
+ st.success(f"πŸ“„ **{uploaded_file.name}** is ready for questions!")
225
+
226
+ if st.session_state.vector_store is not None and st.session_state.openai_api_key:
227
+ st.write("🧠 Setting up Q&A chain...")
228
+ chain, retriever = create_qa_chain(
229
+ st.session_state.vector_store, st.session_state.openai_api_key)
230
+
231
+ # Store in session state
232
+ st.session_state.chain = chain
233
+ st.session_state.retriever = retriever
234
+
235
+ # Chat interface
236
+ if st.session_state.chain is not None:
237
+ # Display chat messages
238
+ for message in st.session_state.messages:
239
+ with st.chat_message(message["role"]):
240
+ st.text(message["content"])
241
+
242
+ # Display sources if available
243
+ if "sources" in message and message["sources"]:
244
+ for source in message["sources"]:
245
+ with st.expander(f"πŸ“„ Source: {source['name']}"):
246
+ st.text(source["content"])
247
+
248
+ # Chat input
249
+ if prompt := st.chat_input("Ask a question about the PDF..."):
250
+ # Add user message to chat history
251
+ st.session_state.messages.append(
252
+ {"role": "user", "content": prompt})
253
+
254
+ # Display user message
255
+ with st.chat_message("user"):
256
+ st.text(prompt)
257
+
258
+ # Generate response
259
+ with st.chat_message("assistant"):
260
+ with st.spinner("Thinking..."):
261
+ try:
262
+ chat_history = get_chat_history_messages(
263
+ st.session_state.messages)
264
+
265
+ # Get retrieved documents for source processing
266
+ retrieved_docs = st.session_state.retriever.invoke(
267
+ prompt)
268
+
269
+ # Invoke the LCEL chain
270
+ response = st.session_state.chain.invoke({
271
+ "question": prompt,
272
+ "chat_history": chat_history
273
+ })
274
+
275
+ answer, source_contents = format_answer_with_sources(
276
+ response, retrieved_docs
277
+ )
278
+
279
+ st.text(answer)
280
+
281
+ # Display sources
282
+ if source_contents:
283
+ for source in source_contents:
284
+ with st.expander(f"πŸ“„ Source: {source['name']}"):
285
+ st.text(source["content"])
286
+
287
+ # Add assistant response to chat history
288
+ st.session_state.messages.append({
289
+ "role": "assistant",
290
+ "content": answer,
291
+ "sources": source_contents
292
+ })
293
+
294
+ except Exception as e:
295
+ error_msg = f"Error generating response: {str(e)}"
296
+ import logging
297
+ logging.error(e, exc_info=True)
298
+ st.error(error_msg)
299
+ st.session_state.messages.append({
300
+ "role": "assistant",
301
+ "content": error_msg
302
+ })
303
+
304
+ else:
305
+ if not st.session_state.openai_api_key:
306
+ st.info(
307
+ "πŸ”‘ Please enter your OpenAI API key in the sidebar to get started!")
308
  else:
309
+ st.info("πŸ‘† Please upload a PDF file to get started!")
310
+
311
 
312
+ if __name__ == "__main__":
313
+ main()
app/prompt.py CHANGED
@@ -1,5 +1,4 @@
1
- # flake8: noqa
2
- from langchain.prompts import PromptTemplate
3
 
4
  WELCOME_MESSAGE = """\
5
  Welcome to Introduction to LLM App Development Sample PDF QA Application!
@@ -8,20 +7,20 @@ To get started:
8
  2. Ask any question about the file!
9
  """
10
 
11
- template = """Please act as an expert financial analyst when you answer the questions and pay special attention to the financial statements. Operating margin is also known as op margin and is calculated by dividing operating income by revenue.
12
- Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
13
- If you don't know the answer, just say that you don't know. Don't try to make up an answer.
14
- ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
 
15
 
16
- QUESTION: {question}
17
- =========
18
- {summaries}
19
- =========
20
- FINAL ANSWER:"""
21
 
22
- PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
23
 
24
- EXAMPLE_PROMPT = PromptTemplate(
25
- template="Content: {page_content}\nSource: {source}",
26
- input_variables=["page_content", "source"],
 
 
 
27
  )
 
1
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 
2
 
3
  WELCOME_MESSAGE = """\
4
  Welcome to Introduction to LLM App Development Sample PDF QA Application!
 
7
  2. Ask any question about the file!
8
  """
9
 
10
+ PROMPT = ChatPromptTemplate.from_messages(
11
+ [
12
+ (
13
+ "system",
14
+ """Please act as an expert financial analyst when you answer the questions and pay special attention to the financial statements. Operating margin is also known as op margin and is calculated by dividing operating income by revenue.
15
 
16
+ Given the following extracted parts of a long document and the conversation history, create a final answer with references ("SOURCES"). If you don't know the answer, just say that you don't know. Don't try to make up an answer.
 
 
 
 
17
 
18
+ ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
19
 
20
+ Context from documents:
21
+ {context}"""
22
+ ),
23
+ MessagesPlaceholder(variable_name="chat_history"),
24
+ ("human", "{question}")
25
+ ]
26
  )
app/utils.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ import tempfile
3
+ import os
4
+ from chromadb.config import Settings
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_community.document_loaders import PDFPlumberLoader
7
+ from langchain_chroma import Chroma
8
+ from langchain.vectorstores.base import VectorStore
9
+ from langchain_openai import OpenAIEmbeddings
10
+
11
+
12
+ def process_file(file_data, file_type: str = None) -> list:
13
+ """
14
+ Process a PDF file and split it into documents.
15
+
16
+ Args:
17
+ file_data: Either a file path (str) or file bytes
18
+ file_type: Optional file type, defaults to checking if PDF
19
+
20
+ Returns:
21
+ List of processed documents
22
+
23
+ Raises:
24
+ TypeError: If file is not a PDF
25
+ ValueError: If PDF parsing fails
26
+ """
27
+ if file_type and file_type != "application/pdf":
28
+ raise TypeError("Only PDF files are supported")
29
+
30
+ # Handle both file path and file bytes
31
+ if isinstance(file_data, bytes):
32
+ # Create a temporary file for the PDF bytes
33
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
34
+ tmp_file.write(file_data)
35
+ tmp_file_path = tmp_file.name
36
+
37
+ try:
38
+ loader = PDFPlumberLoader(tmp_file_path)
39
+ documents = loader.load()
40
+ finally:
41
+ # Clean up the temporary file
42
+ os.unlink(tmp_file_path)
43
+ else:
44
+ # Assume it's a file path
45
+ loader = PDFPlumberLoader(file_data)
46
+ documents = loader.load()
47
+
48
+ # Clean up extracted text to fix common PDF extraction issues
49
+ for doc in documents:
50
+ # Fix common spacing issues from PDF extraction
51
+ doc.page_content = doc.page_content.replace('\n', ' ') # Replace newlines with spaces
52
+ doc.page_content = ' '.join(doc.page_content.split()) # Normalize whitespace
53
+
54
+ text_splitter = RecursiveCharacterTextSplitter(
55
+ chunk_size=3000,
56
+ chunk_overlap=100,
57
+ separators=["\n\n", "\n", " ", ""]
58
+ )
59
+ docs = text_splitter.split_documents(documents)
60
+ for i, doc in enumerate(docs):
61
+ doc.metadata["source"] = f"source_{i}"
62
+ if not docs:
63
+ raise ValueError("PDF file parsing failed.")
64
+ return docs
65
+
66
+
67
+ def create_search_engine(file_data, file_type: str = None, api_key: str = None) -> tuple[VectorStore, list]:
68
+ """
69
+ Create a vector store search engine from a PDF file.
70
+
71
+ Args:
72
+ file_data: Either a file path (str) or file bytes
73
+ file_type: Optional file type for validation
74
+ api_key: OpenAI API key for embeddings
75
+
76
+ Returns:
77
+ Tuple of (search_engine, docs) where:
78
+ - search_engine: The Chroma vector store
79
+ - docs: The processed documents
80
+ """
81
+ # Process the file
82
+ docs = process_file(file_data, file_type)
83
+
84
+ encoder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key)
85
+
86
+ # Initialize Chromadb client and settings, reset to ensure we get a clean
87
+ # search engine
88
+ client = chromadb.EphemeralClient()
89
+ client_settings = Settings(
90
+ allow_reset=True,
91
+ anonymized_telemetry=False
92
+ )
93
+ search_engine = Chroma(
94
+ client=client,
95
+ client_settings=client_settings
96
+ )
97
+ search_engine._client.reset()
98
+
99
+ search_engine = Chroma.from_documents(
100
+ client=client,
101
+ documents=docs,
102
+ embedding=encoder,
103
+ client_settings=client_settings
104
+ )
105
+
106
+ return search_engine, docs
chainlit.md DELETED
@@ -1,8 +0,0 @@
1
- # Welcome to your PDF QA Sample Application! πŸš€πŸ€–
2
-
3
- Hi Team! πŸ‘‹ Congratulations on launching your first LLM Application. This application is build using OpenAI, Langchain, Chainlit, and Chroma. The goal of this application is to provite a quick overview of the most basic archetype of LLM application and the prototyping and debugging environment.
4
-
5
- ## Useful Links πŸ”—
6
-
7
- - **Langchain Documentation:** Get started with [Langchain Documentation](https://python.langchain.com/) πŸ”—
8
- - **Chainlit Documentation:** Get started with [Chainlit Documentation](https://docs.chainlit.io) πŸ“š
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1,10 @@
1
  # Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
2
- openai==1.2.3
3
- langchain==0.0.334
4
- chainlit==0.7.700
5
- tiktoken==0.5.1
6
- pdfplumber==0.10.3
7
- chromadb==0.4.17
8
- pysqlite3-binary==0.5.2.post1
9
- ruff==0.1.5
 
 
1
  # Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
2
+ langchain>=0.3.25,<1.0.0
3
+ langchain-openai>=0.0.5,<1.0.0
4
+ langchain-chroma>=0.2.4,<1.0.0
5
+ langchain_community>=0.3.26,<1.0.0
6
+ streamlit>=1.31.0
7
+ pdfplumber>=0.11.6
8
+ chromadb>=1.0.10
9
+ ruff==0.11.11
10
+ python-dotenv>=1.0.0