Spaces:

HanLee
/

linkedin-learning

Sleeping

App Files Files Community

HanLee commited on Jun 27

Commit

53edd39

1 Parent(s): 313a2ae

feat: update to streamlit, lcel, and deployable to huggingface

Browse files

Files changed (12) hide show

.chainlit/config.toml +0 -78
.devcontainer/devcontainer.json +3 -4
.devcontainer/setup.sh +93 -0
.gitignore +5 -0
.streamlit/config.toml +3 -0
.vscode/settings.json +8 -6
README.md +32 -7
app/app.py +294 -152
app/prompt.py +14 -15
app/utils.py +106 -0
chainlit.md +0 -8
requirements.txt +9 -8

.chainlit/config.toml DELETED Viewed

@@ -1,78 +0,0 @@
-[project]
-# Whether to enable telemetry (default: true). No personal data is collected.
-enable_telemetry = true
-# List of environment variables to be provided by each user to use the app.
-user_env = []
-# Duration (in seconds) during which the session is saved when the connection is lost
-session_timeout = 3600
-# Enable third parties caching (e.g LangChain cache)
-cache = false
-# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
-# follow_symlink = false
-[features]
-# Show the prompt playground
-prompt_playground = true
-# Authorize users to upload files with messages
-multi_modal = true
-# Allows user to use speech to text
-[features.speech_to_text]
-    enabled = false
-    # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
-    # language = "en-US"
-[UI]
-# Name of the app and chatbot.
-name = "Chatbot"
-# Show the readme while the conversation is empty.
-show_readme_as_default = true
-# Description of the app and chatbot. This is used for HTML tags.
-# description = ""
-# Large size content are by default collapsed for a cleaner ui
-default_collapse_content = true
-# The default value for the expand messages settings.
-default_expand_messages = false
-# Hide the chain of thought details from the user in the UI.
-hide_cot = false
-# Link to your github repo. This will add a github button in the UI's header.
-github = "https://github.com/LinkedInLearning/hands-on-ai-building-and-deploying-llm-powered-apps-4511409"
-# Specify a CSS file that can be used to customize the user interface.
-# The CSS file can be served from the public directory or via an external link.
-# custom_css = "/public/test.css"
-# Override default MUI light theme. (Check theme.ts)
-[UI.theme.light]
-    #background = "#FAFAFA"
-    #paper = "#FFFFFF"
-    [UI.theme.light.primary]
-        #main = "#F80061"
-        #dark = "#980039"
-        #light = "#FFE7EB"
-# Override default MUI dark theme. (Check theme.ts)
-[UI.theme.dark]
-    #background = "#FAFAFA"
-    #paper = "#FFFFFF"
-    [UI.theme.dark.primary]
-        #main = "#F80061"
-        #dark = "#980039"
-        #light = "#FFE7EB"
-[meta]
-generated_by = "0.7.501"

.devcontainer/devcontainer.json CHANGED Viewed

@@ -1,11 +1,10 @@
 {
   "extensions": [
     "GitHub.github-vscode-theme",
     "ms-toolsai.jupyter",
     "ms-python.python"
-    // Additional Extensions Here
   ],
-  "onCreateCommand" : "[ -f requirements.txt ] && pip install -r requirements.txt; echo PS1='\"$ \"' >> ~/.bashrc", //Set Terminal Prompt to $
 }
-// DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference

 {
+  "image": "mcr.microsoft.com/devcontainers/python:3.11",
   "extensions": [
     "GitHub.github-vscode-theme",
     "ms-toolsai.jupyter",
     "ms-python.python"
   ],
+  "onCreateCommand": "bash .devcontainer/setup.sh"
 }
+// DevContainer Reference: https://code.visualstudio.com/docs/remote/devcontainerjson-reference

.devcontainer/setup.sh ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/bin/bash
+set -euo pipefail
+echo "Upgrading pip..."
+pip install --upgrade pip || {
+    echo "Failed to upgrade pip"
+    exit 1
+}
+echo "🔧 Installing NVM..."
+export NVM_DIR="$HOME/.nvm"
+mkdir -p "$NVM_DIR"
+# Download and install NVM
+curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash || {
+    echo "Failed to download NVM installer"
+    exit 1
+}
+# Add NVM to bashrc for future sessions
+echo 'export NVM_DIR="$HOME/.nvm"' >> ~/.bashrc
+echo '[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"' >> ~/.bashrc
+echo '[ -s "$NVM_DIR/bash_completion" ] && \. "$NVM_DIR/bash_completion"' >> ~/.bashrc
+# Load NVM for current session
+if [ -s "$NVM_DIR/nvm.sh" ]; then
+    \. "$NVM_DIR/nvm.sh"
+    echo "NVM loaded successfully"
+else
+    echo "NVM script not found at $NVM_DIR/nvm.sh"
+    exit 1
+fi
+# Verify NVM is available
+if ! command -v nvm &> /dev/null; then
+    echo "NVM command not found after sourcing. Trying alternative approach..."
+    # Try to source it with bash explicitly
+    bash -c "source $NVM_DIR/nvm.sh && nvm --version" || {
+        echo "Failed to verify NVM installation"
+        exit 1
+    }
+fi
+echo "📦 Installing Node.js LTS..."
+# Run nvm commands in a bash subshell to ensure proper environment
+bash -c "source $NVM_DIR/nvm.sh && nvm install --lts" || {
+    echo "Failed to install Node.js"
+    exit 1
+}
+# Run nvm use in a bash subshell
+bash -c "source $NVM_DIR/nvm.sh && nvm use --lts" || {
+    echo "Failed to use Node.js LTS"
+    exit 1
+}
+echo "🧰 Installing latest npm..."
+# Run npm in a bash subshell to ensure node is available
+bash -c "source $NVM_DIR/nvm.sh && nvm use --lts && npm install -g npm@latest" || {
+    echo "Failed to update npm"
+    exit 1
+}
+echo "✅ NVM, Node.js, and npm installed successfully."
+if [ -f requirements.txt ]; then
+    echo "Installing requirements..."
+    pip install -r requirements.txt || {
+        echo "Failed to install requirements"
+        exit 1
+    }
+else
+    echo "No requirements.txt found, skipping package installation"
+fi
+echo "Setting up terminal prompt..."
+cat << 'EOF' >> ~/.bashrc
+# Function to get git branch
+parse_git_branch() {
+    git branch 2> /dev/null | sed -e '/^[^*]/d' -e 's/* \(.*\)/ (\1)/'
+}
+# Color definitions
+BLUE='\[\033[34m\]'
+GREEN='\[\033[32m\]'
+YELLOW='\[\033[33m\]'
+RESET='\[\033[00m\]'
+# Set prompt with current directory and git branch
+export PS1="${BLUE}\W${RESET}${YELLOW}\$(parse_git_branch)${RESET}${GREEN} $ ${RESET}"
+EOF
+echo "Setup completed successfully!"

.gitignore CHANGED Viewed

@@ -4,6 +4,11 @@
 # Chainlit
 .chainlit/.langchain.db
 # Chroma
 .chromadb/

 # Chainlit
 .chainlit/.langchain.db
+# Claude settings (local only)
+CLAUDE.md
+.claude/settings.json
+.claude/settings.local.json
 # Chroma
 .chromadb/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[server]
+runOnSave = true
+fileWatcherType = "auto"

.vscode/settings.json CHANGED Viewed

@@ -3,21 +3,23 @@
   "editor.cursorBlinking": "solid",
   "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace",
   "editor.fontLigatures": false,
-  "editor.fontSize": 22,
   "editor.formatOnPaste": true,
   "editor.formatOnSave": true,
   "editor.lineNumbers": "on",
   "editor.matchBrackets": "always",
   "editor.minimap.enabled": false,
   "editor.smoothScrolling": true,
-  "editor.tabSize": 2,
   "editor.useTabStops": true,
   "emmet.triggerExpansionOnTab": true,
-  "explorer.openEditors.visible": 0,
   "files.autoSave": "afterDelay",
   "screencastMode.onlyKeyboardShortcuts": true,
-  "terminal.integrated.fontSize": 18,
   "workbench.colorTheme": "Visual Studio Dark",
   "workbench.fontAliasing": "antialiased",
-  "workbench.statusBar.visible": true
-}

   "editor.cursorBlinking": "solid",
   "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace",
   "editor.fontLigatures": false,
+  "editor.fontSize": 14,
   "editor.formatOnPaste": true,
   "editor.formatOnSave": true,
   "editor.lineNumbers": "on",
   "editor.matchBrackets": "always",
   "editor.minimap.enabled": false,
   "editor.smoothScrolling": true,
+  "editor.tabSize": 4,
   "editor.useTabStops": true,
   "emmet.triggerExpansionOnTab": true,
   "files.autoSave": "afterDelay",
   "screencastMode.onlyKeyboardShortcuts": true,
+  "terminal.integrated.fontSize": 14,
   "workbench.colorTheme": "Visual Studio Dark",
   "workbench.fontAliasing": "antialiased",
+  "workbench.statusBar.visible": true,
+  "workbench.tree.indent": 8,
+  "workbench.tree.renderIndentGuides": "always",
+  "workbench.fontSize": 14
+}

README.md CHANGED Viewed

@@ -1,7 +1,28 @@
 # Hands-On AI: Building and Deploying LLM-Powered Apps
 This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
 _See the readme file in the main branch for updated instructions and information._
 ## Instructions
 This repository has branches for each of the videos in the course. You can use the branch pop up menu in github to switch to a specific branch and take a look at the course at that stage, or you can add `/tree/BRANCH_NAME` to the URL to go to the branch you want to access.
@@ -20,15 +41,19 @@ To resolve this issue:
     Add changes to git using this command: git add .
 	Commit changes using this command: git commit -m "some message"
-## Installing
-1. To use these exercise files, you must have the following installed:
-	- [list of requirements for course]
-2. Clone this repository into your local machine using the terminal (Mac), CMD (Windows), or a GUI tool like SourceTree.
-3. [Course-specific instructions]
 [0]: # (Replace these placeholder URLs with actual course URLs)
-[lil-course-url]: https://www.linkedin.com/learning/
-[lil-thumbnail-url]: http://

+---
+title: Test
+emoji: 🚀
+colorFrom: green
+colorTo: green
+sdk: streamlit
+sdk_version: 1.46.0
+pinned: false
+app_file: app/app.py
+license: other
+---
 # Hands-On AI: Building and Deploying LLM-Powered Apps
 This is the repository for the LinkedIn Learning course `Hands-On AI: Building and Deploying LLM-Powered Apps`. The full course is available from [LinkedIn Learning][lil-course-url].
+![lil-thumbnail-url]
+Are you ready to start building applications with large language models (LLMs), but not sure where to begin? This course, which is designed uniquely for beginners with no experience in the LLM space, offers an overview of the fundamentals of LLMs with hands-on challenges to boost your skills along the way.
+Explore the essentials of retrieval-augmented generation including search engine basics, embedding model limitations, and how to build a chat-with-PDF application. Along the way, instructor Han Lee shows you how to get up and running with prompt engineering, using the prompt playground for LLM apps.
+This course is integrated with GitHub Codespaces, an instant cloud developer environment that offers all the functionality of your favorite IDE without the need for any local machine setup. With GitHub Codespaces, you can get hands-on practice from any machine, at any time—all while using a tool that you’ll likely encounter in the workplace. Check out the “Using GitHub Codespaces with this course” video to learn how to get started.
 _See the readme file in the main branch for updated instructions and information._
 ## Instructions
 This repository has branches for each of the videos in the course. You can use the branch pop up menu in github to switch to a specific branch and take a look at the course at that stage, or you can add `/tree/BRANCH_NAME` to the URL to go to the branch you want to access.
     Add changes to git using this command: git add .
 	Commit changes using this command: git commit -m "some message"
+### Instructor
+Han-chung Lee
+Machine Learning Engineer in NLP, Search, and Recommendation Systems
+Check out my other courses on [LinkedIn Learning](https://www.linkedin.com/learning/instructors/han-chung-lee?u=104).
 [0]: # (Replace these placeholder URLs with actual course URLs)
+[lil-course-url]: https://www.linkedin.com/learning/hands-on-ai-building-llm-powered-apps
+[lil-thumbnail-url]: https://media.licdn.com/dms/image/D560DAQGRl8C0MWSlTg/learning-public-crop_675_1200/0/1708734970228?e=2147483647&v=beta&t=415ypTLk6X2GXE5io0I1Ejc9vFT6EHEYEOclgbRB5aM

app/app.py CHANGED Viewed

@@ -1,171 +1,313 @@
-# Chroma compatibility issue resolution
-# https://docs.trychroma.com/troubleshooting#sqlite
-__import__('pysqlite3')
-import sys
-sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
-from tempfile import NamedTemporaryFile
-import chainlit as cl
-from chainlit.types import AskFileResponse
-import chromadb
-from chromadb.config import Settings
-from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain
-from langchain.chains.base import Chain
-from langchain.chat_models import ChatOpenAI
-from langchain.document_loaders import PDFPlumberLoader
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import Chroma
-from langchain.vectorstores.base import VectorStore
-from prompt import EXAMPLE_PROMPT, PROMPT, WELCOME_MESSAGE
-namespaces = set()
-def process_file(*, file: AskFileResponse) -> list:
-    if file.type != "application/pdf":
-        raise TypeError("Only PDF files are supported")
-    with NamedTemporaryFile() as tempfile:
-        tempfile.write(file.content)
-        loader = PDFPlumberLoader(tempfile.name)
-        documents = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=3000,
-            chunk_overlap=100
-        )
-        docs = text_splitter.split_documents(documents)
-        for i, doc in enumerate(docs):
-            doc.metadata["source"] = f"source_{i}"
-        if not docs:
-            raise ValueError("PDF file parsing failed.")
-        return docs
-def create_search_engine(*, file: AskFileResponse) -> VectorStore:
-    # Process and save data in the user session
-    docs = process_file(file=file)
-    cl.user_session.set("docs", docs)
-    encoder = OpenAIEmbeddings(
-        model="text-embedding-ada-002"
-    )
-    # Initialize Chromadb client and settings, reset to ensure we get a clean
-    # search engine
-    client = chromadb.EphemeralClient()
-    client_settings=Settings(
-        allow_reset=True,
-        anonymized_telemetry=False
-    )
-    search_engine = Chroma(
-        client=client,
-        client_settings=client_settings
     )
-    search_engine._client.reset()
-    search_engine = Chroma.from_documents(
-        client=client,
-        documents=docs,
-        embedding=encoder,
-        client_settings=client_settings
     )
-    return search_engine
-@cl.on_chat_start
-async def start():
-    files = None
-    while files is None:
-        files = await cl.AskFileMessage(
-            content=WELCOME_MESSAGE,
-            accept=["application/pdf"],
-            max_size_mb=20,
-        ).send()
-    file = files[0]
-    msg = cl.Message(content=f"Processing `{file.name}`...")
-    await msg.send()
-    try:
-        search_engine = await cl.make_async(create_search_engine)(file=file)
-    except Exception as e:
-        await cl.Message(content=f"Error: {e}").send()
-        raise SystemError
-    llm = ChatOpenAI(
-        model='gpt-3.5-turbo-16k-0613',
-        temperature=0,
-        streaming=True
-    )
-    chain = RetrievalQAWithSourcesChain.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=search_engine.as_retriever(max_tokens_limit=4097),
-        chain_type_kwargs={
-            "prompt": PROMPT,
-            "document_prompt": EXAMPLE_PROMPT
-        },
-    )
-    msg.content = f"`{file.name}` processed. You can now ask questions!"
-    await msg.update()
-    cl.user_session.set("chain", chain)
-@cl.on_message
-async def main(message: cl.Message):
-    chain = cl.user_session.get("chain")  # type: ConversationalRetrievalChain
-    cb = cl.AsyncLangchainCallbackHandler()
-    response = await chain.acall(message.content, callbacks=[cb])
-    answer = response["answer"]
-    sources = response["sources"].strip()
-    source_elements = []
-    # Get the documents from the user session
-    docs = cl.user_session.get("docs")
-    metadatas = [doc.metadata for doc in docs]
-    all_sources = [m["source"] for m in metadatas]
-    # Adding sources to the answer
-    if sources:
-        found_sources = []
-        # Add the sources to the message
-        for source in sources.split(","):
-            source_name = source.strip().replace(".", "")
-            # Get the index of the source
-            try:
-                index = all_sources.index(source_name)
-            except ValueError:
-                continue
-            text = docs[index].page_content
-            found_sources.append(source_name)
-            # Create the text element referenced in the message
-            source_elements.append(cl.Text(content=text, name=source_name))
-        if found_sources:
-            answer += f"\nSources: {', '.join(found_sources)}"
         else:
-            answer += "\nNo sources found"
-    await cl.Message(content=answer, elements=source_elements).send()

+from typing import List, Dict, Any, Tuple
+from dotenv import load_dotenv
+from langchain.schema import Document
+from langchain_openai import ChatOpenAI
+from langchain.vectorstores.base import VectorStore
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.messages import HumanMessage, AIMessage
+import streamlit as st
+from utils import process_file, create_search_engine
+from prompt import PROMPT, WELCOME_MESSAGE
+load_dotenv()
+# Page configuration
+st.set_page_config(
+    page_title="PDF Q&A Assistant",
+    page_icon="📚",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# Initialize session state
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "chain" not in st.session_state:
+    st.session_state.chain = None
+if "vector_store" not in st.session_state:
+    st.session_state.vector_store = None
+if "retriever" not in st.session_state:
+    st.session_state.retriever = None
+if "docs" not in st.session_state:
+    st.session_state.docs = None
+if "processed_file" not in st.session_state:
+    st.session_state.processed_file = None
+if "openai_api_key" not in st.session_state:
+    st.session_state.openai_api_key = None
+def create_qa_chain(vector_store: VectorStore, api_key: str) -> Tuple[Any, Any]:
+    """Create the QA chain with the vector store using LCEL.
+    Args:
+        vector_store: The vector store containing document embeddings
+        api_key: OpenAI API key
+    Returns:
+        Tuple containing:
+            - chain: The LCEL chain for question answering
+            - retriever: The document retriever
+    """
+    llm = ChatOpenAI(
+        model='gpt-4.1-mini',
+        temperature=0,
+        streaming=True,
+        max_tokens=8192,
+        api_key=api_key
     )
+    # Create retriever
+    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
+    def format_docs(docs: List[Document]) -> str:
+        """Format retrieved documents for the prompt.
+        Args:
+            docs: List of retrieved documents
+        Returns:
+            Formatted string containing document content and sources
+        """
+        formatted = []
+        for doc in docs:
+            content = doc.page_content
+            source = doc.metadata.get("source", "unknown")
+            formatted.append(f"Content: {content}\nSource: {source}")
+        return "\n\n".join(formatted)
+    def get_question(inputs: Dict[str, Any]) -> str:
+        return inputs["question"]
+    def get_chat_history(inputs: Dict[str, Any]) -> List[Any]:
+        return inputs["chat_history"]
+    chain = (
+        {
+            "context": get_question | retriever | format_docs,
+            "question": get_question,
+            "chat_history": get_chat_history
+        }
+        | PROMPT
+        | llm
+        | StrOutputParser()
     )
+    return chain, retriever
+def format_answer_with_sources(response: str, retrieved_docs: List[Document]) -> Tuple[str, List[Dict[str, str]]]:
+    """Format the answer with source information.
+    Args:
+        response: The LLM response containing the answer
+        retrieved_docs: List of documents retrieved from the vector store
+    Returns:
+        Tuple containing:
+            - answer: The formatted answer string
+            - source_contents: List of source dictionaries with name and content
+    """
+    answer = response
+    source_contents = []
+    sources_text = ""
+    if "SOURCES:" in answer:
+        parts = answer.split("SOURCES:")
+        if len(parts) > 1:
+            sources_text = parts[1].strip()
+    if sources_text and retrieved_docs:
+        source_map = {}
+        for doc in retrieved_docs:
+            source_name = doc.metadata.get("source", "unknown")
+            source_map[source_name] = doc.page_content
+        found_sources = []
+        for source in sources_text.split(","):
+            source_name = source.strip().replace(".", "")
+            if source_name in source_map:
+                found_sources.append(source_name)
+                source_contents.append({
+                    "name": source_name,
+                    "content": source_map[source_name]
+                })
+    return answer, source_contents
+def get_chat_history_messages(messages: List[Dict[str, str]]) -> List[Any]:
+    """Convert Streamlit messages to LangChain message format.
+    Args:
+        messages: List of Streamlit message dictionaries with 'role' and 'content' keys
+    Returns:
+        List of LangChain message objects (HumanMessage or AIMessage)
+    """
+    chat_history = []
+    for msg in messages:
+        if msg["role"] == "user":
+            chat_history.append(HumanMessage(content=msg["content"]))
+        elif msg["role"] == "assistant":
+            chat_history.append(AIMessage(content=msg["content"]))
+    return chat_history
+def main() -> None:
+    """Main Streamlit application function for PDF Q&A Assistant.
+    Handles file upload, processing, and chat interface for asking questions
+    about uploaded PDF documents using RAG (Retrieval Augmented Generation).
+    """
+    st.title("📚 PDF Q&A Assistant")
+    st.markdown(WELCOME_MESSAGE)
+    # Sidebar for file upload
+    with st.sidebar:
+        st.header("🔑 API Configuration")
+        api_key = st.text_input(
+            "OpenAI API Key",
+            type="password",
+            value=st.session_state.openai_api_key if st.session_state.openai_api_key else "",
+            help="Enter your OpenAI API key to use the application"
+        )
+        if api_key:
+            st.session_state.openai_api_key = api_key
+            st.success("✅ API Key configured")
+        else:
+            st.warning("⚠️ Please enter your OpenAI API key to continue")
+        st.divider()
+        st.header("📤 Upload PDF")
+        uploaded_file = st.file_uploader(
+            "Choose a PDF file",
+            type=["pdf"],
+            help="Upload a PDF file to ask questions about its content",
+            disabled=not st.session_state.openai_api_key
+        )
+        if uploaded_file is not None and st.session_state.openai_api_key:
+            if st.session_state.processed_file != uploaded_file.name:
+                with st.status("Processing PDF...", expanded=True) as status:
+                    st.write("📄 Reading PDF content...")
+                    try:
+                        docs = process_file(
+                            uploaded_file.getvalue(), "application/pdf")
+                        st.write(f"✅ Extracted {len(docs)} text chunks")
+                        st.write("🔍 Creating vector store...")
+                        vector_store, _ = create_search_engine(
+                            uploaded_file.getvalue(), "application/pdf", api_key=st.session_state.openai_api_key)
+                        st.session_state.vector_store = vector_store
+                        st.session_state.docs = docs
+                        st.session_state.processed_file = uploaded_file.name
+                        status.update(
+                            label="✅ PDF processed successfully!", state="complete")
+                    except Exception as e:
+                        status.update(
+                            label="❌ Error processing PDF", state="error")
+                        st.error(f"Error: {str(e)}")
+                        return
+            st.success(f"📄 **{uploaded_file.name}** is ready for questions!")
+    if st.session_state.vector_store is not None and st.session_state.openai_api_key:
+        st.write("🧠 Setting up Q&A chain...")
+        chain, retriever = create_qa_chain(
+            st.session_state.vector_store, st.session_state.openai_api_key)
+        # Store in session state
+        st.session_state.chain = chain
+        st.session_state.retriever = retriever
+    # Chat interface
+    if st.session_state.chain is not None:
+        # Display chat messages
+        for message in st.session_state.messages:
+            with st.chat_message(message["role"]):
+                st.text(message["content"])
+                # Display sources if available
+                if "sources" in message and message["sources"]:
+                    for source in message["sources"]:
+                        with st.expander(f"📄 Source: {source['name']}"):
+                            st.text(source["content"])
+        # Chat input
+        if prompt := st.chat_input("Ask a question about the PDF..."):
+            # Add user message to chat history
+            st.session_state.messages.append(
+                {"role": "user", "content": prompt})
+            # Display user message
+            with st.chat_message("user"):
+                st.text(prompt)
+            # Generate response
+            with st.chat_message("assistant"):
+                with st.spinner("Thinking..."):
+                    try:
+                        chat_history = get_chat_history_messages(
+                            st.session_state.messages)
+                        # Get retrieved documents for source processing
+                        retrieved_docs = st.session_state.retriever.invoke(
+                            prompt)
+                        # Invoke the LCEL chain
+                        response = st.session_state.chain.invoke({
+                            "question": prompt,
+                            "chat_history": chat_history
+                        })
+                        answer, source_contents = format_answer_with_sources(
+                            response, retrieved_docs
+                        )
+                        st.text(answer)
+                        # Display sources
+                        if source_contents:
+                            for source in source_contents:
+                                with st.expander(f"📄 Source: {source['name']}"):
+                                    st.text(source["content"])
+                        # Add assistant response to chat history
+                        st.session_state.messages.append({
+                            "role": "assistant",
+                            "content": answer,
+                            "sources": source_contents
+                        })
+                    except Exception as e:
+                        error_msg = f"Error generating response: {str(e)}"
+                        import logging
+                        logging.error(e, exc_info=True)
+                        st.error(error_msg)
+                        st.session_state.messages.append({
+                            "role": "assistant",
+                            "content": error_msg
+                        })
+    else:
+        if not st.session_state.openai_api_key:
+            st.info(
+                "🔑 Please enter your OpenAI API key in the sidebar to get started!")
         else:
+            st.info("👆 Please upload a PDF file to get started!")
+if __name__ == "__main__":
+    main()

app/prompt.py CHANGED Viewed

@@ -1,5 +1,4 @@
-# flake8: noqa
-from langchain.prompts import PromptTemplate
 WELCOME_MESSAGE = """\
 Welcome to Introduction to LLM App Development Sample PDF QA Application!
@@ -8,20 +7,20 @@ To get started:
 2. Ask any question about the file!
 """
-template = """Please act as an expert financial analyst when you answer the questions and pay special attention to the financial statements.  Operating margin is also known as op margin and is calculated by dividing operating income by revenue.
-Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
-If you don't know the answer, just say that you don't know. Don't try to make up an answer.
-ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
-QUESTION: {question}
-=========
-{summaries}
-=========
-FINAL ANSWER:"""
-PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
-EXAMPLE_PROMPT = PromptTemplate(
-    template="Content: {page_content}\nSource: {source}",
-    input_variables=["page_content", "source"],
 )

+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 WELCOME_MESSAGE = """\
 Welcome to Introduction to LLM App Development Sample PDF QA Application!
 2. Ask any question about the file!
 """
+PROMPT = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            """Please act as an expert financial analyst when you answer the questions and pay special attention to the financial statements.  Operating margin is also known as op margin and is calculated by dividing operating income by revenue.
+Given the following extracted parts of a long document and the conversation history, create a final answer with references ("SOURCES"). If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
+Context from documents:
+{context}"""
+        ),
+        MessagesPlaceholder(variable_name="chat_history"),
+        ("human", "{question}")
+    ]
 )

app/utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import chromadb
+import tempfile
+import os
+from chromadb.config import Settings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PDFPlumberLoader
+from langchain_chroma import Chroma
+from langchain.vectorstores.base import VectorStore
+from langchain_openai import OpenAIEmbeddings
+def process_file(file_data, file_type: str = None) -> list:
+    """
+    Process a PDF file and split it into documents.
+    Args:
+        file_data: Either a file path (str) or file bytes
+        file_type: Optional file type, defaults to checking if PDF
+    Returns:
+        List of processed documents
+    Raises:
+        TypeError: If file is not a PDF
+        ValueError: If PDF parsing fails
+    """
+    if file_type and file_type != "application/pdf":
+        raise TypeError("Only PDF files are supported")
+    # Handle both file path and file bytes
+    if isinstance(file_data, bytes):
+        # Create a temporary file for the PDF bytes
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(file_data)
+            tmp_file_path = tmp_file.name
+        try:
+            loader = PDFPlumberLoader(tmp_file_path)
+            documents = loader.load()
+        finally:
+            # Clean up the temporary file
+            os.unlink(tmp_file_path)
+    else:
+        # Assume it's a file path
+        loader = PDFPlumberLoader(file_data)
+        documents = loader.load()
+    # Clean up extracted text to fix common PDF extraction issues
+    for doc in documents:
+        # Fix common spacing issues from PDF extraction
+        doc.page_content = doc.page_content.replace('\n', ' ')  # Replace newlines with spaces
+        doc.page_content = ' '.join(doc.page_content.split())  # Normalize whitespace
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=3000,
+        chunk_overlap=100,
+        separators=["\n\n", "\n", " ", ""]
+    )
+    docs = text_splitter.split_documents(documents)
+    for i, doc in enumerate(docs):
+        doc.metadata["source"] = f"source_{i}"
+    if not docs:
+        raise ValueError("PDF file parsing failed.")
+    return docs
+def create_search_engine(file_data, file_type: str = None, api_key: str = None) -> tuple[VectorStore, list]:
+    """
+    Create a vector store search engine from a PDF file.
+    Args:
+        file_data: Either a file path (str) or file bytes
+        file_type: Optional file type for validation
+        api_key: OpenAI API key for embeddings
+    Returns:
+        Tuple of (search_engine, docs) where:
+        - search_engine: The Chroma vector store
+        - docs: The processed documents
+    """
+    # Process the file
+    docs = process_file(file_data, file_type)
+    encoder = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key)
+    # Initialize Chromadb client and settings, reset to ensure we get a clean
+    # search engine
+    client = chromadb.EphemeralClient()
+    client_settings = Settings(
+        allow_reset=True,
+        anonymized_telemetry=False
+    )
+    search_engine = Chroma(
+        client=client,
+        client_settings=client_settings
+    )
+    search_engine._client.reset()
+    search_engine = Chroma.from_documents(
+        client=client,
+        documents=docs,
+        embedding=encoder,
+        client_settings=client_settings
+    )
+    return search_engine, docs

chainlit.md DELETED Viewed

@@ -1,8 +0,0 @@
-# Welcome to your PDF QA Sample Application! 🚀🤖
-Hi Team! 👋 Congratulations on launching your first LLM Application. This application is build using OpenAI, Langchain, Chainlit, and Chroma. The goal of this application is to provite a quick overview of the most basic archetype of LLM application and the prototyping and debugging environment.
-## Useful Links 🔗
-- **Langchain Documentation:** Get started with [Langchain Documentation](https://python.langchain.com/) 🔗
-- **Chainlit Documentation:** Get started with [Chainlit Documentation](https://docs.chainlit.io) 📚

requirements.txt CHANGED Viewed

@@ -1,9 +1,10 @@
 # Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
-openai==1.2.3
-langchain==0.0.334
-chainlit==0.7.700
-tiktoken==0.5.1
-pdfplumber==0.10.3
-chromadb==0.4.17
-pysqlite3-binary==0.5.2.post1
-ruff==0.1.5

 # Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
+langchain>=0.3.25,<1.0.0
+langchain-openai>=0.0.5,<1.0.0
+langchain-chroma>=0.2.4,<1.0.0
+langchain_community>=0.3.26,<1.0.0
+streamlit>=1.31.0
+pdfplumber>=0.11.6
+chromadb>=1.0.10
+ruff==0.11.11
+python-dotenv>=1.0.0