{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "!pip install -U -q langchain langchain-openai langchain_core langchain-community langchainhub openai" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "!pip install -qU qdrant-client pymupdf pandas" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-06-20 23:18:02 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", "2024-06-20 23:18:03 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", "2024-06-20 23:18:04 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", "2024-06-20 23:18:05 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", "2024-06-20 23:18:06 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n" ] } ], "source": [ "import os\n", "import openai\n", "import chainlit as cl\n", "from langchain_community.document_loaders import PyMuPDFLoader\n", "from langchain_openai import OpenAIEmbeddings\n", "from langchain_openai import ChatOpenAI\n", "from langchain_community.vectorstores import Qdrant\n", "from langchain.prompts import ChatPromptTemplate\n", "\n", "from dotenv import load_dotenv\n", "from operator import itemgetter\n", "from langchain_huggingface import HuggingFaceEndpoint\n", "from langchain_community.document_loaders import TextLoader\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain_community.vectorstores import FAISS\n", "from langchain_huggingface import HuggingFaceEndpointEmbeddings\n", "from langchain_core.prompts import PromptTemplate\n", "from langchain.schema.output_parser import StrOutputParser\n", "from langchain.schema.runnable import RunnablePassthrough\n", "from langchain.schema.runnable.config import RunnableConfig\n", "\n", "#Load environment variables\n", "load_dotenv()\n", "OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]\n", "\n", "#Load 10-K PDF and split into chunks\n", "loader = PyMuPDFLoader (\n", " \"./data/AirBNB10kfilingsq12024.pdf\"\n", ")\n", "\n", "documents = loader.load()\n", "\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size = 1000,\n", " chunk_overlap = 100\n", ")\n", "\n", "documents = text_splitter.split_documents(documents)\n", "\n", "#Load embeddings model - we'll use OpenAI's text-embedding-3-small\n", "embeddings = OpenAIEmbeddings(\n", " model=\"text-embedding-3-small\"\n", ")\n", "\n", "#Create QDrant vector store\n", "qdrant_vector_store = Qdrant.from_documents(\n", " documents,\n", " embeddings,\n", " location=\":memory:\",\n", " collection_name=\"AirBNB10k\",\n", ")\n", "\n", "#Create Retriever\n", "retriever = qdrant_vector_store.as_retriever()\n", "\n", "#Create Prompt Template\n", "template = \"\"\"Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':\n", "\n", "Context:\n", "{context}\n", "\n", "Question:\n", "{question}\n", "\"\"\"\n", "\n", "prompt = ChatPromptTemplate.from_template(template)\n", "\n", "#Choose LLM - we'll use gpt-4o.\n", "primary_llm = ChatOpenAI(model_name=\"gpt-4o\", temperature=0)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-06-20 23:18:10 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n" ] } ], "source": [ "retrieved_documents = retriever.invoke(\"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "page_content='Table of Contents\\nAirbnb, Inc.\\nNotes to Condensed Consolidated Financial Statements (unaudited)\\nNote 3. Supplemental Financial Statement Information\\nCash, Cash Equivalents, and Restricted Cash\\nThe following table reconciles cash, cash equivalents, and restricted cash reported on the Company’s unaudited condensed consolidated balance sheets to the total amount\\npresented in the unaudited condensed consolidated statements of cash flows (in millions):\\nDecember 31,\\n2023\\nMarch 31,\\n2024\\nCash and cash equivalents\\n$\\n6,874\\xa0 $\\n7,829\\xa0\\nCash and cash equivalents included in funds receivable and amounts held on behalf of customers\\n5,769\\xa0\\n8,665\\xa0\\nRestricted cash included in prepaids and other current assets\\n24\\xa0\\n35\\xa0\\nTotal cash, cash equivalents, and restricted cash presented in the unaudited condensed consolidated statements of cash flows\\n$\\n12,667\\xa0 $\\n16,529\\xa0\\nSupplemental disclosures of balance sheet information\\nSupplemental balance sheet information consisted of the following (in millions):\\nDecember 31,' metadata={'source': './data/AirBNB10kfilingsq12024.pdf', 'file_path': './data/AirBNB10kfilingsq12024.pdf', 'page': 10, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2024-05-08 for the period ending 2024-03-31', 'keywords': '0001559720-24-000017; ; 10-Q', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240508161757-04'00'\", 'modDate': \"D:20240508161807-04'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': 'aeb9683f378a493bab25395753cc9fb5', '_collection_name': 'AirBNB10k'}\n", "page_content='liabilities. We believe that our existing cash, cash equivalents, and short-term investments balances in the United States are sufficient to fund our working capital needs in the United\\nStates.\\nWe have access to $1.0 billion of commitments and a $200\\xa0million sub-limit for the issuance of letters of credit under the 2022 Credit Facility. As of March\\xa031, 2024, no amounts were\\ndrawn under the 2022 Credit Facility and outstanding letters of credit totaled $25 million.\\nMaterial Cash Requirements\\nAs of March\\xa031, 2024, we had outstanding $2.0\\xa0billion in aggregate principal amount of indebtedness of our 0% convertible senior notes due in 2026. On March 3, 2021, in\\nconnection with the pricing of the 2026 Notes, we entered into privately negotiated capped call transactions (the “Capped Calls”) with certain of the initial purchasers and other' metadata={'source': './data/AirBNB10kfilingsq12024.pdf', 'file_path': './data/AirBNB10kfilingsq12024.pdf', 'page': 28, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2024-05-08 for the period ending 2024-03-31', 'keywords': '0001559720-24-000017; ; 10-Q', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240508161757-04'00'\", 'modDate': \"D:20240508161807-04'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': '19d099a757a5473c9b1eceafc54bc0cf', '_collection_name': 'AirBNB10k'}\n", "page_content='unrealized loss position for more than twelve months as of December\\xa031, 2023 and March\\xa031, 2024, respectively.\\nThe following table summarizes the contractual maturities of the Company’s available-for-sale debt securities (in millions):\\nMarch 31, 2024\\nAmortized\\nCost\\nEstimated\\nFair Value\\nDue within one year\\n$\\n1,489\\xa0 $\\n1,489\\xa0\\nDue after one year through five years\\n957\\xa0\\n947\\xa0\\nDue after five years\\n96\\xa0\\n92\\xa0\\nTotal\\n$\\n2,542\\xa0 $\\n2,528\\xa0\\nNote 5. Fair Value Measurements and Financial Instruments\\nThe following table summarizes the Company’s financial assets and liabilities measured at fair value on a recurring basis (in millions):\\nDecember 31, 2023\\nLevel\\xa01\\nLevel\\xa02\\nLevel\\xa03\\nTotal\\nAssets\\nCash and cash equivalents:\\nMoney market funds\\n$\\n2,018\\xa0 $\\n—\\xa0 $\\n—\\xa0 $\\n2,018\\xa0\\nCertificates of deposit\\n—\\xa0\\n1\\xa0\\n—\\xa0\\n1\\xa0\\nGovernment bonds\\n—\\xa0\\n115\\xa0\\n—\\xa0\\n115\\xa0\\nCommercial paper\\n—\\xa0\\n223\\xa0\\n—\\xa0\\n223\\xa0\\nCorporate debt securities\\n—\\xa0\\n12\\xa0\\n—\\xa0\\n12\\xa0\\n2,018\\xa0\\n351\\xa0\\n—\\xa0\\n2,369\\xa0\\nShort-term investments:\\nCertificates of deposit\\n—\\xa0\\n172\\xa0\\n—\\xa0\\n172\\xa0\\nGovernment bonds\\n—' metadata={'source': './data/AirBNB10kfilingsq12024.pdf', 'file_path': './data/AirBNB10kfilingsq12024.pdf', 'page': 12, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2024-05-08 for the period ending 2024-03-31', 'keywords': '0001559720-24-000017; ; 10-Q', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240508161757-04'00'\", 'modDate': \"D:20240508161807-04'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': '3b5048de80644fc59fd621def77d795d', '_collection_name': 'AirBNB10k'}\n", "page_content='and mortgage-backed and asset-backed securities. These amounts do not include funds of $8.7 billion as of March\\xa031, 2024, that we held for bookings in advance of guests\\ncompleting check-ins that we record separately on our unaudited condensed consolidated balance sheet in funds receivable and amounts held on behalf of customers with a\\ncorresponding liability in funds payable and amounts payable to customers.\\nOur cash and cash equivalents are generally held at large global systemically important banks (or “G-SIBs”) which are subject to high capital requirements and are required to\\nregularly perform stringent stress tests related to their ability to absorb capital losses. Our cash, cash equivalents, and short-term investments held outside the United States may be\\nrepatriated, subject to certain limitations, and would be available to be used to fund our domestic operations. However, repatriation of such funds may result in additional tax' metadata={'source': './data/AirBNB10kfilingsq12024.pdf', 'file_path': './data/AirBNB10kfilingsq12024.pdf', 'page': 28, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2024-05-08 for the period ending 2024-03-31', 'keywords': '0001559720-24-000017; ; 10-Q', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240508161757-04'00'\", 'modDate': \"D:20240508161807-04'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': 'c4fd17ee80e44fa2babcfc76951eef77', '_collection_name': 'AirBNB10k'}\n" ] } ], "source": [ "for doc in retrieved_documents:\n", " print(doc)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "primary_llm = ChatOpenAI(model_name=\"gpt-4o\", temperature=0)\n", "\n", "retrieval_augmented_chain = (\n", " # INVOKE CHAIN WITH: {\"question\" : \"<>\"}\n", " # \"question\" : populated by getting the value of the \"question\" key\n", " # \"context\" : populated by getting the value of the \"question\" key and chaining it into the base_retriever\n", " {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n", " | prompt | primary_llm\n", " )" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-06-20 23:18:25 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", "2024-06-20 23:18:26 - HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n" ] }, { "ename": "TypeError", "evalue": "'AIMessage' object is not subscriptable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[17], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m question \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWhat was the total value of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCash and cash equivalents\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m as of December 31, 2023?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 3\u001b[0m result \u001b[38;5;241m=\u001b[39m retrieval_augmented_chain\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquestion\u001b[39m\u001b[38;5;124m\"\u001b[39m : question})\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mresult\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mresponse\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mcontent)\n", "\u001b[0;31mTypeError\u001b[0m: 'AIMessage' object is not subscriptable" ] } ], "source": [ "question = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n", "\n", "result = retrieval_augmented_chain.invoke({\"question\" : question})\n", "\n", "print(result[\"response\"].content)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "content=\"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $6,874 million.\" response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 2129, 'total_tokens': 2156}, 'model_name': 'gpt-4o', 'system_fingerprint': 'fp_3e7d703517', 'finish_reason': 'stop', 'logprobs': None} id='run-1a2044fd-54bb-4f2d-bb88-cfafd050ee3c-0' usage_metadata={'input_tokens': 2129, 'output_tokens': 27, 'total_tokens': 2156}\n" ] } ], "source": [ "print(result)" ] } ], "metadata": { "kernelspec": { "display_name": "llmops-course", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }