Spaces:
Runtime error
Runtime error
File size: 15,536 Bytes
6024b4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"!pip install -U -q langchain langchain-openai langchain_core langchain-community langchainhub openai"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"!pip install -qU qdrant-client pymupdf pandas"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-06-20 23:18:02 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
"2024-06-20 23:18:03 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
"2024-06-20 23:18:04 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
"2024-06-20 23:18:05 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
"2024-06-20 23:18:06 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
]
}
],
"source": [
"import os\n",
"import openai\n",
"import chainlit as cl\n",
"from langchain_community.document_loaders import PyMuPDFLoader\n",
"from langchain_openai import OpenAIEmbeddings\n",
"from langchain_openai import ChatOpenAI\n",
"from langchain_community.vectorstores import Qdrant\n",
"from langchain.prompts import ChatPromptTemplate\n",
"\n",
"from dotenv import load_dotenv\n",
"from operator import itemgetter\n",
"from langchain_huggingface import HuggingFaceEndpoint\n",
"from langchain_community.document_loaders import TextLoader\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain_community.vectorstores import FAISS\n",
"from langchain_huggingface import HuggingFaceEndpointEmbeddings\n",
"from langchain_core.prompts import PromptTemplate\n",
"from langchain.schema.output_parser import StrOutputParser\n",
"from langchain.schema.runnable import RunnablePassthrough\n",
"from langchain.schema.runnable.config import RunnableConfig\n",
"\n",
"#Load environment variables\n",
"load_dotenv()\n",
"OPENAI_API_KEY = os.environ[\"OPENAI_API_KEY\"]\n",
"\n",
"#Load 10-K PDF and split into chunks\n",
"loader = PyMuPDFLoader (\n",
" \"./data/AirBNB10kfilingsq12024.pdf\"\n",
")\n",
"\n",
"documents = loader.load()\n",
"\n",
"text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size = 1000,\n",
" chunk_overlap = 100\n",
")\n",
"\n",
"documents = text_splitter.split_documents(documents)\n",
"\n",
"#Load embeddings model - we'll use OpenAI's text-embedding-3-small\n",
"embeddings = OpenAIEmbeddings(\n",
" model=\"text-embedding-3-small\"\n",
")\n",
"\n",
"#Create QDrant vector store\n",
"qdrant_vector_store = Qdrant.from_documents(\n",
" documents,\n",
" embeddings,\n",
" location=\":memory:\",\n",
" collection_name=\"AirBNB10k\",\n",
")\n",
"\n",
"#Create Retriever\n",
"retriever = qdrant_vector_store.as_retriever()\n",
"\n",
"#Create Prompt Template\n",
"template = \"\"\"Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':\n",
"\n",
"Context:\n",
"{context}\n",
"\n",
"Question:\n",
"{question}\n",
"\"\"\"\n",
"\n",
"prompt = ChatPromptTemplate.from_template(template)\n",
"\n",
"#Choose LLM - we'll use gpt-4o.\n",
"primary_llm = ChatOpenAI(model_name=\"gpt-4o\", temperature=0)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-06-20 23:18:10 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
]
}
],
"source": [
"retrieved_documents = retriever.invoke(\"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_content='Table of Contents\\nAirbnb, Inc.\\nNotes to Condensed Consolidated Financial Statements (unaudited)\\nNote 3. Supplemental Financial Statement Information\\nCash, Cash Equivalents, and Restricted Cash\\nThe following table reconciles cash, cash equivalents, and restricted cash reported on the Company’s unaudited condensed consolidated balance sheets to the total amount\\npresented in the unaudited condensed consolidated statements of cash flows (in millions):\\nDecember 31,\\n2023\\nMarch 31,\\n2024\\nCash and cash equivalents\\n$\\n6,874\\xa0 $\\n7,829\\xa0\\nCash and cash equivalents included in funds receivable and amounts held on behalf of customers\\n5,769\\xa0\\n8,665\\xa0\\nRestricted cash included in prepaids and other current assets\\n24\\xa0\\n35\\xa0\\nTotal cash, cash equivalents, and restricted cash presented in the unaudited condensed consolidated statements of cash flows\\n$\\n12,667\\xa0 $\\n16,529\\xa0\\nSupplemental disclosures of balance sheet information\\nSupplemental balance sheet information consisted of the following (in millions):\\nDecember 31,' metadata={'source': './data/AirBNB10kfilingsq12024.pdf', 'file_path': './data/AirBNB10kfilingsq12024.pdf', 'page': 10, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2024-05-08 for the period ending 2024-03-31', 'keywords': '0001559720-24-000017; ; 10-Q', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240508161757-04'00'\", 'modDate': \"D:20240508161807-04'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': 'aeb9683f378a493bab25395753cc9fb5', '_collection_name': 'AirBNB10k'}\n",
"page_content='liabilities. We believe that our existing cash, cash equivalents, and short-term investments balances in the United States are sufficient to fund our working capital needs in the United\\nStates.\\nWe have access to $1.0 billion of commitments and a $200\\xa0million sub-limit for the issuance of letters of credit under the 2022 Credit Facility. As of March\\xa031, 2024, no amounts were\\ndrawn under the 2022 Credit Facility and outstanding letters of credit totaled $25 million.\\nMaterial Cash Requirements\\nAs of March\\xa031, 2024, we had outstanding $2.0\\xa0billion in aggregate principal amount of indebtedness of our 0% convertible senior notes due in 2026. On March 3, 2021, in\\nconnection with the pricing of the 2026 Notes, we entered into privately negotiated capped call transactions (the “Capped Calls”) with certain of the initial purchasers and other' metadata={'source': './data/AirBNB10kfilingsq12024.pdf', 'file_path': './data/AirBNB10kfilingsq12024.pdf', 'page': 28, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2024-05-08 for the period ending 2024-03-31', 'keywords': '0001559720-24-000017; ; 10-Q', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240508161757-04'00'\", 'modDate': \"D:20240508161807-04'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': '19d099a757a5473c9b1eceafc54bc0cf', '_collection_name': 'AirBNB10k'}\n",
"page_content='unrealized loss position for more than twelve months as of December\\xa031, 2023 and March\\xa031, 2024, respectively.\\nThe following table summarizes the contractual maturities of the Company’s available-for-sale debt securities (in millions):\\nMarch 31, 2024\\nAmortized\\nCost\\nEstimated\\nFair Value\\nDue within one year\\n$\\n1,489\\xa0 $\\n1,489\\xa0\\nDue after one year through five years\\n957\\xa0\\n947\\xa0\\nDue after five years\\n96\\xa0\\n92\\xa0\\nTotal\\n$\\n2,542\\xa0 $\\n2,528\\xa0\\nNote 5. Fair Value Measurements and Financial Instruments\\nThe following table summarizes the Company’s financial assets and liabilities measured at fair value on a recurring basis (in millions):\\nDecember 31, 2023\\nLevel\\xa01\\nLevel\\xa02\\nLevel\\xa03\\nTotal\\nAssets\\nCash and cash equivalents:\\nMoney market funds\\n$\\n2,018\\xa0 $\\n—\\xa0 $\\n—\\xa0 $\\n2,018\\xa0\\nCertificates of deposit\\n—\\xa0\\n1\\xa0\\n—\\xa0\\n1\\xa0\\nGovernment bonds\\n—\\xa0\\n115\\xa0\\n—\\xa0\\n115\\xa0\\nCommercial paper\\n—\\xa0\\n223\\xa0\\n—\\xa0\\n223\\xa0\\nCorporate debt securities\\n—\\xa0\\n12\\xa0\\n—\\xa0\\n12\\xa0\\n2,018\\xa0\\n351\\xa0\\n—\\xa0\\n2,369\\xa0\\nShort-term investments:\\nCertificates of deposit\\n—\\xa0\\n172\\xa0\\n—\\xa0\\n172\\xa0\\nGovernment bonds\\n—' metadata={'source': './data/AirBNB10kfilingsq12024.pdf', 'file_path': './data/AirBNB10kfilingsq12024.pdf', 'page': 12, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2024-05-08 for the period ending 2024-03-31', 'keywords': '0001559720-24-000017; ; 10-Q', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240508161757-04'00'\", 'modDate': \"D:20240508161807-04'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': '3b5048de80644fc59fd621def77d795d', '_collection_name': 'AirBNB10k'}\n",
"page_content='and mortgage-backed and asset-backed securities. These amounts do not include funds of $8.7 billion as of March\\xa031, 2024, that we held for bookings in advance of guests\\ncompleting check-ins that we record separately on our unaudited condensed consolidated balance sheet in funds receivable and amounts held on behalf of customers with a\\ncorresponding liability in funds payable and amounts payable to customers.\\nOur cash and cash equivalents are generally held at large global systemically important banks (or “G-SIBs”) which are subject to high capital requirements and are required to\\nregularly perform stringent stress tests related to their ability to absorb capital losses. Our cash, cash equivalents, and short-term investments held outside the United States may be\\nrepatriated, subject to certain limitations, and would be available to be used to fund our domestic operations. However, repatriation of such funds may result in additional tax' metadata={'source': './data/AirBNB10kfilingsq12024.pdf', 'file_path': './data/AirBNB10kfilingsq12024.pdf', 'page': 28, 'total_pages': 54, 'format': 'PDF 1.4', 'title': '0001559720-24-000017', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-Q filed on 2024-05-08 for the period ending 2024-03-31', 'keywords': '0001559720-24-000017; ; 10-Q', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationDate': \"D:20240508161757-04'00'\", 'modDate': \"D:20240508161807-04'00'\", 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', '_id': 'c4fd17ee80e44fa2babcfc76951eef77', '_collection_name': 'AirBNB10k'}\n"
]
}
],
"source": [
"for doc in retrieved_documents:\n",
" print(doc)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"primary_llm = ChatOpenAI(model_name=\"gpt-4o\", temperature=0)\n",
"\n",
"retrieval_augmented_chain = (\n",
" # INVOKE CHAIN WITH: {\"question\" : \"<<SOME USER QUESTION>>\"}\n",
" # \"question\" : populated by getting the value of the \"question\" key\n",
" # \"context\" : populated by getting the value of the \"question\" key and chaining it into the base_retriever\n",
" {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n",
" | prompt | primary_llm\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-06-20 23:18:25 - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
"2024-06-20 23:18:26 - HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
]
},
{
"ename": "TypeError",
"evalue": "'AIMessage' object is not subscriptable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[17], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m question \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWhat was the total value of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCash and cash equivalents\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m as of December 31, 2023?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 3\u001b[0m result \u001b[38;5;241m=\u001b[39m retrieval_augmented_chain\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquestion\u001b[39m\u001b[38;5;124m\"\u001b[39m : question})\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mresult\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mresponse\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mcontent)\n",
"\u001b[0;31mTypeError\u001b[0m: 'AIMessage' object is not subscriptable"
]
}
],
"source": [
"question = \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n",
"\n",
"result = retrieval_augmented_chain.invoke({\"question\" : question})\n",
"\n",
"print(result[\"response\"].content)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"content=\"The total value of 'Cash and cash equivalents' as of December 31, 2023, was $6,874 million.\" response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 2129, 'total_tokens': 2156}, 'model_name': 'gpt-4o', 'system_fingerprint': 'fp_3e7d703517', 'finish_reason': 'stop', 'logprobs': None} id='run-1a2044fd-54bb-4f2d-bb88-cfafd050ee3c-0' usage_metadata={'input_tokens': 2129, 'output_tokens': 27, 'total_tokens': 2156}\n"
]
}
],
"source": [
"print(result)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "llmops-course",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|