{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'langchain'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvectorstores\u001b[39;00m \u001b[39mimport\u001b[39;00m Chroma\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39membeddings\u001b[39;00m \u001b[39mimport\u001b[39;00m OpenAIEmbeddings\n\u001b[1;32m 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtext_splitter\u001b[39;00m \u001b[39mimport\u001b[39;00m RecursiveCharacterTextSplitter\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain'" ] } ], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain.vectorstores import Chroma\n", "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.llms import OpenAI\n", "from langchain.chains import VectorDBQA\n", "from langchain.document_loaders import TextLoader\n", "from langchain.document_loaders import OnlinePDFLoader\n", "from langchain.document_loaders import OnlinePDFLoader, PagedPDFSplitter, TextLoader, UnstructuredPDFLoader\n", "\n", "def get_context(file_path: str, prompt: str) -> str:\n", "\n", " # Load the document\n", " loader = PagedPDFSplitter(file_path)\n", " doc = loader.load()\n", "\n", " paragraphs = []\n", " for page in doc:\n", " paragraphs.extend(page.paragraphs)\n", "\n", " # Split the document into sentences\n", " splitter = RecursiveCharacterTextSplitter()\n", " sentences = splitter.split(doc)\n", "\n", " # Embed the sentences\n", " embeddings = OpenAIEmbeddings()\n", " embedded_sentences = embeddings.embed(sentences)\n", "\n", " # Create a vector store\n", " store = Chroma()\n", "\n", " # Create a language model\n", " lm = OpenAI()\n", "\n", " # Create a QA chain\n", " chain = VectorDBQA(store, lm)\n", "\n", " # Add the embedded sentences to the vector store\n", " for sentence, embedding in zip(sentences, embedded_sentences):\n", " store.add(sentence, embedding)\n", "\n", " # Ask the QA chain a question\n", " return chain.ask(prompt)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file_path = \"1706.03762.pdf\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the document\n", "loader = PagedPDFSplitter(file_path)\n", "doc = loader.load()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(paragraphs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "paragraphs = [p for p in doc.split(\"1706.03762.pdf\") if len(p) > 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "loader = PagedPDFSplitter(\"1706.03762.pdf\")\n", "pages = loader.load_and_split()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain.vectorstores import Chroma" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Chroma()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "store = Chroma()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "store.add_documents(pages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "embeddings = OpenAIEmbeddings()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from chromadb.utils import embedding_functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "openai_ef = embedding_functions.OpenAIEmbeddingFunction(\n", " api_key=os.environ[\"OPENAI_API_KEY\"],\n", " model_name=\"\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "openai_ef(\"Hello world\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Chroma(embedding_function=openai_ef)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from langchain.vectorstores import Chroma\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "\n", "faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())\n", "docs = faiss_index.similarity_search(\"How will the community be engaged?\", k=2)\n", "for doc in docs:\n", " print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data[0].paragraphs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "loader = PagedPDFSplitter(\"1706.03762.pdf\")\n", "data = loader.load()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(data[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_context(file_path=\"1706.03762.pdf\", prompt='What is the main idea of the paper?')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "texts = text_splitter.split_documents(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "embeddings = OpenAIEmbeddings()\n", "vectordb = Chroma.from_documents(texts, embeddings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type=\"stuff\", vectorstore=vectordb)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query = \"What did the president say about Ketanji Brown Jackson\"\n", "qa.run(query)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" } } }, "nbformat": 4, "nbformat_minor": 2 }