Spaces:

gaia-mistral
/

chatbot-g-rag

Sleeping

File size: 17,424 Bytes

68ed2d8

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b880d1ed-3db0-45a1-807e-1b47e9ce1320",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install faiss-cpu, mistralai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "851612c3-ee93-42e3-a1fb-481f89c9410f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from mistralai.client import MistralClient, ChatMessage\n",
    "import requests\n",
    "import numpy as np\n",
    "import faiss\n",
    "import os\n",
    "\n",
    "api_key=os.environ[\"MISTRAL_API_KEY\"]\n",
    "client = MistralClient(api_key=api_key)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "01b27964-b40f-41d5-ba20-cec93ca25dc5",
   "metadata": {},
   "source": [
    "# 1. RAG from scratch"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe8609d5-9f27-4202-b0be-36db34412998",
   "metadata": {},
   "source": [
    "## Get data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c4c01740-72b4-482c-b61e-e272a734f01f",
   "metadata": {},
   "outputs": [],
   "source": [
    "response = requests.get('https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt')\n",
    "text = response.text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "f03f47af-a20b-4122-a114-74b9748ff543",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "75014"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aad1aa61-9e1c-46c8-ae5e-61855df440f9",
   "metadata": {},
   "source": [
    "## Split document into chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8494655e-bd87-49de-8f1d-69ffbc1c256e",
   "metadata": {},
   "outputs": [],
   "source": [
    "chunk_size = 512\n",
    "chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4176cbe3-9b15-4d17-afb1-665011d09bb7",
   "metadata": {},
   "source": [
    "## Create embeddings for each text chunk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e77d9805-7a53-4210-9f80-f4de52285588",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_text_embedding(input):\n",
    "    embeddings_batch_response = client.embeddings(\n",
    "          model=\"mistral-embed\",\n",
    "          input=input\n",
    "      )\n",
    "    return embeddings_batch_response.data[0].embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "46503830-6ad5-493e-a629-152721e2d88e",
   "metadata": {},
   "outputs": [],
   "source": [
    "text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ca875993-fe6d-42df-811e-a43891cd0350",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(147, 1024)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "55396758-c3f3-45b3-b6e7-d4912c0899f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.04849243,  0.07305908,  0.01568604, ..., -0.0234375 ,\n",
       "        -0.02072144, -0.01068115],\n",
       "       [-0.04660034,  0.04846191, -0.00045729, ..., -0.00754929,\n",
       "        -0.00577545,  0.01355743],\n",
       "       [-0.02139282,  0.0625    ,  0.00907898, ..., -0.02233887,\n",
       "        -0.00765228, -0.00793457],\n",
       "       ...,\n",
       "       [-0.02787781,  0.04260254,  0.00785828, ..., -0.00067568,\n",
       "        -0.01176453, -0.02828979],\n",
       "       [-0.02966309,  0.06292725,  0.03979492, ..., -0.01296997,\n",
       "        -0.00264549, -0.03845215],\n",
       "       [-0.06185913,  0.05847168,  0.03988647, ..., -0.04724121,\n",
       "        -0.01289368, -0.02728271]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_embeddings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1cba33c7-9d1d-44d8-a01e-e30f16be1aac",
   "metadata": {},
   "source": [
    "## Load into a vector database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6a5b1877-b113-4527-9055-cae9049fef08",
   "metadata": {},
   "outputs": [],
   "source": [
    "d = text_embeddings.shape[1]\n",
    "index = faiss.IndexFlatL2(d)\n",
    "index.add(text_embeddings)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ee023ab-b26c-4df5-8a7b-7dd660bfad86",
   "metadata": {},
   "source": [
    "## Create embeddings for a question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "894d9764-9da9-4629-8f2a-c9dcaf6ceb8d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 1024)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "question = \"What were the two main things the author worked on before college?\"\n",
    "question_embeddings = np.array([get_text_embedding(question)])\n",
    "question_embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9c4948cc-6d8b-449f-bc00-abb3591c7222",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.05456543,  0.03518677,  0.03723145, ..., -0.02763367,\n",
       "        -0.00327873,  0.00323677]])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "question_embeddings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15989e10-d0ec-41be-b6be-fa317565a926",
   "metadata": {},
   "source": [
    "## Retrieve similar chunks from the vector database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "c930b378-7aac-434c-881b-ab69d3edb93d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0 90]]\n"
     ]
    }
   ],
   "source": [
    "D, I = index.search(question_embeddings, k=2) \n",
    "print(I)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "73aab584-1dbf-4532-b41e-0403eeeeb567",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['\\n\\nWhat I Worked On\\n\\nFebruary 2021\\n\\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\\n\\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called \"data processing.\" This wa', \"king on things that aren't prestigious doesn't guarantee you're on the right track, it at least guarantees you're not on the most common type of wrong one.\\n\\nOver the next several years I wrote lots of essays about all kinds of different topics. O'Reilly reprinted a collection of them as a book, called Hackers & Painters after one of the essays in it. I also worked on spam filters, and did some more painting. I used to have dinners for a group of friends every thursday night, which taught me how to cook for \"]\n"
     ]
    }
   ],
   "source": [
    "retrieved_chunk = [chunks[i] for i in I.tolist()[0]]\n",
    "print(retrieved_chunk)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b417a59-021a-411d-a491-cb31815192cd",
   "metadata": {},
   "source": [
    "## Combine context and question in a prompt and generate response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "da042a53-4564-4057-9a60-9b57dffff6a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = f\"\"\"\n",
    "Context information is below.\n",
    "---------------------\n",
    "{retrieved_chunk}\n",
    "---------------------\n",
    "Given the context information and not prior knowledge, answer the query.\n",
    "Query: {question}\n",
    "Answer:\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "e77d975b-5f69-4e9c-8b94-97214517eac7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_mistral(user_message, model=\"mistral-medium\"):\n",
    "    messages = [\n",
    "        ChatMessage(role=\"user\", content=user_message)\n",
    "    ]\n",
    "    chat_response = client.chat(\n",
    "        model=model,\n",
    "        messages=messages,\n",
    "        temperature=1, \n",
    "        # max_tokens=100\n",
    "    )\n",
    "    return (chat_response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "1c5c20aa-6673-4105-9c10-886a1e18da8a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The two main things the author worked on before college were writing and programming. Specifically, the author wrote short stories and tried writing programs on an IBM 1401.'"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "run_mistral(prompt)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e3b531c-4730-4108-ae8a-8de6563e085b",
   "metadata": {},
   "source": [
    "# 2. LlamaIndex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "152c2a1e-9564-459c-9ea9-5208da519a90",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install llama-index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "96003762-acac-4886-964b-2d6a67f6f724",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2024-01-16 18:40:06--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 75042 (73K) [text/plain]\n",
      "Saving to: ‘pg_essay.txt’\n",
      "\n",
      "pg_essay.txt        100%[===================>]  73,28K  --.-KB/s    in 0,01s   \n",
      "\n",
      "2024-01-16 18:40:07 (5,45 MB/s) - ‘pg_essay.txt’ saved [75042/75042]\n",
      "\n",
      "The two main things the author worked on before college, outside of school, were writing and programming. In terms of writing, they wrote short stories, which they described as having hardly any plot and mainly featuring characters with strong feelings. As for programming, they tried writing programs on an IBM 1401 in 9th grade using an early version of Fortran. They typed programs on punch cards and stacked them in the card reader to load the program into memory and run it. However, they couldn't figure out what to do with the 1401 and didn't have any data stored on punched cards, so they didn't do much with it.\n"
     ]
    }
   ],
   "source": [
    "from llama_index import VectorStoreIndex, SimpleDirectoryReader\n",
    "from llama_index.llms import MistralAI\n",
    "from llama_index.embeddings import MistralAIEmbedding\n",
    "from llama_index import ServiceContext\n",
    "from llama_index.query_engine import RetrieverQueryEngine\n",
    "\n",
    "# Load data\n",
    "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt' -O pg_essay.txt\n",
    "reader = SimpleDirectoryReader(input_files=[\"pg_essay.txt\"])\n",
    "documents = reader.load_data()\n",
    "# Define LLM and embedding model \n",
    "llm = MistralAI(api_key=api_key,model=\"mistral-medium\")\n",
    "embed_model = MistralAIEmbedding(model_name='mistral-embed', api_key=api_key)\n",
    "service_context = ServiceContext.from_defaults(llm=llm,embed_model=embed_model)\n",
    "# Create vector store index \n",
    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n",
    "# Create query engine\n",
    "query_engine = index.as_query_engine(similarity_top_k=2)\n",
    "response = query_engine.query(\n",
    "    \"What were the two main things the author worked on before college?\"\n",
    ")\n",
    "print(str(response))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3c8f1701-897f-43ff-8101-6ec503995e23",
   "metadata": {},
   "source": [
    "# 3. LangChain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "edcb2ef1-f7aa-4b49-96a6-850cda03bf6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install langchain, langchain_mistralai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "2e9a4f96-7bcf-452d-85b8-fe89990a5dbc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2024-01-19 17:46:24--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 75042 (73K) [text/plain]\n",
      "Saving to: ‘pg_essay.txt’\n",
      "\n",
      "pg_essay.txt        100%[===================>]  73,28K  --.-KB/s    in 0,009s  \n",
      "\n",
      "2024-01-19 17:46:25 (7,57 MB/s) - ‘pg_essay.txt’ saved [75042/75042]\n",
      "\n",
      "The two main things the author worked on before college were writing and programming. Specifically, they wrote short stories and created spam filters, and they also did some painting.\n"
     ]
    }
   ],
   "source": [
    "from langchain_community.document_loaders import TextLoader\n",
    "from langchain_mistralai.chat_models import ChatMistralAI\n",
    "from langchain_mistralai.embeddings import MistralAIEmbeddings\n",
    "from langchain_community.vectorstores import FAISS\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
    "from langchain_core.prompts import ChatPromptTemplate\n",
    "from langchain.chains import create_retrieval_chain\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "\n",
    "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt' -O pg_essay.txt\n",
    "loader = TextLoader(\"pg_essay.txt\")\n",
    "docs = loader.load()\n",
    "\n",
    "# text_splitter = RecursiveCharacterTextSplitter()\n",
    "text_splitter = CharacterTextSplitter(\n",
    "    separator=\"\\n\\n\",\n",
    "    chunk_size=500,\n",
    "    chunk_overlap=200,\n",
    "    length_function=len,\n",
    "    is_separator_regex=False,\n",
    ")\n",
    "documents = text_splitter.split_documents(docs)\n",
    "\n",
    "# Our embedding model doesnt work :( \n",
    "# embeddings = MistralAIEmbeddings(model=\"mistral-embed\", mistral_api_key=api_key)\n",
    "import os\n",
    "embeddings = OpenAIEmbeddings()\n",
    "\n",
    "\n",
    "vector = FAISS.from_documents(documents, embeddings)\n",
    "retriever = vector.as_retriever()\n",
    "model = ChatMistralAI(mistral_api_key=api_key)\n",
    "\n",
    "prompt = ChatPromptTemplate.from_template(\"\"\"Answer the following question based only on the provided context:\n",
    "\n",
    "<context>\n",
    "{context}\n",
    "</context>\n",
    "\n",
    "Question: {input}\"\"\")\n",
    "\n",
    "document_chain = create_stuff_documents_chain(model, prompt)\n",
    "retriever = vector.as_retriever()\n",
    "retrieval_chain = create_retrieval_chain(retriever, document_chain)\n",
    "response = retrieval_chain.invoke({\"input\": \"What were the two main things the author worked on before college?\"})\n",
    "print(response[\"answer\"])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec63c4a9-9c04-4707-a92f-e339b14054fd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}