{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Used this to migrate vectors to pinecone from our faiss indices. I recommend you use our scripts to ingest your data directly into Pinecone. For this, direct it to a folder containing the index.faiss and index.pkl files that you want to ingest into pinecone." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\dfole\\Desktop\\CS549\\pinecone_venv\\Lib\\site-packages\\pinecone\\data\\index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from tqdm.autonotebook import tqdm\n" ] } ], "source": [ "import getpass\n", "import os\n", "import time\n", "from pinecone import Pinecone, ServerlessSpec\n", "\n", "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n", "\n", "pc = Pinecone(api_key=pinecone_api_key)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from langchain_huggingface import HuggingFaceEmbeddings\n", "\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 4685/4685 [1:57:28<00:00, 1.50s/it] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Successfully migrated 468455 documents to Pinecone index 'bpl-rag'\n" ] } ], "source": [ "import os\n", "from langchain_community.vectorstores import FAISS\n", "from pinecone import Pinecone, ServerlessSpec\n", "from langchain_community.embeddings import OpenAIEmbeddings\n", "from tqdm import tqdm\n", "from langchain_pinecone import PineconeVectorStore\n", "\n", "def migrate_faiss_to_pinecone(\n", " faiss_index_path: str,\n", " pinecone_api_key: str,\n", " index_name: str,\n", " batch_size: int = 100\n", "):\n", " \"\"\"\n", " Migrate a local FAISS index to Pinecone.\n", " \n", " Args:\n", " faiss_index_path: Path to the local FAISS index\n", " pinecone_api_key: Your Pinecone API key\n", " pinecone_environment: Pinecone environment (e.g., \"us-east1-gcp\")\n", " index_name: Name of the Pinecone index to create/use\n", " batch_size: Number of vectors to upload in each batch\n", " \"\"\"\n", " # Load the local FAISS index\n", " embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", " faiss_vectorstore = FAISS.load_local(faiss_index_path, embeddings,allow_dangerous_deserialization=True)\n", " pc = Pinecone(api_key=pinecone_api_key)\n", "\n", " index = pc.Index(index_name)\n", " \n", " # Get all the vectors and documents from FAISS\n", " all_docs = faiss_vectorstore.docstore._dict\n", " docs = dict()\n", "\n", " for uuid in faiss_vectorstore.docstore._dict:\n", " doc = faiss_vectorstore.docstore._dict[uuid]\n", " # print(doc)\n", " if doc.metadata['field'] in ['abstract_tsi','title_info_primary_tsi','title_info_primary_subtitle_tsi', 'title_info_alternative_tsim']:\n", " if len(doc.page_content) > 3:\n", " docs[uuid] = doc\n", "\n", " total_docs = len(docs)\n", " \n", " pinecone_vectorstore = PineconeVectorStore(index=index, embedding=embeddings)\n", "\n", " # Batch processing\n", " for i in tqdm(range(0, total_docs, batch_size)):\n", " batch_ids = list(docs.keys())[i:i + batch_size]\n", " batch_docs = [docs[doc_id] for doc_id in batch_ids]\n", " batch_embeddings = [faiss_vectorstore.index.reconstruct(j).tolist() \n", " for j in range(i, min(i + batch_size, total_docs))]\n", " \n", " # Create metadata for each document\n", " metadatas = [doc.metadata for doc in batch_docs]\n", " texts = [doc.page_content for doc in batch_docs]\n", " # print(batch_docs)\n", " # Add vectors to Pinecone\n", " pinecone_vectorstore.add_texts(\n", " texts=texts,\n", " metadatas=metadatas,\n", " embeddings=batch_embeddings,\n", " ids=batch_ids\n", " )\n", " \n", " print(f\"Successfully migrated {total_docs} documents to Pinecone index '{index_name}'\")\n", " return pinecone_vectorstore\n", "\n", "# Example usage:\n", "if __name__ == \"__main__\":\n", " # Set your credentials and paths\n", " FAISS_INDEX_PATH = \"faiss_900_1200\"\n", " PINECONE_API_KEY = \os.get.environ("PINECONE_API_KEY"), " INDEX_NAME = \"bpl-rag\"\n", " \n", " # Perform migration\n", " pinecone_vs = migrate_faiss_to_pinecone(\n", " faiss_index_path=FAISS_INDEX_PATH,\n", " pinecone_api_key=PINECONE_API_KEY,\n", " index_name=INDEX_NAME,\n", " batch_size=100\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "pinecone_venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }