import os import pinecone import time import yaml import pandas as pd from langchain.document_loaders import DataFrameLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores.pinecone import Pinecone from typing import List from dotenv import load_dotenv from pathlib import Path class PinceconeIndex: def __init__(self, index_name: str, model_name: str): self.index_name = index_name self._embeddingModel = HuggingFaceEmbeddings(model_name=model_name) def connect_index(self, embedding_dimension: int, delete_existing: bool = False): index_name = self.index_name # load pinecone env variables within Google Colab if (not os.getenv('PINECONE_KEY')) or (not os.getenv('PINECONE_ENV')): dotenv_path = Path('/content/gt-policy-bot/config.env') load_dotenv(dotenv_path=dotenv_path) pinecone.init( api_key=os.getenv('PINECONE_KEY'), environment=os.getenv('PINECONE_ENV'), ) if index_name in pinecone.list_indexes() and delete_existing: pinecone.delete_index(index_name) if index_name not in pinecone.list_indexes(): pinecone.create_index(index_name, dimension=embedding_dimension) index = pinecone.Index(index_name) pinecone.describe_index(index_name) self._index = index def upsert_docs(self, df: pd.DataFrame, text_col: str): loader = DataFrameLoader(df, page_content_column=text_col) docs = loader.load() Pinecone.from_documents(docs, self._embeddingModel, index_name=self.index_name) def get_embedding_model(self): return self._embeddingModel def get_index_name(self): return self.index_name def query(self, query: str, top_k: int = 5) -> List[str]: docsearch = Pinecone.from_existing_index(self.index_name, self._embeddingModel) res = docsearch.similarity_search(query, k=top_k) return [doc.page_content for doc in res] if __name__ == '__main__': config_path = 'config.yml' with open('config.yml', 'r') as file: config = yaml.safe_load(file) print(config) data_path = config['paths']['data_path'] project = config['paths']['project'] format = '.csv' index_name = config['pinecone']['index-name'] embedding_model = config['sentence-transformers'][ 'model-name'] embedding_dimension = config['sentence-transformers'][ 'embedding-dimension'] delete_existing = True if config['paths']['chunking'] == 'manual': print("Using manual chunking") file_path_embedding = config['paths']['manual_chunk_file'] df = pd.read_csv(file_path_embedding, header=None, names=['chunks']) else: print("Using automatic chunking") file_path_embedding = config['paths']['auto_chunk_file'] df = pd.read_csv(file_path_embedding, index_col=0) print(df) start_time = time.time() index = PinceconeIndex(index_name, embedding_model) index.connect_index(embedding_dimension, delete_existing) index.upsert_docs(df, 'chunks') end_time = time.time() print(f'Indexing took {end_time - start_time} seconds') index = PinceconeIndex(index_name, embedding_model) index.connect_index(embedding_dimension, delete_existing=False) query = "When was the student code of conduct last revised?" res = index.query(query, top_k=5) # assert len(res) == 5 print(res)