import os import pandas as pd import pinecone from dotenv import load_dotenv from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings.sentence_transformer import \ SentenceTransformerEmbeddings from langchain.llms import OpenAI from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Pinecone from pypdf import PdfReader from sklearn.model_selection import train_test_split from functools import lru_cache #**********Functions to help you load documents to PINECONE*********** #Read PDF data def read_pdf_data(pdf_file): pdf_page = PdfReader(pdf_file) text = "" for page in pdf_page.pages: text += page.extract_text() return text #Split data into chunks def split_data(text): text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50) docs = text_splitter.split_text(text) docs_chunks =text_splitter.create_documents(docs) return docs_chunks #Create embeddings instance def create_embeddings_load_data(): #embeddings = OpenAIEmbeddings() embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") return embeddings @lru_cache def pine_cone_index(pinecone_index_name: str | None): load_dotenv() pinecone.init( api_key=os.getenv('PINECONE_API_KEY'), environment=os.getenv('PINECONE_ENV'), ) index_name = pinecone_index_name or os.getenv('PINECONE_INDEX_NAME') if index_name is None: raise ValueError('PINECONE_INDEX_NAME is not set') return index_name def push_to_pinecone(embeddings,docs,pinecone_index_name: str | None=None): index_name = pine_cone_index(pinecone_index_name) index = Pinecone.from_documents(docs, embeddings, index_name=index_name) return index #*********Functions for dealing with Model related tasks...************ #Read dataset for model creation def read_data(data): df = pd.read_csv(data,delimiter=',', header=None) return df #Create embeddings instance def get_embeddings(): embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") return embeddings #Generating embeddings for our input dataset def create_embeddings(df,embeddings): df[2] = df[0].apply(lambda x: embeddings.embed_query(x)) return df #Splitting the data into train & test def split_train_test__data(df_sample): # Split into training and testing sets sentences_train, sentences_test, labels_train, labels_test = train_test_split( list(df_sample[2]), list(df_sample[1]), test_size=0.25, random_state=0) print(len(sentences_train)) return sentences_train, sentences_test, labels_train, labels_test #Get the accuracy score on test data def get_score(svm_classifier,sentences_test,labels_test): score = svm_classifier.score(sentences_test, labels_test) return score