Somnath3570 commited on
Commit
433a37f
·
verified ·
1 Parent(s): ddf0729

Create create_memory_for_llm.py

Browse files
Files changed (1) hide show
  1. create_memory_for_llm.py +45 -0
create_memory_for_llm.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+
6
+ from dotenv import load_dotenv, find_dotenv
7
+ load_dotenv(find_dotenv())
8
+
9
+
10
+ # Step 1: Load raw PDF(s)
11
+ DATA_PATH="data/"
12
+ def load_pdf_files(data):
13
+ loader = DirectoryLoader(data,
14
+ glob='*.pdf',
15
+ loader_cls=PyPDFLoader)
16
+
17
+ documents=loader.load()
18
+ return documents
19
+
20
+ documents=load_pdf_files(data=DATA_PATH)
21
+ #print("Length of PDF pages: ", len(documents))
22
+
23
+
24
+ # Step 2: Create Chunks
25
+ def create_chunks(extracted_data):
26
+ text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,
27
+ chunk_overlap=50)
28
+ text_chunks=text_splitter.split_documents(extracted_data)
29
+ return text_chunks
30
+
31
+ text_chunks=create_chunks(extracted_data=documents)
32
+ print("Length of Text Chunks: ", len(text_chunks))
33
+
34
+ # Step 3: Create Vector Embeddings
35
+
36
+ def get_embedding_model():
37
+ embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
38
+ return embedding_model
39
+
40
+ embedding_model=get_embedding_model()
41
+
42
+ # Step 4: Store embeddings in FAISS
43
+ DB_FAISS_PATH="vectorstore/db_faiss"
44
+ db=FAISS.from_documents(text_chunks, embedding_model)
45
+ db.save_local(DB_FAISS_PATH)