DrishtiSharma commited on
Commit
a72c887
Β·
verified Β·
1 Parent(s): d2af2e8

Create vectorize_documents.py

Browse files
Files changed (1) hide show
  1. vectorize_documents.py +26 -0
vectorize_documents.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import UnstructuredFileLoader
2
+ from langchain_community.document_loaders import DirectoryLoader
3
+ from langchain_text_splitters import CharacterTextSplitter
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_chroma import Chroma
6
+
7
+ # loaidng the embedding model
8
+ embeddings = HuggingFaceEmbeddings()
9
+
10
+ loader = DirectoryLoader(path="data",
11
+ glob="./*.pdf",
12
+ loader_cls=UnstructuredFileLoader)
13
+ documents = loader.load()
14
+
15
+
16
+ text_splitter = CharacterTextSplitter(chunk_size=2000,
17
+ chunk_overlap=500)
18
+ text_chunks = text_splitter.split_documents(documents)
19
+
20
+ vectordb = Chroma.from_documents(
21
+ documents=text_chunks,
22
+ embedding=embeddings,
23
+ persist_directory="vector_db_dir"
24
+ )
25
+
26
+ print("Documents Vectorized")