Hemasagar commited on
Commit
74addc9
·
verified ·
1 Parent(s): dfccb9d

Upload ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +34 -0
ingest.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import box
2
+ import yaml
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.document_loaders import PyPDFDirectoryLoader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain_community.embeddings.sentence_transformer import (
10
+ SentenceTransformerEmbeddings,
11
+ )
12
+ from langchain.vectorstores import Chroma
13
+
14
+ # Import config vars
15
+ with open('config.yml', 'r', encoding='utf8') as ymlfile:
16
+ cfg = box.Box(yaml.safe_load(ymlfile))
17
+
18
+
19
+ def run_ingest():
20
+ loader = DirectoryLoader(cfg.DATA_PATH,
21
+ glob='*.pdf',
22
+ loader_cls=PyPDFLoader)
23
+
24
+ documents = loader.load()
25
+ print("documents",documents)
26
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20,length_function =len,add_start_index = True)
27
+ text = text_splitter.split_documents(documents)
28
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
29
+ # load it into Chroma
30
+ # save to disk
31
+ db2 = Chroma.from_documents(text, embedding_function, persist_directory="./vectorestore/chroma")
32
+
33
+ if __name__ == "__main__":
34
+ run_ingest()