GovindRaj commited on
Commit
23f558c
·
1 Parent(s): 7526b0b

added changes

Browse files
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pypdf
2
+ langchain
3
+ torch
4
+ accelerate
5
+ bitsandbytes
6
+ ctransformers
7
+ sentence_transformers
8
+ faiss_cpu
9
+ chainlit
10
+ huggingface_hub
11
+ langchain_community
upload.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from langchain_community.vectorstores import FAISS
4
+ from langchain_community.document_loaders import PyPDFLoader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ import os
7
+ import tempfile
8
+
9
+ DB_FAISS_PATH = 'vectorstore/db_faiss'
10
+
11
+ def create_vector_db(uploaded_files):
12
+ # Create a temporary directory
13
+ with tempfile.TemporaryDirectory() as temp_dir:
14
+ # Save uploaded files to temporary directory
15
+ for file in uploaded_files:
16
+ if file.name.endswith('.pdf'):
17
+ temp_path = os.path.join(temp_dir, file.name)
18
+ with open(temp_path, "wb") as f:
19
+ f.write(file.getvalue())
20
+
21
+ # Load PDFs
22
+ documents = []
23
+ for file in os.listdir(temp_dir):
24
+ if file.endswith('.pdf'):
25
+ pdf_path = os.path.join(temp_dir, file)
26
+ loader = PyPDFLoader(pdf_path)
27
+ documents.extend(loader.load())
28
+
29
+ # Split documents
30
+ text_splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=500,
32
+ chunk_overlap=50
33
+ )
34
+ texts = text_splitter.split_documents(documents)
35
+
36
+ # Create embeddings
37
+ embeddings = HuggingFaceEmbeddings(
38
+ model_name='sentence-transformers/all-MiniLM-L6-v2',
39
+ model_kwargs={'device': 'cpu'}
40
+ )
41
+
42
+ # Create and save FAISS database
43
+ db = FAISS.from_documents(texts, embeddings)
44
+ db.save_local(DB_FAISS_PATH)
45
+ return True
46
+
47
+ def main():
48
+ st.title("PDF to Vector Database Converter")
49
+
50
+ uploaded_files = st.file_uploader(
51
+ "Upload PDF files",
52
+ type=['pdf'],
53
+ accept_multiple_files=True
54
+ )
55
+
56
+ if st.button("Create Vector Database") and uploaded_files:
57
+ with st.spinner("Creating vector database..."):
58
+ try:
59
+ success = create_vector_db(uploaded_files)
60
+ if success:
61
+ st.success("Vector database created successfully!")
62
+ except Exception as e:
63
+ st.error(f"An error occurred: {str(e)}")
64
+
65
+ if __name__ == "__main__":
66
+ main()
vectorstore/db_faiss/requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pypdf
2
+ langchain
3
+ torch
4
+ accelerate
5
+ bitsandbytes
6
+ ctransformers
7
+ sentence_transformers
8
+ faiss_cpu
9
+ chainlit
10
+ huggingface_hub
11
+ langchain_community