GovindRaj commited on
Commit
29d7f82
·
verified ·
1 Parent(s): c5fbeef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -18
app.py CHANGED
@@ -5,9 +5,12 @@ from langchain_community.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  import os
7
  import tempfile
 
8
 
9
- DB_FAISS_PATH = './vectorstore/db_faiss'
 
10
 
 
11
  def create_vector_db(uploaded_files):
12
  # Create a temporary directory
13
  with tempfile.TemporaryDirectory() as temp_dir:
@@ -26,7 +29,7 @@ def create_vector_db(uploaded_files):
26
  loader = PyPDFLoader(pdf_path)
27
  documents.extend(loader.load())
28
 
29
- # Split documents
30
  text_splitter = RecursiveCharacterTextSplitter(
31
  chunk_size=500,
32
  chunk_overlap=50
@@ -39,24 +42,30 @@ def create_vector_db(uploaded_files):
39
  model_kwargs={'device': 'cpu'}
40
  )
41
 
42
- # Create the vectorstore directory if it doesn't exist
43
- if not os.path.exists(DB_FAISS_PATH):
44
- try:
45
- os.makedirs(DB_FAISS_PATH)
46
- print(f"Created directory {DB_FAISS_PATH}")
47
- except Exception as e:
48
- print(f"Error creating directory: {e}")
49
-
50
- # Create and save FAISS database
51
  db = FAISS.from_documents(texts, embeddings)
52
- try:
53
- db.save_local(DB_FAISS_PATH)
54
- print(f"FAISS database saved to {DB_FAISS_PATH}")
55
- except Exception as e:
56
- print(f"Error saving FAISS database: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  return True
59
 
 
60
  def main():
61
  st.title("PDF to Vector Database Converter")
62
 
@@ -71,9 +80,9 @@ def main():
71
  try:
72
  success = create_vector_db(uploaded_files)
73
  if success:
74
- st.success("Vector database created successfully!")
75
  except Exception as e:
76
  st.error(f"An error occurred: {str(e)}")
77
 
78
  if __name__ == "__main__":
79
- main()
 
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  import os
7
  import tempfile
8
+ from huggingface_hub import HfApi, HfFolder
9
 
10
+ DB_FAISS_PATH = 'vectorstore/db_faiss'
11
+ DATASET_REPO = "GovindRaj/faiss-vectorstore" # Your Hugging Face Dataset ID
12
 
13
+ # Function to create FAISS vector DB and upload to Hugging Face
14
  def create_vector_db(uploaded_files):
15
  # Create a temporary directory
16
  with tempfile.TemporaryDirectory() as temp_dir:
 
29
  loader = PyPDFLoader(pdf_path)
30
  documents.extend(loader.load())
31
 
32
+ # Split documents into chunks
33
  text_splitter = RecursiveCharacterTextSplitter(
34
  chunk_size=500,
35
  chunk_overlap=50
 
42
  model_kwargs={'device': 'cpu'}
43
  )
44
 
45
+ # Create and save FAISS database locally
 
 
 
 
 
 
 
 
46
  db = FAISS.from_documents(texts, embeddings)
47
+ db.save_local(DB_FAISS_PATH)
48
+
49
+ # Retrieve the token from environment variables (Hugging Face Secrets)
50
+ hf_token = os.getenv("HF_TOKEN")
51
+
52
+ if not hf_token:
53
+ raise ValueError("Hugging Face token not found. Please set the token in Hugging Face secrets.")
54
+
55
+ # Push the vector database to Hugging Face Dataset
56
+ HfFolder.save_token(hf_token)
57
+ api = HfApi()
58
+ api.upload_folder(
59
+ folder_path=DB_FAISS_PATH, # Local path to the FAISS folder
60
+ path_in_repo="faiss_data", # Save at the root of the dataset
61
+ repo_id=DATASET_REPO, # Hugging Face Dataset ID
62
+ repo_type="dataset", # Specify this is a dataset
63
+ token=hf_token # Use the token from secrets
64
+ )
65
 
66
  return True
67
 
68
+ # Streamlit app
69
  def main():
70
  st.title("PDF to Vector Database Converter")
71
 
 
80
  try:
81
  success = create_vector_db(uploaded_files)
82
  if success:
83
+ st.success("Vector database created and uploaded successfully!")
84
  except Exception as e:
85
  st.error(f"An error occurred: {str(e)}")
86
 
87
  if __name__ == "__main__":
88
+ main()