Raghuan commited on
Commit
536d130
·
verified ·
1 Parent(s): 7ead06e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +63 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app1.py
2
+
3
+ import streamlit as st
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import pickle
7
+ import os
8
+ from PyPDF2 import PdfFileReader
9
+
10
+ st.title("File Upload and Vector Database Creation")
11
+
12
+ dataset = st.selectbox("Select Dataset", ["Sales", "Marketing", "HR"])
13
+ uploaded_file = st.file_uploader("Upload your file", type=["txt", "pdf", "docx"])
14
+
15
+ # Function to extract text from PDF
16
+ def extract_text_from_pdf(file):
17
+ reader = PdfFileReader(file)
18
+ text = ""
19
+ for page in range(reader.getNumPages()):
20
+ text += reader.getPage(page).extract_text()
21
+ return text
22
+
23
+ if uploaded_file is not None:
24
+ if uploaded_file.type == "application/pdf":
25
+ text = extract_text_from_pdf(uploaded_file)
26
+ elif uploaded_file.type == "text/plain":
27
+ text = str(uploaded_file.read(), "utf-8")
28
+ # Additional file types can be added as needed
29
+
30
+ st.write("File uploaded successfully!")
31
+
32
+ # Load pre-trained model for embeddings
33
+ model = SentenceTransformer('all-MiniLM-L6-v2')
34
+ embeddings = model.encode([text])
35
+
36
+ # Create or load existing FAISS index
37
+ dimension = 384 # Example dimension size for the MiniLM model
38
+ index_file = f'vector_db_{dataset}.index'
39
+
40
+ if os.path.exists(index_file):
41
+ index = faiss.read_index(index_file)
42
+ else:
43
+ index = faiss.IndexFlatL2(dimension)
44
+
45
+ # Add embeddings to the index
46
+ index.add(embeddings)
47
+
48
+ # Save the index
49
+ faiss.write_index(index, index_file)
50
+
51
+ # Save metadata
52
+ metadata_file = f'metadata_{dataset}.pkl'
53
+ if os.path.exists(metadata_file):
54
+ with open(metadata_file, 'rb') as f:
55
+ metadata = pickle.load(f)
56
+ else:
57
+ metadata = []
58
+
59
+ metadata.append(text)
60
+ with open(metadata_file, 'wb') as f:
61
+ pickle.dump(metadata, f)
62
+
63
+ st.write("Vector database updated and saved successfully!")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ sentence-transformers
3
+ faiss-cpu
4
+ PyPDF2
5
+ docx2txt