Raghuan commited on
Commit
538c882
·
verified ·
1 Parent(s): 536d130

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -63
app.py CHANGED
@@ -1,63 +1,76 @@
1
- # app1.py
2
-
3
- import streamlit as st
4
- from sentence_transformers import SentenceTransformer
5
- import faiss
6
- import pickle
7
- import os
8
- from PyPDF2 import PdfFileReader
9
-
10
- st.title("File Upload and Vector Database Creation")
11
-
12
- dataset = st.selectbox("Select Dataset", ["Sales", "Marketing", "HR"])
13
- uploaded_file = st.file_uploader("Upload your file", type=["txt", "pdf", "docx"])
14
-
15
- # Function to extract text from PDF
16
- def extract_text_from_pdf(file):
17
- reader = PdfFileReader(file)
18
- text = ""
19
- for page in range(reader.getNumPages()):
20
- text += reader.getPage(page).extract_text()
21
- return text
22
-
23
- if uploaded_file is not None:
24
- if uploaded_file.type == "application/pdf":
25
- text = extract_text_from_pdf(uploaded_file)
26
- elif uploaded_file.type == "text/plain":
27
- text = str(uploaded_file.read(), "utf-8")
28
- # Additional file types can be added as needed
29
-
30
- st.write("File uploaded successfully!")
31
-
32
- # Load pre-trained model for embeddings
33
- model = SentenceTransformer('all-MiniLM-L6-v2')
34
- embeddings = model.encode([text])
35
-
36
- # Create or load existing FAISS index
37
- dimension = 384 # Example dimension size for the MiniLM model
38
- index_file = f'vector_db_{dataset}.index'
39
-
40
- if os.path.exists(index_file):
41
- index = faiss.read_index(index_file)
42
- else:
43
- index = faiss.IndexFlatL2(dimension)
44
-
45
- # Add embeddings to the index
46
- index.add(embeddings)
47
-
48
- # Save the index
49
- faiss.write_index(index, index_file)
50
-
51
- # Save metadata
52
- metadata_file = f'metadata_{dataset}.pkl'
53
- if os.path.exists(metadata_file):
54
- with open(metadata_file, 'rb') as f:
55
- metadata = pickle.load(f)
56
- else:
57
- metadata = []
58
-
59
- metadata.append(text)
60
- with open(metadata_file, 'wb') as f:
61
- pickle.dump(metadata, f)
62
-
63
- st.write("Vector database updated and saved successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import faiss
3
+ import os
4
+ from PyPDF2 import PdfFileReader
5
+ from sentence_transformers import SentenceTransformer
6
+ import pickle
7
+
8
+ st.title("File Upload and Vector Database Creation")
9
+
10
+ dataset = st.selectbox("Select Dataset", ["Sales", "Marketing", "HR"])
11
+ uploaded_file = st.file_uploader("Upload your file", type=["txt", "pdf", "docx"])
12
+
13
+ # Function to extract text from PDF
14
+ def extract_text_from_pdf(file):
15
+ reader = PdfFileReader(file)
16
+ text = ""
17
+ for page in range(reader.getNumPages()):
18
+ text += reader.getPage(page).extract_text()
19
+ return text
20
+
21
+ if uploaded_file is not None:
22
+ if uploaded_file.type == "application/pdf":
23
+ text = extract_text_from_pdf(uploaded_file)
24
+ elif uploaded_file.type == "text/plain":
25
+ text = str(uploaded_file.read(), "utf-8")
26
+
27
+ st.write("File uploaded successfully!")
28
+
29
+ # Load pre-trained model for embeddings
30
+ model = SentenceTransformer('all-MiniLM-L6-v2')
31
+ embeddings = model.encode([text])
32
+
33
+ # Create or load existing FAISS index
34
+ dimension = 384 # Example dimension size for the MiniLM model
35
+ index_file = f'vector_db_{dataset}.index'
36
+
37
+ if os.path.exists(index_file):
38
+ index = faiss.read_index(index_file)
39
+ else:
40
+ index = faiss.IndexFlatL2(dimension)
41
+
42
+ # Add embeddings to the index
43
+ index.add(embeddings)
44
+
45
+ # Save the index
46
+ faiss.write_index(index, index_file)
47
+
48
+ # Save metadata
49
+ metadata_file = f'metadata_{dataset}.pkl'
50
+ if os.path.exists(metadata_file):
51
+ with open(metadata_file, 'rb') as f:
52
+ metadata = pickle.load(f)
53
+ else:
54
+ metadata = []
55
+
56
+ metadata.append(text)
57
+ with open(metadata_file, 'wb') as f:
58
+ pickle.dump(metadata, f)
59
+
60
+ st.write("Vector database updated and saved successfully!")
61
+
62
+ # Option to download the vector database file
63
+ with open(index_file, 'rb') as f:
64
+ st.download_button(
65
+ label=f"Download {index_file}",
66
+ data=f,
67
+ file_name=index_file
68
+ )
69
+
70
+ # Option to download the metadata file
71
+ with open(metadata_file, 'rb') as f:
72
+ st.download_button(
73
+ label=f"Download {metadata_file}",
74
+ data=f,
75
+ file_name=metadata_file
76
+ )