Raghuan's picture
Update app.py
538c882 verified
import streamlit as st
import faiss
import os
from PyPDF2 import PdfFileReader
from sentence_transformers import SentenceTransformer
import pickle
st.title("File Upload and Vector Database Creation")
dataset = st.selectbox("Select Dataset", ["Sales", "Marketing", "HR"])
uploaded_file = st.file_uploader("Upload your file", type=["txt", "pdf", "docx"])
# Function to extract text from PDF
def extract_text_from_pdf(file):
reader = PdfFileReader(file)
text = ""
for page in range(reader.getNumPages()):
text += reader.getPage(page).extract_text()
return text
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "text/plain":
text = str(uploaded_file.read(), "utf-8")
st.write("File uploaded successfully!")
# Load pre-trained model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([text])
# Create or load existing FAISS index
dimension = 384 # Example dimension size for the MiniLM model
index_file = f'vector_db_{dataset}.index'
if os.path.exists(index_file):
index = faiss.read_index(index_file)
else:
index = faiss.IndexFlatL2(dimension)
# Add embeddings to the index
index.add(embeddings)
# Save the index
faiss.write_index(index, index_file)
# Save metadata
metadata_file = f'metadata_{dataset}.pkl'
if os.path.exists(metadata_file):
with open(metadata_file, 'rb') as f:
metadata = pickle.load(f)
else:
metadata = []
metadata.append(text)
with open(metadata_file, 'wb') as f:
pickle.dump(metadata, f)
st.write("Vector database updated and saved successfully!")
# Option to download the vector database file
with open(index_file, 'rb') as f:
st.download_button(
label=f"Download {index_file}",
data=f,
file_name=index_file
)
# Option to download the metadata file
with open(metadata_file, 'rb') as f:
st.download_button(
label=f"Download {metadata_file}",
data=f,
file_name=metadata_file
)