Spaces:
Runtime error
Runtime error
File size: 2,266 Bytes
538c882 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import streamlit as st
import faiss
import os
from PyPDF2 import PdfFileReader
from sentence_transformers import SentenceTransformer
import pickle
st.title("File Upload and Vector Database Creation")
dataset = st.selectbox("Select Dataset", ["Sales", "Marketing", "HR"])
uploaded_file = st.file_uploader("Upload your file", type=["txt", "pdf", "docx"])
# Function to extract text from PDF
def extract_text_from_pdf(file):
reader = PdfFileReader(file)
text = ""
for page in range(reader.getNumPages()):
text += reader.getPage(page).extract_text()
return text
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "text/plain":
text = str(uploaded_file.read(), "utf-8")
st.write("File uploaded successfully!")
# Load pre-trained model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([text])
# Create or load existing FAISS index
dimension = 384 # Example dimension size for the MiniLM model
index_file = f'vector_db_{dataset}.index'
if os.path.exists(index_file):
index = faiss.read_index(index_file)
else:
index = faiss.IndexFlatL2(dimension)
# Add embeddings to the index
index.add(embeddings)
# Save the index
faiss.write_index(index, index_file)
# Save metadata
metadata_file = f'metadata_{dataset}.pkl'
if os.path.exists(metadata_file):
with open(metadata_file, 'rb') as f:
metadata = pickle.load(f)
else:
metadata = []
metadata.append(text)
with open(metadata_file, 'wb') as f:
pickle.dump(metadata, f)
st.write("Vector database updated and saved successfully!")
# Option to download the vector database file
with open(index_file, 'rb') as f:
st.download_button(
label=f"Download {index_file}",
data=f,
file_name=index_file
)
# Option to download the metadata file
with open(metadata_file, 'rb') as f:
st.download_button(
label=f"Download {metadata_file}",
data=f,
file_name=metadata_file
) |