Spaces:
Sleeping
Sleeping
File size: 4,887 Bytes
496bce6 3de15ee 496bce6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import streamlit as st
import os
import time
import numpy as np
import pandas as pd
def add_custom_css():
st.markdown("""
<style>
.container {
text-align: center;
background-color: #f0f0f0;
padding: 20px;
}
.big-font {
font-size: 50px;
color: #4CAF50;
}
.progress-bar {
margin-top: 20px;
}
</style>
""", unsafe_allow_html=True)
if 'packages_installed' not in st.session_state:
st.info("Installing required packages...")
os.system("pip install -U sentence-transformers")
os.system("pip install pinecone-client")
st.session_state['packages_installed'] = True
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec, PodSpec
if 'pc' not in st.session_state:
use_serverless = False
# Configure Pinecone client
api_key = os.environ.get('PINECONE_API_KEY', '28b0fd5a-fdfb-422d-9a44-c0ec09a25074')
environment = os.environ.get('PINECONE_ENVIRONMENT', 'gcp-starter')
st.session_state['pc'] = Pinecone(api_key=api_key)
if use_serverless:
spec = ServerlessSpec(cloud='gcp', region='asia-southeast1-gcp')
else:
spec = PodSpec(environment=environment)
if 'model' not in st.session_state:
st.session_state['model'] = SentenceTransformer('intfloat/e5-small')
index_name = 'dataset'
if index_name not in st.session_state.pc.list_indexes().names():
dimensions = 384
st.session_state.pc.create_index(
name=index_name,
dimension=dimensions,
metric='cosine',
spec=spec
)
# Wait until index is ready
while not st.session_state.pc.describe_index(index_name).status['ready']:
time.sleep(1)
if 'index' not in st.session_state:
st.session_state['index'] = st.session_state.pc.Index(index_name)
# Function to process data and insert into Pinecone index
def process_data(data, namespace):
input_texts = data['Query']
progress_bar = st.progress(0)
total_chunks = len(data) // 1000 + 1
for chunk_start in range(0, len(data), 1000):
chunk_end = min(chunk_start + 1000, len(data))
chunk = data.iloc[chunk_start:chunk_end]
# Generate embeddings for the current chunk
chunk_embeddings = [st.session_state.model.encode(query, normalize_embeddings=True) for query in chunk['Query']]
chunk['embedding'] = chunk_embeddings
# Upsert embeddings
st.session_state.index.upsert(vectors=zip(chunk['id'], chunk['embedding']), namespace=namespace)
# Update progress bar
progress = (chunk_end / len(data)) * 100
progress_bar.progress(int(progress))
def load_and_process_data(file):
data = pd.read_csv(file)
data['id'] = data.index.astype(str)
namespace = file.name[:15] # Use first 15 characters of file name as namespace
if 'embeddings_done' not in st.session_state:
process_data(data, namespace)
st.session_state['embeddings_done'] = True
return data, namespace
def main():
add_custom_css()
st.markdown("""
<div class='container'>
<h1 class='big-font'>Semantic Search Engine</h1>
</div>
""", unsafe_allow_html=True)
# Use session state to retain information across interactions
if 'namespace' not in st.session_state:
st.session_state.namespace = None
if 'df' not in st.session_state:
st.session_state.df = None
uploaded_file = st.file_uploader("Upload dataset (CSV format)", type=["csv"])
if uploaded_file is not None:
filename = uploaded_file.name
namespace = filename.split('.')[0]
st.info("Dataset Processing Started...")
st.session_state.df, st.session_state.namespace = load_and_process_data(uploaded_file)
st.info("Dataset Processing Completed...")
if st.session_state.namespace:
query = st.text_input("Enter your query about the data (or type 'exit' to quit):")
if query.lower() != 'exit':
vec = st.session_state.model.encode(query)
result = None
result = st.session_state.index.query(
namespace=st.session_state.namespace,
vector=vec.tolist(),
top_k=5,
include_values=False
)
st.subheader("Query Results:")
if result is not None:
id = result['matches'][0]['id']
data = st.session_state.df
answer = data[data['id'] == id]['Answer'].values[0]
st.write(answer)
if st.button("Delete Stored Data"):
st.session_state.index.delete(deleteAll=True, namespace =st.session_state.namespace)
st.stop()
if __name__ == "__main__":
main()
|