import streamlit as st import os import time import numpy as np import pandas as pd def add_custom_css(): st.markdown(""" """, unsafe_allow_html=True) if 'packages_installed' not in st.session_state: st.info("Installing required packages...") os.system("pip install -U sentence-transformers") os.system("pip install pinecone-client") st.session_state['packages_installed'] = True from sentence_transformers import SentenceTransformer from pinecone import Pinecone, ServerlessSpec, PodSpec if 'pc' not in st.session_state: use_serverless = False # Configure Pinecone client api_key = os.environ.get('PINECONE_API_KEY', '28b0fd5a-fdfb-422d-9a44-c0ec09a25074') environment = os.environ.get('PINECONE_ENVIRONMENT', 'gcp-starter') st.session_state['pc'] = Pinecone(api_key=api_key) if use_serverless: spec = ServerlessSpec(cloud='gcp', region='asia-southeast1-gcp') else: spec = PodSpec(environment=environment) if 'model' not in st.session_state: st.session_state['model'] = SentenceTransformer('intfloat/e5-small') index_name = 'dataset' if index_name not in st.session_state.pc.list_indexes().names(): dimensions = 384 st.session_state.pc.create_index( name=index_name, dimension=dimensions, metric='cosine', spec=spec ) # Wait until index is ready while not st.session_state.pc.describe_index(index_name).status['ready']: time.sleep(1) if 'index' not in st.session_state: st.session_state['index'] = st.session_state.pc.Index(index_name) # Function to process data and insert into Pinecone index def process_data(data, namespace): input_texts = data['Query'] progress_bar = st.progress(0) total_chunks = len(data) // 1000 + 1 for chunk_start in range(0, len(data), 1000): chunk_end = min(chunk_start + 1000, len(data)) chunk = data.iloc[chunk_start:chunk_end] # Generate embeddings for the current chunk chunk_embeddings = [st.session_state.model.encode(query, normalize_embeddings=True) for query in chunk['Query']] chunk['embedding'] = chunk_embeddings # Upsert embeddings st.session_state.index.upsert(vectors=zip(chunk['id'], chunk['embedding']), namespace=namespace) # Update progress bar progress = (chunk_end / len(data)) * 100 progress_bar.progress(int(progress)) def load_and_process_data(file): data = pd.read_csv(file) data['id'] = data.index.astype(str) namespace = file.name[:15] # Use first 15 characters of file name as namespace if 'embeddings_done' not in st.session_state: process_data(data, namespace) st.session_state['embeddings_done'] = True return data, namespace def main(): add_custom_css() st.markdown("""