Jayesh13 commited on
Commit
496bce6
·
verified ·
1 Parent(s): a20db0c

Upload Final_app.py

Browse files
Files changed (1) hide show
  1. Final_app.py +150 -0
Final_app.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import time
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ def add_custom_css():
8
+ st.markdown("""
9
+ <style>
10
+ .container {
11
+ text-align: center;
12
+ background-color: #f0f0f0;
13
+ padding: 20px;
14
+ }
15
+ .big-font {
16
+ font-size: 50px;
17
+ color: #4CAF50;
18
+ }
19
+ .progress-bar {
20
+ margin-top: 20px;
21
+ }
22
+ </style>
23
+ """, unsafe_allow_html=True)
24
+
25
+ if 'packages_installed' not in st.session_state:
26
+ st.info("Installing required packages...")
27
+ os.system("pip install -U sentence-transformers")
28
+ os.system("pip install pinecone-client")
29
+ st.session_state['packages_installed'] = True
30
+
31
+ from sentence_transformers import SentenceTransformer
32
+ from pinecone import Pinecone, ServerlessSpec, PodSpec
33
+
34
+ if 'pc' not in st.session_state:
35
+ use_serverless = False
36
+ # Configure Pinecone client
37
+ api_key = os.environ.get('PINECONE_API_KEY', '28b0fd5a-fdfb-422d-9a44-c0ec09a25074')
38
+ environment = os.environ.get('PINECONE_ENVIRONMENT', 'gcp-starter')
39
+ st.session_state['pc'] = Pinecone(api_key=api_key)
40
+
41
+ if use_serverless:
42
+ spec = ServerlessSpec(cloud='gcp', region='asia-southeast1-gcp')
43
+ else:
44
+ spec = PodSpec(environment=environment)
45
+
46
+ if 'model' not in st.session_state:
47
+ st.session_state['model'] = SentenceTransformer('intfloat/e5-small')
48
+
49
+ index_name = 'dataset'
50
+
51
+ if index_name not in st.session_state.pc.list_indexes().names():
52
+ dimensions = 384
53
+ st.session_state.pc.create_index(
54
+ name=index_name,
55
+ dimension=dimensions,
56
+ metric='cosine',
57
+ spec=spec
58
+ )
59
+ # Wait until index is ready
60
+ while not st.session_state.pc.describe_index(index_name).status['ready']:
61
+ time.sleep(1)
62
+
63
+ if 'index' not in st.session_state:
64
+ st.session_state['index'] = st.session_state.pc.Index(index_name)
65
+
66
+
67
+ # Function to process data and insert into Pinecone index
68
+ def process_data(data, namespace):
69
+ input_texts = data['Query']
70
+
71
+ progress_bar = st.progress(0)
72
+ total_chunks = len(data) // 1000 + 1
73
+
74
+ for chunk_start in range(0, len(data), 1000):
75
+ chunk_end = min(chunk_start + 1000, len(data))
76
+ chunk = data.iloc[chunk_start:chunk_end]
77
+
78
+ # Generate embeddings for the current chunk
79
+ chunk_embeddings = [st.session_state.model.encode(query, normalize_embeddings=True) for query in chunk['Query']]
80
+ chunk['embedding'] = chunk_embeddings
81
+
82
+ # Upsert embeddings
83
+ st.session_state.index.upsert(vectors=zip(chunk['id'], chunk['embedding']), namespace=namespace)
84
+
85
+ # Update progress bar
86
+ progress = (chunk_end / len(data)) * 100
87
+ progress_bar.progress(int(progress))
88
+
89
+
90
+
91
+ def load_and_process_data(file):
92
+ data = pd.read_csv(file)
93
+ data = data[0:500]
94
+ data['id'] = data.index.astype(str)
95
+ namespace = file.name[:15] # Use first 15 characters of file name as namespace
96
+ if 'embeddings_done' not in st.session_state:
97
+ process_data(data, namespace)
98
+ st.session_state['embeddings_done'] = True
99
+ return data, namespace
100
+
101
+ def main():
102
+ add_custom_css()
103
+
104
+ st.markdown("""
105
+ <div class='container'>
106
+ <h1 class='big-font'>Semantic Search Engine</h1>
107
+ </div>
108
+ """, unsafe_allow_html=True)
109
+
110
+ # Use session state to retain information across interactions
111
+ if 'namespace' not in st.session_state:
112
+ st.session_state.namespace = None
113
+ if 'df' not in st.session_state:
114
+ st.session_state.df = None
115
+
116
+ uploaded_file = st.file_uploader("Upload dataset (CSV format)", type=["csv"])
117
+
118
+ if uploaded_file is not None:
119
+ filename = uploaded_file.name
120
+ namespace = filename.split('.')[0]
121
+ st.info("Dataset Processing Started...")
122
+ st.session_state.df, st.session_state.namespace = load_and_process_data(uploaded_file)
123
+ st.info("Dataset Processing Completed...")
124
+
125
+ if st.session_state.namespace:
126
+ query = st.text_input("Enter your query about the data (or type 'exit' to quit):")
127
+
128
+ if query.lower() != 'exit':
129
+ vec = st.session_state.model.encode(query)
130
+ result = None
131
+ result = st.session_state.index.query(
132
+ namespace=st.session_state.namespace,
133
+ vector=vec.tolist(),
134
+ top_k=5,
135
+ include_values=False
136
+ )
137
+
138
+ st.subheader("Query Results:")
139
+ if result:
140
+ id = result['matches'][0]['id']
141
+ data = st.session_state.df
142
+ answer = data[data['id'] == id]['Answer'].values[0]
143
+ st.write(answer)
144
+
145
+ if st.button("Delete Stored Data"):
146
+ st.session_state.index.delete(deleteAll=True, namespace =st.session_state.namespace)
147
+ st.stop()
148
+
149
+ if __name__ == "__main__":
150
+ main()