joshuadunlop commited on
Commit
426b438
·
1 Parent(s): a0777f9

Update app.py

Browse files

Adding generate all function

Files changed (1) hide show
  1. app.py +71 -28
app.py CHANGED
@@ -13,6 +13,8 @@ from io import StringIO
13
  import pandas as pd
14
  from io import BytesIO
15
  import base64
 
 
16
 
17
  def download_pdf(url, output_path):
18
  urllib.request.urlretrieve(url, output_path)
@@ -43,7 +45,7 @@ def text_to_chunks(texts, word_length=150, start_page=1):
43
  text_toks = [t.split(' ') for t in texts]
44
  page_nums = []
45
  chunks = []
46
-
47
  for idx, words in enumerate(text_toks):
48
  for i in range(0, len(words), word_length):
49
  chunk = words[i:i+word_length]
@@ -57,11 +59,11 @@ def text_to_chunks(texts, word_length=150, start_page=1):
57
  return chunks
58
 
59
  class SemanticSearch:
60
-
61
  def __init__(self):
62
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
63
  self.fitted = False
64
-
65
  def fit(self, data, batch=1000, n_neighbors=5):
66
  self.data = data
67
  self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -69,16 +71,16 @@ class SemanticSearch:
69
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
70
  self.nn.fit(self.embeddings)
71
  self.fitted = True
72
-
73
- def __call__(self, text, return_data=True):
74
  inp_emb = self.use([text])
75
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
76
-
77
  if return_data:
78
  return [self.data[i] for i in neighbors]
79
  else:
80
  return neighbors
81
-
82
  def get_text_embedding(self, texts, batch=1000):
83
  embeddings = []
84
  for i in range(0, len(texts), batch):
@@ -109,20 +111,20 @@ def generate_text(openAI_key,prompt, engine="text-davinci-003"):
109
  return message
110
 
111
  def generate_answer(question,openAI_key):
112
- topn_chunks = recommender(question)
113
  prompt = ""
114
  prompt += 'search results:\n\n'
115
  for c in topn_chunks:
116
  prompt += c + '\n\n'
117
-
118
- prompt += "Instructions: Compose a simple reply to the query using the search results given. "\
119
- "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
120
- "with the same name, create separate answers for each. Only include information found in the results and "\
121
- "don't add any additional information. Make sure the answer is correct and don't output false content. "\
122
- "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
123
- "search results which has nothing to do with the question. Only answer what is asked. The "\
124
- "answer should be short and concise. \n\nQuery: {question}\nAnswer:"
125
-
126
  prompt += f"Query: {question}\nAnswer:"
127
  answer = generate_text(openAI_key, prompt,"text-davinci-003")
128
  return answer
@@ -143,20 +145,23 @@ paste_data = st.sidebar.button("Paste Data")
143
  add_row = st.sidebar.button("Add row")
144
  row_count = st.session_state.get("row_count", 1)
145
 
 
 
 
146
  if add_row:
147
  row_count += 1
148
  st.session_state.row_count = row_count
149
 
150
  if paste_data:
151
  data = StringIO(data_section.strip())
152
- reader = csv.reader(data, delimiter='\t', quotechar='"') # Changed delimiter to '\t'
153
  urls_questions = [row for row in reader]
154
 
155
  row_count = len(urls_questions)
156
  st.session_state.row_count = row_count
157
 
158
- for i, url_question in enumerate(urls_questions): # Directly iterate over urls_questions
159
- if len(url_question) >= 2:
160
  st.session_state[f"url{i}"] = url_question[0]
161
  st.session_state[f"question{i}"] = url_question[1]
162
  else:
@@ -185,19 +190,18 @@ for i in range(row_count):
185
  glob_url = url
186
  download_pdf(glob_url, 'corpus.pdf')
187
  load_recommender('corpus.pdf')
188
-
189
  answer = generate_answer(question,openAI_key)
190
  # Store the answer in session state
191
  st.session_state[f'session_answer{i}'] = answer
192
-
193
  with col3:
194
- answer_placeholder = st.empty()
195
- answer_placeholder.text_area(f'Answer {i+1}', key=f'answer{i}', value=st.session_state[f'session_answer{i}'])
196
 
197
  def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
198
  csv = df.to_csv(index=False)
199
- b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
200
- href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">{text}</a>'
201
  return href
202
 
203
  # Create a list of lists containing all URLs, questions, and answers
@@ -217,6 +221,45 @@ def to_csv(data):
217
 
218
  def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
219
  csv = df.to_csv(index=False)
220
- b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
221
- href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">{text}</a>'
222
  return href
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  import pandas as pd
14
  from io import BytesIO
15
  import base64
16
+ import threading
17
+ from queue import Queue
18
 
19
  def download_pdf(url, output_path):
20
  urllib.request.urlretrieve(url, output_path)
 
45
  text_toks = [t.split(' ') for t in texts]
46
  page_nums = []
47
  chunks = []
48
+
49
  for idx, words in enumerate(text_toks):
50
  for i in range(0, len(words), word_length):
51
  chunk = words[i:i+word_length]
 
59
  return chunks
60
 
61
  class SemanticSearch:
62
+
63
  def __init__(self):
64
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
65
  self.fitted = False
66
+
67
  def fit(self, data, batch=1000, n_neighbors=5):
68
  self.data = data
69
  self.embeddings = self.get_text_embedding(data, batch=batch)
 
71
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
72
  self.nn.fit(self.embeddings)
73
  self.fitted = True
74
+
75
+ def call(self, text, return_data=True):
76
  inp_emb = self.use([text])
77
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
78
+
79
  if return_data:
80
  return [self.data[i] for i in neighbors]
81
  else:
82
  return neighbors
83
+
84
  def get_text_embedding(self, texts, batch=1000):
85
  embeddings = []
86
  for i in range(0, len(texts), batch):
 
111
  return message
112
 
113
  def generate_answer(question,openAI_key):
114
+ topn_chunks = recommender.call(question)
115
  prompt = ""
116
  prompt += 'search results:\n\n'
117
  for c in topn_chunks:
118
  prompt += c + '\n\n'
119
+
120
+ prompt += "Instructions: Compose a simple reply to the query using the search results given. "
121
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects "
122
+ "with the same name, create separate answers for each. Only include information found in the results and "
123
+ "don't add any additional information. Make sure the answer is correct and don't output false content. "
124
+ "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "
125
+ "search results which has nothing to do with the question. Only answer what is asked. The "
126
+ "answer should be short and concise. \n\nQuery: {question}\nAnswer:"
127
+
128
  prompt += f"Query: {question}\nAnswer:"
129
  answer = generate_text(openAI_key, prompt,"text-davinci-003")
130
  return answer
 
145
  add_row = st.sidebar.button("Add row")
146
  row_count = st.session_state.get("row_count", 1)
147
 
148
+ num_concurrent_calls = st.sidebar.number_input("Concurrent Calls:", min_value=1, max_value=2000, value=10, step=1)
149
+ generate_all = st.sidebar.button("Generate All")
150
+
151
  if add_row:
152
  row_count += 1
153
  st.session_state.row_count = row_count
154
 
155
  if paste_data:
156
  data = StringIO(data_section.strip())
157
+ reader = csv.reader(data, delimiter='\t', quotechar='"') # Changed delimiter to '\t'
158
  urls_questions = [row for row in reader]
159
 
160
  row_count = len(urls_questions)
161
  st.session_state.row_count = row_count
162
 
163
+ for i, url_question in enumerate(urls_questions): # Directly iterate over urls_questions
164
+ if len(url_question) >= 2:
165
  st.session_state[f"url{i}"] = url_question[0]
166
  st.session_state[f"question{i}"] = url_question[1]
167
  else:
 
190
  glob_url = url
191
  download_pdf(glob_url, 'corpus.pdf')
192
  load_recommender('corpus.pdf')
193
+
194
  answer = generate_answer(question,openAI_key)
195
  # Store the answer in session state
196
  st.session_state[f'session_answer{i}'] = answer
 
197
  with col3:
198
+ answer_placeholder = st.empty()
199
+ answer_placeholder.text_area(f'Answer {i+1}', key=f'answer{i}', value=st.session_state[f'session_answer{i}'])
200
 
201
  def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
202
  csv = df.to_csv(index=False)
203
+ b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
204
+ href = f'{text}'
205
  return href
206
 
207
  # Create a list of lists containing all URLs, questions, and answers
 
221
 
222
  def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
223
  csv = df.to_csv(index=False)
224
+ b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
225
+ href = f'{text}'
226
  return href
227
+
228
+ class WorkerThread(threading.Thread):
229
+ def __init__(self, jobs, results):
230
+ super().__init__()
231
+ self.jobs = jobs
232
+ self.results = results
233
+
234
+ def run(self):
235
+ while True:
236
+ job = self.jobs.get()
237
+ if job is None:
238
+ break
239
+ i, question = job
240
+ result = generate_answer(question, openAI_key)
241
+ self.results.put((i, result))
242
+
243
+ if generate_all:
244
+ questions = [st.session_state.get(f"question{i}", "") for i in range(row_count)]
245
+
246
+ jobs = Queue()
247
+ results = Queue()
248
+
249
+ workers = [WorkerThread(jobs, results) for _ in range(num_concurrent_calls)]
250
+
251
+ for worker in workers:
252
+ worker.start()
253
+
254
+ for i, question in enumerate(questions):
255
+ jobs.put((i, question))
256
+
257
+ for _ in range(num_concurrent_calls):
258
+ jobs.put(None)
259
+
260
+ for worker in workers:
261
+ worker.join()
262
+
263
+ while not results.empty():
264
+ i, answer = results.get()
265
+ st.session_state[f'session_answer{i}'] = answer