Spaces:
Sleeping
Sleeping
Commit
·
426b438
1
Parent(s):
a0777f9
Update app.py
Browse filesAdding generate all function
app.py
CHANGED
@@ -13,6 +13,8 @@ from io import StringIO
|
|
13 |
import pandas as pd
|
14 |
from io import BytesIO
|
15 |
import base64
|
|
|
|
|
16 |
|
17 |
def download_pdf(url, output_path):
|
18 |
urllib.request.urlretrieve(url, output_path)
|
@@ -43,7 +45,7 @@ def text_to_chunks(texts, word_length=150, start_page=1):
|
|
43 |
text_toks = [t.split(' ') for t in texts]
|
44 |
page_nums = []
|
45 |
chunks = []
|
46 |
-
|
47 |
for idx, words in enumerate(text_toks):
|
48 |
for i in range(0, len(words), word_length):
|
49 |
chunk = words[i:i+word_length]
|
@@ -57,11 +59,11 @@ def text_to_chunks(texts, word_length=150, start_page=1):
|
|
57 |
return chunks
|
58 |
|
59 |
class SemanticSearch:
|
60 |
-
|
61 |
def __init__(self):
|
62 |
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
|
63 |
self.fitted = False
|
64 |
-
|
65 |
def fit(self, data, batch=1000, n_neighbors=5):
|
66 |
self.data = data
|
67 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
@@ -69,16 +71,16 @@ class SemanticSearch:
|
|
69 |
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
|
70 |
self.nn.fit(self.embeddings)
|
71 |
self.fitted = True
|
72 |
-
|
73 |
-
def
|
74 |
inp_emb = self.use([text])
|
75 |
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
|
76 |
-
|
77 |
if return_data:
|
78 |
return [self.data[i] for i in neighbors]
|
79 |
else:
|
80 |
return neighbors
|
81 |
-
|
82 |
def get_text_embedding(self, texts, batch=1000):
|
83 |
embeddings = []
|
84 |
for i in range(0, len(texts), batch):
|
@@ -109,20 +111,20 @@ def generate_text(openAI_key,prompt, engine="text-davinci-003"):
|
|
109 |
return message
|
110 |
|
111 |
def generate_answer(question,openAI_key):
|
112 |
-
topn_chunks = recommender(question)
|
113 |
prompt = ""
|
114 |
prompt += 'search results:\n\n'
|
115 |
for c in topn_chunks:
|
116 |
prompt += c + '\n\n'
|
117 |
-
|
118 |
-
prompt += "Instructions: Compose a simple reply to the query using the search results given. "
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
prompt += f"Query: {question}\nAnswer:"
|
127 |
answer = generate_text(openAI_key, prompt,"text-davinci-003")
|
128 |
return answer
|
@@ -143,20 +145,23 @@ paste_data = st.sidebar.button("Paste Data")
|
|
143 |
add_row = st.sidebar.button("Add row")
|
144 |
row_count = st.session_state.get("row_count", 1)
|
145 |
|
|
|
|
|
|
|
146 |
if add_row:
|
147 |
row_count += 1
|
148 |
st.session_state.row_count = row_count
|
149 |
|
150 |
if paste_data:
|
151 |
data = StringIO(data_section.strip())
|
152 |
-
reader = csv.reader(data, delimiter='\t', quotechar='"')
|
153 |
urls_questions = [row for row in reader]
|
154 |
|
155 |
row_count = len(urls_questions)
|
156 |
st.session_state.row_count = row_count
|
157 |
|
158 |
-
for i, url_question in enumerate(urls_questions):
|
159 |
-
if len(url_question) >= 2:
|
160 |
st.session_state[f"url{i}"] = url_question[0]
|
161 |
st.session_state[f"question{i}"] = url_question[1]
|
162 |
else:
|
@@ -185,19 +190,18 @@ for i in range(row_count):
|
|
185 |
glob_url = url
|
186 |
download_pdf(glob_url, 'corpus.pdf')
|
187 |
load_recommender('corpus.pdf')
|
188 |
-
|
189 |
answer = generate_answer(question,openAI_key)
|
190 |
# Store the answer in session state
|
191 |
st.session_state[f'session_answer{i}'] = answer
|
192 |
-
|
193 |
with col3:
|
194 |
-
answer_placeholder = st.empty()
|
195 |
-
answer_placeholder.text_area(f'Answer {i+1}', key=f'answer{i}', value=st.session_state[f'session_answer{i}'])
|
196 |
|
197 |
def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
|
198 |
csv = df.to_csv(index=False)
|
199 |
-
b64 = base64.b64encode(csv.encode()).decode()
|
200 |
-
href = f'
|
201 |
return href
|
202 |
|
203 |
# Create a list of lists containing all URLs, questions, and answers
|
@@ -217,6 +221,45 @@ def to_csv(data):
|
|
217 |
|
218 |
def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
|
219 |
csv = df.to_csv(index=False)
|
220 |
-
b64 = base64.b64encode(csv.encode()).decode()
|
221 |
-
href = f'
|
222 |
return href
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
import pandas as pd
|
14 |
from io import BytesIO
|
15 |
import base64
|
16 |
+
import threading
|
17 |
+
from queue import Queue
|
18 |
|
19 |
def download_pdf(url, output_path):
|
20 |
urllib.request.urlretrieve(url, output_path)
|
|
|
45 |
text_toks = [t.split(' ') for t in texts]
|
46 |
page_nums = []
|
47 |
chunks = []
|
48 |
+
|
49 |
for idx, words in enumerate(text_toks):
|
50 |
for i in range(0, len(words), word_length):
|
51 |
chunk = words[i:i+word_length]
|
|
|
59 |
return chunks
|
60 |
|
61 |
class SemanticSearch:
|
62 |
+
|
63 |
def __init__(self):
|
64 |
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
|
65 |
self.fitted = False
|
66 |
+
|
67 |
def fit(self, data, batch=1000, n_neighbors=5):
|
68 |
self.data = data
|
69 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
|
|
71 |
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
|
72 |
self.nn.fit(self.embeddings)
|
73 |
self.fitted = True
|
74 |
+
|
75 |
+
def call(self, text, return_data=True):
|
76 |
inp_emb = self.use([text])
|
77 |
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
|
78 |
+
|
79 |
if return_data:
|
80 |
return [self.data[i] for i in neighbors]
|
81 |
else:
|
82 |
return neighbors
|
83 |
+
|
84 |
def get_text_embedding(self, texts, batch=1000):
|
85 |
embeddings = []
|
86 |
for i in range(0, len(texts), batch):
|
|
|
111 |
return message
|
112 |
|
113 |
def generate_answer(question,openAI_key):
|
114 |
+
topn_chunks = recommender.call(question)
|
115 |
prompt = ""
|
116 |
prompt += 'search results:\n\n'
|
117 |
for c in topn_chunks:
|
118 |
prompt += c + '\n\n'
|
119 |
+
|
120 |
+
prompt += "Instructions: Compose a simple reply to the query using the search results given. "
|
121 |
+
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "
|
122 |
+
"with the same name, create separate answers for each. Only include information found in the results and "
|
123 |
+
"don't add any additional information. Make sure the answer is correct and don't output false content. "
|
124 |
+
"If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "
|
125 |
+
"search results which has nothing to do with the question. Only answer what is asked. The "
|
126 |
+
"answer should be short and concise. \n\nQuery: {question}\nAnswer:"
|
127 |
+
|
128 |
prompt += f"Query: {question}\nAnswer:"
|
129 |
answer = generate_text(openAI_key, prompt,"text-davinci-003")
|
130 |
return answer
|
|
|
145 |
add_row = st.sidebar.button("Add row")
|
146 |
row_count = st.session_state.get("row_count", 1)
|
147 |
|
148 |
+
num_concurrent_calls = st.sidebar.number_input("Concurrent Calls:", min_value=1, max_value=2000, value=10, step=1)
|
149 |
+
generate_all = st.sidebar.button("Generate All")
|
150 |
+
|
151 |
if add_row:
|
152 |
row_count += 1
|
153 |
st.session_state.row_count = row_count
|
154 |
|
155 |
if paste_data:
|
156 |
data = StringIO(data_section.strip())
|
157 |
+
reader = csv.reader(data, delimiter='\t', quotechar='"') # Changed delimiter to '\t'
|
158 |
urls_questions = [row for row in reader]
|
159 |
|
160 |
row_count = len(urls_questions)
|
161 |
st.session_state.row_count = row_count
|
162 |
|
163 |
+
for i, url_question in enumerate(urls_questions): # Directly iterate over urls_questions
|
164 |
+
if len(url_question) >= 2:
|
165 |
st.session_state[f"url{i}"] = url_question[0]
|
166 |
st.session_state[f"question{i}"] = url_question[1]
|
167 |
else:
|
|
|
190 |
glob_url = url
|
191 |
download_pdf(glob_url, 'corpus.pdf')
|
192 |
load_recommender('corpus.pdf')
|
193 |
+
|
194 |
answer = generate_answer(question,openAI_key)
|
195 |
# Store the answer in session state
|
196 |
st.session_state[f'session_answer{i}'] = answer
|
|
|
197 |
with col3:
|
198 |
+
answer_placeholder = st.empty()
|
199 |
+
answer_placeholder.text_area(f'Answer {i+1}', key=f'answer{i}', value=st.session_state[f'session_answer{i}'])
|
200 |
|
201 |
def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
|
202 |
csv = df.to_csv(index=False)
|
203 |
+
b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
|
204 |
+
href = f'{text}'
|
205 |
return href
|
206 |
|
207 |
# Create a list of lists containing all URLs, questions, and answers
|
|
|
221 |
|
222 |
def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
|
223 |
csv = df.to_csv(index=False)
|
224 |
+
b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
|
225 |
+
href = f'{text}'
|
226 |
return href
|
227 |
+
|
228 |
+
class WorkerThread(threading.Thread):
|
229 |
+
def __init__(self, jobs, results):
|
230 |
+
super().__init__()
|
231 |
+
self.jobs = jobs
|
232 |
+
self.results = results
|
233 |
+
|
234 |
+
def run(self):
|
235 |
+
while True:
|
236 |
+
job = self.jobs.get()
|
237 |
+
if job is None:
|
238 |
+
break
|
239 |
+
i, question = job
|
240 |
+
result = generate_answer(question, openAI_key)
|
241 |
+
self.results.put((i, result))
|
242 |
+
|
243 |
+
if generate_all:
|
244 |
+
questions = [st.session_state.get(f"question{i}", "") for i in range(row_count)]
|
245 |
+
|
246 |
+
jobs = Queue()
|
247 |
+
results = Queue()
|
248 |
+
|
249 |
+
workers = [WorkerThread(jobs, results) for _ in range(num_concurrent_calls)]
|
250 |
+
|
251 |
+
for worker in workers:
|
252 |
+
worker.start()
|
253 |
+
|
254 |
+
for i, question in enumerate(questions):
|
255 |
+
jobs.put((i, question))
|
256 |
+
|
257 |
+
for _ in range(num_concurrent_calls):
|
258 |
+
jobs.put(None)
|
259 |
+
|
260 |
+
for worker in workers:
|
261 |
+
worker.join()
|
262 |
+
|
263 |
+
while not results.empty():
|
264 |
+
i, answer = results.get()
|
265 |
+
st.session_state[f'session_answer{i}'] = answer
|