Spaces:

joshuadunlop
/

PDF_Chatlines

Sleeping

App Files Files Community

joshuadunlop commited on Jul 26, 2023

Commit

426b438

1 Parent(s): a0777f9

Update app.py

Browse files

Adding generate all function

Files changed (1) hide show

app.py +71 -28

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ from io import StringIO
 import pandas as pd
 from io import BytesIO
 import base64
 def download_pdf(url, output_path):
     urllib.request.urlretrieve(url, output_path)
@@ -43,7 +45,7 @@ def text_to_chunks(texts, word_length=150, start_page=1):
     text_toks = [t.split(' ') for t in texts]
     page_nums = []
     chunks = []
     for idx, words in enumerate(text_toks):
         for i in range(0, len(words), word_length):
             chunk = words[i:i+word_length]
@@ -57,11 +59,11 @@ def text_to_chunks(texts, word_length=150, start_page=1):
     return chunks
 class SemanticSearch:
     def __init__(self):
         self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
         self.fitted = False
     def fit(self, data, batch=1000, n_neighbors=5):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -69,16 +71,16 @@ class SemanticSearch:
         self.nn = NearestNeighbors(n_neighbors=n_neighbors)
         self.nn.fit(self.embeddings)
         self.fitted = True
-    def __call__(self, text, return_data=True):
         inp_emb = self.use([text])
         neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
         if return_data:
             return [self.data[i] for i in neighbors]
         else:
             return neighbors
     def get_text_embedding(self, texts, batch=1000):
         embeddings = []
         for i in range(0, len(texts), batch):
@@ -109,20 +111,20 @@ def generate_text(openAI_key,prompt, engine="text-davinci-003"):
     return message
 def generate_answer(question,openAI_key):
-    topn_chunks = recommender(question)
     prompt = ""
     prompt += 'search results:\n\n'
     for c in topn_chunks:
         prompt += c + '\n\n'
-    prompt += "Instructions: Compose a simple reply to the query using the search results given. "\
-              "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
-              "with the same name, create separate answers for each. Only include information found in the results and "\
-              "don't add any additional information. Make sure the answer is correct and don't output false content. "\
-              "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
-              "search results which has nothing to do with the question. Only answer what is asked. The "\
-              "answer should be short and concise. \n\nQuery: {question}\nAnswer:"
     prompt += f"Query: {question}\nAnswer:"
     answer = generate_text(openAI_key, prompt,"text-davinci-003")
     return answer
@@ -143,20 +145,23 @@ paste_data = st.sidebar.button("Paste Data")
 add_row = st.sidebar.button("Add row")
 row_count = st.session_state.get("row_count", 1)
 if add_row:
     row_count += 1
     st.session_state.row_count = row_count
 if paste_data:
     data = StringIO(data_section.strip())
-    reader = csv.reader(data, delimiter='\t', quotechar='"')  # Changed delimiter to '\t'
     urls_questions = [row for row in reader]
     row_count = len(urls_questions)
     st.session_state.row_count = row_count
-    for i, url_question in enumerate(urls_questions):  # Directly iterate over urls_questions
-        if len(url_question) >= 2:
             st.session_state[f"url{i}"] = url_question[0]
             st.session_state[f"question{i}"] = url_question[1]
         else:
@@ -185,19 +190,18 @@ for i in range(row_count):
                 glob_url = url
                 download_pdf(glob_url, 'corpus.pdf')
                 load_recommender('corpus.pdf')
                 answer = generate_answer(question,openAI_key)
                 # Store the answer in session state
                 st.session_state[f'session_answer{i}'] = answer
     with col3:
-        answer_placeholder = st.empty()
-        answer_placeholder.text_area(f'Answer {i+1}', key=f'answer{i}', value=st.session_state[f'session_answer{i}'])
 def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
     csv = df.to_csv(index=False)
-    b64 = base64.b64encode(csv.encode()).decode()  # some strings <-> bytes conversions necessary here
-    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">{text}</a>'
     return href
 # Create a list of lists containing all URLs, questions, and answers
@@ -217,6 +221,45 @@ def to_csv(data):
 def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
     csv = df.to_csv(index=False)
-    b64 = base64.b64encode(csv.encode()).decode()  # some strings <-> bytes conversions necessary here
-    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">{text}</a>'
     return href

 import pandas as pd
 from io import BytesIO
 import base64
+import threading
+from queue import Queue
 def download_pdf(url, output_path):
     urllib.request.urlretrieve(url, output_path)
     text_toks = [t.split(' ') for t in texts]
     page_nums = []
     chunks = []
     for idx, words in enumerate(text_toks):
         for i in range(0, len(words), word_length):
             chunk = words[i:i+word_length]
     return chunks
 class SemanticSearch:
     def __init__(self):
         self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
         self.fitted = False
     def fit(self, data, batch=1000, n_neighbors=5):
         self.data = data
         self.embeddings = self.get_text_embedding(data, batch=batch)
         self.nn = NearestNeighbors(n_neighbors=n_neighbors)
         self.nn.fit(self.embeddings)
         self.fitted = True
+    def call(self, text, return_data=True):
         inp_emb = self.use([text])
         neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
         if return_data:
             return [self.data[i] for i in neighbors]
         else:
             return neighbors
     def get_text_embedding(self, texts, batch=1000):
         embeddings = []
         for i in range(0, len(texts), batch):
     return message
 def generate_answer(question,openAI_key):
+    topn_chunks = recommender.call(question)
     prompt = ""
     prompt += 'search results:\n\n'
     for c in topn_chunks:
         prompt += c + '\n\n'
+    prompt += "Instructions: Compose a simple reply to the query using the search results given. "
+    "Citation should be done at the end of each sentence. If the search results mention multiple subjects "
+    "with the same name, create separate answers for each. Only include information found in the results and "
+    "don't add any additional information. Make sure the answer is correct and don't output false content. "
+    "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "
+    "search results which has nothing to do with the question. Only answer what is asked. The "
+    "answer should be short and concise. \n\nQuery: {question}\nAnswer:"
     prompt += f"Query: {question}\nAnswer:"
     answer = generate_text(openAI_key, prompt,"text-davinci-003")
     return answer
 add_row = st.sidebar.button("Add row")
 row_count = st.session_state.get("row_count", 1)
+num_concurrent_calls = st.sidebar.number_input("Concurrent Calls:", min_value=1, max_value=2000, value=10, step=1)
+generate_all = st.sidebar.button("Generate All")
 if add_row:
     row_count += 1
     st.session_state.row_count = row_count
 if paste_data:
     data = StringIO(data_section.strip())
+    reader = csv.reader(data, delimiter='\t', quotechar='"') # Changed delimiter to '\t'
     urls_questions = [row for row in reader]
     row_count = len(urls_questions)
     st.session_state.row_count = row_count
+    for i, url_question in enumerate(urls_questions): # Directly iterate over urls_questions
+        if len(url_question) >= 2:
             st.session_state[f"url{i}"] = url_question[0]
             st.session_state[f"question{i}"] = url_question[1]
         else:
                 glob_url = url
                 download_pdf(glob_url, 'corpus.pdf')
                 load_recommender('corpus.pdf')
                 answer = generate_answer(question,openAI_key)
                 # Store the answer in session state
                 st.session_state[f'session_answer{i}'] = answer
     with col3:
+        answer_placeholder = st.empty()
+        answer_placeholder.text_area(f'Answer {i+1}', key=f'answer{i}', value=st.session_state[f'session_answer{i}'])
 def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
     csv = df.to_csv(index=False)
+    b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
+    href = f'{text}'
     return href
 # Create a list of lists containing all URLs, questions, and answers
 def get_table_download_link(df, filename="data.csv", text="Download CSV file"):
     csv = df.to_csv(index=False)
+    b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
+    href = f'{text}'
     return href
+class WorkerThread(threading.Thread):
+    def __init__(self, jobs, results):
+        super().__init__()
+        self.jobs = jobs
+        self.results = results
+    def run(self):
+        while True:
+            job = self.jobs.get()
+            if job is None:
+                break
+            i, question = job
+            result = generate_answer(question, openAI_key)
+            self.results.put((i, result))
+if generate_all:
+    questions = [st.session_state.get(f"question{i}", "") for i in range(row_count)]
+    jobs = Queue()
+    results = Queue()
+    workers = [WorkerThread(jobs, results) for _ in range(num_concurrent_calls)]
+    for worker in workers:
+        worker.start()
+    for i, question in enumerate(questions):
+        jobs.put((i, question))
+    for _ in range(num_concurrent_calls):
+        jobs.put(None)
+    for worker in workers:
+        worker.join()
+    while not results.empty():
+        i, answer = results.get()
+        st.session_state[f'session_answer{i}'] = answer