Spaces:
Sleeping
Sleeping
import streamlit as st | |
import urllib.request | |
import fitz | |
import re | |
import numpy as np | |
import tensorflow_hub as hub | |
import openai | |
from sklearn.neighbors import NearestNeighbors | |
import os | |
import time | |
import csv | |
from io import StringIO | |
import pandas as pd | |
from io import BytesIO | |
import base64 | |
import threading | |
from queue import Queue | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
def download_pdf(url, output_path): | |
try: | |
urllib.request.urlretrieve(url, output_path) | |
except urllib.error.HTTPError as e: | |
if e.code == 429: | |
time.sleep(1) # Wait for 1 second before retrying | |
download_pdf(url, output_path) | |
else: | |
raise | |
def preprocess(text): | |
text = text.replace('\n', ' ') | |
text = re.sub('\s+', ' ', text) | |
return text | |
def pdf_to_text(path, start_page=1, end_page=None): | |
doc = fitz.open(path) | |
total_pages = doc.page_count | |
if end_page is None: | |
end_page = total_pages | |
text_list = [] | |
for i in range(start_page-1, end_page): | |
text = doc.load_page(i).get_text("text") | |
text = preprocess(text) | |
text_list.append(text) | |
doc.close() | |
return text_list | |
def text_to_chunks(texts, word_length=150, start_page=1): | |
text_toks = [t.split(' ') for t in texts] | |
page_nums = [] | |
chunks = [] | |
for idx, words in enumerate(text_toks): | |
for i in range(0, len(words), word_length): | |
chunk = words[i:i+word_length] | |
if (i+word_length) > len(words) and (len(chunk) < word_length) and ( | |
len(text_toks) != (idx+1)): | |
text_toks[idx+1] = chunk + text_toks[idx+1] | |
continue | |
chunk = ' '.join(chunk).strip() | |
chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"' | |
chunks.append(chunk) | |
return chunks | |
class SemanticSearch: | |
def __init__(self): | |
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4') | |
self.fitted = False | |
def fit(self, data, batch=1000, n_neighbors=5): | |
self.data = data | |
self.embeddings = self.get_text_embedding(data, batch=batch) | |
n_neighbors = min(n_neighbors, len(self.embeddings)) | |
self.nn = NearestNeighbors(n_neighbors=n_neighbors) | |
self.nn.fit(self.embeddings) | |
self.fitted = True | |
def call(self, text, return_data=True): | |
if not self.fitted: | |
raise Exception("The fit method must be called before the call method.") | |
inp_emb = self.use([text]) | |
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0] | |
if return_data: | |
return [self.data[i] for i in neighbors] | |
else: | |
return neighbors | |
def get_text_embedding(self, texts, batch=1000): | |
embeddings = [] | |
for i in range(0, len(texts), batch): | |
text_batch = texts[i:(i+batch)] | |
emb_batch = self.use(text_batch) | |
embeddings.append(emb_batch) | |
embeddings = np.vstack(embeddings) | |
return embeddings | |
def load_recommender(path, start_page=1): | |
global recommender | |
texts = pdf_to_text(path, start_page=start_page) | |
chunks = text_to_chunks(texts, start_page=start_page) | |
recommender.fit(chunks) | |
return 'Corpus Loaded.' | |
def generate_text(openAI_key,prompt, engine="text-davinci-003"): | |
openai.api_key = openAI_key | |
completions = openai.Completion.create( | |
engine=engine, | |
prompt=prompt, | |
max_tokens=512, | |
n=1, | |
stop=None, | |
temperature=0.7, | |
) | |
message = completions.choices[0].text | |
return message | |
def generate_answer(question,openAI_key): | |
topn_chunks = recommender.call(question) | |
if not recommender.fitted: | |
st.error('The recommender is not fitted yet.') | |
return | |
prompt = "" | |
prompt += 'search results:\n\n' | |
for c in topn_chunks: | |
prompt += c + '\n\n' | |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\ | |
"Cite each reference using [Page Number] notation (every result has this number at the beginning). "\ | |
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\ | |
"with the same name, create separate answers for each. Only include information found in the results and "\ | |
"don't add any additional information. Make sure the answer is correct, and don't output false content. "\ | |
"If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\ | |
"search results that have nothing to do with the question. Only answer what is asked. The "\ | |
"answer should be short and concise. \n\nQuery: {question}\nAnswer: " | |
prompt += f"Query: {question}\nAnswer: " | |
answer = generate_text(openAI_key, prompt,"text-davinci-003") | |
answer = answer.strip() | |
return answer | |
recommender = SemanticSearch() | |
st.title('PDF GPT Multi-Line.') | |
description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. The returned response can cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly.""" | |
st.markdown(description) | |
openAI_key = st.sidebar.text_input('API Key', value='sk-') | |
data_section = st.sidebar.text_area("Paste Data:") | |
paste_data = st.sidebar.button("Paste Data") | |
add_row = st.sidebar.button("Add row") | |
row_count = st.session_state.get("row_count", 1) | |
num_concurrent_calls = st.sidebar.number_input("Concurrent Calls:", min_value=1, max_value=2000, value=10, step=1) | |
generate_all = st.sidebar.button("Generate All") | |
reset = st.sidebar.button("Reset") | |
if reset: | |
for i in range(row_count): | |
st.session_state[f"url{i}"] = '' | |
st.session_state[f"question{i}"] = '' | |
st.session_state[f'session_answer{i}'] = '' | |
st.session_state.row_count = 1 | |
st.experimental_rerun() | |
if add_row: | |
row_count += 1 | |
st.session_state.row_count = row_count | |
if paste_data: | |
data = StringIO(data_section.strip()) | |
reader = csv.reader(data, delimiter='\t', quotechar='"') # Changed delimiter to '\t' | |
urls_questions = [row for row in reader] | |
row_count = len(urls_questions) | |
st.session_state.row_count = row_count | |
for i, url_question in enumerate(urls_questions): # Directly iterate over urls_questions | |
if len(url_question) >= 2: | |
st.session_state[f"url{i}"] = url_question[0] | |
st.session_state[f"question{i}"] = url_question[1] | |
else: | |
st.error(f"Row {i+1} does not have enough columns.") | |
for i in range(row_count): | |
col1, col2, col3, col4 = st.columns(4) | |
with col1: | |
url = st.text_input(f'PDF URL {i+1}', key=f'url{i}') | |
with col2: | |
question = st.text_input(f'Question {i+1}', key=f'question{i}') | |
with col3: | |
if f'session_answer{i}' not in st.session_state: | |
st.session_state[f'session_answer{i}'] = '' | |
with col4: | |
if st.button(f'Submit {i+1}'): | |
if openAI_key.strip()=='': | |
st.error('Please enter you Open AI Key') | |
elif url.strip() == '': | |
st.error('URL field is empty') | |
elif question.strip() == '': | |
st.error('Question field is empty') | |
else: | |
glob_url = url | |
download_pdf(glob_url, 'corpus.pdf') | |
load_recommender('corpus.pdf') | |
answer = generate_answer(question,openAI_key) | |
st.session_state[f'session_answer{i}'] = answer | |
with col3: | |
answer_placeholder = st.empty() | |
answer_placeholder.text_area(f'Answer {i+1}', key=f'answer{i}', value=st.session_state[f'session_answer{i}']) | |
def get_table_download_link(df, filename="data.csv", text="Download CSV file"): | |
csv = df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() | |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">{text}</a>' | |
return href | |
data = [[st.session_state.get(f'url{i}', ''), st.session_state.get(f'question{i}', ''), st.session_state.get(f'session_answer{i}', '')] for i in range(row_count)] | |
df = pd.DataFrame(data, columns=['URL', 'Question', 'Answer']) | |
st.markdown(get_table_download_link(df), unsafe_allow_html=True) | |
class WorkerThread(threading.Thread): | |
def __init__(self, jobs, results): | |
super().__init__() | |
self.jobs = jobs | |
self.results = results | |
def run(self): | |
while True: | |
job = self.jobs.get() | |
if job is None: | |
break | |
try: | |
i, question = job | |
result = generate_answer(question, openAI_key) | |
self.results.put((i, result)) | |
logging.info(f"Job {i} completed successfully.") | |
except Exception as e: | |
self.results.put((i, str(e))) | |
logging.error(f"Error on job {i}: {str(e)}") | |
if generate_all: | |
questions = [st.session_state.get(f"question{i}", "") for i in range(row_count)] | |
urls = [st.session_state.get(f"url{i}", "") for i in range(row_count)] | |
jobs = Queue() | |
results = Queue() | |
workers = [WorkerThread(jobs, results) for _ in range(num_concurrent_calls)] | |
for i, (url, question) in enumerate(zip(urls, questions)): | |
download_pdf(url, 'corpus.pdf') | |
load_recommender('corpus.pdf') | |
jobs.put((i, question)) | |
for worker in workers: | |
worker.start() | |
for worker in workers: | |
jobs.put(None) | |
for worker in workers: | |
worker.join() | |
logging.info("All worker threads have finished.") | |
answers = {} | |
while not results.empty(): | |
i, answer = results.get() | |
if isinstance(answer, str) and 'Error' in answer: | |
st.error(f"Error on row {i}: {answer}") | |
else: | |
answers[i] = answer | |
logging.info(f"Collected {len(answers)} answers.") | |
for i, answer in answers.items(): | |
st.session_state[f'session_answer{i}'] = answer | |
logging.info("Session state updated with answers.") | |
# Rerun the app after all answers are generated | |
st.experimental_rerun() | |