Spaces:
Sleeping
Sleeping
import streamlit as st | |
import urllib.request | |
import fitz | |
import re | |
import numpy as np | |
import tensorflow_hub as hub | |
import openai | |
from sklearn.neighbors import NearestNeighbors | |
import os | |
import time | |
import csv | |
from io import StringIO | |
import pandas as pd | |
from io import BytesIO | |
import base64 | |
import threading | |
from queue import Queue | |
def download_pdf(url, output_path): | |
urllib.request.urlretrieve(url, output_path) | |
def preprocess(text): | |
text = text.replace('\n', ' ') | |
text = re.sub('\s+', ' ', text) | |
return text | |
def pdf_to_text(path, start_page=1, end_page=None): | |
doc = fitz.open(path) | |
total_pages = doc.page_count | |
if end_page is None: | |
end_page = total_pages | |
text_list = [] | |
for i in range(start_page-1, end_page): | |
text = doc.load_page(i).get_text("text") | |
text = preprocess(text) | |
text_list.append(text) | |
doc.close() | |
return text_list | |
def text_to_chunks(texts, word_length=150, start_page=1): | |
text_toks = [t.split(' ') for t in texts] | |
page_nums = [] | |
chunks = [] | |
for idx, words in enumerate(text_toks): | |
for i in range(0, len(words), word_length): | |
chunk = words[i:i+word_length] | |
if (i+word_length) > len(words) and (len(chunk) < word_length) and ( | |
len(text_toks) != (idx+1)): | |
text_toks[idx+1] = chunk + text_toks[idx+1] | |
continue | |
chunk = ' '.join(chunk).strip() | |
chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"' | |
chunks.append(chunk) | |
return chunks | |
class SemanticSearch: | |
def __init__(self): | |
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4') | |
self.fitted = False | |
def fit(self, data, batch=1000, n_neighbors=5): | |
self.data = data | |
self.embeddings = self.get_text_embedding(data, batch=batch) | |
n_neighbors = min(n_neighbors, len(self.embeddings)) | |
self.nn = NearestNeighbors(n_neighbors=n_neighbors) | |
self.nn.fit(self.embeddings) | |
self.fitted = True | |
def call(self, text, return_data=True): | |
inp_emb = self.use([text]) | |
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0] | |
if return_data: | |
return [self.data[i] for i in neighbors] | |
else: | |
return neighbors | |
def get_text_embedding(self, texts, batch=1000): | |
embeddings = [] | |
for i in range(0, len(texts), batch): | |
text_batch = texts[i:(i+batch)] | |
emb_batch = self.use(text_batch) | |
embeddings.append(emb_batch) | |
embeddings = np.vstack(embeddings) | |
return embeddings | |
def load_recommender(path, start_page=1): | |
global recommender | |
texts = pdf_to_text(path, start_page=start_page) | |
chunks = text_to_chunks(texts, start_page=start_page) | |
recommender.fit(chunks) | |
return 'Corpus Loaded.' | |
def generate_text(openAI_key,prompt, engine="text-davinci-003"): | |
openai.api_key = openAI_key | |
completions = openai.Completion.create( | |
engine=engine, | |
prompt=prompt, | |
max_tokens=512, | |
n=1, | |
stop=None, | |
temperature=0.7, | |
) | |
message = completions.choices[0].text | |
return message | |
def generate_answer(question,openAI_key): | |
topn_chunks = recommender.call(question) | |
prompt = "" | |
prompt += 'search results:\n\n' | |
for c in topn_chunks: | |
prompt += c + '\n\n' | |
prompt += "Instructions: Compose a simple reply to the query using the search results given. " | |
"Citation should be done at the end of each sentence. If the search results mention multiple subjects " | |
"with the same name, create separate answers for each. Only include information found in the results and " | |
"don't add any additional information. Make sure the answer is correct and don't output false content. " | |
"If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier " | |
"search results which has nothing to do with the question. Only answer what is asked. The " | |
"answer should be short and concise. \n\nQuery: {question}\nAnswer:" | |
prompt += f"Query: {question}\nAnswer:" | |
answer = generate_text(openAI_key, prompt,"text-davinci-003") | |
return answer | |
recommender = SemanticSearch() | |
st.title('PDF GPT') | |
description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. The returned response can cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly.""" | |
st.markdown(description) | |
openAI_key = st.sidebar.text_input('API Key', value='sk-') | |
data_section = st.sidebar.text_area("Paste Data:") | |
paste_data = st.sidebar.button("Paste Data") | |
add_row = st.sidebar.button("Add row") | |
row_count = st.session_state.get("row_count", 1) | |
num_concurrent_calls = st.sidebar.number_input("Concurrent Calls:", min_value=1, max_value=2000, value=10, step=1) | |
generate_all = st.sidebar.button("Generate All") | |
if add_row: | |
row_count += 1 | |
st.session_state.row_count = row_count | |
if paste_data: | |
data = StringIO(data_section.strip()) | |
reader = csv.reader(data, delimiter='\t', quotechar='"') # Changed delimiter to '\t' | |
urls_questions = [row for row in reader] | |
row_count = len(urls_questions) | |
st.session_state.row_count = row_count | |
for i, url_question in enumerate(urls_questions): # Directly iterate over urls_questions | |
if len(url_question) >= 2: | |
st.session_state[f"url{i}"] = url_question[0] | |
st.session_state[f"question{i}"] = url_question[1] | |
else: | |
st.error(f"Row {i+1} does not have enough columns.") | |
for i in range(row_count): | |
col1, col2, col3, col4 = st.columns(4) | |
with col1: | |
url = st.text_input(f'PDF URL {i+1}', key=f'url{i}', value=st.session_state.get(f'url{i}', '')) | |
with col2: | |
question = st.text_input(f'Question {i+1}', key=f'question{i}', value=st.session_state.get(f'question{i}', '')) | |
with col3: | |
# Initialize session state for answer if not already done | |
if f'session_answer{i}' not in st.session_state: | |
st.session_state[f'session_answer{i}'] = '' | |
with col4: | |
if st.button(f'Submit {i+1}'): | |
if openAI_key.strip()=='': | |
st.error('Please enter you Open AI Key') | |
elif url.strip() == '': | |
st.error('URL field is empty') | |
elif question.strip() == '': | |
st.error('Question field is empty') | |
else: | |
glob_url = url | |
download_pdf(glob_url, 'corpus.pdf') | |
load_recommender('corpus.pdf') | |
answer = generate_answer(question,openAI_key) | |
# Store the answer in session state | |
st.session_state[f'session_answer{i}'] = answer | |
with col3: | |
answer_placeholder = st.empty() | |
answer_placeholder.text_area(f'Answer {i+1}', key=f'answer{i}', value=st.session_state[f'session_answer{i}']) | |
def get_table_download_link(df, filename="data.csv", text="Download CSV file"): | |
csv = df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here | |
href = f'{text}' | |
return href | |
# Create a list of lists containing all URLs, questions, and answers | |
data = [[st.session_state.get(f'url{i}', ''), st.session_state.get(f'question{i}', ''), st.session_state.get(f'session_answer{i}', '')] for i in range(row_count)] | |
# Convert the data to a Pandas DataFrame | |
df = pd.DataFrame(data, columns=['URL', 'Question', 'Answer']) | |
# Generate a download link for the DataFrame | |
st.markdown(get_table_download_link(df), unsafe_allow_html=True) | |
def to_csv(data): | |
output = BytesIO() | |
writer = csv.writer(output) | |
writer.writerows(data) | |
return output.getvalue().decode('utf-8') | |
def get_table_download_link(df, filename="data.csv", text="Download CSV file"): | |
csv = df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here | |
href = f'{text}' | |
return href | |
class WorkerThread(threading.Thread): | |
def __init__(self, jobs, results): | |
super().__init__() | |
self.jobs = jobs | |
self.results = results | |
def run(self): | |
while True: | |
job = self.jobs.get() | |
if job is None: | |
break | |
i, question = job | |
result = generate_answer(question, openAI_key) | |
self.results.put((i, result)) | |
if generate_all: | |
questions = [st.session_state.get(f"question{i}", "") for i in range(row_count)] | |
jobs = Queue() | |
results = Queue() | |
workers = [WorkerThread(jobs, results) for _ in range(num_concurrent_calls)] | |
for worker in workers: | |
worker.start() | |
for i, question in enumerate(questions): | |
jobs.put((i, question)) | |
for _ in range(num_concurrent_calls): | |
jobs.put(None) | |
for worker in workers: | |
worker.join() | |
while not results.empty(): | |
i, answer = results.get() | |
st.session_state[f'session_answer{i}'] = answer | |