Spaces:
Sleeping
Sleeping
import fitz | |
import uuid | |
import re | |
import numpy as np | |
import tensorflow_hub as hub | |
import openai | |
import gradio as gr | |
import shutil | |
import os | |
from sklearn.neighbors import NearestNeighbors | |
from tempfile import NamedTemporaryFile | |
from PyPDF2 import PdfReader | |
openAI_key = os.environ['OpenAPI'] | |
class SemanticSearch: | |
def __init__(self): | |
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4') | |
self.fitted = False | |
def fit(self, data, batch=1000, n_neighbors=5): | |
self.data = data | |
self.embeddings = self.get_text_embedding(data, batch=batch) | |
n_neighbors = min(n_neighbors, len(self.embeddings)) | |
self.nn = NearestNeighbors(n_neighbors=n_neighbors) | |
self.nn.fit(self.embeddings) | |
self.fitted = True | |
def __call__(self, text, return_data=True): | |
inp_emb = self.use([text]) | |
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0] | |
if return_data: | |
return [self.data[i] for i in neighbors] | |
else: | |
return neighbors | |
def get_text_embedding(self, texts, batch=1000): | |
embeddings = [] | |
for i in range(0, len(texts), batch): | |
text_batch = texts[i:(i+batch)] | |
emb_batch = self.use(text_batch) | |
embeddings.append(emb_batch) | |
embeddings = np.vstack(embeddings) | |
return embeddings | |
def pdf_to_text(pdf_path, start_page=1): | |
pdf = PdfReader(pdf_path) | |
text = '' | |
for i in range(start_page, len(pdf.pages)): | |
text += pdf.pages[i].extract_text() | |
return text | |
def text_to_chunks(text, start_page=1, chunk_size=512): | |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
return chunks | |
def unique_filename(basename): | |
# Append a unique ID to the end of the filename, before the extension | |
base, ext = os.path.splitext(basename) | |
return base + "_" + uuid.uuid4().hex + ext | |
def load_recommender(paths, start_page=1): | |
global recommender | |
chunks = [] | |
for path in paths: | |
pdf_file = os.path.basename(path) | |
embeddings_file = f"{pdf_file}_{start_page}.npy" | |
if os.path.isfile(embeddings_file): | |
embeddings = np.load(embeddings_file) | |
recommender.embeddings = embeddings | |
recommender.fitted = True | |
print("Embeddings loaded from file") | |
continue | |
texts = pdf_to_text(path, start_page=start_page) | |
chunks.extend(text_to_chunks(texts, start_page=start_page)) | |
recommender.fit(chunks) | |
np.save(embeddings_file, recommender.embeddings) | |
return 'Corpus Loaded.' | |
def generate_text(openAI_key, prompt, engine="gpt-3.5-turbo"): | |
openai.api_key = openAI_key | |
messages = [{'role': 'system', 'content': 'You are a helpful assistant.'}, | |
{'role': 'user', 'content': prompt}] | |
completions = openai.ChatCompletion.create( | |
model=engine, | |
messages=messages, | |
max_tokens=512, | |
n=1, | |
stop=None, | |
temperature=0.7, | |
) | |
message = completions.choices[0].message['content'] | |
return message | |
def generate_answer(question, openAI_key): | |
topn_chunks = recommender(question) | |
prompt = "" | |
prompt += 'search results:\n\n' | |
for c in topn_chunks: | |
prompt += c + '\n\n' | |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\ | |
"Make sure the answer is correct and don't output false content. "\ | |
"If you do not know the answer - answer 'information not provided' "\ | |
"Answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: " | |
prompt += f"Query: {question}\nAnswer:" | |
answer = generate_text(openAI_key, prompt, "gpt-3.5-turbo") | |
return answer | |
def main_loop(url: str, files: list, question: | |
str, openAI_key): | |
paths = [] | |
if url.strip() != '': | |
glob_url = url | |
download_pdf(glob_url, 'corpus.pdf') | |
paths.append('corpus.pdf') | |
if files is not None and len(files) > 0: | |
for file in files: | |
old_file_name = file.name | |
file_name = old_file_name[:-12] + old_file_name[-4:] | |
file_name = unique_filename(file_name) # Ensure the new file name is unique | |
# Copy the content of the old file to the new file and delete the old file | |
with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst: | |
shutil.copyfileobj(src, dst) | |
os.remove(old_file_name) | |
paths.append(file_name) | |
load_recommender(paths) | |
if question.strip().lower() == 'exit': | |
return '', False | |
answer = generate_answer(question, openAI_key) | |
return answer, True # Assuming the function returns an answer in all other cases | |
def on_click(*args): | |
answer.value = main_loop(url.value, files.value, question.value) | |
recommender = SemanticSearch() | |
title = 'Cognitive pdfGPT' | |
description = """ Why use Cognitive Ask an Expert? | |
This is Cognitive Chat. Here you can upload multiple PDF files and query them as a single corpus of knowledge. 🛑DO NOT USE CONFIDENTIAL INFORMATION """ | |
with gr.Blocks() as demo: | |
gr.Markdown(f'<center><h1>{title}</h1></center>') | |
gr.Markdown(description) | |
with gr.Row(): | |
with gr.Group(): | |
files = gr.Files(label='➡️ Upload your PDFs ⬅️ NO CONFIDENTIAL FILES ', file_types=['.pdf']) | |
url = gr.Textbox(label=' ') | |
question = gr.Textbox(label='🔤 Enter your question here 🔤') | |
btn = gr.Button(value='Submit') | |
btn.style(full_width=False) | |
with gr.Group(): | |
gr.Image("logo.jpg") | |
answer = gr.Textbox(label='The answer to your question is :') | |
btn.click(main_loop, inputs=[url, files, question], outputs=[answer]) | |
demo.launch(share=False, debug=True, auth=None, auth_message=None) | |