import fitz import uuid import re import numpy as np import tensorflow_hub as hub import openai import gradio as gr import shutil import os from sklearn.neighbors import NearestNeighbors from tempfile import NamedTemporaryFile from PyPDF2 import PdfReader openAI_key = os.environ['OpenAPI'] class SemanticSearch: def __init__(self): self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4') self.fitted = False def fit(self, data, batch=1000, n_neighbors=5): self.data = data self.embeddings = self.get_text_embedding(data, batch=batch) n_neighbors = min(n_neighbors, len(self.embeddings)) self.nn = NearestNeighbors(n_neighbors=n_neighbors) self.nn.fit(self.embeddings) self.fitted = True def __call__(self, text, return_data=True): inp_emb = self.use([text]) neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0] if return_data: return [self.data[i] for i in neighbors] else: return neighbors def get_text_embedding(self, texts, batch=1000): embeddings = [] for i in range(0, len(texts), batch): text_batch = texts[i:(i+batch)] emb_batch = self.use(text_batch) embeddings.append(emb_batch) embeddings = np.vstack(embeddings) return embeddings def pdf_to_text(pdf_path, start_page=1): pdf = PdfReader(pdf_path) text = '' for i in range(start_page, len(pdf.pages)): text += pdf.pages[i].extract_text() return text def text_to_chunks(text, start_page=1, chunk_size=512): chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] return chunks def unique_filename(basename): # Append a unique ID to the end of the filename, before the extension base, ext = os.path.splitext(basename) return base + "_" + uuid.uuid4().hex + ext def load_recommender(paths, start_page=1): global recommender chunks = [] for path in paths: pdf_file = os.path.basename(path) embeddings_file = f"{pdf_file}_{start_page}.npy" if os.path.isfile(embeddings_file): embeddings = np.load(embeddings_file) recommender.embeddings = embeddings recommender.fitted = True print("Embeddings loaded from file") continue texts = pdf_to_text(path, start_page=start_page) chunks.extend(text_to_chunks(texts, start_page=start_page)) recommender.fit(chunks) np.save(embeddings_file, recommender.embeddings) return 'Corpus Loaded.' def generate_text(openAI_key, prompt, engine="gpt-3.5-turbo"): openai.api_key = openAI_key messages = [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': prompt}] completions = openai.ChatCompletion.create( model=engine, messages=messages, max_tokens=512, n=1, stop=None, temperature=0.7, ) message = completions.choices[0].message['content'] return message def generate_answer(question, openAI_key): topn_chunks = recommender(question) prompt = "" prompt += 'search results:\n\n' for c in topn_chunks: prompt += c + '\n\n' prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\ "Make sure the answer is correct and don't output false content. "\ "If you do not know the answer - answer 'information not provided' "\ "Answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: " prompt += f"Query: {question}\nAnswer:" answer = generate_text(openAI_key, prompt, "gpt-3.5-turbo") return answer def main_loop(url: str, files: list, question: str, openAI_key): paths = [] if url.strip() != '': glob_url = url download_pdf(glob_url, 'corpus.pdf') paths.append('corpus.pdf') if files is not None and len(files) > 0: for file in files: old_file_name = file.name file_name = old_file_name[:-12] + old_file_name[-4:] file_name = unique_filename(file_name) # Ensure the new file name is unique # Copy the content of the old file to the new file and delete the old file with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst: shutil.copyfileobj(src, dst) os.remove(old_file_name) paths.append(file_name) load_recommender(paths) if question.strip().lower() == 'exit': return '', False answer = generate_answer(question, openAI_key) return answer, True # Assuming the function returns an answer in all other cases def on_click(*args): answer.value = main_loop(url.value, files.value, question.value) recommender = SemanticSearch() title = 'Cognitive pdfGPT' description = """ Why use Cognitive Ask an Expert? This is Cognitive Chat. Here you can upload multiple PDF files and query them as a single corpus of knowledge. 🛑DO NOT USE CONFIDENTIAL INFORMATION """ with gr.Blocks() as demo: gr.Markdown(f'

{title}

') gr.Markdown(description) with gr.Row(): with gr.Group(): files = gr.Files(label='➡️ Upload your PDFs ⬅️ NO CONFIDENTIAL FILES ', file_types=['.pdf']) url = gr.Textbox(label=' ') question = gr.Textbox(label='🔤 Enter your question here 🔤') btn = gr.Button(value='Submit') btn.style(full_width=False) with gr.Group(): gr.Image("logo.jpg") answer = gr.Textbox(label='The answer to your question is :') btn.click(main_loop, inputs=[url, files, question], outputs=[answer]) demo.launch(share=False, debug=True, auth=None, auth_message=None)