assignment1 / app.py
cogcorp's picture
Update app.py
195e8f3
raw
history blame
6.02 kB
import fitz
import uuid
import re
import numpy as np
import tensorflow_hub as hub
import openai
import gradio as gr
import shutil
import os
from sklearn.neighbors import NearestNeighbors
from tempfile import NamedTemporaryFile
from PyPDF2 import PdfReader
openAI_key = os.environ['OpenAPI']
class SemanticSearch:
def __init__(self):
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
self.fitted = False
def fit(self, data, batch=1000, n_neighbors=5):
self.data = data
self.embeddings = self.get_text_embedding(data, batch=batch)
n_neighbors = min(n_neighbors, len(self.embeddings))
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
self.nn.fit(self.embeddings)
self.fitted = True
def __call__(self, text, return_data=True):
inp_emb = self.use([text])
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
if return_data:
return [self.data[i] for i in neighbors]
else:
return neighbors
def get_text_embedding(self, texts, batch=1000):
embeddings = []
for i in range(0, len(texts), batch):
text_batch = texts[i:(i+batch)]
emb_batch = self.use(text_batch)
embeddings.append(emb_batch)
embeddings = np.vstack(embeddings)
return embeddings
def pdf_to_text(pdf_path, start_page=1):
pdf = PdfReader(pdf_path)
text = ''
for i in range(start_page, len(pdf.pages)):
text += pdf.pages[i].extract_text()
return text
def text_to_chunks(text, start_page=1, chunk_size=512):
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
return chunks
def unique_filename(basename):
# Append a unique ID to the end of the filename, before the extension
base, ext = os.path.splitext(basename)
return base + "_" + uuid.uuid4().hex + ext
def load_recommender(paths, start_page=1):
global recommender
chunks = []
for path in paths:
pdf_file = os.path.basename(path)
embeddings_file = f"{pdf_file}_{start_page}.npy"
if os.path.isfile(embeddings_file):
embeddings = np.load(embeddings_file)
recommender.embeddings = embeddings
recommender.fitted = True
print("Embeddings loaded from file")
continue
texts = pdf_to_text(path, start_page=start_page)
chunks.extend(text_to_chunks(texts, start_page=start_page))
recommender.fit(chunks)
np.save(embeddings_file, recommender.embeddings)
return 'Corpus Loaded.'
def generate_text(openAI_key, prompt, engine="gpt-3.5-turbo"):
openai.api_key = openAI_key
messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
{'role': 'user', 'content': prompt}]
completions = openai.ChatCompletion.create(
model=engine,
messages=messages,
max_tokens=512,
n=1,
stop=None,
temperature=0.7,
)
message = completions.choices[0].message['content']
return message
def generate_answer(question, openAI_key):
topn_chunks = recommender(question)
prompt = ""
prompt += 'search results:\n\n'
for c in topn_chunks:
prompt += c + '\n\n'
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
"Make sure the answer is correct and don't output false content. "\
"If you do not know the answer - answer 'information not provided' "\
"Answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
prompt += f"Query: {question}\nAnswer:"
answer = generate_text(openAI_key, prompt, "gpt-3.5-turbo")
return answer
def main_loop(url: str, files: list, question:
str, openAI_key):
paths = []
if url.strip() != '':
glob_url = url
download_pdf(glob_url, 'corpus.pdf')
paths.append('corpus.pdf')
if files is not None and len(files) > 0:
for file in files:
old_file_name = file.name
file_name = old_file_name[:-12] + old_file_name[-4:]
file_name = unique_filename(file_name) # Ensure the new file name is unique
# Copy the content of the old file to the new file and delete the old file
with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst:
shutil.copyfileobj(src, dst)
os.remove(old_file_name)
paths.append(file_name)
load_recommender(paths)
if question.strip().lower() == 'exit':
return '', False
answer = generate_answer(question, openAI_key)
return answer, True # Assuming the function returns an answer in all other cases
def on_click(*args):
answer.value = main_loop(url.value, files.value, question.value)
recommender = SemanticSearch()
title = 'Cognitive pdfGPT'
description = """ Why use Cognitive Ask an Expert?
This is Cognitive Chat. Here you can upload multiple PDF files and query them as a single corpus of knowledge. 🛑DO NOT USE CONFIDENTIAL INFORMATION """
with gr.Blocks() as demo:
gr.Markdown(f'<center><h1>{title}</h1></center>')
gr.Markdown(description)
with gr.Row():
with gr.Group():
files = gr.Files(label='➡️ Upload your PDFs ⬅️ NO CONFIDENTIAL FILES ', file_types=['.pdf'])
url = gr.Textbox(label=' ')
question = gr.Textbox(label='🔤 Enter your question here 🔤')
btn = gr.Button(value='Submit')
btn.style(full_width=False)
with gr.Group():
gr.Image("logo.jpg")
answer = gr.Textbox(label='The answer to your question is :')
btn.click(main_loop, inputs=[url, files, question], outputs=[answer])
demo.launch(share=False, debug=True, auth=None, auth_message=None)