Spaces:

cogcorp
/

assignment1

Sleeping

App Files Files Community

assignment1 / app.py

cogcorp

Update app.py

195e8f3 almost 2 years ago

raw

history blame

6.02 kB

	import fitz
	import uuid
	import re
	import numpy as np
	import tensorflow_hub as hub
	import openai
	import gradio as gr
	import shutil
	import os
	from sklearn.neighbors import NearestNeighbors
	from tempfile import NamedTemporaryFile
	from PyPDF2 import PdfReader

	openAI_key = os.environ['OpenAPI']

	class SemanticSearch:

	def __init__(self):
	self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
	self.fitted = False


	def fit(self, data, batch=1000, n_neighbors=5):
	self.data = data
	self.embeddings = self.get_text_embedding(data, batch=batch)
	n_neighbors = min(n_neighbors, len(self.embeddings))
	self.nn = NearestNeighbors(n_neighbors=n_neighbors)
	self.nn.fit(self.embeddings)
	self.fitted = True


	def __call__(self, text, return_data=True):
	inp_emb = self.use([text])
	neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]

	if return_data:
	return [self.data[i] for i in neighbors]
	else:
	return neighbors


	def get_text_embedding(self, texts, batch=1000):
	embeddings = []
	for i in range(0, len(texts), batch):
	text_batch = texts[i:(i+batch)]
	emb_batch = self.use(text_batch)
	embeddings.append(emb_batch)
	embeddings = np.vstack(embeddings)
	return embeddings

	def pdf_to_text(pdf_path, start_page=1):
	pdf = PdfReader(pdf_path)
	text = ''
	for i in range(start_page, len(pdf.pages)):
	text += pdf.pages[i].extract_text()
	return text

	def text_to_chunks(text, start_page=1, chunk_size=512):
	chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
	return chunks



	def unique_filename(basename):
	# Append a unique ID to the end of the filename, before the extension
	base, ext = os.path.splitext(basename)
	return base + "_" + uuid.uuid4().hex + ext


	def load_recommender(paths, start_page=1):
	global recommender
	chunks = []
	for path in paths:
	pdf_file = os.path.basename(path)
	embeddings_file = f"{pdf_file}_{start_page}.npy"

	if os.path.isfile(embeddings_file):
	embeddings = np.load(embeddings_file)
	recommender.embeddings = embeddings
	recommender.fitted = True
	print("Embeddings loaded from file")
	continue

	texts = pdf_to_text(path, start_page=start_page)
	chunks.extend(text_to_chunks(texts, start_page=start_page))

	recommender.fit(chunks)
	np.save(embeddings_file, recommender.embeddings)
	return 'Corpus Loaded.'


	def generate_text(openAI_key, prompt, engine="gpt-3.5-turbo"):
	openai.api_key = openAI_key
	messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
	{'role': 'user', 'content': prompt}]

	completions = openai.ChatCompletion.create(
	model=engine,
	messages=messages,
	max_tokens=512,
	n=1,
	stop=None,
	temperature=0.7,
	)
	message = completions.choices[0].message['content']
	return message


	def generate_answer(question, openAI_key):
	topn_chunks = recommender(question)
	prompt = ""
	prompt += 'search results:\n\n'
	for c in topn_chunks:
	prompt += c + '\n\n'

	prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
	"Make sure the answer is correct and don't output false content. "\
	"If you do not know the answer - answer 'information not provided' "\
	"Answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "

	prompt += f"Query: {question}\nAnswer:"
	answer = generate_text(openAI_key, prompt, "gpt-3.5-turbo")
	return answer


	def main_loop(url: str, files: list, question:
	str, openAI_key):
	paths = []

	if url.strip() != '':
	glob_url = url
	download_pdf(glob_url, 'corpus.pdf')
	paths.append('corpus.pdf')
	if files is not None and len(files) > 0:
	for file in files:
	old_file_name = file.name
	file_name = old_file_name[:-12] + old_file_name[-4:]
	file_name = unique_filename(file_name) # Ensure the new file name is unique

	# Copy the content of the old file to the new file and delete the old file
	with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst:
	shutil.copyfileobj(src, dst)
	os.remove(old_file_name)

	paths.append(file_name)

	load_recommender(paths)

	if question.strip().lower() == 'exit':
	return '', False

	answer = generate_answer(question, openAI_key)
	return answer, True # Assuming the function returns an answer in all other cases


	def on_click(*args):
	answer.value = main_loop(url.value, files.value, question.value)


	recommender = SemanticSearch()

	title = 'Cognitive pdfGPT'
	description = """ Why use Cognitive Ask an Expert?
	This is Cognitive Chat. Here you can upload multiple PDF files and query them as a single corpus of knowledge. 🛑DO NOT USE CONFIDENTIAL INFORMATION """


	with gr.Blocks() as demo:
	gr.Markdown(f'<center><h1>{title}</h1></center>')
	gr.Markdown(description)

	with gr.Row():

	with gr.Group():
	files = gr.Files(label='➡️ Upload your PDFs ⬅️ NO CONFIDENTIAL FILES ', file_types=['.pdf'])
	url = gr.Textbox(label=' ')
	question = gr.Textbox(label='🔤 Enter your question here 🔤')
	btn = gr.Button(value='Submit')
	btn.style(full_width=False)

	with gr.Group():
	gr.Image("logo.jpg")
	answer = gr.Textbox(label='The answer to your question is :')

	btn.click(main_loop, inputs=[url, files, question], outputs=[answer])



	demo.launch(share=False, debug=True, auth=None, auth_message=None)