Spaces:

cogcorp
/

assignment1

Sleeping

App Files Files Community

assignment1 / app.py

cogcorp

Update app.py

13c2cf2 about 2 years ago

raw

history blame

5.26 kB

	import urllib.request
	import fitz
	import re
	import numpy as np
	import tensorflow_hub as hub
	import openai
	import gradio as gr
	import os
	import zipfile
	from sklearn.neighbors import NearestNeighbors

	openai.api_key = os.getenv('OpenAPI')

	def download_pdf(url, output_path):
	urllib.request.urlretrieve(url, output_path)

	def extract_zip(file):
	with zipfile.ZipFile(file, 'r') as zip_ref:
	for member in zip_ref.namelist():
	filename = os.path.basename(member)
	if filename.endswith('.pdf'):
	zip_ref.extract(member, 'pdfs')

	def preprocess(text):
	text = text.replace('\n', ' ')
	text = re.sub('\s+', ' ', text)
	return text

	def pdf_to_text(path, start_page=1, end_page=None):
	doc = fitz.open(path)
	total_pages = doc.page_count

	if end_page is None:
	end_page = total_pages

	text_list = []

	for i in range(start_page-1, end_page):
	text = doc.load_page(i).get_text("text")
	text = preprocess(text)
	text_list.append(text)

	doc.close()
	return text_list

	def text_to_chunks(texts, word_length=150, start_page=1):
	text_toks = [t.split(' ') for t in texts]
	chunks = []

	for idx, words in enumerate(text_toks):
	for i in range(0, len(words), word_length):
	chunk = words[i:i+word_length]
	if (i+word_length) > len(words) and (len(chunk) < word_length) and (
	len(text_toks) != (idx+1)):
	text_toks[idx+1] = chunk + text_toks[idx+1]
	continue
	chunk = ' '.join(chunk).strip()
	chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
	chunks.append(chunk)
	return chunks

	class SemanticSearch:

	def __init__(self):
	self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
	self.fitted = False

	def fit(self, data, batch=1000, n_neighbors=15):
	self.data = data
	self.embeddings = self.get_text_embedding(data, batch=batch)
	n_neighbors = min(n_neighbors, len(self.embeddings))
	self.nn = NearestNeighbors(n_neighbors=n_neighbors)
	self.nn.fit(self.embeddings)
	self.fitted = True

	def __call__(self, text, return_data=True):
	inp_emb = self.use([text])
	neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]

	if return_data:
	return [self.data[i] for i in neighbors]
	else:
	return neighbors

	def get_text_embedding(self, texts, batch=1000):
	embeddings = []
	for i in range(0, len(texts), batch):
	text_batch = texts[i:(i+batch)]
	emb_batch = self.use(text_batch)
	embeddings.append(emb_batch)
	embeddings = np.vstack(embeddings)
	return embeddings

	recommender = SemanticSearch()

	def load_recommender(paths, start_page=1):
	global recommender
	chunks = []
	for path in paths:
	if path.endswith('.pdf'):
	texts = pdf_to_text(path, start_page=start_page)
	chunks += text_to_chunks(texts, start_page=start_page)
	recommender.fit(chunks)
	return 'Corpus Loaded.'

	def generate_text(messages, engine='gpt-3.5-turbo', max_tokens=2048, temperature=0.8):
	response = openai.ChatCompletion.create(
	model=engine,
	messages=[{"role": "system", "content": "You are a research assistant"},
	{"role": "user", "content": prompt}],
	max_tokens=max_tokens,
	n=1,
	temperature=temperature
	)
	return response.choices[0].message['content']


	def generate_answer(question):
	topn_chunks = recommender(question)

	prompt = "You are a helpful assistant.\n"
	prompt += "User: " + question + "\n"

	for c in topn_chunks:
	prompt += "Assistant: " + c + "\n"

	answer = generate_text(prompt)
	return answer



	def question_answer(urls, file, question):
	if urls.strip() == '' and file is None:
	return '[ERROR]: Both URLs and PDFs are empty. Provide at least one.'

	paths = []
	if urls.strip() != '':
	urls = urls.split(',') # split the URLs string into a list of URLs
	for url in urls:
	download_pdf(url.strip(), 'corpus.pdf')
	paths.append('corpus.pdf')

	if file is not None:
	extract_zip(file.name) # extract the PDFs from the zip file
	for pdf_file in os.listdir('pdfs'):
	paths.append(os.path.join('pdfs', pdf_file))

	load_recommender(paths)

	if question.strip() == '':
	return '[ERROR]: Question field is empty'

	return generate_answer(question)

	title = 'Cognitive AI Agent - Asks the Expert'
	description = """ This cognitive agent allows you to chat with your PDF files as a single corpus of knowledge. Add your relevant PDFs to a zip file and upload. 🛑PROOF OF CONCEPT🛑 """

	iface = gr.Interface(
	fn=question_answer,
	inputs=[
	gr.inputs.Textbox(label="Enter PDF URLs here, separated by commas"),
	gr.inputs.File(label="Upload a zip file containing PDF files"),
	gr.inputs.Textbox(label="Enter your question here"),
	],
	outputs=gr.outputs.Textbox(label="Generated Answer"),
	title=title,
	description=description
	)
	iface.launch()