Spaces:
Sleeping
Sleeping
File size: 5,255 Bytes
7cac9bf 47125e3 48f21f7 47125e3 d9b620b 39f223a 7cac9bf 48f21f7 15e8c54 48f21f7 7cac9bf 3aae288 7cac9bf 50e9009 7cac9bf 97e95ab 1ea4820 a7df561 39f223a 47125e3 48f21f7 47125e3 48f21f7 15e8c54 47125e3 48f21f7 b81cbce 13c2cf2 f62362e d9e936b a4b14b2 d9e936b a4b14b2 13c2cf2 511c006 37c10f7 47125e3 ba952d1 47125e3 d9e936b 47125e3 d9e936b 47125e3 d9e936b 48f21f7 7cac9bf 48f21f7 47125e3 48f21f7 47125e3 48f21f7 47125e3 7cac9bf ec3fe15 ba952d1 47125e3 03bef98 47125e3 ba952d1 48f21f7 38f70ae ba952d1 38f70ae ba952d1 75ce576 67673ff a7df561 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import urllib.request
import fitz
import re
import numpy as np
import tensorflow_hub as hub
import openai
import gradio as gr
import os
import zipfile
from sklearn.neighbors import NearestNeighbors
openai.api_key = os.getenv('OpenAPI')
def download_pdf(url, output_path):
urllib.request.urlretrieve(url, output_path)
def extract_zip(file):
with zipfile.ZipFile(file, 'r') as zip_ref:
for member in zip_ref.namelist():
filename = os.path.basename(member)
if filename.endswith('.pdf'):
zip_ref.extract(member, 'pdfs')
def preprocess(text):
text = text.replace('\n', ' ')
text = re.sub('\s+', ' ', text)
return text
def pdf_to_text(path, start_page=1, end_page=None):
doc = fitz.open(path)
total_pages = doc.page_count
if end_page is None:
end_page = total_pages
text_list = []
for i in range(start_page-1, end_page):
text = doc.load_page(i).get_text("text")
text = preprocess(text)
text_list.append(text)
doc.close()
return text_list
def text_to_chunks(texts, word_length=150, start_page=1):
text_toks = [t.split(' ') for t in texts]
chunks = []
for idx, words in enumerate(text_toks):
for i in range(0, len(words), word_length):
chunk = words[i:i+word_length]
if (i+word_length) > len(words) and (len(chunk) < word_length) and (
len(text_toks) != (idx+1)):
text_toks[idx+1] = chunk + text_toks[idx+1]
continue
chunk = ' '.join(chunk).strip()
chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
chunks.append(chunk)
return chunks
class SemanticSearch:
def __init__(self):
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
self.fitted = False
def fit(self, data, batch=1000, n_neighbors=15):
self.data = data
self.embeddings = self.get_text_embedding(data, batch=batch)
n_neighbors = min(n_neighbors, len(self.embeddings))
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
self.nn.fit(self.embeddings)
self.fitted = True
def __call__(self, text, return_data=True):
inp_emb = self.use([text])
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
if return_data:
return [self.data[i] for i in neighbors]
else:
return neighbors
def get_text_embedding(self, texts, batch=1000):
embeddings = []
for i in range(0, len(texts), batch):
text_batch = texts[i:(i+batch)]
emb_batch = self.use(text_batch)
embeddings.append(emb_batch)
embeddings = np.vstack(embeddings)
return embeddings
recommender = SemanticSearch()
def load_recommender(paths, start_page=1):
global recommender
chunks = []
for path in paths:
if path.endswith('.pdf'):
texts = pdf_to_text(path, start_page=start_page)
chunks += text_to_chunks(texts, start_page=start_page)
recommender.fit(chunks)
return 'Corpus Loaded.'
def generate_text(messages, engine='gpt-3.5-turbo', max_tokens=2048, temperature=0.8):
response = openai.ChatCompletion.create(
model=engine,
messages=[{"role": "system", "content": "You are a research assistant"},
{"role": "user", "content": prompt}],
max_tokens=max_tokens,
n=1,
temperature=temperature
)
return response.choices[0].message['content']
def generate_answer(question):
topn_chunks = recommender(question)
prompt = "You are a helpful assistant.\n"
prompt += "User: " + question + "\n"
for c in topn_chunks:
prompt += "Assistant: " + c + "\n"
answer = generate_text(prompt)
return answer
def question_answer(urls, file, question):
if urls.strip() == '' and file is None:
return '[ERROR]: Both URLs and PDFs are empty. Provide at least one.'
paths = []
if urls.strip() != '':
urls = urls.split(',') # split the URLs string into a list of URLs
for url in urls:
download_pdf(url.strip(), 'corpus.pdf')
paths.append('corpus.pdf')
if file is not None:
extract_zip(file.name) # extract the PDFs from the zip file
for pdf_file in os.listdir('pdfs'):
paths.append(os.path.join('pdfs', pdf_file))
load_recommender(paths)
if question.strip() == '':
return '[ERROR]: Question field is empty'
return generate_answer(question)
title = 'Cognitive AI Agent - Asks the Expert'
description = """ This cognitive agent allows you to chat with your PDF files as a single corpus of knowledge. Add your relevant PDFs to a zip file and upload. 🛑PROOF OF CONCEPT🛑 """
iface = gr.Interface(
fn=question_answer,
inputs=[
gr.inputs.Textbox(label="Enter PDF URLs here, separated by commas"),
gr.inputs.File(label="Upload a zip file containing PDF files"),
gr.inputs.Textbox(label="Enter your question here"),
],
outputs=gr.outputs.Textbox(label="Generated Answer"),
title=title,
description=description
)
iface.launch()
|