File size: 5,255 Bytes
7cac9bf
47125e3
 
 
 
 
 
 
48f21f7
47125e3
 
d9b620b
39f223a
7cac9bf
 
 
48f21f7
 
15e8c54
 
 
 
48f21f7
7cac9bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3aae288
7cac9bf
 
 
 
50e9009
7cac9bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97e95ab
 
 
 
 
1ea4820
 
a7df561
39f223a
47125e3
48f21f7
47125e3
48f21f7
 
15e8c54
 
 
47125e3
 
48f21f7
b81cbce
13c2cf2
f62362e
d9e936b
 
a4b14b2
d9e936b
a4b14b2
13c2cf2
511c006
37c10f7
47125e3
ba952d1
47125e3
d9e936b
 
 
 
47125e3
d9e936b
 
 
47125e3
 
d9e936b
 
48f21f7
 
 
7cac9bf
48f21f7
 
 
 
 
 
47125e3
48f21f7
 
 
 
47125e3
48f21f7
47125e3
7cac9bf
 
ec3fe15
ba952d1
47125e3
03bef98
 
47125e3
ba952d1
 
 
48f21f7
 
38f70ae
ba952d1
38f70ae
ba952d1
 
 
75ce576
67673ff
a7df561
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import urllib.request
import fitz
import re
import numpy as np
import tensorflow_hub as hub
import openai
import gradio as gr
import os
import zipfile
from sklearn.neighbors import NearestNeighbors

openai.api_key = os.getenv('OpenAPI') 

def download_pdf(url, output_path):
    urllib.request.urlretrieve(url, output_path)

def extract_zip(file):
    with zipfile.ZipFile(file, 'r') as zip_ref:
        for member in zip_ref.namelist():
            filename = os.path.basename(member)
            if filename.endswith('.pdf'):
                zip_ref.extract(member, 'pdfs')

def preprocess(text):
    text = text.replace('\n', ' ')
    text = re.sub('\s+', ' ', text)
    return text

def pdf_to_text(path, start_page=1, end_page=None):
    doc = fitz.open(path)
    total_pages = doc.page_count

    if end_page is None:
        end_page = total_pages

    text_list = []

    for i in range(start_page-1, end_page):
        text = doc.load_page(i).get_text("text")
        text = preprocess(text)
        text_list.append(text)

    doc.close()
    return text_list

def text_to_chunks(texts, word_length=150, start_page=1):
    text_toks = [t.split(' ') for t in texts]
    chunks = []

    for idx, words in enumerate(text_toks):
        for i in range(0, len(words), word_length):
            chunk = words[i:i+word_length]
            if (i+word_length) > len(words) and (len(chunk) < word_length) and (
                len(text_toks) != (idx+1)):
                text_toks[idx+1] = chunk + text_toks[idx+1]
                continue
            chunk = ' '.join(chunk).strip()
            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
            chunks.append(chunk)
    return chunks

class SemanticSearch:

    def __init__(self):
        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
        self.fitted = False

    def fit(self, data, batch=1000, n_neighbors=15):
        self.data = data
        self.embeddings = self.get_text_embedding(data, batch=batch)
        n_neighbors = min(n_neighbors, len(self.embeddings))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        self.nn.fit(self.embeddings)
        self.fitted = True

    def __call__(self, text, return_data=True):
        inp_emb = self.use([text])
        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]

        if return_data:
            return [self.data[i] for i in neighbors]
        else:
            return neighbors

    def get_text_embedding(self, texts, batch=1000):
        embeddings = []
        for i in range(0, len(texts), batch):
            text_batch = texts[i:(i+batch)]
            emb_batch = self.use(text_batch)
            embeddings.append(emb_batch)
        embeddings = np.vstack(embeddings)
        return embeddings

recommender = SemanticSearch()

def load_recommender(paths, start_page=1):
    global recommender
    chunks = []
    for path in paths:
        if path.endswith('.pdf'):
            texts = pdf_to_text(path, start_page=start_page)
            chunks += text_to_chunks(texts, start_page=start_page)
    recommender.fit(chunks)
    return 'Corpus Loaded.'

def generate_text(messages, engine='gpt-3.5-turbo', max_tokens=2048, temperature=0.8):
    response = openai.ChatCompletion.create(
        model=engine,
        messages=[{"role": "system", "content": "You are a research assistant"},
             {"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        n=1,
        temperature=temperature
     )
    return response.choices[0].message['content']


def generate_answer(question):
    topn_chunks = recommender(question)

    prompt = "You are a helpful assistant.\n"
    prompt += "User: " + question + "\n"

    for c in topn_chunks:
        prompt += "Assistant: " + c + "\n"

    answer = generate_text(prompt)
    return answer



def question_answer(urls, file, question):
    if urls.strip() == '' and file is None:
        return '[ERROR]: Both URLs and PDFs are empty. Provide at least one.'

    paths = []
    if urls.strip() != '':
        urls = urls.split(',')  # split the URLs string into a list of URLs
        for url in urls:
            download_pdf(url.strip(), 'corpus.pdf')
            paths.append('corpus.pdf')

    if file is not None:
        extract_zip(file.name)  # extract the PDFs from the zip file
        for pdf_file in os.listdir('pdfs'):
            paths.append(os.path.join('pdfs', pdf_file))

    load_recommender(paths)

    if question.strip() == '':
        return '[ERROR]: Question field is empty'

    return generate_answer(question)

title = 'Cognitive AI Agent - Asks the Expert'
description = """ This cognitive agent allows you to chat with your PDF files as a single corpus of knowledge.  Add your relevant PDFs to a zip file and upload. 🛑PROOF OF CONCEPT🛑 """

iface = gr.Interface(
    fn=question_answer,
    inputs=[
        gr.inputs.Textbox(label="Enter PDF URLs here, separated by commas"),
        gr.inputs.File(label="Upload a zip file containing PDF files"),
        gr.inputs.Textbox(label="Enter your question here"),
    ],
    outputs=gr.outputs.Textbox(label="Generated Answer"),
    title=title,
    description=description
)
iface.launch()