psyne commited on
Commit
b83dea1
·
1 Parent(s): dd98a5a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -0
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ import fitz
3
+ import re
4
+ import numpy as np
5
+ import tensorflow_hub as hub
6
+ import openai
7
+ import gradio as gr
8
+ import os
9
+ from sklearn.neighbors import NearestNeighbors
10
+
11
+
12
+ def download_pdf(url, output_path):
13
+ urllib.request.urlretrieve(url, output_path)
14
+
15
+
16
+ def preprocess(text):
17
+ text = text.replace('\n', ' ')
18
+ text = re.sub('\s+', ' ', text)
19
+ return text
20
+
21
+
22
+ def pdf_to_text(path, start_page=1, end_page=None):
23
+ doc = fitz.open(path)
24
+ total_pages = doc.page_count
25
+
26
+ if end_page is None:
27
+ end_page = total_pages
28
+
29
+ text_list = []
30
+
31
+ for i in range(start_page-1, end_page):
32
+ text = doc.load_page(i).get_text("text")
33
+ text = preprocess(text)
34
+ text_list.append(text)
35
+
36
+ doc.close()
37
+ return text_list
38
+
39
+
40
+ def text_to_chunks(texts, word_length=150, start_page=1):
41
+ text_toks = [t.split(' ') for t in texts]
42
+ page_nums = []
43
+ chunks = []
44
+
45
+ for idx, words in enumerate(text_toks):
46
+ for i in range(0, len(words), word_length):
47
+ chunk = words[i:i+word_length]
48
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
49
+ len(text_toks) != (idx+1)):
50
+ text_toks[idx+1] = chunk + text_toks[idx+1]
51
+ continue
52
+ chunk = ' '.join(chunk).strip()
53
+ chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"'
54
+ chunks.append(chunk)
55
+ return chunks
56
+
57
+
58
+ class SemanticSearch:
59
+
60
+ def __init__(self):
61
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
62
+ self.fitted = False
63
+
64
+
65
+ def fit(self, data, batch=1000, n_neighbors=5):
66
+ self.data = data
67
+ self.embeddings = self.get_text_embedding(data, batch=batch)
68
+ n_neighbors = min(n_neighbors, len(self.embeddings))
69
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
70
+ self.nn.fit(self.embeddings)
71
+ self.fitted = True
72
+
73
+
74
+ def __call__(self, text, return_data=True):
75
+ inp_emb = self.use([text])
76
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
77
+
78
+ if return_data:
79
+ return [self.data[i] for i in neighbors]
80
+ else:
81
+ return neighbors
82
+
83
+
84
+ def get_text_embedding(self, texts, batch=1000):
85
+ embeddings = []
86
+ for i in range(0, len(texts), batch):
87
+ text_batch = texts[i:(i+batch)]
88
+ emb_batch = self.use(text_batch)
89
+ embeddings.append(emb_batch)
90
+ embeddings = np.vstack(embeddings)
91
+ return embeddings
92
+
93
+
94
+ recommender = SemanticSearch()
95
+
96
+ def load_recommender(path, start_page=1):
97
+ global recommender
98
+ texts = pdf_to_text(path, start_page=start_page)
99
+ chunks = text_to_chunks(texts, start_page=start_page)
100
+ recommender.fit(chunks)
101
+ return 'Corpus Loaded.'
102
+
103
+
104
+ def generate_text(prompt, engine="mlsgpt3"):
105
+ completions = openai.Completion.create(
106
+ engine=engine,
107
+ prompt=prompt,
108
+ max_tokens=512,
109
+ n=1,
110
+ stop=None,
111
+ temperature=0.7,
112
+ )
113
+ message = completions.choices[0].text
114
+ return message
115
+
116
+
117
+ def generate_answer(question):
118
+ topn_chunks = recommender(question)
119
+ prompt = ""
120
+ prompt += 'search results:\n\n'
121
+ for c in topn_chunks:
122
+ prompt += c + '\n\n'
123
+
124
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
125
+ "Cite each reference using [number] notation (every result has this number at the beginning). "\
126
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
127
+ "with the same name, create separate answers for each. Only include information found in the results and "\
128
+ "don't add any additional information. Make sure the answer is correct and don't output false content. "\
129
+ "If the text does not relate to the query, simply state 'Found Nothing'. Ignore outlier "\
130
+ "search results which has nothing to do with the question. Only answer what is asked. The "\
131
+ "answer should be short and concise.\n\nQuery: {question}\nAnswer: "
132
+
133
+ prompt += f"Query: {question}\nAnswer:"
134
+ answer = generate_text(prompt)
135
+ return answer
136
+
137
+
138
+ def question_answer(url, file, question, api_key):
139
+ openai.api_key = api_key
140
+ openai.api_type = "azure"
141
+ openai.api_base = "https://jaytest.openai.azure.com/"
142
+ openai.api_version = "2022-12-01"
143
+
144
+ if url.strip() == '' and file == None:
145
+ return '[ERROR]: Both URL and PDF is empty. Provide atleast one.'
146
+
147
+ if url.strip() != '' and file != None:
148
+ return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
149
+
150
+ if url.strip() != '':
151
+ glob_url = url
152
+ download_pdf(glob_url, 'corpus.pdf')
153
+ load_recommender('corpus.pdf')
154
+
155
+ else:
156
+ old_file_name = file.name
157
+ file_name = file.name
158
+ file_name = file_name[:-12] + file_name[-4:]
159
+ os.rename(old_file_name, file_name)
160
+ load_recommender(file_name)
161
+
162
+ if question.strip() == '':
163
+ return '[ERROR]: Question field is empty'
164
+
165
+ return generate_answer(question)
166
+
167
+
168
+ title = 'AzurePDFGPT'
169
+ description = "A test platform for indexing PDFs to in order to 'chat' with them. It is hardcoded to the Jaytest and MLSLGPT engine"
170
+
171
+ with gr.Blocks() as demo:
172
+
173
+ gr.Markdown(f'<center><h1>{title}</h1></center>')
174
+ gr.Markdown(description)
175
+
176
+ with gr.Row():
177
+
178
+ with gr.Group():
179
+ url = gr.Textbox(label='URL')
180
+ gr.Markdown("<center><h6>or<h6></center>")
181
+ file = gr.File(label='PDF', file_types=['.pdf'])
182
+ question = gr.Textbox(label='question')
183
+ api_key = gr.Textbox(label='OpenAI API Key')
184
+ btn = gr.Button(value='Submit')
185
+ btn.style(full_width=True)
186
+
187
+ with gr.Group():
188
+ answer = gr.Textbox(label='answer')
189
+
190
+ btn.click(question_answer, inputs=[url, file, question, api_key], outputs=[answer])
191
+
192
+ demo.launch()