cogcorp commited on
Commit
7cac9bf
·
1 Parent(s): 6d06c94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -116
app.py CHANGED
@@ -1,25 +1,65 @@
 
1
  import fitz
2
- import uuid
3
  import re
4
  import numpy as np
5
  import tensorflow_hub as hub
6
  import openai
7
  import gradio as gr
8
- import shutil
9
  import os
10
  from sklearn.neighbors import NearestNeighbors
11
- from tempfile import NamedTemporaryFile
12
- from PyPDF2 import PdfReader
13
 
14
- openAI_key = os.environ['OpenAPI']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  class SemanticSearch:
17
-
18
  def __init__(self):
19
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
20
  self.fitted = False
21
-
22
-
23
  def fit(self, data, batch=1000, n_neighbors=5):
24
  self.data = data
25
  self.embeddings = self.get_text_embedding(data, batch=batch)
@@ -27,87 +67,128 @@ class SemanticSearch:
27
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
28
  self.nn.fit(self.embeddings)
29
  self.fitted = True
30
-
31
-
32
  def __call__(self, text, return_data=True):
33
  inp_emb = self.use([text])
34
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
35
-
36
  if return_data:
37
  return [self.data[i] for i in neighbors]
38
  else:
39
  return neighbors
40
-
41
-
42
  def get_text_embedding(self, texts, batch=1000):
43
  embeddings = []
44
  for i in range(0, len(texts), batch):
45
- text_batch = texts[i:(i+batch)]
46
- print(f"Processing batch {i//batch + 1} of {len(texts)//batch + 1}")
47
- print(f"Text batch: {text_batch}")
48
- emb_batch = self.use(text_batch)
49
- print(f"Embedding batch: {emb_batch}")
50
- embeddings.append(emb_batch)
51
- embeddings = np.vstack(embeddings)
52
- print(f"Final embeddings: {embeddings}")
53
- return embeddings
 
 
 
 
54
 
55
 
56
- def pdf_to_text(pdf_path, start_page=1):
57
- pdf = PdfReader(pdf_path)
58
- text = ''
59
- for i in range(start_page, len(pdf.pages)):
60
- text += pdf.pages[i].extract_text()
61
  return text
62
 
63
- def text_to_chunks(text, start_page=1, chunk_size=512):
64
- chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  return chunks
66
 
67
 
 
68
 
69
- def unique_filename(basename):
70
- # Append a unique ID to the end of the filename, before the extension
71
- base, ext = os.path.splitext(basename)
72
- return base + "_" + uuid.uuid4().hex + ext
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
- def load_recommender(paths, start_page=1):
76
  global recommender
77
- chunks = []
78
- for path in paths:
79
- pdf_file = os.path.basename(path)
80
- embeddings_file = f"{pdf_file}_{start_page}.npy"
81
-
82
- if os.path.isfile(embeddings_file):
83
- embeddings = np.load(embeddings_file)
84
- recommender.embeddings = embeddings
85
- recommender.fitted = True
86
- print("Embeddings loaded from file")
87
- continue
88
-
89
- texts = pdf_to_text(path, start_page=start_page)
90
- chunks.extend(text_to_chunks(texts, start_page=start_page))
91
-
92
  recommender.fit(chunks)
93
- np.save(embeddings_file, recommender.embeddings)
94
  return 'Corpus Loaded.'
95
 
96
 
97
- def generate_text(openAI_key, prompt, engine="gpt-3.5-turbo"):
98
  openai.api_key = openAI_key
99
- messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
100
- {'role': 'user', 'content': prompt}]
101
-
102
- completions = openai.ChatCompletion.create(
103
- model=engine,
104
- messages=messages,
105
  max_tokens=512,
106
  n=1,
107
  stop=None,
108
  temperature=0.7,
109
  )
110
- message = completions.choices[0].message['content']
111
  return message
112
 
113
 
@@ -117,80 +198,55 @@ def generate_answer(question, openAI_key):
117
  prompt += 'search results:\n\n'
118
  for c in topn_chunks:
119
  prompt += c + '\n\n'
120
-
121
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
122
- "Make sure the answer is correct and don't output false content. "\
123
- "If you do not know the answer - answer 'information not provided' "\
124
- "Answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
125
-
 
 
 
 
126
  prompt += f"Query: {question}\nAnswer:"
127
- answer = generate_text(openAI_key, prompt, "gpt-3.5-turbo")
128
  return answer
129
 
130
 
131
- def main_loop(url: str, files: list, question:
132
- str, openAI_key):
133
- paths = []
 
 
 
 
 
134
 
135
  if url.strip() != '':
136
  glob_url = url
137
  download_pdf(glob_url, 'corpus.pdf')
138
- paths.append('corpus.pdf')
139
- if files is not None and len(files) > 0:
140
- for file in files:
141
- old_file_name = file.name
142
- file_name = old_file_name[:-12] + old_file_name[-4:]
143
- file_name = unique_filename(file_name) # Ensure the new file name is unique
144
 
145
- # Copy the content of the old file to the new file and delete the old file
146
- with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst:
147
- shutil.copyfileobj(src, dst)
148
- os.remove(old_file_name)
 
 
149
 
150
- paths.append(file_name)
 
151
 
152
- load_recommender(paths)
153
-
154
- if question.strip().lower() == 'exit':
155
- return '', False
156
-
157
- answer = generate_answer(question, openAI_key)
158
- return answer, True # Assuming the function returns an answer in all other cases
159
-
160
-
161
- def on_click(*args):
162
- answer.value = main_loop(url.value, files.value, question.value)
163
 
164
 
165
  recommender = SemanticSearch()
166
 
167
- title = 'Cognitive pdfGPT'
168
- description = """ Why use Cognitive Ask an Expert?
169
- This is Cognitive Chat. Here you can upload multiple PDF files and query them as a single corpus of knowledge. 🛑DO NOT USE CONFIDENTIAL INFORMATION """
170
-
171
-
172
- with gr.Blocks() as demo:
173
- gr.Markdown(f'<center><h1>{title}</h1></center>')
174
- gr.Markdown(description)
175
-
176
- with gr.Row():
177
-
178
- with gr.Group():
179
- files = gr.Files(label='➡️ Upload your PDFs ⬅️ NO CONFIDENTIAL FILES ', file_types=['.pdf'])
180
- url = gr.Textbox(label=' ')
181
- question = gr.Textbox(label='🔤 Enter your question here 🔤')
182
- btn = gr.Button(value='Submit')
183
- btn.style(full_width=False)
184
-
185
- with gr.Group():
186
- gr.Image("logo.jpg")
187
- answer = gr.Textbox(label='The answer to your question is :')
188
-
189
- btn.click(main_loop, inputs=[url, files, question, openAI_key], outputs=[answer])
190
-
191
-
192
-
193
 
194
- demo.launch(share=False, debug=True, auth=None, auth_message=None)
 
 
195
 
196
 
 
1
+ import urllib.request
2
  import fitz
 
3
  import re
4
  import numpy as np
5
  import tensorflow_hub as hub
6
  import openai
7
  import gradio as gr
 
8
  import os
9
  from sklearn.neighbors import NearestNeighbors
 
 
10
 
11
+ def download_pdf(url, output_path):
12
+ urllib.request.urlretrieve(url, output_path)
13
+
14
+
15
+ def preprocess(text):
16
+ text = text.replace('\n', ' ')
17
+ text = re.sub('\s+', ' ', text)
18
+ return text
19
+
20
+
21
+ def pdf_to_text(path, start_page=1, end_page=None):
22
+ doc = fitz.open(path)
23
+ total_pages = doc.page_count
24
+
25
+ if end_page is None:
26
+ end_page = total_pages
27
+
28
+ text_list = []
29
+
30
+ for i in range(start_page-1, end_page):
31
+ text = doc.load_page(i).get_text("text")
32
+ text = preprocess(text)
33
+ text_list.append(text)
34
+
35
+ doc.close()
36
+ return text_list
37
+
38
+
39
+ def text_to_chunks(texts, word_length=150, start_page=1):
40
+ text_toks = [t.split(' ') for t in texts]
41
+ page_nums = []
42
+ chunks = []
43
+
44
+ for idx, words in enumerate(text_toks):
45
+ for i in range(0, len(words), word_length):
46
+ chunk = words[i:i+word_length]
47
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
48
+ len(text_toks) != (idx+1)):
49
+ text_toks[idx+1] = chunk + text_toks[idx+1]
50
+ continue
51
+ chunk = ' '.join(chunk).strip()
52
+ chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
53
+ chunks.append(chunk)
54
+ return chunks
55
+
56
 
57
  class SemanticSearch:
58
+
59
  def __init__(self):
60
  self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
61
  self.fitted = False
62
+
 
63
  def fit(self, data, batch=1000, n_neighbors=5):
64
  self.data = data
65
  self.embeddings = self.get_text_embedding(data, batch=batch)
 
67
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
68
  self.nn.fit(self.embeddings)
69
  self.fitted = True
70
+
 
71
  def __call__(self, text, return_data=True):
72
  inp_emb = self.use([text])
73
  neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
74
+
75
  if return_data:
76
  return [self.data[i] for i in neighbors]
77
  else:
78
  return neighbors
79
+
 
80
  def get_text_embedding(self, texts, batch=1000):
81
  embeddings = []
82
  for i in range(0, len(texts), batch):
83
+ text_batch = texts[i:(i+batch
84
+ import urllib.request
85
+ import fitz
86
+ import re
87
+ import numpy as np
88
+ import tensorflow_hub as hub
89
+ import openai
90
+ import gradio as gr
91
+ import os
92
+ from sklearn.neighbors import NearestNeighbors
93
+
94
+ def download_pdf(url, output_path):
95
+ urllib.request.urlretrieve(url, output_path)
96
 
97
 
98
+ def preprocess(text):
99
+ text = text.replace('\n', ' ')
100
+ text = re.sub('\s+', ' ', text)
 
 
101
  return text
102
 
103
+
104
+ def pdf_to_text(path, start_page=1, end_page=None):
105
+ doc = fitz.open(path)
106
+ total_pages = doc.page_count
107
+
108
+ if end_page is None:
109
+ end_page = total_pages
110
+
111
+ text_list = []
112
+
113
+ for i in range(start_page-1, end_page):
114
+ text = doc.load_page(i).get_text("text")
115
+ text = preprocess(text)
116
+ text_list.append(text)
117
+
118
+ doc.close()
119
+ return text_list
120
+
121
+
122
+ def text_to_chunks(texts, word_length=150, start_page=1):
123
+ text_toks = [t.split(' ') for t in texts]
124
+ page_nums = []
125
+ chunks = []
126
+
127
+ for idx, words in enumerate(text_toks):
128
+ for i in range(0, len(words), word_length):
129
+ chunk = words[i:i+word_length]
130
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
131
+ len(text_toks) != (idx+1)):
132
+ text_toks[idx+1] = chunk + text_toks[idx+1]
133
+ continue
134
+ chunk = ' '.join(chunk).strip()
135
+ chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
136
+ chunks.append(chunk)
137
  return chunks
138
 
139
 
140
+ class SemanticSearch:
141
 
142
+ def __init__(self):
143
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
144
+ self.fitted = False
145
+
146
+ def fit(self, data, batch=1000, n_neighbors=5):
147
+ self.data = data
148
+ self.embeddings = self.get_text_embedding(data, batch=batch)
149
+ n_neighbors = min(n_neighbors, len(self.embeddings))
150
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
151
+ self.nn.fit(self.embeddings)
152
+ self.fitted = True
153
+
154
+ def __call__(self, text, return_data=True):
155
+ inp_emb = self.use([text])
156
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
157
+
158
+ if return_data:
159
+ return [self.data[i] for i in neighbors]
160
+ else:
161
+ return neighbors
162
+
163
+ def get_text_embedding(self, texts, batch=1000):
164
+ embeddings = []
165
+ for i in range(0, len(texts), batch):
166
+ text_batch = texts[i:(i+batch
167
+ emb_batch = self.use(text_batch)
168
+ embeddings.append(emb_batch)
169
+ embeddings = np.vstack(embeddings)
170
+ return embeddings
171
 
172
 
173
+ def load_recommender(path, start_page=1):
174
  global recommender
175
+ texts = pdf_to_text(path, start_page=start_page)
176
+ chunks = text_to_chunks(texts, start_page=start_page)
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  recommender.fit(chunks)
 
178
  return 'Corpus Loaded.'
179
 
180
 
181
+ def generate_text(openAI_key, prompt, engine="text-davinci-003"):
182
  openai.api_key = openAI_key
183
+ completions = openai.Completion.create(
184
+ engine=engine,
185
+ prompt=prompt,
 
 
 
186
  max_tokens=512,
187
  n=1,
188
  stop=None,
189
  temperature=0.7,
190
  )
191
+ message = completions.choices[0].text
192
  return message
193
 
194
 
 
198
  prompt += 'search results:\n\n'
199
  for c in topn_chunks:
200
  prompt += c + '\n\n'
201
+
202
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
203
+ "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
204
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
205
+ "with the same name, create separate answers for each. Only include information found in the results and "\
206
+ "don't add any additional information. Make sure the answer is correct and don't output false content. "\
207
+ "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
208
+ "search results which has nothing to do with the question. Only answer what is asked. The "\
209
+ "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
210
+
211
  prompt += f"Query: {question}\nAnswer:"
212
+ answer = generate_text(openAI_key, prompt, "text-davinci-003")
213
  return answer
214
 
215
 
216
+ def question_answer(url, file, question, openAI_key):
217
+ if openAI_key.strip() == '':
218
+ return '[ERROR]: Please enter your Open AI Key. Get your key here: https://platform.openai.com/account/api-keys'
219
+ if url.strip() == '' and file is None:
220
+ return '[ERROR]: Both URL and PDF are empty. Provide at least one.'
221
+
222
+ if url.strip() != '' and file is not None:
223
+ return '[ERROR]: Both URL and PDF are provided. Please provide only one (either URL or PDF).'
224
 
225
  if url.strip() != '':
226
  glob_url = url
227
  download_pdf(glob_url, 'corpus.pdf')
228
+ load_recommender('corpus.pdf')
 
 
 
 
 
229
 
230
+ else:
231
+ old_file_name = file.name
232
+ file_name = file.name
233
+ file_name = file_name[:-12] + file_name[-4:]
234
+ os.rename(old_file_name, file_name)
235
+ load_recommender(file_name)
236
 
237
+ if question.strip() == '':
238
+ return '[ERROR]: Question field is empty'
239
 
240
+ return generate_answer(question, openAI_key)
 
 
 
 
 
 
 
 
 
 
241
 
242
 
243
  recommender = SemanticSearch()
244
 
245
+ title = 'PDF GPT'
246
+ description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ with gr.Interface(fn=question_answer, inputs
249
+ =[url, file, question, openAI_key], outputs=[answer], title=title, description=description) as iface:
250
+ iface.launch()
251
 
252