cogcorp commited on
Commit
ec3fe15
·
1 Parent(s): ec77847

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -145
app.py CHANGED
@@ -11,57 +11,6 @@ from tempfile import NamedTemporaryFile
11
 
12
  openAI_key = os.environ['OpenAPI']
13
 
14
-
15
-
16
-
17
-
18
- def download_pdf(url, output_path):
19
- urllib.request.urlretrieve(url, output_path)
20
-
21
-
22
- def preprocess(text):
23
- text = text.replace('\n', ' ')
24
- text = re.sub('\s+', ' ', text)
25
- return text
26
-
27
-
28
- def pdf_to_text(path, start_page=1, end_page=None):
29
- doc = fitz.open(path)
30
- total_pages = doc.page_count
31
-
32
- if end_page is None:
33
- end_page = total_pages
34
-
35
- text_list = []
36
-
37
- for i in range(start_page-1, end_page):
38
- text = doc.load_page(i).get_text("text")
39
- text = preprocess(text)
40
- text_list.append(text)
41
-
42
- doc.close()
43
- return text_list
44
-
45
-
46
- def text_to_chunks(texts, word_length=150, start_page=1):
47
- text_toks = [t.split(' ') for t in texts]
48
- page_nums = []
49
- chunks = []
50
-
51
- for idx, words in enumerate(text_toks):
52
- for i in range(0, len(words), word_length):
53
- chunk = words[i:i+word_length]
54
- if (i+word_length) > len(words) and (len(chunk) < word_length) and (
55
- len(text_toks) != (idx+1)):
56
- text_toks[idx+1] = chunk + text_toks[idx+1]
57
- continue
58
- chunk = ' '.join(chunk).strip()
59
- chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"'
60
- chunks.append(chunk)
61
- return chunks
62
-
63
-
64
-
65
  class SemanticSearch:
66
 
67
  def __init__(self):
@@ -98,57 +47,29 @@ class SemanticSearch:
98
  return embeddings
99
 
100
 
101
-
102
- #def load_recommender(path, start_page=1):
103
- # global recommender
104
- # texts = pdf_to_text(path, start_page=start_page)
105
- # chunks = text_to_chunks(texts, start_page=start_page)
106
- # recommender.fit(chunks)
107
- # return 'Corpus Loaded.'
108
-
109
- # The modified function generates embeddings based on PDF file name and page number and checks if the embeddings file exists before loading or generating it.
110
-
111
- def load_recommender(path, start_page=1):
112
  global recommender
113
- pdf_file = os.path.basename(path)
114
- embeddings_file = f"{pdf_file}_{start_page}.npy"
115
-
116
- if os.path.isfile(embeddings_file):
117
- embeddings = np.load(embeddings_file)
118
- recommender.embeddings = embeddings
119
- recommender.fitted = True
120
- return "Embeddings loaded from file"
 
 
 
 
 
 
121
 
122
- texts = pdf_to_text(path, start_page=start_page)
123
- chunks = text_to_chunks(texts, start_page=start_page)
124
  recommender.fit(chunks)
125
  np.save(embeddings_file, recommender.embeddings)
126
  return 'Corpus Loaded.'
127
 
128
 
129
-
130
- def generate_text(openAI_key,prompt, engine="text-davinci-003"):
131
- openai.api_key = openAI_key
132
- completions = openai.Completion.create(
133
- engine=engine,
134
- prompt=prompt,
135
- max_tokens=512,
136
- n=1,
137
- stop=None,
138
- temperature=0.7,
139
- )
140
- message = completions.choices[0].text
141
- return message
142
-
143
- def process_file(file):
144
- temp_file = NamedTemporaryFile(delete=False, suffix='.pdf')
145
- file.save(temp_file.name)
146
- temp_file.close()
147
- return temp_file.name
148
-
149
-
150
-
151
- def generate_text2(openAI_key, prompt, engine="text-davinci-003"):
152
  openai.api_key = openAI_key
153
  messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
154
  {'role': 'user', 'content': prompt}]
@@ -164,7 +85,8 @@ def generate_text2(openAI_key, prompt, engine="text-davinci-003"):
164
  message = completions.choices[0].message['content']
165
  return message
166
 
167
- def generate_answer(question,openAI_key):
 
168
  topn_chunks = recommender(question)
169
  prompt = ""
170
  prompt += 'search results:\n\n'
@@ -173,93 +95,74 @@ def generate_answer(question,openAI_key):
173
 
174
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
175
  "Make sure the answer is correct and don't output false content. "\
176
- "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
177
 
178
  prompt += f"Query: {question}\nAnswer:"
179
- answer = generate_text(openAI_key, prompt,"text-davinci-003")
180
  return answer
181
 
182
- def unique_filename(file_name):
183
- counter = 1
184
- new_file_name = file_name
185
- while os.path.isfile(new_file_name):
186
- name, ext = os.path.splitext(file_name)
187
- new_file_name = f"{name}_{counter}{ext}"
188
- counter += 1
189
- return new_file_name
190
 
191
-
192
- def question_answer(url, file, question, openAI_key):
193
- #openapi key here
194
-
195
- if url.strip() == '' and file == None:
196
- return '[ERROR]: Both URL and PDF is empty. Provide at least one.', False
197
-
198
- if url.strip() != '' and file != None:
199
- return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).', False
200
 
201
  if url.strip() != '':
202
  glob_url = url
203
  download_pdf(glob_url, 'corpus.pdf')
204
- load_recommender('corpus.pdf')
205
- else:
206
- old_file_name = file.name
207
- file_name = old_file_name[:-12] + old_file_name[-4:]
208
- file_name = unique_filename(file_name) # Ensure the new file name is unique
 
209
 
210
- # Copy the content of the old file to the new file and delete the old file
211
- with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst:
212
- shutil.copyfileobj(src, dst)
213
- os.remove(old_file_name)
214
 
215
- load_recommender(file_name)
 
 
216
 
217
  if question.strip().lower() == 'exit':
218
  return '', False
219
 
220
  answer = generate_answer(question, openAI_key)
221
  return answer, True # Assuming the function returns an answer in all other cases
222
-
223
-
224
- def main_loop(url: str, file: str, question: str):
225
- answer, cont = question_answer(url, file, question, openAI_key)
226
- return answer, cont
227
 
228
 
229
  def on_click(*args):
230
- answer.value = main_loop(url.value, file.value, question.value)
231
 
232
 
233
  recommender = SemanticSearch()
234
 
235
  title = 'Cognitive pdfGPT'
236
- description = """ Why use Cognitive pdfGPT?
237
- The issue is OpenAI has a 4K token constraint, preventing it from processing an entire PDF file as input. Additionally, ChatGPT cannot (as of yet) directly talk to external data. The solution is Cognitive pdfGPT, which allows you to chat with your PDF file using GPT functionalities. The application converts the document into smaller files and generates embeddings using a powerful Deep Averaging Network Encoder. A semantic search is performed on your data, and the top relevant results are used to generate a response. 🛑DO NOT USE CONFIDENTIAL INFORMATION """
238
-
239
-
240
-
241
 
242
 
243
  with gr.Blocks() as demo:
244
-
245
- gr.Markdown(f'<center><h1>{title}</h1></center>')
246
- gr.Markdown(description)
247
 
248
- with gr.Row():
249
 
250
  with gr.Group():
251
- file=gr.File(label='➡️ Upload your PDF ⬅️ NO CONFIDENTIAL FILES ', file_types=['.pdf'])
252
- url=gr.Textbox(label=' ')
253
- question=gr.Textbox(label='🔤 Enter your question here 🔤')
254
- btn=gr.Button(value='Submit')
255
  btn.style(full_width=False)
256
 
257
  with gr.Group():
258
  gr.Image("logo.jpg")
259
  answer = gr.Textbox(label='The answer to your question is :')
260
 
261
- btn.click(main_loop, inputs=[url, file, question], outputs=[answer])
262
 
263
 
264
 
265
  demo.launch()
 
 
11
 
12
  openAI_key = os.environ['OpenAPI']
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class SemanticSearch:
15
 
16
  def __init__(self):
 
47
  return embeddings
48
 
49
 
50
+ def load_recommender(paths, start_page=1):
 
 
 
 
 
 
 
 
 
 
51
  global recommender
52
+ chunks = []
53
+ for path in paths:
54
+ pdf_file = os.path.basename(path)
55
+ embeddings_file = f"{pdf_file}_{start_page}.npy"
56
+
57
+ if os.path.isfile(embeddings_file):
58
+ embeddings = np.load(embeddings_file)
59
+ recommender.embeddings = embeddings
60
+ recommender.fitted = True
61
+ print("Embeddings loaded from file")
62
+ continue
63
+
64
+ texts = pdf_to_text(path, start_page=start_page)
65
+ chunks.extend(text_to_chunks(texts, start_page=start_page))
66
 
 
 
67
  recommender.fit(chunks)
68
  np.save(embeddings_file, recommender.embeddings)
69
  return 'Corpus Loaded.'
70
 
71
 
72
+ def generate_text(openAI_key, prompt, engine="gpt-3.5-turbo"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  openai.api_key = openAI_key
74
  messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
75
  {'role': 'user', 'content': prompt}]
 
85
  message = completions.choices[0].message['content']
86
  return message
87
 
88
+
89
+ def generate_answer(question, openAI_key):
90
  topn_chunks = recommender(question)
91
  prompt = ""
92
  prompt += 'search results:\n\n'
 
95
 
96
  prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
97
  "Make sure the answer is correct and don't output false content. "\
98
+ "Answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
99
 
100
  prompt += f"Query: {question}\nAnswer:"
101
+ answer = generate_text(openAI_key, prompt, "gpt-3.5-turbo")
102
  return answer
103
 
 
 
 
 
 
 
 
 
104
 
105
+ def main_loop(url: str, files: list, question:
106
+ str, openAI_key):
107
+ paths = []
 
 
 
 
 
 
108
 
109
  if url.strip() != '':
110
  glob_url = url
111
  download_pdf(glob_url, 'corpus.pdf')
112
+ paths.append('corpus.pdf')
113
+ if files is not None and len(files) > 0:
114
+ for file in files:
115
+ old_file_name = file.name
116
+ file_name = old_file_name[:-12] + old_file_name[-4:]
117
+ file_name = unique_filename(file_name) # Ensure the new file name is unique
118
 
119
+ # Copy the content of the old file to the new file and delete the old file
120
+ with open(old_file_name, 'rb') as src, open(file_name, 'wb') as dst:
121
+ shutil.copyfileobj(src, dst)
122
+ os.remove(old_file_name)
123
 
124
+ paths.append(file_name)
125
+
126
+ load_recommender(paths)
127
 
128
  if question.strip().lower() == 'exit':
129
  return '', False
130
 
131
  answer = generate_answer(question, openAI_key)
132
  return answer, True # Assuming the function returns an answer in all other cases
 
 
 
 
 
133
 
134
 
135
  def on_click(*args):
136
+ answer.value = main_loop(url.value, files.value, question.value)
137
 
138
 
139
  recommender = SemanticSearch()
140
 
141
  title = 'Cognitive pdfGPT'
142
+ description = """ Why use Cognitive Ask an Expert?
143
+ This is Cognitive Chat. Here you can upload multiple PDF files and query them as a single corpus of knowledge. 🛑DO NOT USE CONFIDENTIAL INFORMATION """
 
 
 
144
 
145
 
146
  with gr.Blocks() as demo:
147
+ gr.Markdown(f'<center><h1>{title}</h1></center>')
148
+ gr.Markdown(description)
 
149
 
150
+ with gr.Row():
151
 
152
  with gr.Group():
153
+ files = gr.Files(label='➡️ Upload your PDFs ⬅️ NO CONFIDENTIAL FILES ', file_types=['.pdf'])
154
+ url = gr.Textbox(label=' ')
155
+ question = gr.Textbox(label='🔤 Enter your question here 🔤')
156
+ btn = gr.Button(value='Submit')
157
  btn.style(full_width=False)
158
 
159
  with gr.Group():
160
  gr.Image("logo.jpg")
161
  answer = gr.Textbox(label='The answer to your question is :')
162
 
163
+ btn.click(main_loop, inputs=[url, files, question], outputs=[answer])
164
 
165
 
166
 
167
  demo.launch()
168
+