zjXu11 commited on
Commit
ca1d599
·
verified ·
1 Parent(s): b1ea0d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +545 -544
app.py CHANGED
@@ -1,545 +1,546 @@
1
- import numpy as np
2
- import os
3
- import re
4
- from io import BytesIO
5
- import datetime
6
- import time
7
- import openai, tenacity
8
- import argparse
9
- import configparser
10
- import json
11
- import fitz
12
- import PyPDF2
13
- import gradio
14
- import sys
15
- from pathlib import Path
16
- utils_dir = Path(__file__).parent / 'utils'
17
- sys.path.append(str(utils_dir))
18
- from openai_utils import *
19
- import base64
20
- from pdf2image import convert_from_bytes
21
- import requests
22
- PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
23
- PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
24
-
25
-
26
- def insert_sentence(text, sentence, interval):
27
- lines = text.split('\n')
28
- new_lines = []
29
-
30
- for line in lines:
31
- words = line.split()
32
- separator = ' '
33
-
34
- new_words = []
35
- count = 0
36
-
37
- for word in words:
38
- new_words.append(word)
39
- count += 1
40
-
41
- if count % interval == 0:
42
- new_words.append(sentence)
43
-
44
- new_lines.append(separator.join(new_words))
45
-
46
- return '\n'.join(new_lines)
47
-
48
- def search_paper(query):
49
- SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
50
- url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
51
-
52
- response = requests.get(url)
53
- while response.status_code != 200:
54
- time.sleep(1)
55
- # print(response)
56
- response = requests.get(url)
57
-
58
- return response.json()
59
-
60
- def split_text_into_chunks(text, chunk_size=300):
61
- words = text.split()
62
- chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
63
- return chunks
64
-
65
- def download_pdf(paper):
66
- pdf_url = paper["openAccessPdf"]["url"]
67
- try:
68
- response = requests.get(pdf_url)
69
- response.raise_for_status()
70
-
71
-
72
- file_object = BytesIO(response.content)
73
- extract_text = extract_chapter(file_object)
74
- chunks = split_text_into_chunks(extract_text)
75
- return chunks
76
- except:
77
- return []
78
-
79
-
80
- def recommendation(s2_id, limit=500):
81
- SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
82
- url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
83
-
84
- # print(url)
85
- response = requests.get(url)
86
- while response.status_code != 200:
87
- time.sleep(1)
88
- # print(response)
89
- response = requests.get(url)
90
-
91
- return response.json()
92
-
93
-
94
- def extract_chapter(file_object):
95
- pdf_reader = PyPDF2.PdfReader(file_object)
96
-
97
-
98
- num_pages = len(pdf_reader.pages)
99
- extraction_started = False
100
- extracted_text = ""
101
- for page_number in range(num_pages):
102
- page = pdf_reader.pages[page_number]
103
- page_text = page.extract_text()
104
- extraction_started = True
105
- page_number_start = page_number
106
- if extraction_started:
107
- extracted_text += page_text
108
- if page_number_start + 1 < page_number:
109
- break
110
- return extracted_text
111
-
112
-
113
-
114
- class Reviewer:
115
-
116
- def __init__(self, api, api_base, paper_pdf, aspect, model_name, enable_rag):
117
- self.api = api
118
- self.api_base = api_base
119
- self.aspect = aspect
120
- self.paper_pdf = paper_pdf
121
- self.model_name = model_name
122
- self.enable_rag = enable_rag
123
- # self.max_token_num = 50000
124
- # self.encoding = tiktoken.get_encoding("gpt2")
125
-
126
-
127
- def review_by_chatgpt(self, paper_list):
128
- text, title, abstract = self.extract_from_paper(self.paper_pdf)
129
- content = f"Paper to review: \nTitle: {title}\n" + text
130
-
131
- if self.enable_rag:
132
- papers = self.retrieve_papers(title, abstract)
133
- if papers != None:
134
- retrieval_content = ""
135
- retrieved_papers = ""
136
- cnt = 1
137
- for paper in papers:
138
- retrieval_content += f"Relevant Paper {str(cnt)}:\n"
139
- retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
140
- retrieved_papers += f"{str(cnt)}. {paper['title']}\n"
141
- cnt += 1
142
- text = retrieval_content + content
143
- chat_review_text = self.chat_review(text=text)
144
- else:
145
- text = content
146
- chat_review_text = self.chat_review(text=text)
147
- retrieved_papers = ""
148
- else:
149
- text = content
150
- chat_review_text = self.chat_review(text=text)
151
- retrieved_papers = ""
152
-
153
- return chat_review_text, retrieved_papers
154
-
155
- def query_gen(self, abstract):
156
- os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
157
- os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
158
- client = AsyncOpenAI()
159
-
160
- messages=[
161
- {"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
162
- {"role": "user", "content": abstract},
163
- ]
164
-
165
- responses = asyncio.run(
166
- generate_from_openai_chat_completion(
167
- client,
168
- messages=[messages],
169
- engine_name="gpt-4o-mini", # gpt-3.5-turbo
170
- max_tokens=1000, # 32
171
- requests_per_minute = 20,
172
- # response_format={"type":"json_object"},
173
- )
174
- )
175
- return responses[0]
176
-
177
-
178
- def rerank(self, paper_list, title, abstract):
179
- os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
180
- os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
181
- client = AsyncOpenAI()
182
-
183
- rec_content = ""
184
- rec_paper_cnt = 1
185
-
186
- for rec_paper in paper_list:
187
- rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
188
- rec_paper_cnt += 1
189
-
190
- rec_content += f"Reference Paper: {title}\n"
191
- rec_content += f"Abstract: {abstract}\n"
192
-
193
- messages=[
194
- {"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
195
- {"role": "user", "content": rec_content},
196
- ]
197
-
198
- responses = asyncio.run(
199
- generate_from_openai_chat_completion(
200
- client,
201
- messages=[messages],
202
- engine_name="gpt-4o-mini", # gpt-3.5-turbo
203
- max_tokens=1000, # 32
204
- requests_per_minute = 20,
205
- response_format={"type":"json_object"},
206
- )
207
- )
208
- response_data = json.loads(responses[0])
209
- rec_papers = []
210
- for rec_num in response_data["ranking"][:5]:
211
- num = int(rec_num)
212
- rec_papers.append(paper_list[num-1])
213
-
214
- return rec_papers
215
-
216
- def extract_related_content(self, papers, aspect):
217
- os.environ["OPENAI_BASE_URL"] = self.api_base
218
- os.environ["OPENAI_API_KEY"] = self.api
219
- client = AsyncOpenAI()
220
-
221
- messages = []
222
- chunk_index_map = []
223
- paper_data_list = []
224
- paper_chunk_list = []
225
- for paper_idx, paper in enumerate(papers):
226
- paper_chunks = download_pdf(paper)
227
- paper_chunk_list.append(paper_chunks)
228
-
229
- SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."
230
-
231
- for chunk_idx, paper_chunk in enumerate(paper_chunks):
232
- message = [
233
- {"role": "system", "content": SYSTEM_INPUT},
234
- {"role": "user", "content": paper_chunk},
235
- ]
236
- messages.append(message)
237
- chunk_index_map.append((paper_idx, chunk_idx)) # 标记每个 chunk 归属哪个 paper
238
-
239
-
240
- responses = asyncio.run(
241
- generate_from_openai_chat_completion(
242
- client,
243
- messages=messages,
244
- engine_name="gpt-4o-mini",
245
- max_tokens=1000,
246
- requests_per_minute=100,
247
- )
248
- )
249
-
250
- paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
251
-
252
- for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
253
- if response.strip().lower().startswith("yes"):
254
- paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"
255
-
256
- for idx, paper_data in enumerate(paper_data_list):
257
- if not paper_data["content"].strip():
258
- paper_data["content"] = papers[idx]["abstract"]
259
-
260
-
261
- if aspect == "Methodology":
262
- SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.
263
- Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.
264
- Organize the result in JSON format as follows:
265
- {
266
- "revised_text": str, not dict, not a summary
267
- }
268
- """
269
- elif aspect == "Result Analysis":
270
- SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.
271
- Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.
272
- Organize the result in JSON format as follows:
273
- {
274
- "revised_text": str, not dict, not a summary
275
- }
276
- """
277
- elif aspect == "Experimental Design":
278
- SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.
279
- Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.
280
- Organize the result in JSON format as follows:
281
- {
282
- "revised_text": str, not dict, not a summary
283
- }
284
- """
285
- elif aspect == "Literature Review":
286
- SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.
287
- Remove sentences that are irrelevant to the literature review, and keep details about the related works.
288
- Organize the result in JSON format as follows:
289
- {
290
- "revised_text": str, not dict, not a summary
291
- }
292
- """
293
- messages = []
294
- for paper_data in paper_data_list:
295
- message=[
296
- {"role": "system", "content": SYSTEM_INPUT} ,
297
- {"role": "user", "content": paper_data["content"]},
298
- ]
299
- messages.append(message)
300
-
301
- responses = asyncio.run(
302
- generate_from_openai_chat_completion(
303
- client,
304
- messages=messages,
305
- engine_name="gpt-4o-mini", # gpt-3.5-turbo
306
- max_tokens=1000, # 32
307
- requests_per_minute = 20,
308
- response_format={"type":"json_object"},
309
- )
310
- )
311
-
312
- results = []
313
- for paper_data, response in zip(paper_data_list, responses):
314
- response = json.loads(response)
315
- results.append({"title": paper_data["title"], "content": response["revised_text"]})
316
- return results
317
-
318
-
319
-
320
- def chat_review(self, text):
321
- os.environ["OPENAI_BASE_URL"] = self.api_base
322
- os.environ["OPENAI_API_KEY"] = self.api
323
- client = AsyncOpenAI()
324
-
325
- if self.enable_rag:
326
- messages=[
327
- {"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate the major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself and return only the list of limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
328
- {"role": "user", "content": text},
329
- ]
330
- else:
331
- messages=[
332
- {"role": "system", "content": f"Read the following scientific paper and generate major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself and return only the limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
333
- {"role": "user", "content": text},
334
- ]
335
- try:
336
- responses = asyncio.run(
337
- generate_from_openai_chat_completion(
338
- client,
339
- messages=[messages],
340
- engine_name=self.model_name, # gpt-3.5-turbo
341
- max_tokens=1000, # 32
342
- requests_per_minute = 20,
343
- # response_format={"type":"json_object"},
344
- )
345
- )
346
- try:
347
- limitations = json.loads(responses[0])["limitations"]
348
- result = ""
349
- limit_cnt = 1
350
- for limitation in limitations:
351
- result += f"{str(limit_cnt)}. {limitation}\n"
352
- limit_cnt += 1
353
- except:
354
- SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
355
- messages=[
356
- {"role": "system", "content": SYSTEM_INPUT},
357
- {"role": "user", "content": responses[0]},
358
- ]
359
- responses = asyncio.run(
360
- generate_from_openai_chat_completion(
361
- client,
362
- messages=[messages],
363
- engine_name="gpt-4o-mini", # gpt-3.5-turbo
364
- max_tokens=1000, # 32
365
- requests_per_minute = 20,
366
- response_format={"type":"json_object"},
367
- )
368
- )
369
- limitations = json.loads(responses[0])["limitations"]
370
- result = ""
371
- limit_cnt = 1
372
- for limitation in limitations:
373
- result += f"{str(limit_cnt)}. {limitation}\n"
374
- limit_cnt += 1
375
- # for choice in response.choices:
376
- # result += choice.message.content
377
- # result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 50)
378
- except Exception as e:
379
- result = "Error: "+ str(e)
380
- # usage = 'xxxxx'
381
- print("********"*10)
382
- print(result)
383
- print("********"*10)
384
- return result
385
-
386
-
387
- def retrieve_papers(self, title, abstract):
388
- query = title
389
- search_results = search_paper(query)
390
- if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
391
- search_result = search_results[0]
392
- retrieval = recommendation(search_result["paperId"])
393
- recommended_paper_list = []
394
- for recommended_paper in retrieval["recommendedPapers"]:
395
- if recommended_paper["abstract"] is None:
396
- continue
397
- if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
398
- recommended_paper_list.append(recommended_paper)
399
-
400
- if len(recommended_paper_list) >= 20:
401
- break
402
-
403
- else:
404
- query = self.query_gen(abstract)
405
- search_results = search_paper(query)
406
- recommended_paper_list = []
407
- if search_results["data"] == []:
408
- return None
409
- for search_result in search_results["data"]:
410
- retrieval = recommendation(search_result["paperId"])
411
- recommended_papers = []
412
- for recommended_paper in retrieval["recommendedPapers"]:
413
- if recommended_paper["abstract"] is None:
414
- continue
415
- if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
416
- recommended_papers.append(recommended_paper)
417
-
418
- if len(recommended_papers) >= 5:
419
- break
420
- recommended_paper_list.extend(recommended_papers)
421
-
422
- if recommended_paper_list == []:
423
- return None
424
- final_papers = self.rerank(recommended_paper_list, title, abstract)
425
- retrieved_papers = self.extract_related_content(final_papers, self.aspect)
426
-
427
- return retrieved_papers
428
-
429
-
430
-
431
-
432
- def extract_from_paper(self, pdf_path):
433
- os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
434
- os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
435
- client = AsyncOpenAI()
436
-
437
- # with open(pdf_path, 'rb') as f: # TODO
438
- # pdf_bytes = f.read()
439
- # file_object = BytesIO(pdf_bytes)
440
-
441
- file_object = BytesIO(pdf_path) # TODO
442
- pdf_reader = PyPDF2.PdfReader(file_object)
443
-
444
- doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
445
- page = doc.load_page(0)
446
- pix = page.get_pixmap()
447
- image_bytes = pix.tobytes("png")
448
-
449
- image_base64 = base64.b64encode(image_bytes).decode('utf-8')
450
-
451
- USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
452
- messages=[
453
- {"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
454
- {"role": "user", "content": USER_INPUT},
455
- ]
456
- responses = asyncio.run(
457
- generate_from_openai_chat_completion(
458
- client,
459
- messages=[messages],
460
- engine_name="gpt-4o-mini", # gpt-3.5-turbo
461
- max_tokens=1000, # 32
462
- requests_per_minute = 20,
463
- response_format={"type":"json_object"},
464
- )
465
- )
466
-
467
- response = json.loads(responses[0])
468
- title = response["title"]
469
- abstract = response["abstract"]
470
-
471
-
472
-
473
- num_pages = len(pdf_reader.pages)
474
- extraction_started = False
475
- extracted_text = ""
476
- for page_number in range(num_pages):
477
- page = pdf_reader.pages[page_number]
478
- page_text = page.extract_text()
479
-
480
- extraction_started = True
481
- page_number_start = page_number
482
- if extraction_started:
483
- extracted_text += page_text
484
- if page_number_start + 1 < page_number:
485
- break
486
- return extracted_text, title, abstract
487
-
488
- def main(api,api_base, paper_pdf, aspect, model_name, enable_rag):
489
- start_time = time.time()
490
- # print("key: ", PRIVATE_API_KEY, "\nbase: ", PRIVATE_API_BASE)
491
- comments = ''
492
- output2 = ''
493
- retrieved_content = ''
494
- if not api or not paper_pdf:
495
- comments = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
496
- output2 = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
497
- else:
498
- try:
499
- reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, enable_rag)
500
- comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
501
- time_used = time.time() - start_time
502
- output2 ="Processing Time:"+ str(round(time_used, 2)) +"seconds"
503
- except Exception as e:
504
- comments = "Error: "+ str(e)
505
- output2 = "Error: "+ str(e)
506
- return retrieved_content, comments, output2
507
-
508
-
509
-
510
-
511
-
512
- ########################################################################################################
513
-
514
- title = "LimitGen"
515
-
516
-
517
- description = '''<div align='left'>
518
- <strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations.
519
- </div>
520
- '''
521
-
522
- inp = [gradio.Textbox(label="Input your API-key",
523
- value="",
524
- type='password'),
525
- gradio.Textbox(label="Input the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
526
- value="https://api.openai.com/v1"),
527
-
528
- gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
529
- gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
530
- value="Methodology",
531
- label="Select the aspect"),
532
- gradio.Textbox(label="Input the model name",
533
- value="gpt-4o-mini"),
534
- gradio.Checkbox(label="Enable RAG", value=False)
535
-
536
- ]
537
-
538
- chat_reviewer_gui = gradio.Interface(fn=main,
539
- inputs=inp,
540
- outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
541
- title=title,
542
- description=description)
543
-
544
- # Start server
 
545
  chat_reviewer_gui .launch(quiet=True, show_api=False)
 
1
+ import numpy as np
2
+ import os
3
+ import re
4
+ from io import BytesIO
5
+ import datetime
6
+ import time
7
+ import openai, tenacity
8
+ import argparse
9
+ import configparser
10
+ import json
11
+ import fitz
12
+ import PyPDF2
13
+ import gradio
14
+ import sys
15
+ from pathlib import Path
16
+ utils_dir = Path(__file__).parent / 'utils'
17
+ sys.path.append(str(utils_dir))
18
+ from openai_utils import *
19
+ import base64
20
+ from pdf2image import convert_from_bytes
21
+ import requests
22
+ PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
23
+ PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
24
+
25
+
26
+ def insert_sentence(text, sentence, interval):
27
+ lines = text.split('\n')
28
+ new_lines = []
29
+
30
+ for line in lines:
31
+ words = line.split()
32
+ separator = ' '
33
+
34
+ new_words = []
35
+ count = 0
36
+
37
+ for word in words:
38
+ new_words.append(word)
39
+ count += 1
40
+
41
+ if count % interval == 0:
42
+ new_words.append(sentence)
43
+
44
+ new_lines.append(separator.join(new_words))
45
+
46
+ return '\n'.join(new_lines)
47
+
48
+ def search_paper(query):
49
+ SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
50
+ url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
51
+
52
+ response = requests.get(url)
53
+ while response.status_code != 200:
54
+ time.sleep(1)
55
+ # print(response)
56
+ response = requests.get(url)
57
+
58
+ return response.json()
59
+
60
+ def split_text_into_chunks(text, chunk_size=300):
61
+ words = text.split()
62
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
63
+ return chunks
64
+
65
+ def download_pdf(paper):
66
+ pdf_url = paper["openAccessPdf"]["url"]
67
+ try:
68
+ response = requests.get(pdf_url)
69
+ response.raise_for_status()
70
+
71
+
72
+ file_object = BytesIO(response.content)
73
+ extract_text = extract_chapter(file_object)
74
+ chunks = split_text_into_chunks(extract_text)
75
+ return chunks
76
+ except:
77
+ return []
78
+
79
+
80
+ def recommendation(s2_id, limit=500):
81
+ SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
82
+ url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
83
+
84
+ # print(url)
85
+ response = requests.get(url)
86
+ while response.status_code != 200:
87
+ time.sleep(1)
88
+ # print(response)
89
+ response = requests.get(url)
90
+
91
+ return response.json()
92
+
93
+
94
+ def extract_chapter(file_object):
95
+ pdf_reader = PyPDF2.PdfReader(file_object)
96
+
97
+
98
+ num_pages = len(pdf_reader.pages)
99
+ extraction_started = False
100
+ extracted_text = ""
101
+ for page_number in range(num_pages):
102
+ page = pdf_reader.pages[page_number]
103
+ page_text = page.extract_text()
104
+ extraction_started = True
105
+ page_number_start = page_number
106
+ if extraction_started:
107
+ extracted_text += page_text
108
+ if page_number_start + 1 < page_number:
109
+ break
110
+ return extracted_text
111
+
112
+
113
+
114
+ class Reviewer:
115
+
116
+ def __init__(self, api, api_base, paper_pdf, aspect, model_name, enable_rag):
117
+ self.api = api
118
+ self.api_base = api_base
119
+ self.aspect = aspect
120
+ self.paper_pdf = paper_pdf
121
+ self.model_name = model_name
122
+ self.enable_rag = enable_rag
123
+ # self.max_token_num = 50000
124
+ # self.encoding = tiktoken.get_encoding("gpt2")
125
+
126
+
127
+ def review_by_chatgpt(self, paper_list):
128
+ text, title, abstract = self.extract_from_paper(self.paper_pdf)
129
+ content = f"Paper to review: \nTitle: {title}\n" + text
130
+
131
+ if self.enable_rag:
132
+ papers = self.retrieve_papers(title, abstract)
133
+ if papers != None:
134
+ retrieval_content = ""
135
+ retrieved_papers = ""
136
+ cnt = 1
137
+ for paper in papers:
138
+ retrieval_content += f"Relevant Paper {str(cnt)}:\n"
139
+ retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
140
+ retrieved_papers += f"{str(cnt)}. {paper['title']}\n"
141
+ cnt += 1
142
+ text = retrieval_content + content
143
+ chat_review_text = self.chat_review(text=text)
144
+ else:
145
+ text = content
146
+ chat_review_text = self.chat_review(text=text)
147
+ retrieved_papers = ""
148
+ else:
149
+ text = content
150
+ chat_review_text = self.chat_review(text=text)
151
+ retrieved_papers = ""
152
+
153
+ return chat_review_text, retrieved_papers
154
+
155
+ def query_gen(self, abstract):
156
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
157
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
158
+ client = AsyncOpenAI()
159
+
160
+ messages=[
161
+ {"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
162
+ {"role": "user", "content": abstract},
163
+ ]
164
+
165
+ responses = asyncio.run(
166
+ generate_from_openai_chat_completion(
167
+ client,
168
+ messages=[messages],
169
+ engine_name="gpt-4o-mini", # gpt-3.5-turbo
170
+ max_tokens=1000, # 32
171
+ requests_per_minute = 20,
172
+ # response_format={"type":"json_object"},
173
+ )
174
+ )
175
+ return responses[0]
176
+
177
+
178
+ def rerank(self, paper_list, title, abstract):
179
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
180
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
181
+ client = AsyncOpenAI()
182
+
183
+ rec_content = ""
184
+ rec_paper_cnt = 1
185
+
186
+ for rec_paper in paper_list:
187
+ rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
188
+ rec_paper_cnt += 1
189
+
190
+ rec_content += f"Reference Paper: {title}\n"
191
+ rec_content += f"Abstract: {abstract}\n"
192
+
193
+ messages=[
194
+ {"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
195
+ {"role": "user", "content": rec_content},
196
+ ]
197
+
198
+ responses = asyncio.run(
199
+ generate_from_openai_chat_completion(
200
+ client,
201
+ messages=[messages],
202
+ engine_name="gpt-4o-mini", # gpt-3.5-turbo
203
+ max_tokens=1000, # 32
204
+ requests_per_minute = 20,
205
+ response_format={"type":"json_object"},
206
+ )
207
+ )
208
+ response_data = json.loads(responses[0])
209
+ rec_papers = []
210
+ for rec_num in response_data["ranking"][:5]:
211
+ num = int(rec_num)
212
+ rec_papers.append(paper_list[num-1])
213
+
214
+ return rec_papers
215
+
216
+ def extract_related_content(self, papers, aspect):
217
+ os.environ["OPENAI_BASE_URL"] = self.api_base
218
+ os.environ["OPENAI_API_KEY"] = self.api
219
+ client = AsyncOpenAI()
220
+
221
+ messages = []
222
+ chunk_index_map = []
223
+ paper_data_list = []
224
+ paper_chunk_list = []
225
+ for paper_idx, paper in enumerate(papers):
226
+ paper_chunks = download_pdf(paper)
227
+ paper_chunk_list.append(paper_chunks)
228
+
229
+ SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."
230
+
231
+ for chunk_idx, paper_chunk in enumerate(paper_chunks):
232
+ message = [
233
+ {"role": "system", "content": SYSTEM_INPUT},
234
+ {"role": "user", "content": paper_chunk},
235
+ ]
236
+ messages.append(message)
237
+ chunk_index_map.append((paper_idx, chunk_idx)) # 标记每个 chunk 归属哪个 paper
238
+
239
+
240
+ responses = asyncio.run(
241
+ generate_from_openai_chat_completion(
242
+ client,
243
+ messages=messages,
244
+ engine_name="gpt-4o-mini",
245
+ max_tokens=1000,
246
+ requests_per_minute=100,
247
+ )
248
+ )
249
+
250
+ paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
251
+
252
+ for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
253
+ if response.strip().lower().startswith("yes"):
254
+ paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"
255
+
256
+ for idx, paper_data in enumerate(paper_data_list):
257
+ if not paper_data["content"].strip():
258
+ paper_data["content"] = papers[idx]["abstract"]
259
+
260
+
261
+ if aspect == "Methodology":
262
+ SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.
263
+ Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.
264
+ Organize the result in JSON format as follows:
265
+ {
266
+ "revised_text": str, not dict, not a summary
267
+ }
268
+ """
269
+ elif aspect == "Result Analysis":
270
+ SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.
271
+ Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.
272
+ Organize the result in JSON format as follows:
273
+ {
274
+ "revised_text": str, not dict, not a summary
275
+ }
276
+ """
277
+ elif aspect == "Experimental Design":
278
+ SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.
279
+ Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.
280
+ Organize the result in JSON format as follows:
281
+ {
282
+ "revised_text": str, not dict, not a summary
283
+ }
284
+ """
285
+ elif aspect == "Literature Review":
286
+ SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.
287
+ Remove sentences that are irrelevant to the literature review, and keep details about the related works.
288
+ Organize the result in JSON format as follows:
289
+ {
290
+ "revised_text": str, not dict, not a summary
291
+ }
292
+ """
293
+ messages = []
294
+ for paper_data in paper_data_list:
295
+ message=[
296
+ {"role": "system", "content": SYSTEM_INPUT} ,
297
+ {"role": "user", "content": paper_data["content"]},
298
+ ]
299
+ messages.append(message)
300
+
301
+ responses = asyncio.run(
302
+ generate_from_openai_chat_completion(
303
+ client,
304
+ messages=messages,
305
+ engine_name="gpt-4o-mini", # gpt-3.5-turbo
306
+ max_tokens=1000, # 32
307
+ requests_per_minute = 20,
308
+ response_format={"type":"json_object"},
309
+ )
310
+ )
311
+
312
+ results = []
313
+ for paper_data, response in zip(paper_data_list, responses):
314
+ print(response)
315
+ response = json.loads(response)
316
+ results.append({"title": paper_data["title"], "content": response["revised_text"]})
317
+ return results
318
+
319
+
320
+
321
+ def chat_review(self, text):
322
+ os.environ["OPENAI_BASE_URL"] = self.api_base
323
+ os.environ["OPENAI_API_KEY"] = self.api
324
+ client = AsyncOpenAI()
325
+
326
+ if self.enable_rag:
327
+ messages=[
328
+ {"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate the major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself and return only the list of limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
329
+ {"role": "user", "content": text},
330
+ ]
331
+ else:
332
+ messages=[
333
+ {"role": "system", "content": f"Read the following scientific paper and generate major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself and return only the limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
334
+ {"role": "user", "content": text},
335
+ ]
336
+ try:
337
+ responses = asyncio.run(
338
+ generate_from_openai_chat_completion(
339
+ client,
340
+ messages=[messages],
341
+ engine_name=self.model_name, # gpt-3.5-turbo
342
+ max_tokens=1000, # 32
343
+ requests_per_minute = 20,
344
+ # response_format={"type":"json_object"},
345
+ )
346
+ )
347
+ try:
348
+ limitations = json.loads(responses[0])["limitations"]
349
+ result = ""
350
+ limit_cnt = 1
351
+ for limitation in limitations:
352
+ result += f"{str(limit_cnt)}. {limitation}\n"
353
+ limit_cnt += 1
354
+ except:
355
+ SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
356
+ messages=[
357
+ {"role": "system", "content": SYSTEM_INPUT},
358
+ {"role": "user", "content": responses[0]},
359
+ ]
360
+ responses = asyncio.run(
361
+ generate_from_openai_chat_completion(
362
+ client,
363
+ messages=[messages],
364
+ engine_name="gpt-4o-mini", # gpt-3.5-turbo
365
+ max_tokens=1000, # 32
366
+ requests_per_minute = 20,
367
+ response_format={"type":"json_object"},
368
+ )
369
+ )
370
+ limitations = json.loads(responses[0])["limitations"]
371
+ result = ""
372
+ limit_cnt = 1
373
+ for limitation in limitations:
374
+ result += f"{str(limit_cnt)}. {limitation}\n"
375
+ limit_cnt += 1
376
+ # for choice in response.choices:
377
+ # result += choice.message.content
378
+ # result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 50)
379
+ except Exception as e:
380
+ result = "Error: "+ str(e)
381
+ # usage = 'xxxxx'
382
+ print("********"*10)
383
+ print(result)
384
+ print("********"*10)
385
+ return result
386
+
387
+
388
+ def retrieve_papers(self, title, abstract):
389
+ query = title
390
+ search_results = search_paper(query)
391
+ if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
392
+ search_result = search_results[0]
393
+ retrieval = recommendation(search_result["paperId"])
394
+ recommended_paper_list = []
395
+ for recommended_paper in retrieval["recommendedPapers"]:
396
+ if recommended_paper["abstract"] is None:
397
+ continue
398
+ if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
399
+ recommended_paper_list.append(recommended_paper)
400
+
401
+ if len(recommended_paper_list) >= 20:
402
+ break
403
+
404
+ else:
405
+ query = self.query_gen(abstract)
406
+ search_results = search_paper(query)
407
+ recommended_paper_list = []
408
+ if search_results["data"] == []:
409
+ return None
410
+ for search_result in search_results["data"]:
411
+ retrieval = recommendation(search_result["paperId"])
412
+ recommended_papers = []
413
+ for recommended_paper in retrieval["recommendedPapers"]:
414
+ if recommended_paper["abstract"] is None:
415
+ continue
416
+ if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
417
+ recommended_papers.append(recommended_paper)
418
+
419
+ if len(recommended_papers) >= 5:
420
+ break
421
+ recommended_paper_list.extend(recommended_papers)
422
+
423
+ if recommended_paper_list == []:
424
+ return None
425
+ final_papers = self.rerank(recommended_paper_list, title, abstract)
426
+ retrieved_papers = self.extract_related_content(final_papers, self.aspect)
427
+
428
+ return retrieved_papers
429
+
430
+
431
+
432
+
433
+ def extract_from_paper(self, pdf_path):
434
+ os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
435
+ os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
436
+ client = AsyncOpenAI()
437
+
438
+ # with open(pdf_path, 'rb') as f: # TODO
439
+ # pdf_bytes = f.read()
440
+ # file_object = BytesIO(pdf_bytes)
441
+
442
+ file_object = BytesIO(pdf_path) # TODO
443
+ pdf_reader = PyPDF2.PdfReader(file_object)
444
+
445
+ doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
446
+ page = doc.load_page(0)
447
+ pix = page.get_pixmap()
448
+ image_bytes = pix.tobytes("png")
449
+
450
+ image_base64 = base64.b64encode(image_bytes).decode('utf-8')
451
+
452
+ USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
453
+ messages=[
454
+ {"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
455
+ {"role": "user", "content": USER_INPUT},
456
+ ]
457
+ responses = asyncio.run(
458
+ generate_from_openai_chat_completion(
459
+ client,
460
+ messages=[messages],
461
+ engine_name="gpt-4o-mini", # gpt-3.5-turbo
462
+ max_tokens=1000, # 32
463
+ requests_per_minute = 20,
464
+ response_format={"type":"json_object"},
465
+ )
466
+ )
467
+
468
+ response = json.loads(responses[0])
469
+ title = response["title"]
470
+ abstract = response["abstract"]
471
+
472
+
473
+
474
+ num_pages = len(pdf_reader.pages)
475
+ extraction_started = False
476
+ extracted_text = ""
477
+ for page_number in range(num_pages):
478
+ page = pdf_reader.pages[page_number]
479
+ page_text = page.extract_text()
480
+
481
+ extraction_started = True
482
+ page_number_start = page_number
483
+ if extraction_started:
484
+ extracted_text += page_text
485
+ if page_number_start + 1 < page_number:
486
+ break
487
+ return extracted_text, title, abstract
488
+
489
+ def main(api,api_base, paper_pdf, aspect, model_name, enable_rag):
490
+ start_time = time.time()
491
+ # print("key: ", PRIVATE_API_KEY, "\nbase: ", PRIVATE_API_BASE)
492
+ comments = ''
493
+ output2 = ''
494
+ retrieved_content = ''
495
+ if not api or not paper_pdf:
496
+ comments = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
497
+ output2 = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
498
+ else:
499
+ try:
500
+ reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, enable_rag)
501
+ comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
502
+ time_used = time.time() - start_time
503
+ output2 ="Processing Time:"+ str(round(time_used, 2)) +"seconds"
504
+ except Exception as e:
505
+ comments = "Error: "+ str(e)
506
+ output2 = "Error: "+ str(e)
507
+ return retrieved_content, comments, output2
508
+
509
+
510
+
511
+
512
+
513
+ ########################################################################################################
514
+
515
+ title = "LimitGen"
516
+
517
+
518
+ description = '''<div align='left'>
519
+ <strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations.
520
+ </div>
521
+ '''
522
+
523
+ inp = [gradio.Textbox(label="Input your API-key",
524
+ value="",
525
+ type='password'),
526
+ gradio.Textbox(label="Input the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
527
+ value="https://api.openai.com/v1"),
528
+
529
+ gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
530
+ gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
531
+ value="Methodology",
532
+ label="Select the aspect"),
533
+ gradio.Textbox(label="Input the model name",
534
+ value="gpt-4o-mini"),
535
+ gradio.Checkbox(label="Enable RAG", value=False)
536
+
537
+ ]
538
+
539
+ chat_reviewer_gui = gradio.Interface(fn=main,
540
+ inputs=inp,
541
+ outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
542
+ title=title,
543
+ description=description)
544
+
545
+ # Start server
546
  chat_reviewer_gui .launch(quiet=True, show_api=False)