File size: 22,505 Bytes
9d9cd7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
import numpy as np
import os
import re
from io import BytesIO
import datetime
import time
import openai, tenacity
import argparse
import configparser
import json
import fitz
import PyPDF2
import gradio
import sys
from pathlib import Path
utils_dir = Path(__file__).parent / 'utils'
sys.path.append(str(utils_dir))
from openai_utils import *
import base64
from pdf2image import convert_from_bytes
import requests
PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')


def insert_sentence(text, sentence, interval):
    lines = text.split('\n')
    new_lines = []

    for line in lines:
        words = line.split()
        separator = ' '

        new_words = []
        count = 0

        for word in words:
            new_words.append(word)
            count += 1

            if count % interval == 0:
                new_words.append(sentence)

        new_lines.append(separator.join(new_words))

    return '\n'.join(new_lines)
    
def search_paper(query):
    SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
    url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"

    response = requests.get(url)
    while response.status_code != 200:
        time.sleep(1)
        # print(response)
        response = requests.get(url)

    return response.json()    

def split_text_into_chunks(text, chunk_size=300):
    words = text.split()  
    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def download_pdf(paper):
    pdf_url = paper["openAccessPdf"]["url"]
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()  


        file_object = BytesIO(response.content)
        extract_text = extract_chapter(file_object)
        chunks = split_text_into_chunks(extract_text)
        return chunks
    except:
        return []
    

def recommendation(s2_id, limit=500):
    SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
    url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"

    # print(url)
    response = requests.get(url)
    while response.status_code != 200:
        time.sleep(1)
        # print(response)
        response = requests.get(url)

    return response.json()


def extract_chapter(file_object):
    pdf_reader = PyPDF2.PdfReader(file_object)
    

    num_pages = len(pdf_reader.pages)
    extraction_started = False
    extracted_text = ""
    for page_number in range(num_pages):
        page = pdf_reader.pages[page_number]
        page_text = page.extract_text()
        extraction_started = True
        page_number_start = page_number
        if extraction_started:
            extracted_text += page_text
            if page_number_start + 1 < page_number:
                break
    return extracted_text
        


class Reviewer:

    def __init__(self, api, api_base, paper_pdf, aspect, model_name, enable_rag):
        self.api = api
        self.api_base = api_base
        self.aspect = aspect
        self.paper_pdf = paper_pdf
        self.model_name = model_name
        self.enable_rag = enable_rag
        # self.max_token_num = 50000
        # self.encoding = tiktoken.get_encoding("gpt2")


    def review_by_chatgpt(self, paper_list):
        text, title, abstract = self.extract_from_paper(self.paper_pdf) 
        content = f"Paper to review: \nTitle: {title}\n" + text

        if self.enable_rag:
            papers = self.retrieve_papers(title, abstract)
            if papers != None:
                retrieval_content = ""
                retrieved_papers = "" 
                cnt = 1
                for paper in papers:
                    retrieval_content += f"Relevant Paper {str(cnt)}:\n"
                    retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
                    retrieved_papers += f"{str(cnt)}. {paper['title']}\n"
                    cnt += 1
                text = retrieval_content + content
                chat_review_text = self.chat_review(text=text)  
            else:
                text = content
                chat_review_text = self.chat_review(text=text)   
                retrieved_papers = "" 
        else:
            text = content
            chat_review_text = self.chat_review(text=text)   
            retrieved_papers = ""   

        return chat_review_text, retrieved_papers

    def query_gen(self, abstract):
        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
        client = AsyncOpenAI()

        messages=[
                {"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
                {"role": "user", "content": abstract},
            ]

        responses = asyncio.run(
            generate_from_openai_chat_completion(
                client,
                messages=[messages], 
                engine_name="gpt-4o-mini", # gpt-3.5-turbo
                max_tokens=1000, # 32
                requests_per_minute = 20,
                # response_format={"type":"json_object"},
            )
        )   
        return responses[0]


    def rerank(self, paper_list, title, abstract): 
        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
        client = AsyncOpenAI()

        rec_content = ""
        rec_paper_cnt = 1

        for rec_paper in paper_list:
            rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
            rec_paper_cnt += 1

        rec_content += f"Reference Paper: {title}\n"
        rec_content += f"Abstract: {abstract}\n"

        messages=[
                {"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
                {"role": "user", "content": rec_content},
            ]

        responses = asyncio.run(
            generate_from_openai_chat_completion(
                client,
                messages=[messages], 
                engine_name="gpt-4o-mini", # gpt-3.5-turbo
                max_tokens=1000, # 32
                requests_per_minute = 20,
                response_format={"type":"json_object"},
            )
        )   
        response_data = json.loads(responses[0])
        rec_papers = []
        for rec_num in response_data["ranking"][:5]:
            num = int(rec_num)
            rec_papers.append(paper_list[num-1])

        return rec_papers

    def extract_related_content(self, papers, aspect):
        os.environ["OPENAI_BASE_URL"] = self.api_base
        os.environ["OPENAI_API_KEY"] = self.api
        client = AsyncOpenAI()

        messages = []
        chunk_index_map = [] 
        paper_data_list = []
        paper_chunk_list = []
        for paper_idx, paper in enumerate(papers):
            paper_chunks = download_pdf(paper)
            paper_chunk_list.append(paper_chunks)

            SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."

            for chunk_idx, paper_chunk in enumerate(paper_chunks):
                message = [
                    {"role": "system", "content": SYSTEM_INPUT},
                    {"role": "user", "content": paper_chunk},
                ]
                messages.append(message)
                chunk_index_map.append((paper_idx, chunk_idx))  # 标记每个 chunk 归属哪个 paper
        

        responses = asyncio.run(
            generate_from_openai_chat_completion(
                client,
                messages=messages,
                engine_name="gpt-4o-mini",
                max_tokens=1000,
                requests_per_minute=100,
            )
        )

        paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]

        for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
            if response.strip().lower().startswith("yes"):
                paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"

        for idx, paper_data in enumerate(paper_data_list):
            if not paper_data["content"].strip():
                paper_data["content"] = papers[idx]["abstract"]


        if aspect == "Methodology":
            SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.

Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.

Organize the result in JSON format as follows:

{

    "revised_text": str, not dict, not a summary

}

"""
        elif aspect == "Result Analysis":        
            SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.

Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.

Organize the result in JSON format as follows:

{

    "revised_text": str, not dict, not a summary

}

"""
        elif aspect == "Experimental Design":  
            SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.

Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.

Organize the result in JSON format as follows:

{

    "revised_text": str, not dict, not a summary

}

"""
        elif aspect == "Literature Review":  
            SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.

Remove sentences that are irrelevant to the literature review, and keep details about the related works.

Organize the result in JSON format as follows:

{

    "revised_text": str, not dict, not a summary

}

"""
        messages = []
        for paper_data in paper_data_list:
            message=[
                {"role": "system", "content": SYSTEM_INPUT} ,
                {"role": "user", "content": paper_data["content"]},
            ]
            messages.append(message)

        responses = asyncio.run(
            generate_from_openai_chat_completion(
                client,
                messages=messages, 
                engine_name="gpt-4o-mini", # gpt-3.5-turbo
                max_tokens=1000, # 32
                requests_per_minute = 20,
                response_format={"type":"json_object"},
            )
        ) 

        results = []
        for paper_data, response in zip(paper_data_list, responses):
            response = json.loads(response)
            results.append({"title": paper_data["title"], "content": response["revised_text"]})
        return results



    def chat_review(self, text):
        os.environ["OPENAI_BASE_URL"] = self.api_base
        os.environ["OPENAI_API_KEY"] = self.api
        client = AsyncOpenAI()

        if self.enable_rag:
            messages=[
                    {"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate the major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself and return only the list of limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
                    {"role": "user", "content": text},
                ]
        else:
            messages=[
                    {"role": "system", "content": f"Read the following scientific paper and generate major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself and return only the limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
                    {"role": "user", "content": text},
                ]
        try:
            responses = asyncio.run(
                generate_from_openai_chat_completion(
                    client,
                    messages=[messages], 
                    engine_name=self.model_name, # gpt-3.5-turbo
                    max_tokens=1000, # 32
                    requests_per_minute = 20,
                    # response_format={"type":"json_object"},
                )
            )            
            try:
                limitations = json.loads(responses[0])["limitations"]
                result = ""
                limit_cnt = 1
                for limitation in limitations:
                    result += f"{str(limit_cnt)}. {limitation}\n"
                    limit_cnt += 1
            except:
                SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
                messages=[
                    {"role": "system", "content": SYSTEM_INPUT},
                    {"role": "user", "content": responses[0]},
                ]
                responses = asyncio.run(
                    generate_from_openai_chat_completion(
                        client,
                        messages=[messages], 
                        engine_name="gpt-4o-mini", # gpt-3.5-turbo
                        max_tokens=1000, # 32
                        requests_per_minute = 20,
                        response_format={"type":"json_object"},
                    )
                )  
                limitations = json.loads(responses[0])["limitations"]
                result = ""
                limit_cnt = 1
                for limitation in limitations:
                    result += f"{str(limit_cnt)}. {limitation}\n"
                    limit_cnt += 1
            # for choice in response.choices:
            #     result += choice.message.content 
            # result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 50)
        except Exception as e:  
            result = "Error: "+ str(e)
            # usage  = 'xxxxx'
        print("********"*10)
        print(result)
        print("********"*10)      
        return result    


    def retrieve_papers(self, title, abstract):    
        query = title
        search_results = search_paper(query)
        if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
            search_result = search_results[0]
            retrieval = recommendation(search_result["paperId"])
            recommended_paper_list = []
            for recommended_paper in retrieval["recommendedPapers"]:
                if recommended_paper["abstract"] is None:
                    continue
                if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
                    recommended_paper_list.append(recommended_paper)

                if len(recommended_paper_list) >= 20:
                    break

        else:
            query = self.query_gen(abstract)
            search_results = search_paper(query)
            recommended_paper_list = []
            if search_results["data"] == []:
                return None
            for search_result in search_results["data"]:
                retrieval = recommendation(search_result["paperId"])
                recommended_papers = []
                for recommended_paper in retrieval["recommendedPapers"]:
                    if recommended_paper["abstract"] is None:
                        continue
                    if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
                        recommended_papers.append(recommended_paper)

                    if len(recommended_papers) >= 5:
                        break
                recommended_paper_list.extend(recommended_papers)

        if recommended_paper_list == []:
            return None
        final_papers = self.rerank(recommended_paper_list, title, abstract)
        retrieved_papers = self.extract_related_content(final_papers, self.aspect)

        return retrieved_papers


        

    def extract_from_paper(self, pdf_path):
        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
        client = AsyncOpenAI()
        
        # with open(pdf_path, 'rb') as f: # TODO
        #     pdf_bytes = f.read()
        #     file_object = BytesIO(pdf_bytes)

        file_object = BytesIO(pdf_path) # TODO
        pdf_reader = PyPDF2.PdfReader(file_object)
        
        doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
        page = doc.load_page(0)
        pix = page.get_pixmap()
        image_bytes = pix.tobytes("png")

        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
    
        USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
        messages=[
                {"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
                {"role": "user", "content": USER_INPUT},
            ]
        responses = asyncio.run(
            generate_from_openai_chat_completion(
                client,
                messages=[messages], 
                engine_name="gpt-4o-mini", # gpt-3.5-turbo
                max_tokens=1000, # 32
                requests_per_minute = 20,
                response_format={"type":"json_object"},
            )
        )   

        response = json.loads(responses[0])
        title = response["title"]
        abstract = response["abstract"]



        num_pages = len(pdf_reader.pages)
        extraction_started = False
        extracted_text = ""
        for page_number in range(num_pages):
            page = pdf_reader.pages[page_number]
            page_text = page.extract_text()

            extraction_started = True
            page_number_start = page_number
            if extraction_started:
                extracted_text += page_text
                if page_number_start + 1 < page_number:
                    break
        return extracted_text, title, abstract

def main(api,api_base, paper_pdf, aspect, model_name, enable_rag):  
    start_time = time.time()
    # print("key: ", PRIVATE_API_KEY, "\nbase: ", PRIVATE_API_BASE)
    comments = ''
    output2 = ''
    retrieved_content = ''
    if not api or not paper_pdf:
        comments =  "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
        output2 =  "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
    else:
        try:
            reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, enable_rag)   
            comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
            time_used = time.time() - start_time
            output2 ="Processing Time:"+ str(round(time_used, 2)) +"seconds"
        except Exception as e:    
            comments = "Error: "+ str(e)
            output2 = "Error: "+ str(e)
    return retrieved_content, comments, output2





########################################################################################################    

title = "LimitGen"


description = '''<div align='left'>

<strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations.

</div>

'''

inp = [gradio.Textbox(label="Input your API-key",
                          value="",
                          type='password'),
       gradio.Textbox(label="Input the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
                          value="https://api.openai.com/v1"),

       gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
       gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
                        value="Methodology",
                        label="Select the aspect"),
       gradio.Textbox(label="Input the model name",
                          value="gpt-4o-mini"), 
       gradio.Checkbox(label="Enable RAG", value=False)
    
]

chat_reviewer_gui = gradio.Interface(fn=main,
                                 inputs=inp,
                                 outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
                                 title=title,
                                 description=description)

# Start server
chat_reviewer_gui .launch(quiet=True, show_api=False)