LimitGen

Sleeping

App Files Files Community

zjXu11 commited on Mar 12

Commit

ca1d599

verified ·

1 Parent(s): b1ea0d1

Update app.py

Browse files

Files changed (1) hide show

app.py +545 -544

app.py CHANGED Viewed

@@ -1,545 +1,546 @@
-import numpy as np
-import os
-import re
-from io import BytesIO
-import datetime
-import time
-import openai, tenacity
-import argparse
-import configparser
-import json
-import fitz
-import PyPDF2
-import gradio
-import sys
-from pathlib import Path
-utils_dir = Path(__file__).parent / 'utils'
-sys.path.append(str(utils_dir))
-from openai_utils import *
-import base64
-from pdf2image import convert_from_bytes
-import requests
-PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
-PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
-def insert_sentence(text, sentence, interval):
-    lines = text.split('\n')
-    new_lines = []
-    for line in lines:
-        words = line.split()
-        separator = ' '
-        new_words = []
-        count = 0
-        for word in words:
-            new_words.append(word)
-            count += 1
-            if count % interval == 0:
-                new_words.append(sentence)
-        new_lines.append(separator.join(new_words))
-    return '\n'.join(new_lines)
-def search_paper(query):
-    SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
-    url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
-    response = requests.get(url)
-    while response.status_code != 200:
-        time.sleep(1)
-        # print(response)
-        response = requests.get(url)
-    return response.json()
-def split_text_into_chunks(text, chunk_size=300):
-    words = text.split()
-    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
-    return chunks
-def download_pdf(paper):
-    pdf_url = paper["openAccessPdf"]["url"]
-    try:
-        response = requests.get(pdf_url)
-        response.raise_for_status()
-        file_object = BytesIO(response.content)
-        extract_text = extract_chapter(file_object)
-        chunks = split_text_into_chunks(extract_text)
-        return chunks
-    except:
-        return []
-def recommendation(s2_id, limit=500):
-    SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
-    url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
-    # print(url)
-    response = requests.get(url)
-    while response.status_code != 200:
-        time.sleep(1)
-        # print(response)
-        response = requests.get(url)
-    return response.json()
-def extract_chapter(file_object):
-    pdf_reader = PyPDF2.PdfReader(file_object)
-    num_pages = len(pdf_reader.pages)
-    extraction_started = False
-    extracted_text = ""
-    for page_number in range(num_pages):
-        page = pdf_reader.pages[page_number]
-        page_text = page.extract_text()
-        extraction_started = True
-        page_number_start = page_number
-        if extraction_started:
-            extracted_text += page_text
-            if page_number_start + 1 < page_number:
-                break
-    return extracted_text
-class Reviewer:
-    def __init__(self, api, api_base, paper_pdf, aspect, model_name, enable_rag):
-        self.api = api
-        self.api_base = api_base
-        self.aspect = aspect
-        self.paper_pdf = paper_pdf
-        self.model_name = model_name
-        self.enable_rag = enable_rag
-        # self.max_token_num = 50000
-        # self.encoding = tiktoken.get_encoding("gpt2")
-    def review_by_chatgpt(self, paper_list):
-        text, title, abstract = self.extract_from_paper(self.paper_pdf)
-        content = f"Paper to review: \nTitle: {title}\n" + text
-        if self.enable_rag:
-            papers = self.retrieve_papers(title, abstract)
-            if papers != None:
-                retrieval_content = ""
-                retrieved_papers = ""
-                cnt = 1
-                for paper in papers:
-                    retrieval_content += f"Relevant Paper {str(cnt)}:\n"
-                    retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
-                    retrieved_papers += f"{str(cnt)}. {paper['title']}\n"
-                    cnt += 1
-                text = retrieval_content + content
-                chat_review_text = self.chat_review(text=text)
-            else:
-                text = content
-                chat_review_text = self.chat_review(text=text)
-                retrieved_papers = ""
-        else:
-            text = content
-            chat_review_text = self.chat_review(text=text)
-            retrieved_papers = ""
-        return chat_review_text, retrieved_papers
-    def query_gen(self, abstract):
-        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
-        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
-        client = AsyncOpenAI()
-        messages=[
-                {"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
-                {"role": "user", "content": abstract},
-            ]
-        responses = asyncio.run(
-            generate_from_openai_chat_completion(
-                client,
-                messages=[messages],
-                engine_name="gpt-4o-mini", # gpt-3.5-turbo
-                max_tokens=1000, # 32
-                requests_per_minute = 20,
-                # response_format={"type":"json_object"},
-            )
-        )
-        return responses[0]
-    def rerank(self, paper_list, title, abstract):
-        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
-        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
-        client = AsyncOpenAI()
-        rec_content = ""
-        rec_paper_cnt = 1
-        for rec_paper in paper_list:
-            rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
-            rec_paper_cnt += 1
-        rec_content += f"Reference Paper: {title}\n"
-        rec_content += f"Abstract: {abstract}\n"
-        messages=[
-                {"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
-                {"role": "user", "content": rec_content},
-            ]
-        responses = asyncio.run(
-            generate_from_openai_chat_completion(
-                client,
-                messages=[messages],
-                engine_name="gpt-4o-mini", # gpt-3.5-turbo
-                max_tokens=1000, # 32
-                requests_per_minute = 20,
-                response_format={"type":"json_object"},
-            )
-        )
-        response_data = json.loads(responses[0])
-        rec_papers = []
-        for rec_num in response_data["ranking"][:5]:
-            num = int(rec_num)
-            rec_papers.append(paper_list[num-1])
-        return rec_papers
-    def extract_related_content(self, papers, aspect):
-        os.environ["OPENAI_BASE_URL"] = self.api_base
-        os.environ["OPENAI_API_KEY"] = self.api
-        client = AsyncOpenAI()
-        messages = []
-        chunk_index_map = []
-        paper_data_list = []
-        paper_chunk_list = []
-        for paper_idx, paper in enumerate(papers):
-            paper_chunks = download_pdf(paper)
-            paper_chunk_list.append(paper_chunks)
-            SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."
-            for chunk_idx, paper_chunk in enumerate(paper_chunks):
-                message = [
-                    {"role": "system", "content": SYSTEM_INPUT},
-                    {"role": "user", "content": paper_chunk},
-                ]
-                messages.append(message)
-                chunk_index_map.append((paper_idx, chunk_idx))  # 标记每个 chunk 归属哪个 paper
-        responses = asyncio.run(
-            generate_from_openai_chat_completion(
-                client,
-                messages=messages,
-                engine_name="gpt-4o-mini",
-                max_tokens=1000,
-                requests_per_minute=100,
-            )
-        )
-        paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
-        for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
-            if response.strip().lower().startswith("yes"):
-                paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"
-        for idx, paper_data in enumerate(paper_data_list):
-            if not paper_data["content"].strip():
-                paper_data["content"] = papers[idx]["abstract"]
-        if aspect == "Methodology":
-            SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.
-Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.
-Organize the result in JSON format as follows:
-{
-    "revised_text": str, not dict, not a summary
-}
-"""
-        elif aspect == "Result Analysis":
-            SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.
-Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.
-Organize the result in JSON format as follows:
-{
-    "revised_text": str, not dict, not a summary
-}
-"""
-        elif aspect == "Experimental Design":
-            SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.
-Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.
-Organize the result in JSON format as follows:
-{
-    "revised_text": str, not dict, not a summary
-}
-"""
-        elif aspect == "Literature Review":
-            SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.
-Remove sentences that are irrelevant to the literature review, and keep details about the related works.
-Organize the result in JSON format as follows:
-{
-    "revised_text": str, not dict, not a summary
-}
-"""
-        messages = []
-        for paper_data in paper_data_list:
-            message=[
-                {"role": "system", "content": SYSTEM_INPUT} ,
-                {"role": "user", "content": paper_data["content"]},
-            ]
-            messages.append(message)
-        responses = asyncio.run(
-            generate_from_openai_chat_completion(
-                client,
-                messages=messages,
-                engine_name="gpt-4o-mini", # gpt-3.5-turbo
-                max_tokens=1000, # 32
-                requests_per_minute = 20,
-                response_format={"type":"json_object"},
-            )
-        )
-        results = []
-        for paper_data, response in zip(paper_data_list, responses):
-            response = json.loads(response)
-            results.append({"title": paper_data["title"], "content": response["revised_text"]})
-        return results
-    def chat_review(self, text):
-        os.environ["OPENAI_BASE_URL"] = self.api_base
-        os.environ["OPENAI_API_KEY"] = self.api
-        client = AsyncOpenAI()
-        if self.enable_rag:
-            messages=[
-                    {"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate the major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself and return only the list of limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
-                    {"role": "user", "content": text},
-                ]
-        else:
-            messages=[
-                    {"role": "system", "content": f"Read the following scientific paper and generate major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself and return only the limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
-                    {"role": "user", "content": text},
-                ]
-        try:
-            responses = asyncio.run(
-                generate_from_openai_chat_completion(
-                    client,
-                    messages=[messages],
-                    engine_name=self.model_name, # gpt-3.5-turbo
-                    max_tokens=1000, # 32
-                    requests_per_minute = 20,
-                    # response_format={"type":"json_object"},
-                )
-            )
-            try:
-                limitations = json.loads(responses[0])["limitations"]
-                result = ""
-                limit_cnt = 1
-                for limitation in limitations:
-                    result += f"{str(limit_cnt)}. {limitation}\n"
-                    limit_cnt += 1
-            except:
-                SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
-                messages=[
-                    {"role": "system", "content": SYSTEM_INPUT},
-                    {"role": "user", "content": responses[0]},
-                ]
-                responses = asyncio.run(
-                    generate_from_openai_chat_completion(
-                        client,
-                        messages=[messages],
-                        engine_name="gpt-4o-mini", # gpt-3.5-turbo
-                        max_tokens=1000, # 32
-                        requests_per_minute = 20,
-                        response_format={"type":"json_object"},
-                    )
-                )
-                limitations = json.loads(responses[0])["limitations"]
-                result = ""
-                limit_cnt = 1
-                for limitation in limitations:
-                    result += f"{str(limit_cnt)}. {limitation}\n"
-                    limit_cnt += 1
-            # for choice in response.choices:
-            #     result += choice.message.content
-            # result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 50)
-        except Exception as e:
-            result = "Error: "+ str(e)
-            # usage  = 'xxxxx'
-        print("********"*10)
-        print(result)
-        print("********"*10)
-        return result
-    def retrieve_papers(self, title, abstract):
-        query = title
-        search_results = search_paper(query)
-        if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
-            search_result = search_results[0]
-            retrieval = recommendation(search_result["paperId"])
-            recommended_paper_list = []
-            for recommended_paper in retrieval["recommendedPapers"]:
-                if recommended_paper["abstract"] is None:
-                    continue
-                if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
-                    recommended_paper_list.append(recommended_paper)
-                if len(recommended_paper_list) >= 20:
-                    break
-        else:
-            query = self.query_gen(abstract)
-            search_results = search_paper(query)
-            recommended_paper_list = []
-            if search_results["data"] == []:
-                return None
-            for search_result in search_results["data"]:
-                retrieval = recommendation(search_result["paperId"])
-                recommended_papers = []
-                for recommended_paper in retrieval["recommendedPapers"]:
-                    if recommended_paper["abstract"] is None:
-                        continue
-                    if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
-                        recommended_papers.append(recommended_paper)
-                    if len(recommended_papers) >= 5:
-                        break
-                recommended_paper_list.extend(recommended_papers)
-        if recommended_paper_list == []:
-            return None
-        final_papers = self.rerank(recommended_paper_list, title, abstract)
-        retrieved_papers = self.extract_related_content(final_papers, self.aspect)
-        return retrieved_papers
-    def extract_from_paper(self, pdf_path):
-        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
-        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
-        client = AsyncOpenAI()
-        # with open(pdf_path, 'rb') as f: # TODO
-        #     pdf_bytes = f.read()
-        #     file_object = BytesIO(pdf_bytes)
-        file_object = BytesIO(pdf_path) # TODO
-        pdf_reader = PyPDF2.PdfReader(file_object)
-        doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
-        page = doc.load_page(0)
-        pix = page.get_pixmap()
-        image_bytes = pix.tobytes("png")
-        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
-        USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
-        messages=[
-                {"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
-                {"role": "user", "content": USER_INPUT},
-            ]
-        responses = asyncio.run(
-            generate_from_openai_chat_completion(
-                client,
-                messages=[messages],
-                engine_name="gpt-4o-mini", # gpt-3.5-turbo
-                max_tokens=1000, # 32
-                requests_per_minute = 20,
-                response_format={"type":"json_object"},
-            )
-        )
-        response = json.loads(responses[0])
-        title = response["title"]
-        abstract = response["abstract"]
-        num_pages = len(pdf_reader.pages)
-        extraction_started = False
-        extracted_text = ""
-        for page_number in range(num_pages):
-            page = pdf_reader.pages[page_number]
-            page_text = page.extract_text()
-            extraction_started = True
-            page_number_start = page_number
-            if extraction_started:
-                extracted_text += page_text
-                if page_number_start + 1 < page_number:
-                    break
-        return extracted_text, title, abstract
-def main(api,api_base, paper_pdf, aspect, model_name, enable_rag):
-    start_time = time.time()
-    # print("key: ", PRIVATE_API_KEY, "\nbase: ", PRIVATE_API_BASE)
-    comments = ''
-    output2 = ''
-    retrieved_content = ''
-    if not api or not paper_pdf:
-        comments =  "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
-        output2 =  "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
-    else:
-        try:
-            reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, enable_rag)
-            comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
-            time_used = time.time() - start_time
-            output2 ="Processing Time："+ str(round(time_used, 2)) +"seconds"
-        except Exception as e:
-            comments = "Error: "+ str(e)
-            output2 = "Error: "+ str(e)
-    return retrieved_content, comments, output2
-########################################################################################################
-title = "LimitGen"
-description = '''<div align='left'>
-<strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations.
-</div>
-'''
-inp = [gradio.Textbox(label="Input your API-key",
-                          value="",
-                          type='password'),
-       gradio.Textbox(label="Input the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
-                          value="https://api.openai.com/v1"),
-       gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
-       gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
-                        value="Methodology",
-                        label="Select the aspect"),
-       gradio.Textbox(label="Input the model name",
-                          value="gpt-4o-mini"),
-       gradio.Checkbox(label="Enable RAG", value=False)
-]
-chat_reviewer_gui = gradio.Interface(fn=main,
-                                 inputs=inp,
-                                 outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
-                                 title=title,
-                                 description=description)
-# Start server
 chat_reviewer_gui .launch(quiet=True, show_api=False)

+import numpy as np
+import os
+import re
+from io import BytesIO
+import datetime
+import time
+import openai, tenacity
+import argparse
+import configparser
+import json
+import fitz
+import PyPDF2
+import gradio
+import sys
+from pathlib import Path
+utils_dir = Path(__file__).parent / 'utils'
+sys.path.append(str(utils_dir))
+from openai_utils import *
+import base64
+from pdf2image import convert_from_bytes
+import requests
+PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
+PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
+def insert_sentence(text, sentence, interval):
+    lines = text.split('\n')
+    new_lines = []
+    for line in lines:
+        words = line.split()
+        separator = ' '
+        new_words = []
+        count = 0
+        for word in words:
+            new_words.append(word)
+            count += 1
+            if count % interval == 0:
+                new_words.append(sentence)
+        new_lines.append(separator.join(new_words))
+    return '\n'.join(new_lines)
+def search_paper(query):
+    SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
+    url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
+    response = requests.get(url)
+    while response.status_code != 200:
+        time.sleep(1)
+        # print(response)
+        response = requests.get(url)
+    return response.json()
+def split_text_into_chunks(text, chunk_size=300):
+    words = text.split()
+    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    return chunks
+def download_pdf(paper):
+    pdf_url = paper["openAccessPdf"]["url"]
+    try:
+        response = requests.get(pdf_url)
+        response.raise_for_status()
+        file_object = BytesIO(response.content)
+        extract_text = extract_chapter(file_object)
+        chunks = split_text_into_chunks(extract_text)
+        return chunks
+    except:
+        return []
+def recommendation(s2_id, limit=500):
+    SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
+    url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
+    # print(url)
+    response = requests.get(url)
+    while response.status_code != 200:
+        time.sleep(1)
+        # print(response)
+        response = requests.get(url)
+    return response.json()
+def extract_chapter(file_object):
+    pdf_reader = PyPDF2.PdfReader(file_object)
+    num_pages = len(pdf_reader.pages)
+    extraction_started = False
+    extracted_text = ""
+    for page_number in range(num_pages):
+        page = pdf_reader.pages[page_number]
+        page_text = page.extract_text()
+        extraction_started = True
+        page_number_start = page_number
+        if extraction_started:
+            extracted_text += page_text
+            if page_number_start + 1 < page_number:
+                break
+    return extracted_text
+class Reviewer:
+    def __init__(self, api, api_base, paper_pdf, aspect, model_name, enable_rag):
+        self.api = api
+        self.api_base = api_base
+        self.aspect = aspect
+        self.paper_pdf = paper_pdf
+        self.model_name = model_name
+        self.enable_rag = enable_rag
+        # self.max_token_num = 50000
+        # self.encoding = tiktoken.get_encoding("gpt2")
+    def review_by_chatgpt(self, paper_list):
+        text, title, abstract = self.extract_from_paper(self.paper_pdf)
+        content = f"Paper to review: \nTitle: {title}\n" + text
+        if self.enable_rag:
+            papers = self.retrieve_papers(title, abstract)
+            if papers != None:
+                retrieval_content = ""
+                retrieved_papers = ""
+                cnt = 1
+                for paper in papers:
+                    retrieval_content += f"Relevant Paper {str(cnt)}:\n"
+                    retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
+                    retrieved_papers += f"{str(cnt)}. {paper['title']}\n"
+                    cnt += 1
+                text = retrieval_content + content
+                chat_review_text = self.chat_review(text=text)
+            else:
+                text = content
+                chat_review_text = self.chat_review(text=text)
+                retrieved_papers = ""
+        else:
+            text = content
+            chat_review_text = self.chat_review(text=text)
+            retrieved_papers = ""
+        return chat_review_text, retrieved_papers
+    def query_gen(self, abstract):
+        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
+        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
+        client = AsyncOpenAI()
+        messages=[
+                {"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
+                {"role": "user", "content": abstract},
+            ]
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=[messages],
+                engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                max_tokens=1000, # 32
+                requests_per_minute = 20,
+                # response_format={"type":"json_object"},
+            )
+        )
+        return responses[0]
+    def rerank(self, paper_list, title, abstract):
+        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
+        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
+        client = AsyncOpenAI()
+        rec_content = ""
+        rec_paper_cnt = 1
+        for rec_paper in paper_list:
+            rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
+            rec_paper_cnt += 1
+        rec_content += f"Reference Paper: {title}\n"
+        rec_content += f"Abstract: {abstract}\n"
+        messages=[
+                {"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
+                {"role": "user", "content": rec_content},
+            ]
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=[messages],
+                engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                max_tokens=1000, # 32
+                requests_per_minute = 20,
+                response_format={"type":"json_object"},
+            )
+        )
+        response_data = json.loads(responses[0])
+        rec_papers = []
+        for rec_num in response_data["ranking"][:5]:
+            num = int(rec_num)
+            rec_papers.append(paper_list[num-1])
+        return rec_papers
+    def extract_related_content(self, papers, aspect):
+        os.environ["OPENAI_BASE_URL"] = self.api_base
+        os.environ["OPENAI_API_KEY"] = self.api
+        client = AsyncOpenAI()
+        messages = []
+        chunk_index_map = []
+        paper_data_list = []
+        paper_chunk_list = []
+        for paper_idx, paper in enumerate(papers):
+            paper_chunks = download_pdf(paper)
+            paper_chunk_list.append(paper_chunks)
+            SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."
+            for chunk_idx, paper_chunk in enumerate(paper_chunks):
+                message = [
+                    {"role": "system", "content": SYSTEM_INPUT},
+                    {"role": "user", "content": paper_chunk},
+                ]
+                messages.append(message)
+                chunk_index_map.append((paper_idx, chunk_idx))  # 标记每个 chunk 归属哪个 paper
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=messages,
+                engine_name="gpt-4o-mini",
+                max_tokens=1000,
+                requests_per_minute=100,
+            )
+        )
+        paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
+        for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
+            if response.strip().lower().startswith("yes"):
+                paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"
+        for idx, paper_data in enumerate(paper_data_list):
+            if not paper_data["content"].strip():
+                paper_data["content"] = papers[idx]["abstract"]
+        if aspect == "Methodology":
+            SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.
+Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.
+Organize the result in JSON format as follows:
+{
+    "revised_text": str, not dict, not a summary
+}
+"""
+        elif aspect == "Result Analysis":
+            SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.
+Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.
+Organize the result in JSON format as follows:
+{
+    "revised_text": str, not dict, not a summary
+}
+"""
+        elif aspect == "Experimental Design":
+            SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.
+Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.
+Organize the result in JSON format as follows:
+{
+    "revised_text": str, not dict, not a summary
+}
+"""
+        elif aspect == "Literature Review":
+            SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.
+Remove sentences that are irrelevant to the literature review, and keep details about the related works.
+Organize the result in JSON format as follows:
+{
+    "revised_text": str, not dict, not a summary
+}
+"""
+        messages = []
+        for paper_data in paper_data_list:
+            message=[
+                {"role": "system", "content": SYSTEM_INPUT} ,
+                {"role": "user", "content": paper_data["content"]},
+            ]
+            messages.append(message)
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=messages,
+                engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                max_tokens=1000, # 32
+                requests_per_minute = 20,
+                response_format={"type":"json_object"},
+            )
+        )
+        results = []
+        for paper_data, response in zip(paper_data_list, responses):
+            print(response)
+            response = json.loads(response)
+            results.append({"title": paper_data["title"], "content": response["revised_text"]})
+        return results
+    def chat_review(self, text):
+        os.environ["OPENAI_BASE_URL"] = self.api_base
+        os.environ["OPENAI_API_KEY"] = self.api
+        client = AsyncOpenAI()
+        if self.enable_rag:
+            messages=[
+                    {"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate the major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself and return only the list of limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
+                    {"role": "user", "content": text},
+                ]
+        else:
+            messages=[
+                    {"role": "system", "content": f"Read the following scientific paper and generate major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself and return only the limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
+                    {"role": "user", "content": text},
+                ]
+        try:
+            responses = asyncio.run(
+                generate_from_openai_chat_completion(
+                    client,
+                    messages=[messages],
+                    engine_name=self.model_name, # gpt-3.5-turbo
+                    max_tokens=1000, # 32
+                    requests_per_minute = 20,
+                    # response_format={"type":"json_object"},
+                )
+            )
+            try:
+                limitations = json.loads(responses[0])["limitations"]
+                result = ""
+                limit_cnt = 1
+                for limitation in limitations:
+                    result += f"{str(limit_cnt)}. {limitation}\n"
+                    limit_cnt += 1
+            except:
+                SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
+                messages=[
+                    {"role": "system", "content": SYSTEM_INPUT},
+                    {"role": "user", "content": responses[0]},
+                ]
+                responses = asyncio.run(
+                    generate_from_openai_chat_completion(
+                        client,
+                        messages=[messages],
+                        engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                        max_tokens=1000, # 32
+                        requests_per_minute = 20,
+                        response_format={"type":"json_object"},
+                    )
+                )
+                limitations = json.loads(responses[0])["limitations"]
+                result = ""
+                limit_cnt = 1
+                for limitation in limitations:
+                    result += f"{str(limit_cnt)}. {limitation}\n"
+                    limit_cnt += 1
+            # for choice in response.choices:
+            #     result += choice.message.content
+            # result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 50)
+        except Exception as e:
+            result = "Error: "+ str(e)
+            # usage  = 'xxxxx'
+        print("********"*10)
+        print(result)
+        print("********"*10)
+        return result
+    def retrieve_papers(self, title, abstract):
+        query = title
+        search_results = search_paper(query)
+        if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
+            search_result = search_results[0]
+            retrieval = recommendation(search_result["paperId"])
+            recommended_paper_list = []
+            for recommended_paper in retrieval["recommendedPapers"]:
+                if recommended_paper["abstract"] is None:
+                    continue
+                if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
+                    recommended_paper_list.append(recommended_paper)
+                if len(recommended_paper_list) >= 20:
+                    break
+        else:
+            query = self.query_gen(abstract)
+            search_results = search_paper(query)
+            recommended_paper_list = []
+            if search_results["data"] == []:
+                return None
+            for search_result in search_results["data"]:
+                retrieval = recommendation(search_result["paperId"])
+                recommended_papers = []
+                for recommended_paper in retrieval["recommendedPapers"]:
+                    if recommended_paper["abstract"] is None:
+                        continue
+                    if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
+                        recommended_papers.append(recommended_paper)
+                    if len(recommended_papers) >= 5:
+                        break
+                recommended_paper_list.extend(recommended_papers)
+        if recommended_paper_list == []:
+            return None
+        final_papers = self.rerank(recommended_paper_list, title, abstract)
+        retrieved_papers = self.extract_related_content(final_papers, self.aspect)
+        return retrieved_papers
+    def extract_from_paper(self, pdf_path):
+        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
+        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
+        client = AsyncOpenAI()
+        # with open(pdf_path, 'rb') as f: # TODO
+        #     pdf_bytes = f.read()
+        #     file_object = BytesIO(pdf_bytes)
+        file_object = BytesIO(pdf_path) # TODO
+        pdf_reader = PyPDF2.PdfReader(file_object)
+        doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
+        page = doc.load_page(0)
+        pix = page.get_pixmap()
+        image_bytes = pix.tobytes("png")
+        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
+        messages=[
+                {"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
+                {"role": "user", "content": USER_INPUT},
+            ]
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=[messages],
+                engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                max_tokens=1000, # 32
+                requests_per_minute = 20,
+                response_format={"type":"json_object"},
+            )
+        )
+        response = json.loads(responses[0])
+        title = response["title"]
+        abstract = response["abstract"]
+        num_pages = len(pdf_reader.pages)
+        extraction_started = False
+        extracted_text = ""
+        for page_number in range(num_pages):
+            page = pdf_reader.pages[page_number]
+            page_text = page.extract_text()
+            extraction_started = True
+            page_number_start = page_number
+            if extraction_started:
+                extracted_text += page_text
+                if page_number_start + 1 < page_number:
+                    break
+        return extracted_text, title, abstract
+def main(api,api_base, paper_pdf, aspect, model_name, enable_rag):
+    start_time = time.time()
+    # print("key: ", PRIVATE_API_KEY, "\nbase: ", PRIVATE_API_BASE)
+    comments = ''
+    output2 = ''
+    retrieved_content = ''
+    if not api or not paper_pdf:
+        comments =  "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
+        output2 =  "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
+    else:
+        try:
+            reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, enable_rag)
+            comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
+            time_used = time.time() - start_time
+            output2 ="Processing Time："+ str(round(time_used, 2)) +"seconds"
+        except Exception as e:
+            comments = "Error: "+ str(e)
+            output2 = "Error: "+ str(e)
+    return retrieved_content, comments, output2
+########################################################################################################
+title = "LimitGen"
+description = '''<div align='left'>
+<strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations.
+</div>
+'''
+inp = [gradio.Textbox(label="Input your API-key",
+                          value="",
+                          type='password'),
+       gradio.Textbox(label="Input the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
+                          value="https://api.openai.com/v1"),
+       gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
+       gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
+                        value="Methodology",
+                        label="Select the aspect"),
+       gradio.Textbox(label="Input the model name",
+                          value="gpt-4o-mini"),
+       gradio.Checkbox(label="Enable RAG", value=False)
+]
+chat_reviewer_gui = gradio.Interface(fn=main,
+                                 inputs=inp,
+                                 outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
+                                 title=title,
+                                 description=description)
+# Start server
 chat_reviewer_gui .launch(quiet=True, show_api=False)