LimitGen

Running

App Files Files Community

zjXu11 commited on Mar 12

Commit

9d9cd7e

verified ·

1 Parent(s): c1f6997

Upload 3 files

Browse files

Files changed (3) hide show

README.md +12 -14
app.py +545 -0
requirements.txt +11 -0

README.md CHANGED Viewed

@@ -1,14 +1,12 @@
----
-title: LimitGen
-emoji: 🏢
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 5.20.1
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Demo for LimitGen
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: LimitGen Demo
+emoji: 💬
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 5.6.0
+app_file: app.py
+pinned: false
+short_description: demo
+---
+An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

app.py ADDED Viewed

	@@ -0,0 +1,545 @@

+import numpy as np
+import os
+import re
+from io import BytesIO
+import datetime
+import time
+import openai, tenacity
+import argparse
+import configparser
+import json
+import fitz
+import PyPDF2
+import gradio
+import sys
+from pathlib import Path
+utils_dir = Path(__file__).parent / 'utils'
+sys.path.append(str(utils_dir))
+from openai_utils import *
+import base64
+from pdf2image import convert_from_bytes
+import requests
+PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
+PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
+def insert_sentence(text, sentence, interval):
+    lines = text.split('\n')
+    new_lines = []
+    for line in lines:
+        words = line.split()
+        separator = ' '
+        new_words = []
+        count = 0
+        for word in words:
+            new_words.append(word)
+            count += 1
+            if count % interval == 0:
+                new_words.append(sentence)
+        new_lines.append(separator.join(new_words))
+    return '\n'.join(new_lines)
+def search_paper(query):
+    SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
+    url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
+    response = requests.get(url)
+    while response.status_code != 200:
+        time.sleep(1)
+        # print(response)
+        response = requests.get(url)
+    return response.json()
+def split_text_into_chunks(text, chunk_size=300):
+    words = text.split()
+    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    return chunks
+def download_pdf(paper):
+    pdf_url = paper["openAccessPdf"]["url"]
+    try:
+        response = requests.get(pdf_url)
+        response.raise_for_status()
+        file_object = BytesIO(response.content)
+        extract_text = extract_chapter(file_object)
+        chunks = split_text_into_chunks(extract_text)
+        return chunks
+    except:
+        return []
+def recommendation(s2_id, limit=500):
+    SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
+    url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
+    # print(url)
+    response = requests.get(url)
+    while response.status_code != 200:
+        time.sleep(1)
+        # print(response)
+        response = requests.get(url)
+    return response.json()
+def extract_chapter(file_object):
+    pdf_reader = PyPDF2.PdfReader(file_object)
+    num_pages = len(pdf_reader.pages)
+    extraction_started = False
+    extracted_text = ""
+    for page_number in range(num_pages):
+        page = pdf_reader.pages[page_number]
+        page_text = page.extract_text()
+        extraction_started = True
+        page_number_start = page_number
+        if extraction_started:
+            extracted_text += page_text
+            if page_number_start + 1 < page_number:
+                break
+    return extracted_text
+class Reviewer:
+    def __init__(self, api, api_base, paper_pdf, aspect, model_name, enable_rag):
+        self.api = api
+        self.api_base = api_base
+        self.aspect = aspect
+        self.paper_pdf = paper_pdf
+        self.model_name = model_name
+        self.enable_rag = enable_rag
+        # self.max_token_num = 50000
+        # self.encoding = tiktoken.get_encoding("gpt2")
+    def review_by_chatgpt(self, paper_list):
+        text, title, abstract = self.extract_from_paper(self.paper_pdf)
+        content = f"Paper to review: \nTitle: {title}\n" + text
+        if self.enable_rag:
+            papers = self.retrieve_papers(title, abstract)
+            if papers != None:
+                retrieval_content = ""
+                retrieved_papers = ""
+                cnt = 1
+                for paper in papers:
+                    retrieval_content += f"Relevant Paper {str(cnt)}:\n"
+                    retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
+                    retrieved_papers += f"{str(cnt)}. {paper['title']}\n"
+                    cnt += 1
+                text = retrieval_content + content
+                chat_review_text = self.chat_review(text=text)
+            else:
+                text = content
+                chat_review_text = self.chat_review(text=text)
+                retrieved_papers = ""
+        else:
+            text = content
+            chat_review_text = self.chat_review(text=text)
+            retrieved_papers = ""
+        return chat_review_text, retrieved_papers
+    def query_gen(self, abstract):
+        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
+        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
+        client = AsyncOpenAI()
+        messages=[
+                {"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
+                {"role": "user", "content": abstract},
+            ]
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=[messages],
+                engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                max_tokens=1000, # 32
+                requests_per_minute = 20,
+                # response_format={"type":"json_object"},
+            )
+        )
+        return responses[0]
+    def rerank(self, paper_list, title, abstract):
+        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
+        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
+        client = AsyncOpenAI()
+        rec_content = ""
+        rec_paper_cnt = 1
+        for rec_paper in paper_list:
+            rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
+            rec_paper_cnt += 1
+        rec_content += f"Reference Paper: {title}\n"
+        rec_content += f"Abstract: {abstract}\n"
+        messages=[
+                {"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
+                {"role": "user", "content": rec_content},
+            ]
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=[messages],
+                engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                max_tokens=1000, # 32
+                requests_per_minute = 20,
+                response_format={"type":"json_object"},
+            )
+        )
+        response_data = json.loads(responses[0])
+        rec_papers = []
+        for rec_num in response_data["ranking"][:5]:
+            num = int(rec_num)
+            rec_papers.append(paper_list[num-1])
+        return rec_papers
+    def extract_related_content(self, papers, aspect):
+        os.environ["OPENAI_BASE_URL"] = self.api_base
+        os.environ["OPENAI_API_KEY"] = self.api
+        client = AsyncOpenAI()
+        messages = []
+        chunk_index_map = []
+        paper_data_list = []
+        paper_chunk_list = []
+        for paper_idx, paper in enumerate(papers):
+            paper_chunks = download_pdf(paper)
+            paper_chunk_list.append(paper_chunks)
+            SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."
+            for chunk_idx, paper_chunk in enumerate(paper_chunks):
+                message = [
+                    {"role": "system", "content": SYSTEM_INPUT},
+                    {"role": "user", "content": paper_chunk},
+                ]
+                messages.append(message)
+                chunk_index_map.append((paper_idx, chunk_idx))  # 标记每个 chunk 归属哪个 paper
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=messages,
+                engine_name="gpt-4o-mini",
+                max_tokens=1000,
+                requests_per_minute=100,
+            )
+        )
+        paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
+        for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
+            if response.strip().lower().startswith("yes"):
+                paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"
+        for idx, paper_data in enumerate(paper_data_list):
+            if not paper_data["content"].strip():
+                paper_data["content"] = papers[idx]["abstract"]
+        if aspect == "Methodology":
+            SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.
+Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.
+Organize the result in JSON format as follows:
+{
+    "revised_text": str, not dict, not a summary
+}
+"""
+        elif aspect == "Result Analysis":
+            SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.
+Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.
+Organize the result in JSON format as follows:
+{
+    "revised_text": str, not dict, not a summary
+}
+"""
+        elif aspect == "Experimental Design":
+            SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.
+Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.
+Organize the result in JSON format as follows:
+{
+    "revised_text": str, not dict, not a summary
+}
+"""
+        elif aspect == "Literature Review":
+            SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.
+Remove sentences that are irrelevant to the literature review, and keep details about the related works.
+Organize the result in JSON format as follows:
+{
+    "revised_text": str, not dict, not a summary
+}
+"""
+        messages = []
+        for paper_data in paper_data_list:
+            message=[
+                {"role": "system", "content": SYSTEM_INPUT} ,
+                {"role": "user", "content": paper_data["content"]},
+            ]
+            messages.append(message)
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=messages,
+                engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                max_tokens=1000, # 32
+                requests_per_minute = 20,
+                response_format={"type":"json_object"},
+            )
+        )
+        results = []
+        for paper_data, response in zip(paper_data_list, responses):
+            response = json.loads(response)
+            results.append({"title": paper_data["title"], "content": response["revised_text"]})
+        return results
+    def chat_review(self, text):
+        os.environ["OPENAI_BASE_URL"] = self.api_base
+        os.environ["OPENAI_API_KEY"] = self.api
+        client = AsyncOpenAI()
+        if self.enable_rag:
+            messages=[
+                    {"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate the major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself and return only the list of limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
+                    {"role": "user", "content": text},
+                ]
+        else:
+            messages=[
+                    {"role": "system", "content": f"Read the following scientific paper and generate major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself and return only the limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
+                    {"role": "user", "content": text},
+                ]
+        try:
+            responses = asyncio.run(
+                generate_from_openai_chat_completion(
+                    client,
+                    messages=[messages],
+                    engine_name=self.model_name, # gpt-3.5-turbo
+                    max_tokens=1000, # 32
+                    requests_per_minute = 20,
+                    # response_format={"type":"json_object"},
+                )
+            )
+            try:
+                limitations = json.loads(responses[0])["limitations"]
+                result = ""
+                limit_cnt = 1
+                for limitation in limitations:
+                    result += f"{str(limit_cnt)}. {limitation}\n"
+                    limit_cnt += 1
+            except:
+                SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
+                messages=[
+                    {"role": "system", "content": SYSTEM_INPUT},
+                    {"role": "user", "content": responses[0]},
+                ]
+                responses = asyncio.run(
+                    generate_from_openai_chat_completion(
+                        client,
+                        messages=[messages],
+                        engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                        max_tokens=1000, # 32
+                        requests_per_minute = 20,
+                        response_format={"type":"json_object"},
+                    )
+                )
+                limitations = json.loads(responses[0])["limitations"]
+                result = ""
+                limit_cnt = 1
+                for limitation in limitations:
+                    result += f"{str(limit_cnt)}. {limitation}\n"
+                    limit_cnt += 1
+            # for choice in response.choices:
+            #     result += choice.message.content
+            # result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 50)
+        except Exception as e:
+            result = "Error: "+ str(e)
+            # usage  = 'xxxxx'
+        print("********"*10)
+        print(result)
+        print("********"*10)
+        return result
+    def retrieve_papers(self, title, abstract):
+        query = title
+        search_results = search_paper(query)
+        if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
+            search_result = search_results[0]
+            retrieval = recommendation(search_result["paperId"])
+            recommended_paper_list = []
+            for recommended_paper in retrieval["recommendedPapers"]:
+                if recommended_paper["abstract"] is None:
+                    continue
+                if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
+                    recommended_paper_list.append(recommended_paper)
+                if len(recommended_paper_list) >= 20:
+                    break
+        else:
+            query = self.query_gen(abstract)
+            search_results = search_paper(query)
+            recommended_paper_list = []
+            if search_results["data"] == []:
+                return None
+            for search_result in search_results["data"]:
+                retrieval = recommendation(search_result["paperId"])
+                recommended_papers = []
+                for recommended_paper in retrieval["recommendedPapers"]:
+                    if recommended_paper["abstract"] is None:
+                        continue
+                    if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
+                        recommended_papers.append(recommended_paper)
+                    if len(recommended_papers) >= 5:
+                        break
+                recommended_paper_list.extend(recommended_papers)
+        if recommended_paper_list == []:
+            return None
+        final_papers = self.rerank(recommended_paper_list, title, abstract)
+        retrieved_papers = self.extract_related_content(final_papers, self.aspect)
+        return retrieved_papers
+    def extract_from_paper(self, pdf_path):
+        os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
+        os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
+        client = AsyncOpenAI()
+        # with open(pdf_path, 'rb') as f: # TODO
+        #     pdf_bytes = f.read()
+        #     file_object = BytesIO(pdf_bytes)
+        file_object = BytesIO(pdf_path) # TODO
+        pdf_reader = PyPDF2.PdfReader(file_object)
+        doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
+        page = doc.load_page(0)
+        pix = page.get_pixmap()
+        image_bytes = pix.tobytes("png")
+        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
+        messages=[
+                {"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
+                {"role": "user", "content": USER_INPUT},
+            ]
+        responses = asyncio.run(
+            generate_from_openai_chat_completion(
+                client,
+                messages=[messages],
+                engine_name="gpt-4o-mini", # gpt-3.5-turbo
+                max_tokens=1000, # 32
+                requests_per_minute = 20,
+                response_format={"type":"json_object"},
+            )
+        )
+        response = json.loads(responses[0])
+        title = response["title"]
+        abstract = response["abstract"]
+        num_pages = len(pdf_reader.pages)
+        extraction_started = False
+        extracted_text = ""
+        for page_number in range(num_pages):
+            page = pdf_reader.pages[page_number]
+            page_text = page.extract_text()
+            extraction_started = True
+            page_number_start = page_number
+            if extraction_started:
+                extracted_text += page_text
+                if page_number_start + 1 < page_number:
+                    break
+        return extracted_text, title, abstract
+def main(api,api_base, paper_pdf, aspect, model_name, enable_rag):
+    start_time = time.time()
+    # print("key: ", PRIVATE_API_KEY, "\nbase: ", PRIVATE_API_BASE)
+    comments = ''
+    output2 = ''
+    retrieved_content = ''
+    if not api or not paper_pdf:
+        comments =  "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
+        output2 =  "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
+    else:
+        try:
+            reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, enable_rag)
+            comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
+            time_used = time.time() - start_time
+            output2 ="Processing Time："+ str(round(time_used, 2)) +"seconds"
+        except Exception as e:
+            comments = "Error: "+ str(e)
+            output2 = "Error: "+ str(e)
+    return retrieved_content, comments, output2
+########################################################################################################
+title = "LimitGen"
+description = '''<div align='left'>
+<strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations.
+</div>
+'''
+inp = [gradio.Textbox(label="Input your API-key",
+                          value="",
+                          type='password'),
+       gradio.Textbox(label="Input the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
+                          value="https://api.openai.com/v1"),
+       gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
+       gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
+                        value="Methodology",
+                        label="Select the aspect"),
+       gradio.Textbox(label="Input the model name",
+                          value="gpt-4o-mini"),
+       gradio.Checkbox(label="Enable RAG", value=False)
+]
+chat_reviewer_gui = gradio.Interface(fn=main,
+                                 inputs=inp,
+                                 outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
+                                 title=title,
+                                 description=description)
+# Start server
+chat_reviewer_gui .launch(quiet=True, show_api=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+PyMuPDF==1.21.1
+tenacity==8.2.2
+pybase64==1.2.3
+Pillow==9.4.0
+openai==1.33.0
+markdown
+gradio==3.20.1
+PyPDF2
+aiolimiter
+pdf2image
+httpx==0.27.2