|
import numpy as np |
|
import os |
|
import re |
|
from io import BytesIO |
|
import datetime |
|
import time |
|
import openai, tenacity |
|
import argparse |
|
import configparser |
|
import json |
|
import fitz |
|
import PyPDF2 |
|
import gradio |
|
import sys |
|
from pathlib import Path |
|
utils_dir = Path(__file__).parent / 'utils' |
|
sys.path.append(str(utils_dir)) |
|
from openai_utils import * |
|
import base64 |
|
from pdf2image import convert_from_bytes |
|
import requests |
|
PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY') |
|
PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE') |
|
|
|
|
|
def insert_sentence(text, sentence, interval): |
|
lines = text.split('\n') |
|
new_lines = [] |
|
|
|
for line in lines: |
|
words = line.split() |
|
separator = ' ' |
|
|
|
new_words = [] |
|
count = 0 |
|
|
|
for word in words: |
|
new_words.append(word) |
|
count += 1 |
|
|
|
if count % interval == 0: |
|
new_words.append(sentence) |
|
|
|
new_lines.append(separator.join(new_words)) |
|
|
|
return '\n'.join(new_lines) |
|
|
|
def search_paper(query): |
|
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/" |
|
url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science" |
|
|
|
response = requests.get(url) |
|
while response.status_code != 200: |
|
time.sleep(1) |
|
|
|
response = requests.get(url) |
|
|
|
return response.json() |
|
|
|
def split_text_into_chunks(text, chunk_size=300): |
|
words = text.split() |
|
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] |
|
return chunks |
|
|
|
def download_pdf(paper): |
|
pdf_url = paper["openAccessPdf"]["url"] |
|
try: |
|
response = requests.get(pdf_url) |
|
response.raise_for_status() |
|
|
|
|
|
file_object = BytesIO(response.content) |
|
extract_text = extract_chapter(file_object) |
|
chunks = split_text_into_chunks(extract_text) |
|
return chunks |
|
except: |
|
return [] |
|
|
|
|
|
def recommendation(s2_id, limit=500): |
|
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/" |
|
url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf" |
|
|
|
|
|
response = requests.get(url) |
|
while response.status_code != 200: |
|
time.sleep(1) |
|
|
|
response = requests.get(url) |
|
|
|
return response.json() |
|
|
|
|
|
def extract_chapter(file_object): |
|
pdf_reader = PyPDF2.PdfReader(file_object) |
|
|
|
|
|
num_pages = len(pdf_reader.pages) |
|
extraction_started = False |
|
extracted_text = "" |
|
for page_number in range(num_pages): |
|
page = pdf_reader.pages[page_number] |
|
page_text = page.extract_text() |
|
extraction_started = True |
|
page_number_start = page_number |
|
if extraction_started: |
|
extracted_text += page_text |
|
if page_number_start + 1 < page_number: |
|
break |
|
return extracted_text |
|
|
|
|
|
|
|
class Reviewer: |
|
|
|
def __init__(self, api, api_base, paper_pdf, aspect, model_name, limit_num, enable_rag): |
|
self.api = api |
|
self.api_base = api_base |
|
self.aspect = aspect |
|
self.paper_pdf = paper_pdf |
|
self.model_name = model_name |
|
self.limit_num = int(limit_num) |
|
self.enable_rag = enable_rag |
|
|
|
|
|
|
|
|
|
def review_by_chatgpt(self, paper_list): |
|
text, title, abstract = self.extract_from_paper(self.paper_pdf) |
|
content = f"Paper to review: \nTitle: {title}\n" + text |
|
|
|
if self.enable_rag: |
|
papers = self.retrieve_papers(title, abstract) |
|
if papers != None: |
|
retrieval_content = "" |
|
retrieved_papers = "" |
|
cnt = 1 |
|
for paper in papers: |
|
retrieval_content += f"Relevant Paper {str(cnt)}:\n" |
|
retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n" |
|
retrieved_papers += f"{str(cnt)}. {paper['title']}\n" |
|
cnt += 1 |
|
text = retrieval_content + content |
|
chat_review_text = self.chat_review(text=text) |
|
else: |
|
text = content |
|
chat_review_text = self.chat_review(text=text) |
|
retrieved_papers = "" |
|
else: |
|
text = content |
|
chat_review_text = self.chat_review(text=text) |
|
retrieved_papers = "" |
|
|
|
return chat_review_text, retrieved_papers |
|
|
|
def query_gen(self, abstract): |
|
os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE |
|
os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY |
|
client = AsyncOpenAI() |
|
|
|
messages=[ |
|
{"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} , |
|
{"role": "user", "content": abstract}, |
|
] |
|
|
|
responses = asyncio.run( |
|
generate_from_openai_chat_completion( |
|
client, |
|
messages=[messages], |
|
engine_name="gpt-4o-mini", |
|
max_tokens=1000, |
|
requests_per_minute = 20, |
|
|
|
) |
|
) |
|
return responses[0] |
|
|
|
|
|
def rerank(self, paper_list, title, abstract): |
|
os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE |
|
os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY |
|
client = AsyncOpenAI() |
|
|
|
rec_content = "" |
|
rec_paper_cnt = 1 |
|
|
|
for rec_paper in paper_list: |
|
rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n" |
|
rec_paper_cnt += 1 |
|
|
|
rec_content += f"Reference Paper: {title}\n" |
|
rec_content += f"Abstract: {abstract}\n" |
|
|
|
messages=[ |
|
{"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} , |
|
{"role": "user", "content": rec_content}, |
|
] |
|
|
|
responses = asyncio.run( |
|
generate_from_openai_chat_completion( |
|
client, |
|
messages=[messages], |
|
engine_name="gpt-4o-mini", |
|
max_tokens=1000, |
|
requests_per_minute = 20, |
|
response_format={"type":"json_object"}, |
|
) |
|
) |
|
response_data = json.loads(responses[0]) |
|
rec_papers = [] |
|
for rec_num in response_data["ranking"][:5]: |
|
num = int(rec_num) |
|
rec_papers.append(paper_list[num-1]) |
|
|
|
return rec_papers |
|
|
|
def extract_related_content(self, papers, aspect): |
|
os.environ["OPENAI_BASE_URL"] = self.api_base |
|
os.environ["OPENAI_API_KEY"] = self.api |
|
client = AsyncOpenAI() |
|
|
|
messages = [] |
|
chunk_index_map = [] |
|
paper_data_list = [] |
|
paper_chunk_list = [] |
|
for paper_idx, paper in enumerate(papers): |
|
paper_chunks = download_pdf(paper) |
|
paper_chunk_list.append(paper_chunks) |
|
|
|
SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'." |
|
|
|
for chunk_idx, paper_chunk in enumerate(paper_chunks): |
|
message = [ |
|
{"role": "system", "content": SYSTEM_INPUT}, |
|
{"role": "user", "content": paper_chunk}, |
|
] |
|
messages.append(message) |
|
chunk_index_map.append((paper_idx, chunk_idx)) |
|
|
|
|
|
responses = asyncio.run( |
|
generate_from_openai_chat_completion( |
|
client, |
|
messages=messages, |
|
engine_name="gpt-4o-mini", |
|
max_tokens=1000, |
|
requests_per_minute=100, |
|
) |
|
) |
|
|
|
paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers] |
|
|
|
for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses): |
|
if response.strip().lower().startswith("yes"): |
|
paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n" |
|
|
|
for idx, paper_data in enumerate(paper_data_list): |
|
if not paper_data["content"].strip(): |
|
paper_data["content"] = papers[idx]["abstract"] |
|
|
|
|
|
if aspect == "Methodology": |
|
SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper. |
|
Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations. |
|
Organize the result in JSON format as follows: |
|
{ |
|
"revised_text": str, not dict, not a summary |
|
} |
|
""" |
|
elif aspect == "Result Analysis": |
|
SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper. |
|
Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results. |
|
Organize the result in JSON format as follows: |
|
{ |
|
"revised_text": str, not dict, not a summary |
|
} |
|
""" |
|
elif aspect == "Experimental Design": |
|
SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper. |
|
Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies. |
|
Organize the result in JSON format as follows: |
|
{ |
|
"revised_text": str, not dict, not a summary |
|
} |
|
""" |
|
elif aspect == "Literature Review": |
|
SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper. |
|
Remove sentences that are irrelevant to the literature review, and keep details about the related works. |
|
Organize the result in JSON format as follows: |
|
{ |
|
"revised_text": str, not dict, not a summary |
|
} |
|
""" |
|
messages = [] |
|
for paper_data in paper_data_list: |
|
message=[ |
|
{"role": "system", "content": SYSTEM_INPUT} , |
|
{"role": "user", "content": paper_data["content"]}, |
|
] |
|
messages.append(message) |
|
|
|
responses = asyncio.run( |
|
generate_from_openai_chat_completion( |
|
client, |
|
messages=messages, |
|
engine_name="gpt-4o-mini", |
|
max_tokens=5000, |
|
requests_per_minute = 20, |
|
response_format={"type":"json_object"}, |
|
) |
|
) |
|
|
|
results = [] |
|
for paper_data, response in zip(paper_data_list, responses): |
|
|
|
response = json.loads(response) |
|
results.append({"title": paper_data["title"], "content": response["revised_text"]}) |
|
return results |
|
|
|
|
|
|
|
def chat_review(self, text): |
|
os.environ["OPENAI_BASE_URL"] = self.api_base |
|
os.environ["OPENAI_API_KEY"] = self.api |
|
client = AsyncOpenAI() |
|
|
|
if self.enable_rag: |
|
messages=[ |
|
{"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate {str(self.limit_num)} major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} , |
|
{"role": "user", "content": text}, |
|
] |
|
else: |
|
messages=[ |
|
{"role": "system", "content": f"Read the following scientific paper and generate {str(self.limit_num)} major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} , |
|
{"role": "user", "content": text}, |
|
] |
|
try: |
|
responses = asyncio.run( |
|
generate_from_openai_chat_completion( |
|
client, |
|
messages=[messages], |
|
engine_name=self.model_name, |
|
max_tokens=1000, |
|
requests_per_minute = 20, |
|
|
|
) |
|
) |
|
try: |
|
limitations = json.loads(responses[0])["limitations"][:self.limit_num] |
|
result = "" |
|
limit_cnt = 1 |
|
for limitation in limitations: |
|
result += f"{str(limit_cnt)}. {limitation}\n" |
|
limit_cnt += 1 |
|
except: |
|
SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}" |
|
messages=[ |
|
{"role": "system", "content": SYSTEM_INPUT}, |
|
{"role": "user", "content": responses[0]}, |
|
] |
|
responses = asyncio.run( |
|
generate_from_openai_chat_completion( |
|
client, |
|
messages=[messages], |
|
engine_name="gpt-4o-mini", |
|
max_tokens=1000, |
|
requests_per_minute = 20, |
|
response_format={"type":"json_object"}, |
|
) |
|
) |
|
limitations = json.loads(responses[0])["limitations"][:self.limit_num] |
|
result = "" |
|
limit_cnt = 1 |
|
for limitation in limitations: |
|
result += f"{str(limit_cnt)}. {limitation}\n" |
|
limit_cnt += 1 |
|
|
|
|
|
|
|
except Exception as e: |
|
result = "Error: "+ str(e) |
|
|
|
print("********"*10) |
|
print(result) |
|
print("********"*10) |
|
return result |
|
|
|
|
|
def retrieve_papers(self, title, abstract): |
|
query = title |
|
search_results = search_paper(query) |
|
if search_results != [] and search_results["data"][0]["title"].lower() == title.lower(): |
|
search_result = search_results[0] |
|
retrieval = recommendation(search_result["paperId"]) |
|
recommended_paper_list = [] |
|
for recommended_paper in retrieval["recommendedPapers"]: |
|
if recommended_paper["abstract"] is None: |
|
continue |
|
if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None: |
|
recommended_paper_list.append(recommended_paper) |
|
|
|
if len(recommended_paper_list) >= 20: |
|
break |
|
|
|
else: |
|
query = self.query_gen(abstract) |
|
search_results = search_paper(query) |
|
recommended_paper_list = [] |
|
if search_results["data"] == []: |
|
return None |
|
for search_result in search_results["data"]: |
|
retrieval = recommendation(search_result["paperId"]) |
|
recommended_papers = [] |
|
for recommended_paper in retrieval["recommendedPapers"]: |
|
if recommended_paper["abstract"] is None: |
|
continue |
|
if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None: |
|
recommended_papers.append(recommended_paper) |
|
|
|
if len(recommended_papers) >= 5: |
|
break |
|
recommended_paper_list.extend(recommended_papers) |
|
|
|
if recommended_paper_list == []: |
|
return None |
|
final_papers = self.rerank(recommended_paper_list, title, abstract) |
|
retrieved_papers = self.extract_related_content(final_papers, self.aspect) |
|
|
|
return retrieved_papers |
|
|
|
|
|
|
|
|
|
def extract_from_paper(self, pdf_path): |
|
os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE |
|
os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY |
|
client = AsyncOpenAI() |
|
|
|
|
|
|
|
|
|
|
|
file_object = BytesIO(pdf_path) |
|
pdf_reader = PyPDF2.PdfReader(file_object) |
|
|
|
doc = fitz.open(stream=pdf_path, filetype="pdf") |
|
page = doc.load_page(0) |
|
pix = page.get_pixmap() |
|
image_bytes = pix.tobytes("png") |
|
|
|
image_base64 = base64.b64encode(image_bytes).decode('utf-8') |
|
|
|
USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}] |
|
messages=[ |
|
{"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} , |
|
{"role": "user", "content": USER_INPUT}, |
|
] |
|
responses = asyncio.run( |
|
generate_from_openai_chat_completion( |
|
client, |
|
messages=[messages], |
|
engine_name="gpt-4o-mini", |
|
max_tokens=1000, |
|
requests_per_minute = 20, |
|
response_format={"type":"json_object"}, |
|
) |
|
) |
|
|
|
response = json.loads(responses[0]) |
|
title = response["title"] |
|
abstract = response["abstract"] |
|
|
|
|
|
|
|
num_pages = len(pdf_reader.pages) |
|
extraction_started = False |
|
extracted_text = "" |
|
for page_number in range(num_pages): |
|
page = pdf_reader.pages[page_number] |
|
page_text = page.extract_text() |
|
|
|
extraction_started = True |
|
page_number_start = page_number |
|
if extraction_started: |
|
extracted_text += page_text |
|
if page_number_start + 1 < page_number: |
|
break |
|
return extracted_text, title, abstract |
|
|
|
def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag): |
|
start_time = time.time() |
|
|
|
comments = '' |
|
output2 = '' |
|
retrieved_content = '' |
|
if not api or not paper_pdf or not api_base: |
|
comments = "It looks like there's a missing API key/base URL or PDF input. Make sure you've provided the necessary information or uploaded the required file." |
|
output2 = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file." |
|
if not limit_num.isdigit() or int(limit_num) <= 0: |
|
comments = "The input number is not a positive integer." |
|
output2 = "The input number is not a positive integer." |
|
else: |
|
try: |
|
reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag) |
|
comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf) |
|
time_used = time.time() - start_time |
|
output2 ="Processing Time:"+ str(round(time_used, 2)) +"seconds" |
|
except Exception as e: |
|
comments = "Error: "+ str(e) |
|
output2 = "Error: "+ str(e) |
|
return retrieved_content, comments, output2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
title = "LimitGen" |
|
|
|
|
|
description = '''<div align='left'> |
|
<strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations. |
|
</div> |
|
''' |
|
|
|
inp = [gradio.Textbox(label="Enter your API-key", |
|
value="", |
|
type='password'), |
|
gradio.Textbox(label="Enter the base URL (ending with /v1). Skip this step if using the original OpenAI API.", |
|
value="https://api.openai.com/v1"), |
|
|
|
gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"), |
|
gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"], |
|
value="Methodology", |
|
label="Select the aspect"), |
|
gradio.Dropdown(["gpt-4o-mini","gpt-4o","Qwen/Qwen2.5-7B-Instruct-Turbo", "meta-llama/Meta-Llama-3-70B-Instruct-Turbo", "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"], |
|
label="Select the model name", |
|
value="gpt-4o-mini"), |
|
gradio.Textbox(label="Enter the number of limitations to generate.", |
|
value="3"), |
|
gradio.Checkbox(label="Enable RAG", value=False), |
|
] |
|
|
|
chat_reviewer_gui = gradio.Interface(fn=main, |
|
inputs=inp, |
|
outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")], |
|
title=title, |
|
description=description) |
|
|
|
|
|
chat_reviewer_gui .launch(quiet=True, show_api=False) |