|
import numpy as np
|
|
import os
|
|
import re
|
|
from io import BytesIO
|
|
import datetime
|
|
import time
|
|
import openai, tenacity
|
|
import argparse
|
|
import configparser
|
|
import json
|
|
import fitz
|
|
import PyPDF2
|
|
import gradio
|
|
import sys
|
|
from pathlib import Path
|
|
utils_dir = Path(__file__).parent / 'utils'
|
|
sys.path.append(str(utils_dir))
|
|
from openai_utils import *
|
|
import base64
|
|
from pdf2image import convert_from_bytes
|
|
import requests
|
|
PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
|
|
PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
|
|
|
|
|
|
def insert_sentence(text, sentence, interval):
|
|
lines = text.split('\n')
|
|
new_lines = []
|
|
|
|
for line in lines:
|
|
words = line.split()
|
|
separator = ' '
|
|
|
|
new_words = []
|
|
count = 0
|
|
|
|
for word in words:
|
|
new_words.append(word)
|
|
count += 1
|
|
|
|
if count % interval == 0:
|
|
new_words.append(sentence)
|
|
|
|
new_lines.append(separator.join(new_words))
|
|
|
|
return '\n'.join(new_lines)
|
|
|
|
def search_paper(query):
|
|
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
|
|
url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
|
|
|
|
response = requests.get(url)
|
|
while response.status_code != 200:
|
|
time.sleep(1)
|
|
|
|
response = requests.get(url)
|
|
|
|
return response.json()
|
|
|
|
def split_text_into_chunks(text, chunk_size=300):
|
|
words = text.split()
|
|
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
|
return chunks
|
|
|
|
def download_pdf(paper):
|
|
pdf_url = paper["openAccessPdf"]["url"]
|
|
try:
|
|
response = requests.get(pdf_url)
|
|
response.raise_for_status()
|
|
|
|
|
|
file_object = BytesIO(response.content)
|
|
extract_text = extract_chapter(file_object)
|
|
chunks = split_text_into_chunks(extract_text)
|
|
return chunks
|
|
except:
|
|
return []
|
|
|
|
|
|
def recommendation(s2_id, limit=500):
|
|
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
|
|
url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
|
|
|
|
|
|
response = requests.get(url)
|
|
while response.status_code != 200:
|
|
time.sleep(1)
|
|
|
|
response = requests.get(url)
|
|
|
|
return response.json()
|
|
|
|
|
|
def extract_chapter(file_object):
|
|
pdf_reader = PyPDF2.PdfReader(file_object)
|
|
|
|
|
|
num_pages = len(pdf_reader.pages)
|
|
extraction_started = False
|
|
extracted_text = ""
|
|
for page_number in range(num_pages):
|
|
page = pdf_reader.pages[page_number]
|
|
page_text = page.extract_text()
|
|
extraction_started = True
|
|
page_number_start = page_number
|
|
if extraction_started:
|
|
extracted_text += page_text
|
|
if page_number_start + 1 < page_number:
|
|
break
|
|
return extracted_text
|
|
|
|
|
|
|
|
class Reviewer:
|
|
|
|
def __init__(self, api, api_base, paper_pdf, aspect, model_name, enable_rag):
|
|
self.api = api
|
|
self.api_base = api_base
|
|
self.aspect = aspect
|
|
self.paper_pdf = paper_pdf
|
|
self.model_name = model_name
|
|
self.enable_rag = enable_rag
|
|
|
|
|
|
|
|
|
|
def review_by_chatgpt(self, paper_list):
|
|
text, title, abstract = self.extract_from_paper(self.paper_pdf)
|
|
content = f"Paper to review: \nTitle: {title}\n" + text
|
|
|
|
if self.enable_rag:
|
|
papers = self.retrieve_papers(title, abstract)
|
|
if papers != None:
|
|
retrieval_content = ""
|
|
retrieved_papers = ""
|
|
cnt = 1
|
|
for paper in papers:
|
|
retrieval_content += f"Relevant Paper {str(cnt)}:\n"
|
|
retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
|
|
retrieved_papers += f"{str(cnt)}. {paper['title']}\n"
|
|
cnt += 1
|
|
text = retrieval_content + content
|
|
chat_review_text = self.chat_review(text=text)
|
|
else:
|
|
text = content
|
|
chat_review_text = self.chat_review(text=text)
|
|
retrieved_papers = ""
|
|
else:
|
|
text = content
|
|
chat_review_text = self.chat_review(text=text)
|
|
retrieved_papers = ""
|
|
|
|
return chat_review_text, retrieved_papers
|
|
|
|
def query_gen(self, abstract):
|
|
os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
|
|
os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
|
|
client = AsyncOpenAI()
|
|
|
|
messages=[
|
|
{"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
|
|
{"role": "user", "content": abstract},
|
|
]
|
|
|
|
responses = asyncio.run(
|
|
generate_from_openai_chat_completion(
|
|
client,
|
|
messages=[messages],
|
|
engine_name="gpt-4o-mini",
|
|
max_tokens=1000,
|
|
requests_per_minute = 20,
|
|
|
|
)
|
|
)
|
|
return responses[0]
|
|
|
|
|
|
def rerank(self, paper_list, title, abstract):
|
|
os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
|
|
os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
|
|
client = AsyncOpenAI()
|
|
|
|
rec_content = ""
|
|
rec_paper_cnt = 1
|
|
|
|
for rec_paper in paper_list:
|
|
rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
|
|
rec_paper_cnt += 1
|
|
|
|
rec_content += f"Reference Paper: {title}\n"
|
|
rec_content += f"Abstract: {abstract}\n"
|
|
|
|
messages=[
|
|
{"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
|
|
{"role": "user", "content": rec_content},
|
|
]
|
|
|
|
responses = asyncio.run(
|
|
generate_from_openai_chat_completion(
|
|
client,
|
|
messages=[messages],
|
|
engine_name="gpt-4o-mini",
|
|
max_tokens=1000,
|
|
requests_per_minute = 20,
|
|
response_format={"type":"json_object"},
|
|
)
|
|
)
|
|
response_data = json.loads(responses[0])
|
|
rec_papers = []
|
|
for rec_num in response_data["ranking"][:5]:
|
|
num = int(rec_num)
|
|
rec_papers.append(paper_list[num-1])
|
|
|
|
return rec_papers
|
|
|
|
def extract_related_content(self, papers, aspect):
|
|
os.environ["OPENAI_BASE_URL"] = self.api_base
|
|
os.environ["OPENAI_API_KEY"] = self.api
|
|
client = AsyncOpenAI()
|
|
|
|
messages = []
|
|
chunk_index_map = []
|
|
paper_data_list = []
|
|
paper_chunk_list = []
|
|
for paper_idx, paper in enumerate(papers):
|
|
paper_chunks = download_pdf(paper)
|
|
paper_chunk_list.append(paper_chunks)
|
|
|
|
SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."
|
|
|
|
for chunk_idx, paper_chunk in enumerate(paper_chunks):
|
|
message = [
|
|
{"role": "system", "content": SYSTEM_INPUT},
|
|
{"role": "user", "content": paper_chunk},
|
|
]
|
|
messages.append(message)
|
|
chunk_index_map.append((paper_idx, chunk_idx))
|
|
|
|
|
|
responses = asyncio.run(
|
|
generate_from_openai_chat_completion(
|
|
client,
|
|
messages=messages,
|
|
engine_name="gpt-4o-mini",
|
|
max_tokens=1000,
|
|
requests_per_minute=100,
|
|
)
|
|
)
|
|
|
|
paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
|
|
|
|
for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
|
|
if response.strip().lower().startswith("yes"):
|
|
paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"
|
|
|
|
for idx, paper_data in enumerate(paper_data_list):
|
|
if not paper_data["content"].strip():
|
|
paper_data["content"] = papers[idx]["abstract"]
|
|
|
|
|
|
if aspect == "Methodology":
|
|
SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.
|
|
Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.
|
|
Organize the result in JSON format as follows:
|
|
{
|
|
"revised_text": str, not dict, not a summary
|
|
}
|
|
"""
|
|
elif aspect == "Result Analysis":
|
|
SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.
|
|
Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.
|
|
Organize the result in JSON format as follows:
|
|
{
|
|
"revised_text": str, not dict, not a summary
|
|
}
|
|
"""
|
|
elif aspect == "Experimental Design":
|
|
SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.
|
|
Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.
|
|
Organize the result in JSON format as follows:
|
|
{
|
|
"revised_text": str, not dict, not a summary
|
|
}
|
|
"""
|
|
elif aspect == "Literature Review":
|
|
SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.
|
|
Remove sentences that are irrelevant to the literature review, and keep details about the related works.
|
|
Organize the result in JSON format as follows:
|
|
{
|
|
"revised_text": str, not dict, not a summary
|
|
}
|
|
"""
|
|
messages = []
|
|
for paper_data in paper_data_list:
|
|
message=[
|
|
{"role": "system", "content": SYSTEM_INPUT} ,
|
|
{"role": "user", "content": paper_data["content"]},
|
|
]
|
|
messages.append(message)
|
|
|
|
responses = asyncio.run(
|
|
generate_from_openai_chat_completion(
|
|
client,
|
|
messages=messages,
|
|
engine_name="gpt-4o-mini",
|
|
max_tokens=1000,
|
|
requests_per_minute = 20,
|
|
response_format={"type":"json_object"},
|
|
)
|
|
)
|
|
|
|
results = []
|
|
for paper_data, response in zip(paper_data_list, responses):
|
|
response = json.loads(response)
|
|
results.append({"title": paper_data["title"], "content": response["revised_text"]})
|
|
return results
|
|
|
|
|
|
|
|
def chat_review(self, text):
|
|
os.environ["OPENAI_BASE_URL"] = self.api_base
|
|
os.environ["OPENAI_API_KEY"] = self.api
|
|
client = AsyncOpenAI()
|
|
|
|
if self.enable_rag:
|
|
messages=[
|
|
{"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate the major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself and return only the list of limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
|
|
{"role": "user", "content": text},
|
|
]
|
|
else:
|
|
messages=[
|
|
{"role": "system", "content": f"Read the following scientific paper and generate major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself and return only the limitations. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
|
|
{"role": "user", "content": text},
|
|
]
|
|
try:
|
|
responses = asyncio.run(
|
|
generate_from_openai_chat_completion(
|
|
client,
|
|
messages=[messages],
|
|
engine_name=self.model_name,
|
|
max_tokens=1000,
|
|
requests_per_minute = 20,
|
|
|
|
)
|
|
)
|
|
try:
|
|
limitations = json.loads(responses[0])["limitations"]
|
|
result = ""
|
|
limit_cnt = 1
|
|
for limitation in limitations:
|
|
result += f"{str(limit_cnt)}. {limitation}\n"
|
|
limit_cnt += 1
|
|
except:
|
|
SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
|
|
messages=[
|
|
{"role": "system", "content": SYSTEM_INPUT},
|
|
{"role": "user", "content": responses[0]},
|
|
]
|
|
responses = asyncio.run(
|
|
generate_from_openai_chat_completion(
|
|
client,
|
|
messages=[messages],
|
|
engine_name="gpt-4o-mini",
|
|
max_tokens=1000,
|
|
requests_per_minute = 20,
|
|
response_format={"type":"json_object"},
|
|
)
|
|
)
|
|
limitations = json.loads(responses[0])["limitations"]
|
|
result = ""
|
|
limit_cnt = 1
|
|
for limitation in limitations:
|
|
result += f"{str(limit_cnt)}. {limitation}\n"
|
|
limit_cnt += 1
|
|
|
|
|
|
|
|
except Exception as e:
|
|
result = "Error: "+ str(e)
|
|
|
|
print("********"*10)
|
|
print(result)
|
|
print("********"*10)
|
|
return result
|
|
|
|
|
|
def retrieve_papers(self, title, abstract):
|
|
query = title
|
|
search_results = search_paper(query)
|
|
if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
|
|
search_result = search_results[0]
|
|
retrieval = recommendation(search_result["paperId"])
|
|
recommended_paper_list = []
|
|
for recommended_paper in retrieval["recommendedPapers"]:
|
|
if recommended_paper["abstract"] is None:
|
|
continue
|
|
if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
|
|
recommended_paper_list.append(recommended_paper)
|
|
|
|
if len(recommended_paper_list) >= 20:
|
|
break
|
|
|
|
else:
|
|
query = self.query_gen(abstract)
|
|
search_results = search_paper(query)
|
|
recommended_paper_list = []
|
|
if search_results["data"] == []:
|
|
return None
|
|
for search_result in search_results["data"]:
|
|
retrieval = recommendation(search_result["paperId"])
|
|
recommended_papers = []
|
|
for recommended_paper in retrieval["recommendedPapers"]:
|
|
if recommended_paper["abstract"] is None:
|
|
continue
|
|
if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
|
|
recommended_papers.append(recommended_paper)
|
|
|
|
if len(recommended_papers) >= 5:
|
|
break
|
|
recommended_paper_list.extend(recommended_papers)
|
|
|
|
if recommended_paper_list == []:
|
|
return None
|
|
final_papers = self.rerank(recommended_paper_list, title, abstract)
|
|
retrieved_papers = self.extract_related_content(final_papers, self.aspect)
|
|
|
|
return retrieved_papers
|
|
|
|
|
|
|
|
|
|
def extract_from_paper(self, pdf_path):
|
|
os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
|
|
os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
|
|
client = AsyncOpenAI()
|
|
|
|
|
|
|
|
|
|
|
|
file_object = BytesIO(pdf_path)
|
|
pdf_reader = PyPDF2.PdfReader(file_object)
|
|
|
|
doc = fitz.open(stream=pdf_path, filetype="pdf")
|
|
page = doc.load_page(0)
|
|
pix = page.get_pixmap()
|
|
image_bytes = pix.tobytes("png")
|
|
|
|
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
|
|
|
USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
|
|
messages=[
|
|
{"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
|
|
{"role": "user", "content": USER_INPUT},
|
|
]
|
|
responses = asyncio.run(
|
|
generate_from_openai_chat_completion(
|
|
client,
|
|
messages=[messages],
|
|
engine_name="gpt-4o-mini",
|
|
max_tokens=1000,
|
|
requests_per_minute = 20,
|
|
response_format={"type":"json_object"},
|
|
)
|
|
)
|
|
|
|
response = json.loads(responses[0])
|
|
title = response["title"]
|
|
abstract = response["abstract"]
|
|
|
|
|
|
|
|
num_pages = len(pdf_reader.pages)
|
|
extraction_started = False
|
|
extracted_text = ""
|
|
for page_number in range(num_pages):
|
|
page = pdf_reader.pages[page_number]
|
|
page_text = page.extract_text()
|
|
|
|
extraction_started = True
|
|
page_number_start = page_number
|
|
if extraction_started:
|
|
extracted_text += page_text
|
|
if page_number_start + 1 < page_number:
|
|
break
|
|
return extracted_text, title, abstract
|
|
|
|
def main(api,api_base, paper_pdf, aspect, model_name, enable_rag):
|
|
start_time = time.time()
|
|
|
|
comments = ''
|
|
output2 = ''
|
|
retrieved_content = ''
|
|
if not api or not paper_pdf:
|
|
comments = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
|
|
output2 = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
|
|
else:
|
|
try:
|
|
reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, enable_rag)
|
|
comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
|
|
time_used = time.time() - start_time
|
|
output2 ="Processing Time:"+ str(round(time_used, 2)) +"seconds"
|
|
except Exception as e:
|
|
comments = "Error: "+ str(e)
|
|
output2 = "Error: "+ str(e)
|
|
return retrieved_content, comments, output2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
title = "LimitGen"
|
|
|
|
|
|
description = '''<div align='left'>
|
|
<strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations.
|
|
</div>
|
|
'''
|
|
|
|
inp = [gradio.Textbox(label="Input your API-key",
|
|
value="",
|
|
type='password'),
|
|
gradio.Textbox(label="Input the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
|
|
value="https://api.openai.com/v1"),
|
|
|
|
gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
|
|
gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
|
|
value="Methodology",
|
|
label="Select the aspect"),
|
|
gradio.Textbox(label="Input the model name",
|
|
value="gpt-4o-mini"),
|
|
gradio.Checkbox(label="Enable RAG", value=False)
|
|
|
|
]
|
|
|
|
chat_reviewer_gui = gradio.Interface(fn=main,
|
|
inputs=inp,
|
|
outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
|
|
title=title,
|
|
description=description)
|
|
|
|
|
|
chat_reviewer_gui .launch(quiet=True, show_api=False) |