LimitGen

Running

App Files Files Community

LimitGen / app.py

zjXu11

Update app.py

64278dc verified 3 months ago

raw

history blame

24.1 kB

	import numpy as np
	import os
	import re
	from io import BytesIO
	import datetime
	import time
	import openai, tenacity
	import argparse
	import configparser
	import json
	import fitz
	import PyPDF2
	import gradio
	import sys
	from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk, OCRResponse
	from pathlib import Path
	utils_dir = Path(__file__).parent / 'utils'
	sys.path.append(str(utils_dir))
	from openai_utils import *
	import base64
	from pdf2image import convert_from_bytes
	import requests
	import bibtexparser
	from pybtex.database import parse_string
	from pybtex.plugin import find_plugin

	PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
	PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
	MISTRAL_API = os.getenv('MISTRAL_API')

	def insert_sentence(text, sentence, interval):
	lines = text.split('\n')
	new_lines = []

	for line in lines:
	words = line.split()
	separator = ' '

	new_words = []
	count = 0

	for word in words:
	new_words.append(word)
	count += 1

	if count % interval == 0:
	new_words.append(sentence)

	new_lines.append(separator.join(new_words))

	return '\n'.join(new_lines)


	def format_bibtex(paper, style='apa'):
	bibtex_entry = paper["citationStyles"]["bibtex"]
	bib_data = parse_string(bibtex_entry, 'bibtex')
	formatter = find_plugin('pybtex.style.formatting', style)()
	entries = list(bib_data.entries.values())
	if not entries:
	return "No valid entries found."
	formatted_entry = formatter.format_entries(entries)
	return '\n'.join(entry.text.render_as('text') for entry in formatted_entry)

	def search_paper(query):
	SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
	url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"

	response = requests.get(url)
	while response.status_code != 200:
	time.sleep(1)
	# print(response)
	response = requests.get(url)

	return response.json()

	def get_combined_markdown(pdf_response: OCRResponse) -> str:
	markdowns: list[str] = []
	for page in pdf_response.pages:
	markdowns.append(page.markdown)

	return "\n\n".join(markdowns)

	def split_text_into_chunks(pdf_response: OCRResponse) -> str:
	# words = text.split()
	# chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
	# return chunks
	markdowns: list[str] = []
	for page in pdf_response.pages:
	markdowns.append(page.markdown)
	return markdowns

	def download_pdf(paper):
	pdf_url = paper["openAccessPdf"]["url"]
	try:
	response = requests.get(pdf_url)
	response.raise_for_status()


	file_object = BytesIO(response.content)
	chunks = extract_chapter(file_object)
	return chunks
	except:
	return []


	def recommendation(s2_id, limit=500):
	SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
	url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf,citationStyles"

	# print(url)
	response = requests.get(url)
	while response.status_code != 200:
	time.sleep(1)
	# print(response)
	response = requests.get(url)

	return response.json()


	def extract_chapter(file_object):
	client = Mistral(api_key=MISTRAL_API)
	uploaded_file = client.files.upload(
	file={
	"file_name": "retrieve.pdf",
	"content": file_object.read(),
	},
	purpose="ocr",
	)

	signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
	pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
	# response_dict = json.loads(pdf_response.json())
	chunks = split_text_into_chunks(pdf_response)
	return chunks



	class Reviewer:

	def __init__(self, api, api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
	self.api = api
	self.api_base = api_base
	self.aspect = aspect
	self.paper_pdf = paper_pdf
	self.model_name = model_name
	self.limit_num = int(limit_num)
	self.enable_rag = enable_rag
	# self.max_token_num = 50000
	# self.encoding = tiktoken.get_encoding("gpt2")


	def review_by_chatgpt(self, paper_list):
	text, title, abstract = self.extract_from_paper(self.paper_pdf)
	content = f"Paper to review: \nTitle: {title}\n" + text

	if self.enable_rag:
	papers = self.retrieve_papers(title, abstract)
	if papers != None:
	retrieval_content = ""
	retrieved_papers = ""
	cnt = 1
	for paper in papers:
	retrieval_content += f"Relevant Paper {str(cnt)}:\n"
	retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
	formatted_citation = format_bibtex(paper, 'unsrt')
	retrieved_papers += f"{str(cnt)}. {formatted_citation} ({paper['url']})\n\n"
	cnt += 1
	text = retrieval_content + content
	chat_review_text = self.chat_review(text=text)
	else:
	text = content
	chat_review_text = self.chat_review(text=text)
	retrieved_papers = ""
	else:
	text = content
	chat_review_text = self.chat_review(text=text)
	retrieved_papers = ""

	return chat_review_text, retrieved_papers

	def query_gen(self, abstract):
	os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
	os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
	client = AsyncOpenAI()

	messages=[
	{"role": "system", "content": f"Generate a TLDR in 5 words of the following text. Do not use any proposed model names or dataset names from the text. Output only the 5 words without punctuation."} ,
	{"role": "user", "content": abstract},
	]

	responses = asyncio.run(
	generate_from_openai_chat_completion(
	client,
	messages=[messages],
	engine_name="gpt-4o-mini", # gpt-3.5-turbo
	max_tokens=1000, # 32
	requests_per_minute = 20,
	# response_format={"type":"json_object"},
	)
	)
	return responses[0]


	def rerank(self, paper_list, title, abstract):
	os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
	os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
	client = AsyncOpenAI()

	rec_content = ""
	rec_paper_cnt = 1

	for rec_paper in paper_list:
	rec_content += f"Paper {rec_paper_cnt}: {rec_paper['title']}\n{rec_paper['abstract']}\n\n"
	rec_paper_cnt += 1

	rec_content += f"Reference Paper: {title}\n"
	rec_content += f"Abstract: {abstract}\n"

	messages=[
	{"role": "system", "content": f"Given the abstracts of {rec_paper_cnt-1} papers and the abstract of a reference paper, rank the papers in order of relevance to the reference paper. Output the top 5 as a list of integers in JSON format: {{'ranking': [1, 10, 4, 2, 8]}}."} ,
	{"role": "user", "content": rec_content},
	]

	responses = asyncio.run(
	generate_from_openai_chat_completion(
	client,
	messages=[messages],
	engine_name="gpt-4o-mini", # gpt-3.5-turbo
	max_tokens=1000, # 32
	requests_per_minute = 20,
	response_format={"type":"json_object"},
	)
	)
	response_data = json.loads(responses[0])
	rec_papers = []
	for rec_num in response_data["ranking"][:5]:
	num = int(rec_num)
	rec_papers.append(paper_list[num-1])

	return rec_papers

	def extract_related_content(self, papers, aspect):
	os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
	os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
	client = AsyncOpenAI()

	messages = []
	chunk_index_map = []
	paper_data_list = []
	paper_chunk_list = []
	for paper_idx, paper in enumerate(papers):
	paper_chunks = download_pdf(paper)
	paper_chunk_list.append(paper_chunks)

	SYSTEM_INPUT = f"Read the following section from a scientific paper. If the section is related to the paper's {aspect}, output 'yes'; otherwise, output 'no'."

	for chunk_idx, paper_chunk in enumerate(paper_chunks):
	message = [
	{"role": "system", "content": SYSTEM_INPUT},
	{"role": "user", "content": paper_chunk},
	]
	messages.append(message)
	chunk_index_map.append((paper_idx, chunk_idx)) # 标记每个 chunk 归属哪个 paper


	responses = asyncio.run(
	generate_from_openai_chat_completion(
	client,
	messages=messages,
	engine_name="gpt-4o-mini",
	max_tokens=1000,
	requests_per_minute=100,
	)
	)

	paper_data_list = [{"title": paper["title"], "content": "", "citationStyles": paper["citationStyles"], "url": paper["url"]} for paper in papers]

	for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
	if response.strip().lower().startswith("yes"):
	paper_data_list[paper_idx]["content"] += paper_chunk_list[paper_idx][chunk_idx] + "\n"

	for idx, paper_data in enumerate(paper_data_list):
	if not paper_data["content"].strip():
	paper_data["content"] = papers[idx]["abstract"]


	if aspect == "Methodology":
	SYSTEM_INPUT = """Concatenate all the content from the methodology sections of a paper.
	Remove sentences that are irrelevant to the proposed methodology or models, and keep details about key components and innovations.
	Organize the result in JSON format as follows:
	{
	"revised_text": str, not dict, not a summary
	}
	"""
	elif aspect == "Result Analysis":
	SYSTEM_INPUT = """Concatenate all the content from the result analysis sections of a paper.
	Remove sentences that are irrelevant to the result analysis of the experiments, and keep details about the metrics, case study and how the paper presents the results.
	Organize the result in JSON format as follows:
	{
	"revised_text": str, not dict, not a summary
	}
	"""
	elif aspect == "Experimental Design":
	SYSTEM_INPUT = """Concatenate all the content from the experimental design sections of a paper.
	Remove sentences that are irrelevant to the experiment setup, and keep details about the datasets, baselines, and main experimental, ablation studies.
	Organize the result in JSON format as follows:
	{
	"revised_text": str, not dict, not a summary
	}
	"""
	elif aspect == "Literature Review":
	SYSTEM_INPUT = """Concatenate all the content from the literature review sections of a paper.
	Remove sentences that are irrelevant to the literature review, and keep details about the related works.
	Organize the result in JSON format as follows:
	{
	"revised_text": str, not dict, not a summary
	}
	"""
	messages = []
	for paper_data in paper_data_list:
	message=[
	{"role": "system", "content": SYSTEM_INPUT} ,
	{"role": "user", "content": paper_data["content"]},
	]
	messages.append(message)

	responses = asyncio.run(
	generate_from_openai_chat_completion(
	client,
	messages=messages,
	engine_name="gpt-4o-mini", # gpt-3.5-turbo
	max_tokens=5000, # 32
	requests_per_minute = 20,
	response_format={"type":"json_object"},
	)
	)

	results = []
	for paper_data, response in zip(paper_data_list, responses):
	# print(response)
	response = json.loads(response)
	results.append({"title": paper_data["title"], "content": response["revised_text"], "citationStyles": paper_data["citationStyles"], "url": paper_data["url"]})
	return results



	def chat_review(self, text):
	os.environ["OPENAI_BASE_URL"] = self.api_base
	os.environ["OPENAI_API_KEY"] = self.api
	client = AsyncOpenAI()

	if self.enable_rag:
	messages=[
	{"role": "system", "content": f"Read the following content from several papers to gain knowledge in the relevant field. Using this knowledge, review a new scientific paper in this field. Based on existing research, identify the limitations of the 'Paper to Review'. Generate {str(self.limit_num)} major limitations related to its {self.aspect} in this paper. Do not include any limitation explicitly mentioned in the paper itself. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
	{"role": "user", "content": text},
	]
	else:
	messages=[
	{"role": "system", "content": f"Read the following scientific paper and generate {str(self.limit_num)} major limitations in this paper about its {self.aspect}. Do not include any limitation explicitly mentioned in the paper itself. Return only the limitations in the following JSON format: {{\"limitations\": <a list of limitations>"} ,
	{"role": "user", "content": text},
	]
	try:
	responses = asyncio.run(
	generate_from_openai_chat_completion(
	client,
	messages=[messages],
	engine_name=self.model_name, # gpt-3.5-turbo
	max_tokens=1000, # 32
	requests_per_minute = 20,
	# response_format={"type":"json_object"},
	)
	)
	try:
	limitations = json.loads(responses[0])["limitations"][:self.limit_num]
	result = ""
	limit_cnt = 1
	for limitation in limitations:
	result += f"{str(limit_cnt)}. {limitation}\n"
	limit_cnt += 1
	except:
	SYSTEM_INPUT = f"Below is an output from an LLM about several limitations of a scientific paper. Please extract the list of limitations and DO NOT make any modification to the original limitations. Return the limitations in the following JSON format: {{\"limitations\": <a list of limitations>}}. If there is no valid response inthe output, return {{\"limitations\": {{}}}}"
	messages=[
	{"role": "system", "content": SYSTEM_INPUT},
	{"role": "user", "content": responses[0]},
	]
	os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
	os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
	responses = asyncio.run(
	generate_from_openai_chat_completion(
	client,
	messages=[messages],
	engine_name="gpt-4o-mini", # gpt-3.5-turbo
	max_tokens=1000, # 32
	requests_per_minute = 20,
	response_format={"type":"json_object"},
	)
	)
	limitations = json.loads(responses[0])["limitations"][:self.limit_num]
	result = ""
	limit_cnt = 1
	for limitation in limitations:
	result += f"{str(limit_cnt)}. {limitation}\n\n"
	limit_cnt += 1
	# for choice in response.choices:
	# result += choice.message.content
	# result = insert_sentence(result, 'Generated by ChatGPT, no copying allowed!', 50)
	except Exception as e:
	result = "Error: "+ str(e)
	# usage = 'xxxxx'
	print("*******"10)
	print(result)
	print("*******"10)
	return result


	def retrieve_papers(self, title, abstract):
	query = title
	search_results = search_paper(query)
	if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
	search_result = search_results["data"][0]
	retrieval = recommendation(search_result["paperId"])
	recommended_paper_list = []
	for recommended_paper in retrieval["recommendedPapers"]:
	if recommended_paper["abstract"] is None:
	continue
	if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
	recommended_paper_list.append(recommended_paper)

	if len(recommended_paper_list) >= 20:
	break

	else:
	query = self.query_gen(abstract)
	search_results = search_paper(query)
	recommended_paper_list = []
	if search_results["data"] == []:
	return None
	for search_result in search_results["data"]:
	retrieval = recommendation(search_result["paperId"])
	recommended_papers = []
	for recommended_paper in retrieval["recommendedPapers"]:
	if recommended_paper["abstract"] is None:
	continue
	if recommended_paper["isOpenAccess"] and recommended_paper["openAccessPdf"]!= None:
	recommended_papers.append(recommended_paper)

	if len(recommended_papers) >= 5:
	break
	recommended_paper_list.extend(recommended_papers)

	if recommended_paper_list == []:
	return None
	final_papers = self.rerank(recommended_paper_list, title, abstract)
	retrieved_papers = self.extract_related_content(final_papers, self.aspect)

	return retrieved_papers




	def extract_from_paper(self, pdf_path):
	os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
	os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
	client = AsyncOpenAI()

	# with open(pdf_path, 'rb') as f: # TODO
	# pdf_bytes = f.read()
	# file_object = BytesIO(pdf_bytes)

	file_object = BytesIO(pdf_path) # TODO
	pdf_reader = PyPDF2.PdfReader(file_object)

	doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO path/bytes
	page = doc.load_page(0)
	pix = page.get_pixmap()
	image_bytes = pix.tobytes("png")

	image_base64 = base64.b64encode(image_bytes).decode('utf-8')

	USER_INPUT = [{"type": "text", "text": "The first page of the paper: "}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}]
	messages=[
	{"role": "system", "content": "Given the first-page image of a scientific paper in PDF format, extract and return the title and abstract in the following JSON format: {\"title\": \"<extracted title>\", \"abstract\": \"<extracted abstract>\"}."} ,
	{"role": "user", "content": USER_INPUT},
	]
	responses = asyncio.run(
	generate_from_openai_chat_completion(
	client,
	messages=[messages],
	engine_name="gpt-4o-mini", # gpt-3.5-turbo
	max_tokens=1000, # 32
	requests_per_minute = 20,
	response_format={"type":"json_object"},
	)
	)

	response = json.loads(responses[0])
	title = response["title"]
	abstract = response["abstract"]

	client = Mistral(api_key=MISTRAL_API)
	file_object.seek(0)
	uploaded_file = client.files.upload(
	file={
	"file_name": "upload.pdf",
	"content": file_object.read(),
	},
	purpose="ocr",
	)

	signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
	pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
	# response_dict = json.loads(pdf_response.json())
	extracted_text = get_combined_markdown(pdf_response)

	return extracted_text, title, abstract

	def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
	start_time = time.time()
	# print("key: ", PRIVATE_API_KEY, "\nbase: ", PRIVATE_API_BASE)
	comments = ''
	output2 = ''
	retrieved_content = ''
	if not api or not paper_pdf or not api_base:
	comments = "It looks like there's a missing API key/base URL or PDF input. Make sure you've provided the necessary information or uploaded the required file."
	output2 = "It looks like there's a missing API key or PDF input. Make sure you've provided the necessary information or uploaded the required file."
	if not limit_num.isdigit() or int(limit_num) <= 0:
	comments = "The input number is not a positive integer."
	output2 = "The input number is not a positive integer."
	else:
	try:
	reviewer1 = Reviewer(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag)
	comments, retrieved_content = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
	time_used = time.time() - start_time
	output2 ="Processing Time："+ str(round(time_used, 2)) +"seconds"
	except Exception as e:
	comments = "Error: "+ str(e)
	output2 = "Error: "+ str(e)
	return retrieved_content, comments, output2


	########################################################################################################

	title = "LimitGen"


	description = '''<div align='left'>
	<strong>We present a demo for our paper: Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers. Upload the PDF of the paper you want to review, and the demo will automatically generate its identified limitations.
	</div>
	'''

	inp = [gradio.Textbox(label="Enter your API-key",
	value="",
	type='password'),
	gradio.Textbox(label="Enter the base URL (ending with /v1). Skip this step if using the original OpenAI API.",
	value="https://api.openai.com/v1"),

	gradio.File(label="Upload the PDF file of your paper (Make sure the PDF is fully uploaded before clicking Submit)",type="binary"),
	gradio.Radio(choices=["Methodology", "Experimental Design", "Result Analysis", "Literature Review"],
	value="Methodology",
	label="Select the aspect"),
	gradio.Dropdown(["gpt-4o-mini","gpt-4o","Qwen/Qwen2.5-7B-Instruct-Turbo", "meta-llama/Meta-Llama-3-70B-Instruct-Turbo", "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"],
	label="Select the model name",
	value="gpt-4o-mini"),
	gradio.Textbox(label="Enter the number of limitations to generate.",
	value="3"),
	gradio.Checkbox(label="Enable RAG", value=False),
	]

	chat_reviewer_gui = gradio.Interface(fn=main,
	inputs=inp,
	outputs = [gradio.Textbox(lines=6, label="Retrieved Literature"), gradio.Textbox(lines=15, label="Output"), gradio.Textbox(lines=2, label="Resource Statistics")],
	title=title,
	description=description)

	# Start server
	chat_reviewer_gui .launch(quiet=True, show_api=False)