openreviewer / app.py
maxidl's picture
a
3879030
raw
history blame
3.86 kB
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from marker.convert import convert_single_pdf
from marker.output import markdown_exists, save_markdown, get_markdown_filepath
from marker.pdf.utils import find_filetype
from marker.pdf.extract_text import get_length_of_text
from marker.models import load_all_models
from marker.settings import settings
from marker.logger import configure_logging
import traceback
configure_logging()
MAX_PAGES = 20
MIN_LENGTH=200
settings.EXTRACT_IMAGES = False
settings.DEBUG = False
model_refs = load_all_models()
metadata = {}
model_name = "maxidl/arena-test"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
title = "# Placeholder Title"
steps = """Placeholder Description"""
# steps = """1. Converts uploaded pdf file to markdown. You can edit the intermediate markdown output.\n2. Generates a review for the paper"""
@spaces.GPU(duration=60)
def convert_file(filepath):
full_text, images, out_metadata = convert_single_pdf(
filepath, model_refs, metadata=metadata, max_pages=MAX_PAGES
)
return full_text.strip()
def process_file(file):
print(file.name)
filepath = file.name
try:
if MIN_LENGTH:
filetype = find_filetype(filepath)
if filetype == "other":
raise ValueError()
length = get_length_of_text(filepath)
if length < MIN_LENGTH:
raise ValueError()
paper_text = convert_file(filepath)
if not len(paper_text) > MIN_LENGTH:
raise ValueError()
except Exception as e:
print(traceback.format_exc())
print(f"Error converting {filepath}: {e}")
return "Error processing pdf"
return paper_text
@spaces.GPU(duration=60)
def generate(paper_text):
messages = [
{"role": "system", "content": "You are a pirate."},
{"role": "user", "content": paper_text}
]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors='pt'
).to(model.device)
generated_ids = model.generate(
input_ids=input_ids,
max_new_tokens=256
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
# return "Success"
with gr.Blocks() as demo:
title = gr.Markdown(title)
steps = gr.Markdown(steps)
instr = gr.Markdown("## Upload your paper as a pdf file")
file_input = gr.File(file_types=[".pdf"], file_count="single")
markdown_field = gr.Textbox(label="Markdown", max_lines=20, autoscroll=False)
# generate_button = gr.Button("Generate Review", interactive=not markdown_field)
generate_button = gr.Button("Generate Review")
file_input.upload(process_file, file_input, markdown_field)
# markdown_field.change(lambda text: gr.update(interactive=True) if len(text) > 1000 else gr.update(interactive=False), markdown_field, generate_button)
review_field = gr.Markdown(label="Review")
# generate_button.click(fn=lambda: gr.update(interactive=False), inputs=None, outputs=generate_button).then(generate, markdown_field, review_field).then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=generate_button)
generate_button.click(fn=lambda: gr.update(interactive=False), inputs=None, outputs=generate_button).then(generate, markdown_field, review_field).then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=generate_button)
demo.title = "Paper Review Generator"
if __name__ == "__main__":
demo.launch()