Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import spaces | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
from marker.convert import convert_single_pdf | |
from marker.output import markdown_exists, save_markdown, get_markdown_filepath | |
from marker.pdf.utils import find_filetype | |
from marker.pdf.extract_text import get_length_of_text | |
from marker.models import load_all_models | |
from marker.settings import settings | |
from marker.logger import configure_logging | |
import traceback | |
configure_logging() | |
MAX_PAGES = 20 | |
MIN_LENGTH=200 | |
settings.EXTRACT_IMAGES = False | |
settings.DEBUG = False | |
model_refs = load_all_models() | |
metadata = {} | |
model_name = "maxidl/arena-test" | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.bfloat16, | |
device_map="auto" | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
title = "# Placeholder Title" | |
steps = """Placeholder Description""" | |
# steps = """1. Converts uploaded pdf file to markdown. You can edit the intermediate markdown output.\n2. Generates a review for the paper""" | |
def convert_file(filepath): | |
full_text, images, out_metadata = convert_single_pdf( | |
filepath, model_refs, metadata=metadata, max_pages=MAX_PAGES | |
) | |
return full_text.strip() | |
def process_file(file): | |
print(file.name) | |
filepath = file.name | |
try: | |
if MIN_LENGTH: | |
filetype = find_filetype(filepath) | |
if filetype == "other": | |
raise ValueError() | |
length = get_length_of_text(filepath) | |
if length < MIN_LENGTH: | |
raise ValueError() | |
paper_text = convert_file(filepath) | |
if not len(paper_text) > MIN_LENGTH: | |
raise ValueError() | |
except Exception as e: | |
print(traceback.format_exc()) | |
print(f"Error converting {filepath}: {e}") | |
return "Error processing pdf" | |
return paper_text | |
def generate(paper_text): | |
messages = [ | |
{"role": "system", "content": "You are a pirate."}, | |
{"role": "user", "content": paper_text} | |
] | |
input_ids = tokenizer.apply_chat_template( | |
messages, | |
add_generation_prompt=True, | |
return_tensors='pt' | |
).to(model.device) | |
generated_ids = model.generate( | |
input_ids=input_ids, | |
max_new_tokens=256 | |
) | |
generated_ids = [ | |
output_ids[len(input_ids):] for input_ids, output_ids in zip(input_ids, generated_ids) | |
] | |
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return response | |
# return "Success" | |
with gr.Blocks() as demo: | |
title = gr.Markdown(title) | |
steps = gr.Markdown(steps) | |
instr = gr.Markdown("## Upload your paper as a pdf file") | |
file_input = gr.File(file_types=[".pdf"], file_count="single") | |
markdown_field = gr.Textbox(label="Markdown", max_lines=20, autoscroll=False) | |
# generate_button = gr.Button("Generate Review", interactive=not markdown_field) | |
generate_button = gr.Button("Generate Review") | |
file_input.upload(process_file, file_input, markdown_field) | |
# markdown_field.change(lambda text: gr.update(interactive=True) if len(text) > 1000 else gr.update(interactive=False), markdown_field, generate_button) | |
review_field = gr.Markdown(label="Review") | |
# generate_button.click(fn=lambda: gr.update(interactive=False), inputs=None, outputs=generate_button).then(generate, markdown_field, review_field).then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=generate_button) | |
generate_button.click(fn=lambda: gr.update(interactive=False), inputs=None, outputs=generate_button).then(generate, markdown_field, review_field).then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=generate_button) | |
demo.title = "Paper Review Generator" | |
if __name__ == "__main__": | |
demo.launch() | |