import gradio as gr import spaces import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch.nn.functional as F import torch.nn as nn import re model_path = r'ssocean/NAIP' device = 'cuda:0' global model, tokenizer model = None tokenizer = None @spaces.GPU(duration=60, enable_queue=True) def predict(title, abstract): title = title.replace("\n", " ").strip().replace('’',"'") abstract = abstract.replace("\n", " ").strip().replace('’',"'") global model, tokenizer if model is None: model = AutoModelForSequenceClassification.from_pretrained( model_path, num_labels=1, load_in_8bit=True,) tokenizer = AutoTokenizer.from_pretrained(model_path) model.eval() text = f'''Given a certain paper, Title: {title}\n Abstract: {abstract}. \n Predict its normalized academic impact (between 0 and 1):''' inputs = tokenizer(text, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) probability = torch.sigmoid(outputs.logits).item() # reason for +0.05: We observed that the predicted values in the web demo are generally around 0.05 lower than those in the local deployment (due to differences in software/hardware environments, we believed). Therefore, we applied the following compensation in the web demo. Please do not use this in the local deployment. if probability + 0.05 >=1.0: return round(1, 4) return round(probability + 0.05, 4) examples = [ [ "SARDet-100K: Towards Open-Source Benchmark and ToolKit for Large-Scale SAR Object Detection", ('''Synthetic Aperture Radar (SAR) object detection has gained significant attention recently due to its irreplaceable all-weather imaging capabilities. However, this research field suffers from both limited public datasets (mostly comprising <2K images with only mono-category objects) and inaccessible source code. To tackle these challenges, we establish a new benchmark dataset and an open-source method for large-scale SAR object detection. Our dataset, SARDet-100K, is a result of intense surveying, collecting, and standardizing 10 existing SAR detection datasets, providing a large-scale and diverse dataset for research purposes. To the best of our knowledge, SARDet-100K is the first COCO-level large-scale multi-class SAR object detection dataset ever created. With this high-quality dataset, we conducted comprehensive experiments and uncovered a crucial challenge in SAR object detection: the substantial disparities between the pretraining on RGB datasets and finetuning on SAR datasets in terms of both data domain and model structure. To bridge these gaps, we propose a novel Multi-Stage with Filter Augmentation (MSFA) pretraining framework that tackles the problems from the perspective of data input, domain transition, and model migration. The proposed MSFA method significantly enhances the performance of SAR object detection models while demonstrating exceptional generalizability and flexibility across diverse models. This work aims to pave the way for further advancements in SAR object detection. The dataset and code is available at this https URL.''') ], [ "OminiControl: Minimal and Universal Control for Diffusion Transformer", ('''In this paper, we introduce OminiControl, a highly versatile and parameter-efficient framework that integrates image conditions into pre-trained Diffusion Transformer (DiT) models. At its core, OminiControl leverages a parameter reuse mechanism, enabling the DiT to encode image conditions using itself as a powerful backbone and process them with its flexible multi-modal attention processors. Unlike existing methods, which rely heavily on additional encoder modules with complex architectures, OminiControl (1) effectively and efficiently incorporates injected image conditions with only ~0.1% additional parameters, and (2) addresses a wide range of image conditioning tasks in a unified manner, including subject-driven generation and spatially-aligned conditions such as edges, depth, and more. Remarkably, these capabilities are achieved by training on images generated by the DiT itself, which is particularly beneficial for subject-driven generation. Extensive evaluations demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted models in both subject-driven and spatially-aligned conditional generation. Additionally, we release our training dataset, Subjects200K, a diverse collection of over 200,000 identity-consistent images, along with an efficient data synthesis pipeline to advance research in subject-consistent generation.''') ], [ "Enhanced ZSSR for Super-resolution Reconstruction of the Historical Tibetan Document Images", "Due to the poor preservation and imaging conditions, the image quality of historical Tibetan document images is relatively unsatisfactory. In this paper, we adopt super-resolution technology to reconstruct high quality images of historical Tibetan document. To address the problem of low quantity and poor quality of historical Tibetan document images, we propose the EZSSR network based on the Zero-Shot Super-resolution Network (ZSSR), which borrows the idea of feature pyramid in Deep Laplacian Pyramid Networks (LapSRN) to extract different levels of features while alleviating the ringing artifacts. EZSSR neither requires paired training datasets nor preprocessing stage. The computational complexity of EZSSR is low, and thus, EZSSR can also reconstruct image within the acceptable time frame. Experimental results show that EZSSR reconstructs images with better visual effects and higher PSNR and SSIM values." ] ] def validate_input(title, abstract): title = title.replace("\n", " ").strip().replace('’',"'") abstract = abstract.replace("\n", " ").strip().replace('’',"'") non_latin_pattern = re.compile(r'[^\u0000-\u007F]') non_latin_in_title = non_latin_pattern.findall(title) non_latin_in_abstract = non_latin_pattern.findall(abstract) if len(title.strip().split(' ')) < 3: return False, "The title must be at least 3 words long." if len(abstract.strip().split(' ')) < 50: return False, "The abstract must be at least 50 words long." if len((title + abstract).split(' ')) > 1024: return True, "Warning, the input length is approaching tokenization limits (1024) and may be truncated without further warning!" if non_latin_in_title: return False, f"The title contains invalid characters: {', '.join(non_latin_in_title)}. Only English letters and special symbols are allowed." if non_latin_in_abstract: return False, f"The abstract contains invalid characters: {', '.join(non_latin_in_abstract)}. Only English letters and special symbols are allowed." return True, "Inputs are valid! Good to go!" def update_button_status(title, abstract): valid, message = validate_input(title, abstract) if not valid: return gr.update(value="Error: " + message), gr.update(interactive=False) return gr.update(value=message), gr.update(interactive=True) with gr.Blocks() as iface: gr.Markdown(""" # 📈 Predict Academic Impact of Newly Published Paper! ### Estimate the future academic impact from the title and abstract with LLM. ###### [Full Paper](https://arxiv.org/abs/2408.03934) ###### Please be advised: Local inference of the proposed method is instant, but ZeroGPU requires quantized model reinitialization with each "Predict", causing slight delays. (typically wont take more than 30 secs) """) with gr.Row(): with gr.Column(): title_input = gr.Textbox( lines=2, placeholder='''Enter Paper Title Here... (Title will be processed with 'title.replace("\\n", " ").strip()')''', label="Paper Title" ) abstract_input = gr.Textbox( lines=5, placeholder='''Enter Paper Abstract Here... (Abstract will be processed with 'abstract.replace("\\n", " ").strip()')''', label="Paper Abstract" ) validation_status = gr.Textbox(label="Validation Status", interactive=False) submit_button = gr.Button("Predict Impact", interactive=False) with gr.Column(): output = gr.Label(label="Predicted Impact") gr.Markdown(""" ## Ethical Warnings and Important Notes - It is intended as a tool **for research and educational purposes only**. - Please refrain from deliberately embellishing the title and abstract to boost scores, and **avoid making false claims**. - Our **training data only includes** samples from the fields including **cs.CV, cs.CL (NLP), and cs.AI**. Predictions outside these areas are not recommended for reference. - The **predicted value** is a probability generated by the model and **does NOT reflect paper quality or novelty**. - To identify potentially impactful papers, this study uses the sigmoid+MSE approach to optimize NDCG values (over sigmoid+BCE), resulting in predicted values generally concentrated **between 0.1 and 0.9**. - Empirically, it is considered a predicted influence score greater than **0.65** to indicate an impactful paper. - The **author takes NO responsibility** for the prediction results. """) title_input.change( update_button_status, inputs=[title_input, abstract_input], outputs=[validation_status, submit_button] ) abstract_input.change( update_button_status, inputs=[title_input, abstract_input], outputs=[validation_status, submit_button] ) submit_button.click( predict, inputs=[title_input, abstract_input], outputs=output ) gr.Examples( examples=examples, inputs=[title_input, abstract_input], outputs=[validation_status, output], cache_examples=False ) iface.launch()