File size: 24,439 Bytes
ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 ac02643 df5beb9 f432512 df5beb9 ac02643 df5beb9 ac02643 f432512 df5beb9 f432512 df5beb9 ac02643 df5beb9 ac02643 df5beb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 |
import traceback
import time
from typing import List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from typing import List, Dict, Any
import json
from rich.console import Console
try:
from .ai import complete
except ImportError:
from ai import complete
print = Console().log
paper_types: Dict[str, str] = {
"CV": "computer vision, any paper that deals with image, video, point cloud or 3D model data",
"NLP": "natural language processing",
"LLM": "large language model, if a paper has this label, also include the NLP label",
"RO": "robotics",
"ML": "machine learning, use this label only when the paper is not specific to any of the above categories",
}
type_sort_order = ["ML", "CV", "NLP", "LLM", "RO"]
system_prompt = """
You are a professional AI researcher. You are helping users to organize their papers.
""".strip()
user_prompt = "\n\n".join([
"""
You are given a list of papers. You need to classify each paper into one of the following categories:
""".strip(),
chr(10).join(map(lambda x: f"- {x}: {paper_types[x]}", paper_types.keys())),
"""
You will be given several papers at a time. For each paper, you need to classify it into one of the categories above.
You should output in the following format with a code block:
""".strip(),
"""
```json
[
{
"id": "2402.01032",
"category": ["RO"]
},
{
"id": "2402.03254",
"category": ["ML"]
},
{
"id": "2403.00043",
"category": ["LLM", "NLP"]
}
]
```
Do not add any additional information in the output. The order of the papers in the output should match the order of the papers in the input.
""".strip(),
"""
The followings are the papers you need to classify:
"""
])
def build_paper(id: str, title: str, abstract: str = None) -> str:
if abstract is None:
return f"{id}: {title}"
return f"{id}: {title}\n\n{abstract}"
def get_classify_prompt(papers: List[Dict[str, str]]) -> str:
prompt = []
for index, paper in enumerate(papers, start=1):
prompt.append(build_paper(paper["id"], paper["title"], paper["abstract"] if "abstract" in paper else None))
return user_prompt + "\n\n" + "\n\n".join(prompt)
def parse_response(response: str) -> List[Dict[str, List[str]]] | None:
print(response)
# 匹配code block
response = response.strip()
if not response.startswith("```") or not response.endswith("```"):
return None
# 删除response的第一行和最后一行
response = "\n".join(response.split("\n")[1:-1])
try:
data = json.loads(response)
except json.JSONDecodeError:
print(response)
return None
for paper in data:
if "id" not in paper or "category" not in paper:
return None
if not isinstance(paper["id"], str) or not isinstance(paper["category"], list):
return None
for category in paper["category"]:
if category not in paper_types:
return None
# sort the categories
for paper in data:
if "LLM" in paper["category"] and "NLP" not in paper["category"]:
paper["category"].append("NLP")
# sort the categories
for paper in data:
paper["category"].sort(key=lambda x: type_sort_order.index(x))
return data
def get_classification(papers: List[Dict[str, str]]) -> List[Dict[str, List[str]]] | None:
prompt = get_classify_prompt(papers)
for _ in range(3):
try:
print("Request sent")
response = complete([{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}])
if response is not None:
return parse_response(response)
except:
pass
class TokenBucket:
def __init__(self):
self.tokens = 200
self.last_update = time.time()
self.lock = Lock()
def acquire(self, n: int) -> bool:
with self.lock:
current_time = time.time()
elapsed = current_time - self.last_update
if elapsed >= 60:
added_tokens = int(elapsed // 60) * 200
self.tokens = min(self.tokens + added_tokens, 200)
self.last_update += 60 * (elapsed // 60)
if self.tokens >= n:
self.tokens -= n
return True
return False
token_bucket = TokenBucket()
class PaperCache:
def __init__(self):
self.cache = {}
self.lock = Lock()
def get(self, paper):
key = paper["id"]
with self.lock:
data = self.cache.get(key)
if data is not None:
print(f"Cache hit for {paper['id']}")
return data
print(f"Cache miss for {paper['id']}")
return None
def set(self, paper, result):
print(f"Setting cache for {paper['id']}")
key = paper["id"]
with self.lock:
self.cache[key] = result
paper_cache = PaperCache()
def classify_papers(papers: List[Dict[str, str]]) -> Optional[List[Dict[str, List[str]]]]:
print(f"Classifying {len(papers)} papers")
cached_results = []
uncached_papers = []
# 为 papers 添加index
for index, paper in enumerate(papers, start=1):
paper['index'] = index
for paper in papers:
cached_result = paper_cache.get(paper)
if cached_result is not None:
cached_results.append(cached_result)
else:
uncached_papers.append(paper)
# 输出 cache 的数量
print(f"Cache hit: {len(cached_results)}, Cache miss: {len(uncached_papers)}")
if not uncached_papers:
return cached_results
batches = [uncached_papers[i:i+10] for i in range(0, len(uncached_papers), 10)]
num_batches = len(batches)
if num_batches == 0:
return cached_results
if not token_bucket.acquire(num_batches):
return None
try:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(get_classification, batch) for batch in batches]
results = []
for future in as_completed(futures):
batch_result = future.result()
if batch_result is None:
for f in futures:
f.cancel()
return None
results.extend(batch_result)
print(results)
results.sort(key=lambda x: x['id'])
for result in results:
paper_cache.set(result, result)
return cached_results + results
except Exception as e:
print(traceback.format_exc())
return None
if __name__ == "__main__":
from rich import print
import time
start = time.time()
print(classify_papers([
{
"title": "OmniHuman-1: Rethinking the Scaling-Up of One-Stage Conditioned Human Animation Models",
"abstract": "End-to-end human animation, such as audio-driven talking human generation, has undergone notable advancements in the recent few years. However, existing methods still struggle to scale up as large general video generation models, limiting their potential in real applications. In this paper, we propose OmniHuman, a Diffusion Transformer-based framework that scales up data by mixing motion-related conditions into the training phase. To this end, we introduce two training principles for these mixed conditions, along with the corresponding model architecture and inference strategy. These designs enable OmniHuman to fully leverage data-driven motion generation, ultimately achieving highly realistic human video generation. More importantly, OmniHuman supports various portrait contents (face close-up, portrait, half-body, full-body), supports both talking and singing, handles human-object interactions and challenging body poses, and accommodates different image styles. Compared to existing end-to-end audio-driven methods, OmniHuman not only produces more realistic videos, but also offers greater flexibility in inputs. It also supports multiple driving modalities (audio-driven, video-driven and combined driving signals). Video samples are provided on the ttfamily project page (https://omnihuman-lab.github.io)"
},
{
"title": "SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model",
"abstract": "While large language models have facilitated breakthroughs in many applications of artificial intelligence, their inherent largeness makes them computationally expensive and challenging to deploy in resource-constrained settings. In this paper, we document the development of SmolLM2, a state-of-the-art \"small\" (1.7 billion parameter) language model (LM). To attain strong performance, we overtrain SmolLM2 on ~11 trillion tokens of data using a multi-stage training process that mixes web text with specialized math, code, and instruction-following data. We additionally introduce new specialized datasets (FineMath, Stack-Edu, and SmolTalk) at stages where we found existing datasets to be problematically small or low-quality. To inform our design decisions, we perform both small-scale ablations as well as a manual refinement process that updates the dataset mixing rates at each stage based on the performance at the previous stage. Ultimately, we demonstrate that SmolLM2 outperforms other recent small LMs including Qwen2.5-1.5B and Llama3.2-1B. To facilitate future research on LM development as well as applications of small LMs, we release both SmolLM2 as well as all of the datasets we prepared in the course of this project."
},
{
"title": "Generating Multi-Image Synthetic Data for Text-to-Image Customization",
"abstract": "Customization of text-to-image models enables users to insert custom concepts and generate the concepts in unseen settings. Existing methods either rely on costly test-time optimization or train encoders on single-image training datasets without multi-image supervision, leading to worse image quality. We propose a simple approach that addresses both limitations. We first leverage existing text-to-image models and 3D datasets to create a high-quality Synthetic Customization Dataset (SynCD) consisting of multiple images of the same object in different lighting, backgrounds, and poses. We then propose a new encoder architecture based on shared attention mechanisms that better incorporate fine-grained visual details from input images. Finally, we propose a new inference technique that mitigates overexposure issues during inference by normalizing the text and image guidance vectors. Through extensive experiments, we show that our model, trained on the synthetic dataset with the proposed encoder and inference algorithm, outperforms existing tuning-free methods on standard customization benchmarks.",
},
{'title': 's1: Simple test-time scaling'},
{'title': 'Reward-Guided Speculative Decoding for Efficient LLM Reasoning'},
{'title': 'MatAnyone: Stable Video Matting with Consistent Memory Propagation'},
{'title': 'Self-supervised Quantized Representation for Seamlessly Integrating Knowledge Graphs with Large Language Models'},
{'title': 'Scalable-Softmax Is Superior for Attention'},
{'title': 'PixelWorld: Towards Perceiving Everything as Pixels'},
{'title': 'DINO-WM: World Models on Pre-trained Visual Features enable Zero-shot Planning'},
{'title': 'Constitutional Classifiers: Defending against Universal Jailbreaks across Thousands of Hours of Red Teaming'},
{'title': 'SAeUron: Interpretable Concept Unlearning in Diffusion Models with Sparse Autoencoders'},
{'title': 'Zero-Shot Novel View and Depth Synthesis with Multi-View Geometric Diffusion'},
{'title': 'The Surprising Agreement Between Convex Optimization Theory and Learning-Rate Scheduling for Large Model Training'},
{'title': 'Fast Encoder-Based 3D from Casual Videos via Point Track Processing'},
{'title': 'Unraveling the Capabilities of Language Models in News Summarization'},
{'title': 'Trading Inference-Time Compute for Adversarial Robustness'},
{'title': 'INT: Instance-Specific Negative Mining for Task-Generic Promptable Segmentation'},
{'title': 'ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'},
{'title': 'VideoJAM: Joint Appearance-Motion Representations for Enhanced Motion Generation in Video Models'},
{'title': 'Inverse Bridge Matching Distillation'},
{'title': 'ACECODER: Acing Coder RL via Automated Test-Case Synthesis'},
{'title': 'Satori: Reinforcement Learning with Chain-of-Action-Thought Enhances LLM Reasoning via Autoregressive Search'},
{'title': 'QLASS: Boosting Language Agent Inference via Q-Guided Stepwise Search'},
{'title': 'Concept Steerers: Leveraging K-Sparse Autoencoders for Controllable Generations'},
{'title': 'Can LLMs Maintain Fundamental Abilities under KV Cache Compression?'},
{'title': 'Rethinking Mixture-of-Agents: Is Mixing Different Large Language Models Beneficial?'},
{'title': 'COCONut-PanCap: Joint Panoptic Segmentation and Grounded Captions for Fine-Grained Understanding and Generation'},
{'title': 'Text-to-CAD Generation Through Infusing Visual Feedback in Large Language Models'},
{'title': 'Generating Multi-Image Synthetic Data for Text-to-Image Customization'},
{'title': 'Sample, Scrutinize and Scale: Effective Inference-Time Search by Scaling Verification'},
{'title': 'Federated Sketching LoRA: On-Device Collaborative Fine-Tuning of Large Language Models'},
{'title': 'Activation Approximations Can Incur Safety Vulnerabilities Even in Aligned LLMs: Comprehensive Analysis and Defense'},
{'title': 'SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model'},
{'title': 'Demystifying Long Chain-of-Thought Reasoning in LLMs'},
{'title': 'LIMO: Less is More for Reasoning'},
{'title': 'TwinMarket: A Scalable Behavioral and Social Simulation for Financial Markets'},
{'title': 'LayerTracer: Cognitive-Aligned Layered SVG Synthesis via Diffusion Transformer'},
{'title': 'Boosting Multimodal Reasoning with MCTS-Automated Structured Thinking'},
{'title': 'Token Assorted: Mixing Latent and Text Tokens for Improved Language Model Reasoning'},
{'title': 'Large Language Model Guided Self-Debugging Code Generation'},
{'title': 'Jailbreaking with Universal Multi-Prompts'},
{'title': 'A Probabilistic Inference Approach to Inference-Time Scaling of LLMs using Particle-Based Monte Carlo Methods'},
{'title': 'On Teacher Hacking in Language Model Distillation'},
{'title': 'Riddle Me This! Stealthy Membership Inference for Retrieval-Augmented Generation'},
{'title': 'Activation-Informed Merging of Large Language Models'},
{'title': 'HackerRank-ASTRA: Evaluating Correctness & Consistency of Large Language Models on cross-domain multi-file project problems'},
{'title': 'Analyze Feature Flow to Enhance Interpretation and Steering in Language Models'},
{'title': 'Gold-medalist Performance in Solving Olympiad Geometry with AlphaGeometry2'},
{'title': 'DynVFX: Augmenting Real Videos with Dynamic Content'},
{'title': 'UltraIF: Advancing Instruction Following from the Wild'},
{'title': 'Great Models Think Alike and this Undermines AI Oversight'},
{'title': 'ConceptAttention: Diffusion Transformers Learn Highly Interpretable Features'},
{'title': 'Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment'},
{'title': 'Weak-to-Strong Diffusion with Reflection'},
{'title': 'MAGA: MAssive Genre-Audience Reformulation to Pretraining Corpus Expansion'},
{'title': 'MotionLab: Unified Human Motion Generation and Editing via the Motion-Condition-Motion Paradigm'},
{'title': 'BOLT: Bootstrap Long Chain-of-Thought in Language Models without Distillation'},
{'title': 'ScoreFlow: Mastering LLM Agent Workflows via Score-based Preference Optimization'},
{'title': 'Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis'},
{'title': 'MotionCanvas: Cinematic Shot Design with Controllable Image-to-Video Generation'},
{'title': 'PILAF: Optimal Human Preference Sampling for Reward Modeling'},
{'title': 'Beyond Prompt Content: Enhancing LLM Performance via Content-Format Integrated Prompt Optimization'},
{'title': 'ChartCitor: Multi-Agent Framework for Fine-Grained Chart Visual Attribution'},
{'title': 'Towards Physical Understanding in Video Generation: A 3D Point Regularization Approach'},
{'title': 'PlotGen: Multi-Agent LLM-based Scientific Data Visualization via Multimodal Feedback'},
{'title': 'Enhancing Code Generation for Low-Resource Languages: No Silver Bullet'},
{'title': 'Learning Real-World Action-Video Dynamics with Heterogeneous Masked Autoregression'},
{'title': 'Speak Easy: Eliciting Harmful Jailbreaks from LLMs with Simple Interactions'},
{'title': 'OmniHuman-1: Rethinking the Scaling-Up of One-Stage Conditioned Human Animation Models'},
{'title': 'The Differences Between Direct Alignment Algorithms are a Blur'},
{'title': 'Process Reinforcement through Implicit Rewards'},
{'title': 'Preference Leakage: A Contamination Problem in LLM-as-a-judge'},
{'title': 'AlignVLM: Bridging Vision and Language Latent Spaces for Multimodal Understanding'},
{'title': 'SafeRAG: Benchmarking Security in Retrieval-Augmented Generation of Large Language Model'},
{'title': 'SliderSpace: Decomposing the Visual Capabilities of Diffusion Models'},
]))
print(classify_papers([
{'title': 'Federated Sketching LoRA: On-Device Collaborative Fine-Tuning of Large Language Models'},
{'title': 'Activation Approximations Can Incur Safety Vulnerabilities Even in Aligned LLMs: Comprehensive Analysis and Defense'},
{'title': 'SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model'},
{'title': 'Demystifying Long Chain-of-Thought Reasoning in LLMs'},
{'title': 'LIMO: Less is More for Reasoning'},
{'title': 'TwinMarket: A Scalable Behavioral and Social Simulation for Financial Markets'},
{'title': 'LayerTracer: Cognitive-Aligned Layered SVG Synthesis via Diffusion Transformer'},
{'title': 'Boosting Multimodal Reasoning with MCTS-Automated Structured Thinking'},
{'title': 'Token Assorted: Mixing Latent and Text Tokens for Improved Language Model Reasoning'},
{'title': 'Large Language Model Guided Self-Debugging Code Generation'},
{'title': 'Jailbreaking with Universal Multi-Prompts'},
{'title': 'A Probabilistic Inference Approach to Inference-Time Scaling of LLMs using Particle-Based Monte Carlo Methods'},
{'title': 'On Teacher Hacking in Language Model Distillation'},
{'title': 'Riddle Me This! Stealthy Membership Inference for Retrieval-Augmented Generation'},
{'title': 'Activation-Informed Merging of Large Language Models'},
{'title': 'HackerRank-ASTRA: Evaluating Correctness & Consistency of Large Language Models on cross-domain multi-file project problems'},
{'title': 'Analyze Feature Flow to Enhance Interpretation and Steering in Language Models'},
{'title': 'Gold-medalist Performance in Solving Olympiad Geometry with AlphaGeometry2'},
{'title': 'DynVFX: Augmenting Real Videos with Dynamic Content'},
{'title': 'UltraIF: Advancing Instruction Following from the Wild'},
{'title': 'Great Models Think Alike and this Undermines AI Oversight'},
{'title': 'ConceptAttention: Diffusion Transformers Learn Highly Interpretable Features'},
{'title': 'Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment'},
{'title': 'Weak-to-Strong Diffusion with Reflection'},
{'title': 'MAGA: MAssive Genre-Audience Reformulation to Pretraining Corpus Expansion'},
{'title': 'MotionLab: Unified Human Motion Generation and Editing via the Motion-Condition-Motion Paradigm'},
{'title': 'BOLT: Bootstrap Long Chain-of-Thought in Language Models without Distillation'},
{'title': 'ScoreFlow: Mastering LLM Agent Workflows via Score-based Preference Optimization'},
{'title': 'Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis'},
{'title': 'MotionCanvas: Cinematic Shot Design with Controllable Image-to-Video Generation'},
{'title': 'PILAF: Optimal Human Preference Sampling for Reward Modeling'},
{'title': 'Beyond Prompt Content: Enhancing LLM Performance via Content-Format Integrated Prompt Optimization'},
{'title': 'ChartCitor: Multi-Agent Framework for Fine-Grained Chart Visual Attribution'},
{'title': 'Towards Physical Understanding in Video Generation: A 3D Point Regularization Approach'},
{'title': 'PlotGen: Multi-Agent LLM-based Scientific Data Visualization via Multimodal Feedback'},
{'title': 'Enhancing Code Generation for Low-Resource Languages: No Silver Bullet'},
{'title': 'Learning Real-World Action-Video Dynamics with Heterogeneous Masked Autoregression'},
{'title': 'Speak Easy: Eliciting Harmful Jailbreaks from LLMs with Simple Interactions'},
{'title': 'OmniHuman-1: Rethinking the Scaling-Up of One-Stage Conditioned Human Animation Models'},
{'title': 'The Differences Between Direct Alignment Algorithms are a Blur'},
{'title': 'Process Reinforcement through Implicit Rewards'},
{'title': 'Preference Leakage: A Contamination Problem in LLM-as-a-judge'},
{'title': 'AlignVLM: Bridging Vision and Language Latent Spaces for Multimodal Understanding'},
{'title': 'SafeRAG: Benchmarking Security in Retrieval-Augmented Generation of Large Language Model'},
{'title': 'SliderSpace: Decomposing the Visual Capabilities of Diffusion Models'},
{'title': 'MM-IQ: Benchmarking Human-Like Abstraction and Reasoning in Multimodal Models'},
{'title': 'DeepRAG: Thinking to Retrieval Step by Step for Large Language Models'},
{'title': 'Scaling Embedding Layers in Language Models'},
{'title': 'MakeAnything: Harnessing Diffusion Transformers for Multi-Domain Procedural Sequence Generation'},
{'title': 'AIN: The Arabic INclusive Large Multimodal Model'},
{'title': 'FastKV: KV Cache Compression for Fast Long-Context Processing with Token-Selective Propagation'},
{'title': 'ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning'},
{'title': 'The Jumping Reasoning Curve? Tracking the Evolution of Reasoning Performance in GPT-[n] and o-[n] Models on Multimodal Puzzles'},
{'title': 'RandLoRA: Full-rank parameter-efficient fine-tuning of large models'},
{'title': 'Almost Surely Safe Alignment of Large Language Models at Inference-Time'},
{'title': 'Improving Transformer World Models for Data-Efficient RL'},
{'title': 'PhD Knowledge Not Required: A Reasoning Challenge for Large Language Models'},
{'title': 'Improved Training Technique for Latent Consistency Models'},
{'title': 'LongDPO: Unlock Better Long-form Generation Abilities for LLMs via Critique-augmented Stepwise Information'},
{'title': 'Learning to Generate Unit Tests for Automated Debugging'},
{'title': 'Lifelong Sequential Knowledge Editing without Model Degradation'},
{'title': 'A Study on the Performance of U-Net Modifications in Retroperitoneal Tumor Segmentation'},
{'title': 'Language Models Prefer What They Know: Relative Confidence Estimation via Confidence Preferences'},
{'title': 'Current Pathology Foundation Models are unrobust to Medical Center Differences'}
]))
print(f"Time taken: {time.time() - start}")
|