import json import os import tqdm from typing import List, Dict, Any import nltk import re from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction from dataclasses import dataclass from abc import ABC, abstractmethod from transformers import pipeline from rouge_score import rouge_scorer from codebleu import calc_codebleu import math import numpy as np import jieba import torch from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel class LLMModel: def __init__(self, model_name: str): self.model_name = model_name self.is_time_series = False self.timesfm_model = None # timesfm时序模型 if "timesfm" in model_name.lower(): import timesfm self.is_time_series = True self.tfm = timesfm.TimesFm( hparams=timesfm.TimesFmHparams( backend="gpu", per_core_batch_size=32, ), checkpoint=timesfm.TimesFmCheckpoint( huggingface_repo_id=model_name), ) elif "qwen" in model_name.lower() or "gemma" in model_name.lower() or "internlm" in model_name.lower() or "vicuna" in model_name.lower() or "gpt" in model_name.lower(): self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto") self.copied_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto") self.model = self.model.eval() elif "chatglm" in model_name.lower(): self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto") self.model = self.model.eval() else: self.pipeline = pipeline("text-generation", model=model_name, device_map="auto", trust_remote_code=True) def generate(self, prompt: str, max_new_tokens=256) -> str: if self.is_time_series: raise NotImplementedError("This model is a time-series model. Please call generate_for_timeseries() instead of generate().") if "vicuna" in self.model_name.lower() or "gpt" in self.model_name.lower(): inputs = self.tokenizer(prompt, return_tensors="pt") generate_ids = self.model.generate(inputs.input_ids.cuda(), max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id) output = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] return output elif "llama" in self.model_name.lower(): self.messages = [ {"role": "system", "content": "You are a helpful and useful AI assistant."}, {"role": "user", "content":prompt } ] prompt = self.pipeline.tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) terminators = [ self.pipeline.tokenizer.eos_token_id, self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") ] output = self.pipeline(prompt, max_new_tokens=max_new_tokens, num_return_sequences=1, pad_token_id = self.pipeline.tokenizer.eos_token_id, return_full_text=False, eos_token_id=terminators) return output[0]["generated_text"] elif "qwen" in self.model_name.lower(): self.messages = [ {"role": "system", "content": "You are a helpful and useful AI assistant."}, {"role": "user", "content": prompt} ] prompt = self.tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) model_inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda") generated_ids = self.model.generate(model_inputs.input_ids, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] return response elif "gemma" in self.model_name.lower(): self.messages = [ {"role": "user", "content": prompt} ] prompt = self.tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) model_inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda") generated_ids = self.model.generate(model_inputs.input_ids, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] return response elif "chatglm" in self.model_name.lower() or "internlm" in self.model_name.lower(): response, _ = self.model.chat(self.tokenizer, prompt, history=[]) return response def generate_for_timeseries( self, series_data: List[float], horizon: int = 1, freq: int = 0 ) -> List[float]: if self.is_time_series and self.tfm is not None: forecast_input = [series_data] frequency_input = [freq] point_forecast, _ = self.tfm.forecast( forecast_input, freq=frequency_input ) forecast_result = point_forecast[0] if horizon < len(forecast_result): forecast_result = forecast_result[:horizon] return forecast_result.tolist() else: prompt = ( "You are a time-series forecasting assistant.\n" f"The historical data points are: {series_data}.\n" f"Please predict the next {horizon} future data point(s) directly without other words based on the historical trend.\n\n" "Format your answer as a list of floats, e.g. `[3.1415, 2.7182]`.\n" "Answer:" ) raw_response = self.generate(prompt, max_new_tokens=64) import re pattern = r"\[([\d\.\,\s\-eE]+)\]" match = re.search(pattern, raw_response) if not match: print("Warning: LLM output not in expected format, fallback to 0.0") return [0.0] * horizon numbers_str = match.group(1) raw_nums = re.split(r"[\s,]+", numbers_str.strip()) parsed_vals = [] for val in raw_nums: try: parsed_vals.append(float(val)) except ValueError: continue # 如果预测数量不够 horizon,就做填充或截断 if len(parsed_vals) < horizon: # 填充 while len(parsed_vals) < horizon: parsed_vals.append(parsed_vals[-1] if parsed_vals else 0.0) elif len(parsed_vals) > horizon: parsed_vals = parsed_vals[:horizon] return parsed_vals @dataclass class Instance: input: Dict[str, Any] output: Dict[str, Any] id: str class BaseTask(ABC): def __init__(self, task_data: Dict[str, Any], model: LLMModel): self.task_data = task_data self.model = model self.data = self._parse_data(task_data) @abstractmethod def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: pass @abstractmethod def run_inference(self): pass class MultipleChoiceQA(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: return [Instance(input=d["input"], output={}, id=d["id"]) for d in task_data["data"]] def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): question = inst.input["question"] options = inst.input["options"] options_chars = [chr(65 + i) for i in range(len(options))] prompt = f"Question: {question}\nOptions:\n" for i, opt in enumerate(options): prompt += options_chars[i] + ". " + opt + "\n" if self.task_data["task"] == "Causal Reasoning": prompt += f"{question}\nPlease substitute yourself into the above scenario and select the most likely cause and effect outcome. " prompt += r'Please answer the question and output it strictly in the following format: "The final answer is $\boxed{your choice}$" at the end of the sentence.' response = self.model.generate(prompt, max_new_tokens=256) pred = None if "answer" not in response: pred = "A" else: pattern = "answer" response = re.split(pattern, response, flags=re.IGNORECASE)[-1] for opt in options_chars: if opt in response: pred = opt break if pred is None: pred = "A" self.predictions.append(pred) class OpenQA(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: return [Instance(input=d["input"], output={}, id=d["id"]) for d in task_data["data"]] def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): prompt = "" question = inst.input["question"] if "context" in inst.input.keys(): context = inst.input["context"] prompt += f"Given the context: {context}\n" if self.task_data["task"] == "Temporal Reasoning": prompt += f"{question}\nAccroding to the provided context, how long does it take for the event? Please give a direct answer without other words" elif self.task_data["task"] == "Medical Question Answering": prompt += f"Please answer the question in a short pargraph: {question}" elif self.task_data["task"] == "Multilingual Question Answering": prompt += f"Please directly answer the question using the language in the question: {question}" elif self.task_data["task"] == "Table Question Answering": table = inst.input["table"] prompt += f"Please read the content of the table below carefully and then directly answer the question without other words:\n{table}\n\nQuestion: {question}\nAnswer:" else: prompt += f"Please directly answer the question in a short sentence: {question}" if self.task_data["task"] == "Document-Level Causal": prompt += f"\nIf the context does not contain an answer to the question, simply output \"None of the above\"." response = self.model.generate(prompt, max_new_tokens=256) pred = response.strip() self.predictions.append(pred) class SummarizationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: if "document_list" in d: instance = Instance( input={"document_list": d["document_list"]}, output={}, id=d["id"] ) elif d.get("input") and "highlights" in d.get("output", {}): instance = Instance( input={"document": d["document"]}, output={}, id=d["id"] ) else: instance = Instance( input={"document": d["document"]}, output={}, id=d["id"] ) instances.append(instance) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): if "document_list" in inst.input: doc_list = inst.input["document_list"] combined_docs = "\n".join(doc_list) prompt = ( "You are a multi-document summarization assistant.\n" "Please read the following documents, and then summarize them in a concise paragraph:\n\n" f"{combined_docs}\n\n" "Summary:" ) else: doc = inst.input["document"] prompt = ( "Please summarize the following document in a short sentence\n" f"{doc}\n" "Summary:" ) pred = self.model.generate(prompt, max_new_tokens=256) if "Summary:" in pred: pred = pred.split("Summary:")[-1].strip() else: pred = pred.strip() self.predictions.append(pred) class TranslationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: return [Instance(input={ "source_lang": d["in"], "target_lang": d["out"], "text": d["input"] }, output={}, id=d["id"]) for d in task_data["data"]] def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): source_lang = inst.input["source_lang"] target_lang = inst.input["target_lang"] text = inst.input["text"] prompt = (f"Please directly Translate the following text from {source_lang} to {target_lang}.\n" f"Text: {text}\n" f"Translation:") pred = self.model.generate(prompt, max_new_tokens=256) if "Translation:" in pred: pred = pred.split("Translation:")[-1].strip() else: pred = pred.strip() self.predictions.append(pred) class StoryGenerationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: instances.append( Instance( input=d["input"], output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): prompt_text = inst.input["prompt"] prompt = f"Please write a story based on the following prompt:\n{prompt_text}\nStory:" pred = self.model.generate(prompt, max_new_tokens=512) if "Story:" in pred: pred = pred.split("Story:")[-1].strip() self.predictions.append(pred) class DialogueGenerationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: dialog_list = d.get("dialog", []) if not dialog_list: continue instances.append( Instance( input={"dialog": dialog_list}, output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): dialog_context = inst.input["dialog"] prompt = "Below is a multi-turn conversation. Please continue the dialogue for the last turn.\n\n" for turn_idx, turn in enumerate(dialog_context): prompt += f"Turn {turn_idx + 1}: {turn}\n" prompt += "\nNow please respond in one short answer:\n" pred = self.model.generate(prompt, max_new_tokens=128).strip() self.predictions.append(pred) class CodeGenerationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: instance_id = d["id"] language = d["language"] goal = d["goal"] context = d.get("context", []) instances.append( Instance( input={ "language": language, "goal": goal, "context": context }, output={}, id=instance_id ) ) return instances def run_inference(self): self.predictions = [] self.languages = [] for inst in tqdm.tqdm(self.data): language = inst.input["language"] goal = inst.input["goal"] context = inst.input["context"] prompt = f"You are an AI developer. Your goal is: {goal}\n" prompt += f"Please write {language} code that solves the described task.\n\n" for c_item in context: c_type = c_item["type"] c_content = c_item["content"] if c_type == "description": prompt += f"Description:\n{c_content}\n\n" elif c_type == "example": prompt += "Examples:\n" for ex in c_content: prompt += f"- Input: {ex['input']}, Expected Output: {ex['output']}\n" prompt += "\n" else: prompt += f"{c_type.capitalize()}:\n{c_content}\n\n" prompt += ( "Now, please output ONLY the final code solution (without additional explanations, comments or text)." "\nCode:\n" ) pred_code = self.model.generate(prompt, max_new_tokens=256).strip() if "Code:" in pred_code: pred_code = pred_code.split("Code:", 1)[-1].strip() self.predictions.append(pred_code) self.languages.append(language) class CodeRepairTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: instance_id = d["id"] input_part = d["input"] prompt = input_part["prompt"] source_code = input_part["sourceCode"] instances.append( Instance( input={ "prompt": prompt, "sourceCode": source_code }, output={}, id=instance_id ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): prompt = inst.input["prompt"] source_code = inst.input["sourceCode"] final_prompt = ( f"{prompt}\n" f"{source_code}\n\n" "Now, please output ONLY the final code solution (without additional explanations, comments or text)." "Refined Code:" ) pred_code = self.model.generate(final_prompt, max_new_tokens=256).strip() if "Refined Code:" in pred_code: pred_code = pred_code.split("Refined Code:", 1)[-1].strip() self.predictions.append(pred_code) class CodeDefectDetectionTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: instances.append( Instance( input={"func": d["func"]}, output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): code_snippet = inst.input["func"] prompt = ( "You are a code reviewer. Below is a piece of code or function:\n" f"{code_snippet}\n\n" "Please review carefully and determine if it contains a grammatical or logical defect. " "For example, the code below has defect:\n" "static void show_packets(AVFormatContext *format_ctx)\n\n{\n\n AVPacket packet;\n\n\n\n av_init_packet(&packet);\n\n probe_array_header(\"packets\", 0);\n\n while (!av_read_frame(format_ctx, &packet))\n\n show_packet(format_ctx, &packet);\n\n probe_array_footer(\"packets\", 0);\n\n}\n" "For another example, the code below has no defect:\n" "static void visitor_output_setup_internal(TestOutputVisitorData *output_data,\n\n bool is_human)\n\n{\n\n output_data->human = is_human;\n\n output_data->sov = string_output_visitor_new(is_human);\n\n g_assert(output_data->sov);\n\n output_data->ov = string_output_get_visitor(output_data->sov);\n\n g_assert(output_data->ov);\n\n}\n" "Output only 'No defect' if it does NOT contain a grammatical or logical defect, " "or ouput only 'Defect' if it DOES contain a defect.\n" "Answer:" ) response = self.model.generate(prompt, max_new_tokens=16).strip() if "no defect" in response.lower(): pred = "0" else: pred = "1" self.predictions.append(pred) class TextToSQLTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: instances.append( Instance( input={ "context": d["input"]["context"], "question": d["input"]["question"], }, output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): schema_context = inst.input["context"] question = inst.input["question"] prompt = ( "Below is a database schema:\n" f"{schema_context}\n" "Given the schema, please write a valid SQL query that answers the following question without other words.\n" f"Question: {question}\n" "SQL:" ) response = self.model.generate(prompt, max_new_tokens=256) if "SQL:" in response: pred_sql = response.split("SQL:", 1)[-1].strip() else: pred_sql = response.strip() self.predictions.append(pred_sql) class CodeExplanationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: code_snippet = d["code"] instance_id = d["id"] instances.append( Instance( input={"code": code_snippet}, output={}, id=instance_id ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): code_snippet = inst.input["code"] prompt = ( "You are a code explainer. " "Please read the following code snippet and provide a concise, clear explanation in natural language:. For example:\n" "Code:\nboolean equalsResidueRing ( Object obj ) { if ( !( obj instanceof ResidueRing ) ) { return false ; } ResidueRing < C > otherRing = null ; try { otherRing = ( ResidueRing < C > ) obj ; } catch ( ClassCastException e ) { return false ; } if ( otherRing == null ) { return false ; } if ( ! ring . equals ( otherRing . ring ) ) { return false ; } return modul . equals ( otherRing . modul ) ; }" "Explanation: compares this ResidueRing with another object.\n\n" "Now please explain the code below without other words:\n" f"{code_snippet}\n" "Explanation:" ) pred_explanation = self.model.generate(prompt, max_new_tokens=256).strip() self.predictions.append(pred_explanation) class MathematicalProofGenerationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: statement = d["statement"] instances.append( Instance( input={ "statement": statement }, output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): statement = inst.input["statement"] prompt = ( "You are a mathematical assistant. " "Please provide a clear, step-by-step proof for the following statement:\n" f"Statement: {statement}\n\n" "Ensure you include the final conclusion as well. Proof:" ) pred_proof = self.model.generate(prompt, max_new_tokens=512).strip() self.predictions.append(pred_proof) class MathematicalWordProblemSolvingTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: problem_text = d["problem"]["text"] constraints = d["problem"].get("constraints", []) instances.append( Instance( input={ "problem_text": problem_text, "constraints": constraints }, output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions_steps = [] self.predictions_final = [] for inst in tqdm.tqdm(self.data): problem_text = inst.input["problem_text"] constraints = inst.input["constraints"] constraints_str = "" if constraints: constraints_str = "\nConstraints:\n" + "\n".join(constraints) prompt = ( "You are a math problem solver. Please solve the following word problem step by step. " "Finally, provide the final numeric or short answer in a separate line labeled as 'Final Answer:'.\n\n" f"Problem:\n{problem_text}{constraints_str}\n\n" "Solution (step-by-step) + Final Answer:\n" ) response = self.model.generate(prompt, max_new_tokens=512).strip() steps_part, final_part = response, "" if "Final Answer:" in response: parts = response.split("Final Answer:", 1) steps_part = parts[0].strip() final_part = parts[1].strip() self.predictions_steps.append(steps_part) self.predictions_final.append(final_part) class ParaphraseGenerationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: instances.append( Instance( input={"originalSentence": d["input"]["originalSentence"]}, output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): original_sentence = inst.input["originalSentence"] prompt = ( "Please rewrite the following sentence in a different way but keep the same meaning:\n" f"{original_sentence}\n" "Paraphrase:" ) pred = self.model.generate(prompt, max_new_tokens=128) if "Paraphrase:" in pred: pred = pred.split("Paraphrase:")[-1].strip() self.predictions.append(pred.strip()) class GrammarCorrectionTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: return [ Instance( input=d["input"], output={}, id=d["id"] ) for d in task_data["data"] ] def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): error_type = inst.input["Error Type"] ungrammatical_sentence = inst.input["Ungrammatical Statement"] prompt = ( f"You are a grammar correction assistant.\n" f"There is a sentence with the following error type: {error_type}.\n" f"Please rewrite the sentence in correct standard English without any other word.\n\n" f"Ungrammatical Sentence: {ungrammatical_sentence}\n\n" f"Rewritten Sentence:" ) corrected = self.model.generate(prompt, max_new_tokens=128).strip() if "Rewritten Sentence:" in corrected: corrected = corrected.split("Rewritten Sentence:")[-1].strip() self.predictions.append(corrected) class TextStyleTransferTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: instances.append( Instance( input={ "text": d["input"]["text"], "style": d["input"]["style"] }, output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): text = inst.input["text"] style = inst.input["style"] prompt = ( "You are a style transfer assistant.\n" "Below is a piece of text and a target style.\n" f"Text: {text}\n" f"Style: {style}\n\n" "Please rewrite the above text to match the target style more accurately, " "while keeping the original meaning intact.\n" "Answer:" ) pred = self.model.generate(prompt, max_new_tokens=256).strip() if "Answer:" in pred: pred = pred.split("Answer:")[-1].strip() self.predictions.append(pred) class TableToTextGenerationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: instance_id = d["id"] table_data = d["input"]["table"] instances.append( Instance( input={"table": table_data}, output={}, id=instance_id ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): table_data = inst.input["table"] prompt = "Below is a table. Please generate a coherent description that summarizes the table's content.\n\n" for table_idx, table_item in enumerate(table_data): header = table_item["header"] rows = table_item["rows"] prompt += f"Table {table_idx+1}:\nHeader: {header}\nRows:\n" for r_idx, row in enumerate(rows): prompt += f"{r_idx+1}. {row}\n" prompt += "\n" prompt += "Now write a concise text describing the above table:\n" pred_text = self.model.generate(prompt, max_new_tokens=512) pred_text = pred_text.strip() self.predictions.append(pred_text) class TimeSeriesForecastingTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: instances = [] for d in task_data["data"]: time_series = d["input"]["data"] instances.append( Instance( input={"time_series": time_series}, output={}, id=d["id"] ) ) return instances def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): series_data = inst.input["time_series"] pred_values = self.model.generate_for_timeseries(series_data, horizon=1, freq=0) predicted = pred_values[0] if pred_values else 0.0 self.predictions.append(predicted) class ClassificationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: return [Instance(input=d["input"], output={}, id=d["id"]) for d in task_data["data"]] def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): if 'stance_detection' in self.task_data['task']: tweets = inst.input["tweets"] target = inst.input["target"] prompt = inst.input["prompt"].replace("<<>>", target).replace("<<>>", tweets) elif 'aspect_sentiment_classification' in self.task_data['task']: raw_text = inst.input["raw_text"] target = inst.input["target"] prompt = inst.input["prompt"].replace("<<>>", raw_text).replace("<<>>", target) + 'Please direct return the category name without any other words.' elif 'target_oriented_opinion_words_extraction' in self.task_data['task']: raw_text = inst.input["raw_text"] aspect = inst.input["aspect"] prompt = inst.input["prompt"].replace("<<>>", raw_text).replace("<<>>", aspect) + 'Please direct return the opinion word without any other words.' else: raw_text = inst.input["raw_text"] prompt = inst.input["prompt"].replace("<<>>", raw_text) + 'Please return the desired result directly, without any other explanation.' response = self.model.generate(prompt, max_new_tokens=64) self.predictions.append(response.lower()) class MultiLabelClassificationTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: return [Instance(input=d["input"], output={}, id=d["id"]) for d in task_data["data"]] def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): raw_text = inst.input["raw_text"] prompt = inst.input["prompt"].replace("<<>>", raw_text) prompt = prompt + " Please return the desired result directly, without any other explanation." + " Split the result by commas instead of \\n." response = self.model.generate(prompt, max_new_tokens=64) self.predictions.append('

'.join(response.lower().split(', '))) class ChoiceTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: return [Instance(input=d["input"], output={}, id=d["id"]) for d in task_data["data"]] def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): raw_text = inst.input["raw_text"] prompt = inst.input["prompt"].replace("<<>>", raw_text) + 'Please return the desired result directly, without any other explanation.' response = self.model.generate(prompt, max_new_tokens=64) if len(response.strip()) > 1: if "A" in response.strip(): response = "A" elif "B" in response.strip(): response = "B" elif "C" in response.strip(): response = "C" elif "D" in response.strip(): response = "D" self.predictions.append(response.lower()) class NERTask(BaseTask): def _parse_data(self, task_data: Dict[str, Any]) -> List[Instance]: return [Instance(input=d["input"], output={}, id=d["id"]) for d in task_data["data"]] def run_inference(self): self.predictions = [] for inst in tqdm.tqdm(self.data): text = inst.input["raw_text"] prompt = inst.input["prompt"].replace("<<>>", text) response = self.model.generate(prompt, max_new_tokens=128) self.predictions.append('

'.join(response.lower().split(', '))) def save_predictions(task_obj: BaseTask, task_directory: str): save_path = os.path.join(task_directory, "prediction.json") records = [] if isinstance(task_obj, MathematicalWordProblemSolvingTask): for idx, inst in enumerate(task_obj.data): records.append({ "id": inst.id, "prediction_steps": task_obj.predictions_steps[idx], "prediction_final": task_obj.predictions_final[idx] }) elif isinstance(task_obj, TimeSeriesForecastingTask): for idx, inst in enumerate(task_obj.data): records.append({ "id": inst.id, "prediction": float(task_obj.predictions[idx]) }) else: for idx, inst in enumerate(task_obj.data): pred_val = task_obj.predictions[idx] if isinstance(pred_val, (np.floating, np.integer)): pred_val = float(pred_val) records.append({"id": inst.id, "prediction": pred_val}) with open(save_path, "w", encoding="utf-8") as fp: json.dump(records, fp, ensure_ascii=False, indent=2) TASK_MAPPING = { "MultipleChoiceQA": MultipleChoiceQA, "OpenQA": OpenQA, "Summarization": SummarizationTask, "Story Generation": StoryGenerationTask, "Translation": TranslationTask, "Dialogue": DialogueGenerationTask, "Code Generation": CodeGenerationTask, "Code Defect Detection": CodeDefectDetectionTask, "Code Repair": CodeRepairTask, "Code Explanation": CodeExplanationTask, "Proof": MathematicalProofGenerationTask, "Mathematical Word Problem Solving": MathematicalWordProblemSolvingTask, "Text to SQL": TextToSQLTask, "Paraphrase Generation": ParaphraseGenerationTask, "Grammar Correction": GrammarCorrectionTask, "Table-to-Text Generation": TableToTextGenerationTask, "Time Series": TimeSeriesForecastingTask, "Text Style Transfer": TextStyleTransferTask, "classification": ClassificationTask, "multi label classification": MultiLabelClassificationTask, "ner": NERTask, "extraction": MultiLabelClassificationTask, "relation extraction": MultiLabelClassificationTask, "event detection": MultiLabelClassificationTask, "parsing": MultiLabelClassificationTask, "multiple choice": ChoiceTask, } if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="NLP Predictor") parser.add_argument("--dataset_dir", required=True) parser.add_argument("--model_name", required=True) args = parser.parse_args() data_root = os.path.abspath(args.dataset_dir) model = LLMModel(args.model_name) task_dirs = sorted([d for d in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, d))]) for idx, task_folder in enumerate(task_dirs, start=1): folder_path = os.path.join(data_root, task_folder) annotation_path = os.path.join(folder_path, "annotation.json") with open(annotation_path, "r", encoding="utf-8") as f: task_data = json.load(f) task_type = task_data.get("type") task_name = task_data.get("task", task_folder) print(f"\nTask {idx}/{len(task_dirs)}: {task_name} (Type = {task_type})") task_class = TASK_MAPPING.get(task_type, OpenQA) task = task_class(task_data, model) task.run_inference() save_predictions(task, folder_path)