import re import time import openai import pandas as pd from tqdm import tqdm class AutoTab: def __init__( self, in_file_path: str, out_file_path: str, instruction: str, max_examples: int, model_name: str, generation_config: dict, request_interval: float, save_every: int, api_key: str, base_url: str, ): self.in_file_path = in_file_path self.out_file_path = out_file_path self.instruction = instruction self.max_examples = max_examples self.model_name = model_name self.generation_config = generation_config self.request_interval = request_interval self.save_every = save_every self.api_key = api_key self.base_url = base_url # ─── IO ─────────────────────────────────────────────────────────────── def load_excel(self) -> tuple[pd.DataFrame, list, list]: """Load the Excel file and identify input and output fields.""" df = pd.read_excel(self.in_file_path) input_fields = [col for col in df.columns if col.startswith("[Input] ")] output_fields = [col for col in df.columns if col.startswith("[Output] ")] return df, input_fields, output_fields # ─── LLM ────────────────────────────────────────────────────────────── def openai_request(self, query: str) -> str: """Make a request to an OpenAI-format API.""" client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url) response = client.chat.completions.create( model=self.model_name, messages=[{"role": "user", "content": query}], **self.generation_config, ) str_response = response.choices[0].message.content.strip() return str_response # ─── In-Context Learning ────────────────────────────────────────────── def derive_incontext( self, data: pd.DataFrame, input_columns: list[str], output_columns: list[str] ) -> str: """Derive the in-context prompt with angle brackets.""" n = min(self.max_examples, len(data.dropna(subset=output_columns))) in_context = "" for i in range(n): in_context += "".join( f"<{col.replace('[Input] ', '')}>{data[col].iloc[i]}\n" for col in input_columns ) in_context += "".join( f"<{col.replace('[Output] ', '')}>{data[col].iloc[i]}\n" for col in output_columns ) in_context += "\n" return in_context def predict_output( self, in_context: str, input_data: pd.DataFrame, input_fields: str ): """Predict the output values for the given input data using the API.""" query = ( self.instruction + "\n\n" + in_context + "".join( f"<{col.replace('[Input] ', '')}>{input_data[col]}\n" for col in input_fields ) ) self.query_example = query output = self.openai_request(query) return output def extract_fields( self, response: str, output_columns: list[str] ) -> dict[str, str]: """Extract fields from the response text based on output columns.""" extracted = {} for col in output_columns: field = col.replace("[Output] ", "") match = re.search(f"<{field}>(.*?)", response) extracted[col] = match.group(1) if match else "" return extracted # ─── Engine ─────────────────────────────────────────────────────────── def batch_prediction(self, start_index: int, end_index: int): """Process a batch of predictions.""" for i in range(start_index, end_index): prediction = self.predict_output( self.in_context, self.data.iloc[i], self.input_fields ) extracted_fields = self.extract_fields(prediction, self.output_fields) for field_name in self.output_fields: self.data.at[i, field_name] = extracted_fields.get(field_name, "") time.sleep(self.request_interval) def run(self): self.data, self.input_fields, self.output_fields = self.load_excel() self.in_context = self.derive_incontext( self.data, self.input_fields, self.output_fields ) self.num_data = len(self.data) self.num_examples = len(self.data.dropna(subset=self.output_fields)) for start_index in tqdm( range(self.num_examples, self.num_data, self.save_every), description="Processing batches", ): end_index = min(start_index + self.save_every, self.num_data) self.batch_prediction(start_index, end_index) self.data.to_excel(self.out_file_path, index=False) self.data.to_excel(self.out_file_path, index=False) print(f"Results saved to {self.out_file_path}")