Spaces:
Sleeping
Sleeping
import logging | |
from pathlib import Path | |
from typing import Union, Optional | |
import pandas as pd | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger(__name__) | |
class DataPreprocessor: | |
"""A class to handle data preprocessing operations for different file formats.""" | |
def _preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Applies standard preprocessing steps to a DataFrame. | |
Args: | |
df (pd.DataFrame): Input DataFrame to preprocess | |
Returns: | |
pd.DataFrame: Preprocessed DataFrame | |
""" | |
try: | |
# Convert text columns to lowercase for standardization | |
df = df.map(lambda x: x.lower() if isinstance(x, str) else x) | |
# Drop columns that are fully null | |
df = df.dropna(axis=1, how='all') | |
# Fill remaining NaN values with empty strings | |
df = df.fillna('') | |
# Remove duplicate rows | |
df = df.drop_duplicates() | |
return df | |
except Exception as e: | |
logger.error(f"Error during DataFrame preprocessing: {str(e)}") | |
raise | |
def preprocess_msd(cls, | |
file_path: Union[str, Path], | |
output_path: Union[str, Path], | |
sheet_name: Optional[Union[str, int]] = 0) -> pd.DataFrame: | |
""" | |
Preprocesses an MSD Excel file and saves the result. | |
Args: | |
file_path: Path to the Excel file | |
output_path: Directory path for the output file | |
sheet_name: Sheet name or index to load (default: 0) | |
Returns: | |
pd.DataFrame: Preprocessed DataFrame | |
Raises: | |
FileNotFoundError: If input file doesn't exist | |
PermissionError: If output directory is not writable | |
""" | |
try: | |
# Convert to Path objects | |
file_path = Path(file_path) | |
output_path = Path(output_path) | |
# Validate input file | |
if not file_path.exists(): | |
raise FileNotFoundError(f"Input file not found: {file_path}") | |
# Ensure output directory exists | |
output_path.mkdir(parents=True, exist_ok=True) | |
logger.info(f"Processing MSD file: {file_path}") | |
df = pd.read_excel(file_path, sheet_name=sheet_name) | |
# Apply preprocessing | |
df = cls._preprocess_dataframe(df) | |
# Save processed file | |
output_file = output_path / "msd_processed.csv" | |
df.to_csv(output_file, index=False) | |
logger.info(f"Saved processed file to: {output_file}") | |
return df | |
except Exception as e: | |
logger.error(f"Error processing MSD file: {str(e)}") | |
raise | |
def preprocess_cbip(cls, | |
input_dir: Union[str, Path], | |
output_dir: Union[str, Path]) -> None: | |
""" | |
Preprocesses all CSV files in the CBIP directory. | |
Args: | |
input_dir: Directory containing input CSV files | |
output_dir: Directory for output files | |
Raises: | |
FileNotFoundError: If input directory doesn't exist | |
PermissionError: If output directory is not writable | |
""" | |
try: | |
# Convert to Path objects | |
input_dir = Path(input_dir) | |
output_dir = Path(output_dir) | |
# Validate input directory | |
if not input_dir.exists(): | |
raise FileNotFoundError(f"Input directory not found: {input_dir}") | |
# Ensure output directory exists | |
output_dir.mkdir(parents=True, exist_ok=True) | |
# Process all CSV files | |
csv_files = list(input_dir.rglob("*.csv")) | |
if not csv_files: | |
logger.warning(f"No CSV files found in: {input_dir}") | |
return | |
for file_path in csv_files: | |
try: | |
logger.info(f"Processing CBIP file: {file_path}") | |
# Read CSV file | |
df = pd.read_csv( | |
file_path, | |
delimiter=';', | |
quotechar='"', | |
skip_blank_lines=True | |
) | |
# Apply preprocessing | |
df = cls._preprocess_dataframe(df) | |
# Save processed file | |
output_file = output_dir / file_path.name | |
df.to_csv(output_file, index=False) | |
logger.info(f"Saved processed file to: {output_file}") | |
except Exception as e: | |
logger.error(f"Error processing {file_path}: {str(e)}") | |
continue | |
except Exception as e: | |
logger.error(f"Error processing CBIP directory: {str(e)}") | |
raise | |
def main(): | |
"""Main execution function.""" | |
try: | |
import os | |
import argparse | |
from pathlib import Path | |
# Create processed_data directory in current working directory | |
output_base = Path.cwd() / "processed_data" | |
msd_output = output_base / "msd" | |
cbip_output = output_base / "cbip" | |
parser = argparse.ArgumentParser(description='Process MSD and CBIP data files.') | |
parser.add_argument('--msd-input', required=True, help='Path to MSD Excel file') | |
parser.add_argument('--cbip-input', required=True, help='Input directory containing CBIP CSV files') | |
args = parser.parse_args() | |
preprocessor = DataPreprocessor() | |
# Process MSD file | |
preprocessor.preprocess_msd( | |
args.msd_input, | |
msd_output | |
) | |
# Process CBIP directory | |
preprocessor.preprocess_cbip( | |
args.cbip_input, | |
cbip_output | |
) | |
except Exception as e: | |
logger.error(f"Main execution failed: {str(e)}") | |
raise | |
if __name__ == "__main__": | |
main() |