import logging import pandas as pd from sqlalchemy.orm import Session from components.dbo.models.acronym import Acronym from components.dbo.models.dataset import Dataset from components.dbo.models.dataset_document import DatasetDocument from schemas.acronym import AcronymCollectionResponse logger = logging.getLogger(__name__) class AcronymService: """ Сервис для работы с аббревиатурами и сокращениями. """ def __init__(self, db: Session): logger.info("Initializing AcronymService") self.db = db def from_pandas(self, df: pd.DataFrame) -> None: """ Загрузить аббревиатуры и сокращения из pandas DataFrame. Args: df: DataFrame со столбцами document_id, short_form, full_form, type """ logger.info(f"Loading acronyms from DataFrame with {len(df)} rows") with self.db() as session: try: # Process each row in the DataFrame for _, row in df.iterrows(): # Create acronym acronym = Acronym( short_form=row['short_form'], full_form=row['full_form'], type=row['type'], document_id=( int(row['document_id']) if pd.notna(row['document_id']) else None ), ) session.add(acronym) session.commit() logger.info("Successfully loaded all acronyms") except Exception as e: session.rollback() logger.error(f"Error processing acronyms: {str(e)}") raise e finally: session.close() def get_abbreviations(self, document_id: int) -> list[Acronym]: """ Получить аббревиатуры и сокращения для документа. """ logger.info(f"Getting abbreviations for document {document_id}") with self.db() as session: result = ( session.query(Acronym) .filter( (Acronym.document_id == document_id) | (Acronym.document_id == None) ) .all() ) logger.debug(f"Found {len(result)} abbreviations for document {document_id}") return result def get_abbreviations_by_dataset_id(self, dataset_id: int) -> list[Acronym]: """ Получить аббревиатуры и сокращения для документа. """ logger.info(f"Getting abbreviations for dataset {dataset_id}") return self._get_acronyms_for_dataset(dataset_id) def get_current_acronyms(self) -> AcronymCollectionResponse: """ Получить аббревиатуры и сокращения для текущего активного набора данных. """ logger.info("Getting acronyms for current active dataset") with self.db() as session: active_dataset: Dataset = session.query(Dataset).filter(Dataset.is_active == True).first() if not active_dataset: logger.warning("No active dataset found") return AcronymCollectionResponse( collection_id=0, collection_name="", collection_filename="", updated_at=None, acronyms={}, ) result = self._get_acronyms_for_dataset(active_dataset.id) return AcronymCollectionResponse( collection_id=active_dataset.id, collection_name=active_dataset.name, collection_filename='', updated_at=active_dataset.date_created, #TODO: Что? acronyms=self._compress_acronyms(result), ) def _get_acronyms_for_dataset(self, dataset_id: int) -> list[Acronym]: """ Получить список акронимов для датасета. Args: dataset_id: ID датасета Returns: list[Acronym]: Список акронимов """ with self.db() as session: try: document_ids = ( session.query(DatasetDocument.document_id) .filter(DatasetDocument.id == dataset_id) .all() ) result = ( session.query(Acronym) .filter( (Acronym.document_id.in_([doc_id[0] for doc_id in document_ids])) | (Acronym.document_id == None) ) .all() ) logger.debug(f"Found {len(result)} acronyms for dataset {dataset_id}") return result finally: pass def _compress_acronyms(self, acronyms: list[Acronym]) -> dict[str, list[str]]: """ Сжать аббревиатуры и сокращения в словарь. """ short_forms = {acronym.short_form for acronym in acronyms} compressed = { short_form: [ acronym.full_form for acronym in acronyms if acronym.short_form == short_form ] for short_form in short_forms } logger.debug(f"Compressed {len(acronyms)} acronyms into {len(compressed)} unique short forms") return compressed