Spaces:
Sleeping
Sleeping
import logging | |
from dataclasses import dataclass | |
import pandas as pd | |
from components.parser.abbreviations.abbreviation import Abbreviation, AbbreviationType | |
logger = logging.getLogger(__name__) | |
class AbbreviationsCollection: | |
items: list[Abbreviation] | |
def to_pandas(self) -> pd.DataFrame: | |
""" | |
Преобразование всех сокращений в DataFrame. | |
Returns: | |
pd.DataFrame: DataFrame с сокращениями | |
""" | |
logger.debug(f"Items: {self.items}") | |
all_data = [ | |
{ | |
'ShortWord': abbr.short_form, | |
'LongText': abbr.full_form, | |
'AbbreviationType': abbr.abbreviation_type, | |
'DocumentId': abbr.document_id, | |
} | |
for abbr in self.items | |
if abbr.abbreviation_type != AbbreviationType.UNKNOWN | |
] | |
logger.info(f'Approved abbreviations: {len(all_data)}') | |
logger.info(f'Rejected abbreviations: {len(self.items) - len(all_data)}') | |
return pd.DataFrame(all_data) | |
def from_pandas(cls, df: pd.DataFrame) -> 'AbbreviationsCollection': | |
""" | |
Создание коллекции аббревиатур из pandas DataFrame. | |
""" | |
all_data = [] | |
for _, row in df.iterrows(): | |
try: | |
abbreviation = Abbreviation( | |
short=row['short'], | |
full=row['full'], | |
document_id=row['document_id'], | |
) | |
all_data.append(abbreviation) | |
except Exception as e: | |
logger.warning( | |
f'Failed to create abbreviation from row: {row}. Error: {e}' | |
) | |
continue | |
logger.info(f'Created abbreviations collection with {len(all_data)} items') | |
logger.debug( | |
'First 5 abbreviations: %s', ', '.join(str(abbr) for abbr in all_data[:5]) | |
) | |
return cls(all_data) | |