Spaces:
Sleeping
Sleeping
File size: 5,783 Bytes
57cf043 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import logging
import pandas as pd
from sqlalchemy.orm import Session
from components.dbo.models.acronym import Acronym
from components.dbo.models.dataset import Dataset
from components.dbo.models.dataset_document import DatasetDocument
from schemas.acronym import AcronymCollectionResponse
logger = logging.getLogger(__name__)
class AcronymService:
"""
Сервис для работы с аббревиатурами и сокращениями.
"""
def __init__(self, db: Session):
logger.info("Initializing AcronymService")
self.db = db
def from_pandas(self, df: pd.DataFrame) -> None:
"""
Загрузить аббревиатуры и сокращения из pandas DataFrame.
Args:
df: DataFrame со столбцами document_id, short_form, full_form, type
"""
logger.info(f"Loading acronyms from DataFrame with {len(df)} rows")
with self.db() as session:
try:
# Process each row in the DataFrame
for _, row in df.iterrows():
# Create acronym
acronym = Acronym(
short_form=row['short_form'],
full_form=row['full_form'],
type=row['type'],
document_id=(
int(row['document_id'])
if pd.notna(row['document_id'])
else None
),
)
session.add(acronym)
session.commit()
logger.info("Successfully loaded all acronyms")
except Exception as e:
session.rollback()
logger.error(f"Error processing acronyms: {str(e)}")
raise e
finally:
session.close()
def get_abbreviations(self, document_id: int) -> list[Acronym]:
"""
Получить аббревиатуры и сокращения для документа.
"""
logger.info(f"Getting abbreviations for document {document_id}")
with self.db() as session:
result = (
session.query(Acronym)
.filter(
(Acronym.document_id == document_id) | (Acronym.document_id == None)
)
.all()
)
logger.debug(f"Found {len(result)} abbreviations for document {document_id}")
return result
def get_abbreviations_by_dataset_id(self, dataset_id: int) -> list[Acronym]:
"""
Получить аббревиатуры и сокращения для документа.
"""
logger.info(f"Getting abbreviations for dataset {dataset_id}")
return self._get_acronyms_for_dataset(dataset_id)
def get_current_acronyms(self) -> AcronymCollectionResponse:
"""
Получить аббревиатуры и сокращения для текущего активного набора данных.
"""
logger.info("Getting acronyms for current active dataset")
with self.db() as session:
active_dataset: Dataset = session.query(Dataset).filter(Dataset.is_active == True).first()
if not active_dataset:
logger.warning("No active dataset found")
return AcronymCollectionResponse(
collection_id=0,
collection_name="",
collection_filename="",
updated_at=None,
acronyms={},
)
result = self._get_acronyms_for_dataset(active_dataset.id)
return AcronymCollectionResponse(
collection_id=active_dataset.id,
collection_name=active_dataset.name,
collection_filename='',
updated_at=active_dataset.date_created, #TODO: Что?
acronyms=self._compress_acronyms(result),
)
def _get_acronyms_for_dataset(self, dataset_id: int) -> list[Acronym]:
"""
Получить список акронимов для датасета.
Args:
dataset_id: ID датасета
Returns:
list[Acronym]: Список акронимов
"""
with self.db() as session:
try:
document_ids = (
session.query(DatasetDocument.document_id)
.filter(DatasetDocument.id == dataset_id)
.all()
)
result = (
session.query(Acronym)
.filter(
(Acronym.document_id.in_([doc_id[0] for doc_id in document_ids])) | (Acronym.document_id == None)
)
.all()
)
logger.debug(f"Found {len(result)} acronyms for dataset {dataset_id}")
return result
finally:
pass
def _compress_acronyms(self, acronyms: list[Acronym]) -> dict[str, list[str]]:
"""
Сжать аббревиатуры и сокращения в словарь.
"""
short_forms = {acronym.short_form for acronym in acronyms}
compressed = {
short_form: [
acronym.full_form
for acronym in acronyms
if acronym.short_form == short_form
]
for short_form in short_forms
}
logger.debug(f"Compressed {len(acronyms)} acronyms into {len(compressed)} unique short forms")
return compressed
|