Spaces:
Sleeping
Sleeping
import logging | |
import pickle | |
from dataclasses import asdict, dataclass, field | |
from pathlib import Path | |
from typing import Callable | |
import numpy as np | |
import pandas as pd | |
from common.constants import UNKNOWN | |
from components.embedding_extraction import EmbeddingExtractor | |
logger = logging.getLogger(__name__) | |
class DatasetRow: | |
""" | |
Класс для хранения данных одной строки датасета. | |
""" | |
Index: int | |
Text: str | |
DocName: str | |
Title: str | |
DocNumber: str | |
LevelParagraph: str = field(default=UNKNOWN) | |
Pargaraph: str = field(default=UNKNOWN) | |
Duplicate: str = field(default=UNKNOWN) | |
PartLevel1: str = field(default=UNKNOWN) | |
PartLevel2: str = field(default=UNKNOWN) | |
Appendix: str = field(default=UNKNOWN) | |
LevelParagraphAppendix: str = field(default=UNKNOWN) | |
PargaraphAppendix: str = field(default=UNKNOWN) | |
DuplicateAppendix: str = field(default=UNKNOWN) | |
PartLevel1Appendix: str = field(default=UNKNOWN) | |
Table: str = field(default=UNKNOWN) | |
class DocumentsDataset: | |
""" | |
Класс для хранения данных датасета. | |
Содержит список строк и векторы текстов. | |
Изначально не содержит векторов, чтобы запустить процесс векторизации, | |
нужно вызвать метод vectorize_with. | |
""" | |
def __init__(self, rows: list[DatasetRow]): | |
self.rows = rows | |
self.vectors: np.ndarray | None = None | |
def vectorize_with( | |
self, | |
vectorizer: EmbeddingExtractor, | |
progress_callback: Callable[[int, int], None] | None = None, | |
) -> None: | |
""" | |
Векторизация текстов в датасете. | |
""" | |
logger.info('Starting dataset vectorization') | |
total = len(self.rows) | |
rows = [row.Text for row in self.rows] | |
vectors = vectorizer.vectorize(rows, progress_callback) | |
self.vectors = vectors | |
logger.info(f'Completed vectorization of {total} rows') | |
def to_pandas(self) -> pd.DataFrame: | |
""" | |
Преобразовать датасет в pandas DataFrame. | |
Returns: | |
pd.DataFrame: Датафрейм с данными. | |
""" | |
df = pd.DataFrame([asdict(row) for row in self.rows]) | |
if self.vectors is not None: | |
df['Embedding'] = self.vectors.tolist() | |
else: | |
df['Embedding'] = np.nan | |
return df | |
def to_pickle(self, path: Path) -> None: | |
""" | |
Сохранение датасета в pickle файл. | |
""" | |
logger.info(f'Saving dataset to {path}') | |
with open(path, 'wb') as f: | |
pickle.dump(self.to_pandas(), f) | |
logger.info('Dataset saved successfully') | |
def from_pickle(cls, path: Path) -> 'DocumentsDataset': | |
""" | |
Загрузка датасета из pickle файла. | |
""" | |
logger.info(f'Loading dataset from {path}') | |
try: | |
with open(path, 'rb') as f: | |
dataset = pickle.load(f) | |
logger.info(f'Loaded dataset with {len(dataset.rows)} rows') | |
return dataset | |
except Exception as e: | |
logger.error(f'Failed to load dataset: {e}') | |
raise | |