File size: 1,722 Bytes
542680b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import torch
from torch import nn
from torch.nn import init
from transformers import AutoTokenizer, AutoModel


class ProdFeatureEncoder(nn.Module):
    """
    Model for creating embeddings with pre-trained ruBERT-tiny BERT.

    Attributes:
        config (object): Configuration object containing model hyperparameters.
        tokenizer (AutoTokenizer): Tokenizer instance for ruBERT-tiny.
        model (AutoModel): Pre-trained ruBERT-tiny model instance.
        fc (nn.Linear): Linear layer for dimensionality reduction.
    """
    def __init__(self, config):
        """
        Initializes the ProdFeatureEncoder model.

        Args:
            config (object): Configuration object containing model hyperparameters.
        """
        super().__init__()
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
        self.model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
        self.fc = nn.Linear(self.config.bert_output_size, self.config.embedding_size)
        init.xavier_uniform_(self.fc.weight)
        self.norm = nn.LayerNorm(self.config.embedding_size)

    def forward(self, text: str):
        """
        Creates an embedding for the input text.
        Args:
            text (str): Input text to create an embedding for.
        Returns:
            torch.Tensor: Embedding vector for the input text.
        """
        tokens = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        model_output = self.model(**{k: v.to(self.model.device) for k, v in tokens.items()})
        embedding = model_output.last_hidden_state[:, 0, :]
        embedding = self.fc(embedding)
        return embedding[0]