File size: 6,413 Bytes
bb06e40
 
fd7be9e
0182b00
fd7be9e
 
dab0bda
bb06e40
 
 
fd7be9e
 
dab0bda
fd7be9e
 
bb06e40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2642269
bb06e40
 
 
 
dab0bda
 
 
 
 
 
 
 
bb06e40
 
dab0bda
 
 
bb06e40
 
 
 
fd7be9e
 
bb06e40
fd7be9e
 
bb06e40
 
 
 
 
 
fd7be9e
bb06e40
 
 
 
 
dab0bda
bb06e40
 
 
 
 
 
 
dab0bda
fd7be9e
0182b00
bb06e40
 
 
fd7be9e
bb06e40
 
 
dab0bda
 
 
 
bb06e40
 
dab0bda
bb06e40
dab0bda
 
bb06e40
 
 
 
 
 
 
 
dab0bda
 
0182b00
bb06e40
 
 
dab0bda
bb06e40
 
 
dab0bda
 
 
 
bb06e40
dab0bda
 
bb06e40
dab0bda
 
 
bb06e40
dab0bda
 
 
 
 
bb06e40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dab0bda
 
bb06e40
 
 
fd7be9e
0182b00
dab0bda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import torch
import torch.nn as nn
from pymongo import MongoClient
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# MongoDB Atlas 연결
client = MongoClient(
    "mongodb+srv://waseoke:[email protected]/test?retryWrites=true&w=majority"
)
db = client["two_tower_model"]
user_embedding_collection = db["user_embeddings"]
product_embedding_collection = db["product_embeddings"]
train_dataset = db["train_dataset"]


# Autoencoder 모델 정의 (512차원 -> 128차원)
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(512, 256),  # 512 -> 256
            nn.ReLU(),
            nn.Linear(256, 128),  # 256 -> 128
        )
        self.decoder = nn.Sequential(
            nn.Linear(128, 256),  # 128 -> 256
            nn.ReLU(),
            nn.Linear(256, 512),  # 256 -> 512
        )

    def forward(self, x):
        return self.encoder(x)


# Autoencoder를 초기화하고 학습된 모델을 로드
autoencoder = Autoencoder()
autoencoder.eval()  # 학습된 모델 사용 시


# 학습된 모델 로드
def load_trained_model(model_path="product_model.pth"):
    """
    학습된 모델을 로드.
    """
    model = torch.nn.Sequential(
        torch.nn.Linear(768, 256),  # 768: KoBERT 임베딩 차원
        torch.nn.ReLU(),
        torch.nn.Linear(256, 128),
    )
    model.load_state_dict(torch.load(model_path, weights_only=True))
    model.eval()  # 평가 모드
    return model


# 유사도 계산 함수
def calculate_similarity(input_embedding, target_embeddings):
    """
    입력 임베딩과 대상 임베딩들 간의 cosine similarity를 계산.
    """
    similarities = cosine_similarity(input_embedding, target_embeddings).flatten()
    return similarities


def find_most_similar_anchor(user_id, model):
    """
    사용자 임베딩을 기준으로 가장 유사한 anchor 상품을 반환.
    """
    # user_id의 데이터 타입 확인 및 변환
    if isinstance(user_id, str) and user_id.isdigit():
        user_id = int(user_id)

    # 사용자 임베딩 가져오기
    user_data = user_embedding_collection.find_one({"user_id": user_id})

    if not user_data:
        raise ValueError(f"No embedding found for user_id: {user_id}")
    user_embedding = torch.tensor(
        user_data["embedding"][0], dtype=torch.float32
    ).unsqueeze(0)

    padding = torch.zeros((1, 768 - 512))
    user_embedding = torch.cat((user_embedding, padding), dim=1)

    # 사용자 임베딩 차원 축소 (768 -> 128)
    user_embedding = model[0](user_embedding)  # 첫 번째 레이어만 사용하여 차원 축소
    user_embedding = model[2](user_embedding)  # 마지막 레이어 적용 (128 차원)

    # Anchor 데이터 생성
    anchors, anchor_embeddings = [], []

    # Anchor 데이터를 product_model.pth에서 추출
    for _ in range(100):  # Anchor 데이터가 100개라고 가정
        random_input = torch.rand((1, 768))  # KoBERT 차원에 맞는 랜덤 데이터
        anchor_embedding = model(random_input).detach().numpy().flatten()
        anchors.append(f"Product_{len(anchors) + 1}")  # Anchor 상품 이름
        anchor_embeddings.append(anchor_embedding)

    anchor_embeddings = np.array(anchor_embeddings)

    print(f"User embedding dimension: {user_embedding.shape}")
    print(f"Anchor embedding dimension: {anchor_embeddings.shape}")

    # Cosine Similarity 계산
    similarities = calculate_similarity(
        user_embedding.detach().numpy().reshape(1, -1), anchor_embeddings
    )
    most_similar_index = np.argmax(similarities)

    return anchors[most_similar_index], anchor_embeddings[most_similar_index]


def find_most_similar_product(anchor_embedding, model):
    """
    Anchor 임베딩을 기반으로 학습된 positive/negative 상품 중 가장 유사한 상품을 반환.
    """
    train_embeddings, products = [], []
    # Anchor 데이터와 유사한 상품 임베딩을 생성
    for _ in range(100):  # 예시로 100개의 상품 임베딩을 계산한다고 가정
        random_input = torch.rand((1, 768))  # KoBERT 차원에 맞는 랜덤 데이터
        train_embedding = (
            model(random_input).detach().numpy().flatten()
        )  # 모델을 통해 임베딩 계산
        products.append(f"Product_{len(products) + 1}")  # 상품 이름
        train_embeddings.append(train_embedding)

    train_embeddings = np.array(train_embeddings)

    print(f"Anchor embedding dimension: {anchor_embedding.shape}")
    print(f"Train embedding dimension: {train_embeddings.shape}")

    # Cosine Similarity 계산
    similarities = calculate_similarity(
        anchor_embedding.reshape(1, -1), train_embeddings
    )
    most_similar_index = np.argmax(similarities)

    return products[most_similar_index], train_embeddings[most_similar_index]


def recommend_shop_product(similar_product_embedding):
    """
    학습된 상품과 쇼핑몰 상품 임베딩을 비교하여 최종 추천 상품 반환.
    """
    all_products = list(product_embedding_collection.find())
    shop_product_embeddings, shop_product_ids = [], []

    for product in all_products:
        shop_product_ids.append(product["product_id"])
        shop_product_embeddings.append(product["embedding"])

    shop_product_embeddings = np.array(shop_product_embeddings)
    shop_product_embeddings = shop_product_embeddings.reshape(
        shop_product_embeddings.shape[0], -1
    )

    # Shop 제품 임베딩을 NumPy 배열로 변환
    shop_product_embeddings = np.array(shop_product_embeddings)

    # Autoencoder로 차원 축소 (512 -> 128)
    shop_product_embeddings_reduced = (
        autoencoder.encoder(torch.tensor(shop_product_embeddings).float())
        .detach()
        .numpy()
    )

    # similar_product_embedding을 (1, 128)로 변환
    similar_product_embedding = similar_product_embedding.reshape(1, -1)

    print(f"Similar product embedding dimension: {similar_product_embedding.shape}")
    print(f"Shop product embedding dimension: {shop_product_embeddings_reduced.shape}")

    # Cosine Similarity 계산
    similarities = calculate_similarity(
        similar_product_embedding, shop_product_embeddings_reduced
    )
    most_similar_index = np.argmax(similarities)

    return shop_product_ids[most_similar_index]