Spaces:
Sleeping
Sleeping
File size: 6,413 Bytes
bb06e40 fd7be9e 0182b00 fd7be9e dab0bda bb06e40 fd7be9e dab0bda fd7be9e bb06e40 2642269 bb06e40 dab0bda bb06e40 dab0bda bb06e40 fd7be9e bb06e40 fd7be9e bb06e40 fd7be9e bb06e40 dab0bda bb06e40 dab0bda fd7be9e 0182b00 bb06e40 fd7be9e bb06e40 dab0bda bb06e40 dab0bda bb06e40 dab0bda bb06e40 dab0bda 0182b00 bb06e40 dab0bda bb06e40 dab0bda bb06e40 dab0bda bb06e40 dab0bda bb06e40 dab0bda bb06e40 dab0bda bb06e40 fd7be9e 0182b00 dab0bda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import torch
import torch.nn as nn
from pymongo import MongoClient
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# MongoDB Atlas 연결
client = MongoClient(
"mongodb+srv://waseoke:[email protected]/test?retryWrites=true&w=majority"
)
db = client["two_tower_model"]
user_embedding_collection = db["user_embeddings"]
product_embedding_collection = db["product_embeddings"]
train_dataset = db["train_dataset"]
# Autoencoder 모델 정의 (512차원 -> 128차원)
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder, self).__init__()
self.encoder = nn.Sequential(
nn.Linear(512, 256), # 512 -> 256
nn.ReLU(),
nn.Linear(256, 128), # 256 -> 128
)
self.decoder = nn.Sequential(
nn.Linear(128, 256), # 128 -> 256
nn.ReLU(),
nn.Linear(256, 512), # 256 -> 512
)
def forward(self, x):
return self.encoder(x)
# Autoencoder를 초기화하고 학습된 모델을 로드
autoencoder = Autoencoder()
autoencoder.eval() # 학습된 모델 사용 시
# 학습된 모델 로드
def load_trained_model(model_path="product_model.pth"):
"""
학습된 모델을 로드.
"""
model = torch.nn.Sequential(
torch.nn.Linear(768, 256), # 768: KoBERT 임베딩 차원
torch.nn.ReLU(),
torch.nn.Linear(256, 128),
)
model.load_state_dict(torch.load(model_path, weights_only=True))
model.eval() # 평가 모드
return model
# 유사도 계산 함수
def calculate_similarity(input_embedding, target_embeddings):
"""
입력 임베딩과 대상 임베딩들 간의 cosine similarity를 계산.
"""
similarities = cosine_similarity(input_embedding, target_embeddings).flatten()
return similarities
def find_most_similar_anchor(user_id, model):
"""
사용자 임베딩을 기준으로 가장 유사한 anchor 상품을 반환.
"""
# user_id의 데이터 타입 확인 및 변환
if isinstance(user_id, str) and user_id.isdigit():
user_id = int(user_id)
# 사용자 임베딩 가져오기
user_data = user_embedding_collection.find_one({"user_id": user_id})
if not user_data:
raise ValueError(f"No embedding found for user_id: {user_id}")
user_embedding = torch.tensor(
user_data["embedding"][0], dtype=torch.float32
).unsqueeze(0)
padding = torch.zeros((1, 768 - 512))
user_embedding = torch.cat((user_embedding, padding), dim=1)
# 사용자 임베딩 차원 축소 (768 -> 128)
user_embedding = model[0](user_embedding) # 첫 번째 레이어만 사용하여 차원 축소
user_embedding = model[2](user_embedding) # 마지막 레이어 적용 (128 차원)
# Anchor 데이터 생성
anchors, anchor_embeddings = [], []
# Anchor 데이터를 product_model.pth에서 추출
for _ in range(100): # Anchor 데이터가 100개라고 가정
random_input = torch.rand((1, 768)) # KoBERT 차원에 맞는 랜덤 데이터
anchor_embedding = model(random_input).detach().numpy().flatten()
anchors.append(f"Product_{len(anchors) + 1}") # Anchor 상품 이름
anchor_embeddings.append(anchor_embedding)
anchor_embeddings = np.array(anchor_embeddings)
print(f"User embedding dimension: {user_embedding.shape}")
print(f"Anchor embedding dimension: {anchor_embeddings.shape}")
# Cosine Similarity 계산
similarities = calculate_similarity(
user_embedding.detach().numpy().reshape(1, -1), anchor_embeddings
)
most_similar_index = np.argmax(similarities)
return anchors[most_similar_index], anchor_embeddings[most_similar_index]
def find_most_similar_product(anchor_embedding, model):
"""
Anchor 임베딩을 기반으로 학습된 positive/negative 상품 중 가장 유사한 상품을 반환.
"""
train_embeddings, products = [], []
# Anchor 데이터와 유사한 상품 임베딩을 생성
for _ in range(100): # 예시로 100개의 상품 임베딩을 계산한다고 가정
random_input = torch.rand((1, 768)) # KoBERT 차원에 맞는 랜덤 데이터
train_embedding = (
model(random_input).detach().numpy().flatten()
) # 모델을 통해 임베딩 계산
products.append(f"Product_{len(products) + 1}") # 상품 이름
train_embeddings.append(train_embedding)
train_embeddings = np.array(train_embeddings)
print(f"Anchor embedding dimension: {anchor_embedding.shape}")
print(f"Train embedding dimension: {train_embeddings.shape}")
# Cosine Similarity 계산
similarities = calculate_similarity(
anchor_embedding.reshape(1, -1), train_embeddings
)
most_similar_index = np.argmax(similarities)
return products[most_similar_index], train_embeddings[most_similar_index]
def recommend_shop_product(similar_product_embedding):
"""
학습된 상품과 쇼핑몰 상품 임베딩을 비교하여 최종 추천 상품 반환.
"""
all_products = list(product_embedding_collection.find())
shop_product_embeddings, shop_product_ids = [], []
for product in all_products:
shop_product_ids.append(product["product_id"])
shop_product_embeddings.append(product["embedding"])
shop_product_embeddings = np.array(shop_product_embeddings)
shop_product_embeddings = shop_product_embeddings.reshape(
shop_product_embeddings.shape[0], -1
)
# Shop 제품 임베딩을 NumPy 배열로 변환
shop_product_embeddings = np.array(shop_product_embeddings)
# Autoencoder로 차원 축소 (512 -> 128)
shop_product_embeddings_reduced = (
autoencoder.encoder(torch.tensor(shop_product_embeddings).float())
.detach()
.numpy()
)
# similar_product_embedding을 (1, 128)로 변환
similar_product_embedding = similar_product_embedding.reshape(1, -1)
print(f"Similar product embedding dimension: {similar_product_embedding.shape}")
print(f"Shop product embedding dimension: {shop_product_embeddings_reduced.shape}")
# Cosine Similarity 계산
similarities = calculate_similarity(
similar_product_embedding, shop_product_embeddings_reduced
)
most_similar_index = np.argmax(similarities)
return shop_product_ids[most_similar_index]
|