|
|
|
import pandas as pd |
|
import numpy as np |
|
import ast |
|
from pymilvus import MilvusClient |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_rating_similarity(csv_path): |
|
""" |
|
从 csv 文件中读取预计算的商品评分相似度数据. |
|
CSV 文件应包含两列: |
|
asin, top50_similar |
|
其中 top50_similar 是字符串形式的列表,每个元素为 (asin, sim_score)。 |
|
返回: |
|
rating_sim: dict, 格式为 {asin: {asin2: sim_score, ...} } |
|
""" |
|
df = pd.read_csv(csv_path) |
|
rating_sim = {} |
|
for _, row in df.iterrows(): |
|
asin = row["asin"] |
|
top50_similar_str = row["top50_similar"] |
|
try: |
|
top50_similar_list = ast.literal_eval(top50_similar_str) |
|
except Exception as e: |
|
top50_similar_list = [] |
|
|
|
rating_sim[asin] = {item[0]: item[1] for item in top50_similar_list} |
|
return rating_sim |
|
|
|
|
|
|
|
|
|
def query_image_similarity(client, asin, top_k=50): |
|
""" |
|
查询指定 asin 对应的图片向量,并在 Milvus 中搜索相似商品(图片相似度)。 |
|
|
|
返回: |
|
字典格式 {asin: sim_score} |
|
其中 sim_score 采用 COSINE 指标,计算方式: sim_score = 1 - hit.distance |
|
""" |
|
query_expr = f"asin == '{asin}'" |
|
query_res = client.query( |
|
collection_name="image_embeddings", |
|
filter=query_expr, |
|
output_fields=["embedding"], |
|
) |
|
if not query_res: |
|
return {} |
|
|
|
target_embedding = query_res[0]["embedding"] |
|
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}} |
|
search_results = client.search( |
|
collection_name="image_embeddings", |
|
data=[target_embedding], |
|
anns_field="embedding", |
|
search_params=search_params, |
|
limit=top_k, |
|
filter=f"asin != '{asin}'", |
|
) |
|
|
|
sim_dict = {} |
|
for hit in search_results[0]: |
|
sim_asin = hit.entity.get("asin") |
|
sim_score = 1 - hit.distance |
|
sim_dict[sim_asin] = sim_score |
|
return sim_dict |
|
|
|
|
|
def query_desc_similarity(client, asin, top_k=50): |
|
""" |
|
查询指定 asin 对应的描述向量,并在 Milvus 中搜索相似商品(描述相似度)。 |
|
|
|
使用您提供的描述向量 schema,假设集合名称为 "metadata_embeddings"。 |
|
返回: |
|
字典格式 {asin: sim_score},sim_score = 1 - hit.distance |
|
""" |
|
query_expr = f"asin == '{asin}'" |
|
query_res = client.query( |
|
collection_name="metadata_embeddings", |
|
filter=query_expr, |
|
output_fields=["embedding"], |
|
) |
|
if not query_res: |
|
return {} |
|
|
|
target_embedding = query_res[0]["embedding"] |
|
search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}} |
|
search_results = client.search( |
|
collection_name="metadata_embeddings", |
|
data=[target_embedding], |
|
anns_field="embedding", |
|
search_params=search_params, |
|
limit=top_k, |
|
filter=f"asin != '{asin}'", |
|
) |
|
|
|
sim_dict = {} |
|
for hit in search_results[0]: |
|
sim_asin = hit['id'] |
|
sim_score = 1 - hit['distance'] |
|
sim_dict[sim_asin] = sim_score |
|
return sim_dict |
|
|
|
|
|
def query_milvus_similarity(client, asin, similarity_type="image", top_k=50): |
|
""" |
|
根据 similarity_type 参数调用不同的 Milvus 查询: |
|
- "image":基于图片的查询 |
|
- "description":基于描述的查询 |
|
""" |
|
if similarity_type == "image": |
|
return query_image_similarity(client, asin, top_k) |
|
elif similarity_type == "description": |
|
return query_desc_similarity(client, asin, top_k) |
|
else: |
|
return {} |
|
|
|
|
|
|
|
|
|
def get_hybrid_similarity( |
|
asin1, asin2, rating_sim_dict, weights, client, milvus_cache=None |
|
): |
|
""" |
|
计算 asin1 与 asin2 之间的混合相似度。 |
|
|
|
参数: |
|
asin1, asin2: 商品标识符 |
|
rating_sim_dict: 从 CSV 中读取的评分相似度字典 |
|
weights: 各部分的权重字典,例如 {"rating": 0.6, "image": 0.2, "description": 0.2} |
|
client: Milvus 客户端对象 |
|
milvus_cache: 缓存字典,用以减少重复查询 |
|
|
|
返回: |
|
混合相似度分值 |
|
""" |
|
|
|
rating_score = 0 |
|
if asin1 in rating_sim_dict: |
|
rating_score = rating_sim_dict[asin1].get(asin2, 0) |
|
|
|
if milvus_cache is None: |
|
milvus_cache = {} |
|
|
|
|
|
if (asin1, "image") in milvus_cache: |
|
image_sim_dict = milvus_cache[(asin1, "image")] |
|
else: |
|
image_sim_dict = query_milvus_similarity( |
|
client, asin1, similarity_type="image", top_k=50 |
|
) |
|
milvus_cache[(asin1, "image")] = image_sim_dict |
|
image_score = image_sim_dict.get(asin2, 0) |
|
|
|
|
|
if (asin1, "description") in milvus_cache: |
|
desc_sim_dict = milvus_cache[(asin1, "description")] |
|
else: |
|
desc_sim_dict = query_milvus_similarity( |
|
client, asin1, similarity_type="description", top_k=50 |
|
) |
|
milvus_cache[(asin1, "description")] = desc_sim_dict |
|
desc_score = desc_sim_dict.get(asin2, 0) |
|
|
|
|
|
hybrid_score = ( |
|
weights.get("rating", 0) * rating_score |
|
+ weights.get("image", 0) * image_score |
|
+ weights.get("description", 0) * desc_score |
|
) |
|
return hybrid_score |
|
|
|
|
|
|
|
|
|
def recommend_for_user( |
|
user_id, |
|
user_rating_df, |
|
rating_sim_dict, |
|
weights, |
|
client, |
|
milvus_cache=None, |
|
top_n=10, |
|
): |
|
""" |
|
根据用户的历史评分和混合相似度为用户生成推荐。 |
|
|
|
参数: |
|
user_id: 用户标识 |
|
user_rating_df: 包含 user_id, asin, rating 等信息的 DataFrame |
|
rating_sim_dict: 预计算的评分相似度字典(从 CSV 读取) |
|
weights: 各模块混合相似度的权重 |
|
client: Milvus 客户端对象 |
|
milvus_cache: 缓存字典(可选) |
|
top_n: 返回推荐的商品数量 |
|
|
|
返回: |
|
推荐列表,每个元素为 (asin, score) |
|
""" |
|
|
|
rated_items = set(user_rating_df[user_rating_df["user_id"] == user_id]["asin"]) |
|
|
|
candidate_items = set(rating_sim_dict.keys()) - rated_items |
|
|
|
scores = {} |
|
for candidate in candidate_items: |
|
total_score = 0 |
|
count = 0 |
|
for rated in rated_items: |
|
sim = get_hybrid_similarity( |
|
rated, candidate, rating_sim_dict, weights, client, milvus_cache |
|
) |
|
total_score += sim |
|
count += 1 |
|
avg_score = total_score / count if count > 0 else 0 |
|
scores[candidate] = avg_score |
|
|
|
|
|
recommended = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n] |
|
return recommended |
|
|
|
|
|
|
|
|
|
|
|
|
|
user_rating_df = pd.read_csv("ratings.csv") |
|
|
|
|
|
rating_sim_dict = compute_rating_similarity("items_similar.csv") |
|
|
|
|
|
weights = {"rating": 0.6, "image": 0.2, "description": 0.2} |
|
|
|
|
|
|
|
|
|
|
|
client = MilvusClient(uri="./Amazon_electronics.db") |
|
|
|
|
|
milvus_cache = {} |
|
|
|
|
|
|
|
|
|
target_user = "A192HO2ICJ75VU" |
|
recommendations = recommend_for_user( |
|
target_user, |
|
user_rating_df, |
|
rating_sim_dict, |
|
weights, |
|
client, |
|
milvus_cache, |
|
top_n=10, |
|
) |
|
|
|
print(f"为用户 {target_user} 推荐的商品列表:") |
|
for asin, score in recommendations: |
|
print(f"ASIN: {asin},得分: {score:.4f}") |
|
|
|
|
|
|