File size: 19,315 Bytes

b321188

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "46689feb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bb5f4eed",
   "metadata": {},
   "outputs": [],
   "source": [
    "matedata = pd.read_json(\"meta_Electronics.json.gz\", lines=True, compression=\"gzip\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbfd4fdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "matedata.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c28a3490",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata_asin_image = matedata[[\"asin\", \"imageURLHighRes\"]]\n",
    "metadata_asin_image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ae58599",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the list to the image url\n",
    "metadata_asin_image.loc[:, \"url\"] = metadata_asin_image[\"imageURLHighRes\"].apply(\n",
    "    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7dcfbafb",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata_asin_image[[\"asin\", \"url\"]].to_csv(\n",
    "    \"metadata_asin_image.csv\",\n",
    "    index=False,\n",
    "    mode=\"w\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4010aa2c",
   "metadata": {},
   "source": [
    "# Start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13615a01",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "metadata_asin_image=pd.read_csv(\"metadata_asin_image.csv\")\n",
    "metadata_asin_image.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2295fcf0",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata_asin_image.iloc[0]['asin']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70068796",
   "metadata": {},
   "outputs": [],
   "source": [
    "import validators\n",
    "import requests\n",
    "import os\n",
    "import time\n",
    "\n",
    "for _, line in metadata_asin_image.iterrows():\n",
    "    asin = line[\"asin\"]\n",
    "    image_url = line[\"url\"]\n",
    "\n",
    "    if not validators.url(image_url):\n",
    "        print(f\"Invalid Image URL for ASIN: {asin}, URL: {image_url}\")\n",
    "        continue\n",
    "\n",
    "    # Download the image to \"./imgs/\" directory\n",
    "    image_path = f\"./imgs/{asin}.jpg\"\n",
    "    # Check if the image already exists\n",
    "    if os.path.exists(image_path):\n",
    "        print(f\"Image already exists: {image_path}\")\n",
    "        continue\n",
    "    \n",
    "    # Add retry logic for downloading images\n",
    "    max_retries = 3\n",
    "    retry_delay = 1  # seconds\n",
    "    for attempt in range(max_retries):\n",
    "        try:\n",
    "            time.sleep(0.05)\n",
    "            response = requests.get(image_url, timeout=10)\n",
    "            response.raise_for_status()  # Raise exception for HTTP errors\n",
    "            with open(image_path, \"wb\") as f:\n",
    "                f.write(response.content)\n",
    "            break\n",
    "        except (requests.exceptions.RequestException, IOError) as e:\n",
    "            if attempt < max_retries - 1:\n",
    "                print(\n",
    "                    f\"Error downloading image from {image_url}: {e}. Retrying in {retry_delay}s... (Attempt {attempt + 1}/{max_retries})\"\n",
    "                )\n",
    "                time.sleep(retry_delay)\n",
    "                retry_delay *= 2  # Exponential backoff\n",
    "            else:\n",
    "                print(f\"Failed to download image after {max_retries} attempts: {e}\")\n",
    "                continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33e2bd83",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import asyncio\n",
    "import random\n",
    "import aiohttp\n",
    "import validators\n",
    "import pandas as pd\n",
    "from tqdm import tqdm  # import the progress bar package\n",
    "\n",
    "async def download_image(line, session, sema):\n",
    "    \"\"\"\n",
    "    异步下载单个图片，并保存到指定路径中，同时加入重试机制。\n",
    "    \"\"\"\n",
    "    async with sema:\n",
    "        asin = line[\"asin\"]\n",
    "        image_url = line[\"url\"]\n",
    "\n",
    "        # 校验 URL 是否有效\n",
    "        if not validators.url(image_url):\n",
    "            print(f\"Invalid Image URL for ASIN: {asin}, URL: {image_url}\")\n",
    "            return\n",
    "\n",
    "        # 构造图片保存路径\n",
    "        image_path = f\"./imgs/{asin}.jpg\"\n",
    "        if os.path.exists(image_path):\n",
    "            print(f\"Image already exists: {image_path}\")\n",
    "            return\n",
    "\n",
    "        max_retries = 3\n",
    "        retry_delay = 1  # 重试延时（秒）\n",
    "\n",
    "        for attempt in range(max_retries):\n",
    "            try:\n",
    "                # 添加一个短暂的异步延时\n",
    "                await asyncio.sleep(0.05 * random.random())\n",
    "                # 发起 GET 请求下载图片\n",
    "                async with session.get(image_url, timeout=10) as response:\n",
    "                    if response.status != 200:\n",
    "                        raise aiohttp.ClientResponseError(\n",
    "                            request_info=response.request_info,\n",
    "                            history=response.history,\n",
    "                            status=response.status,\n",
    "                            message=\"Non 200 response\",\n",
    "                            headers=response.headers\n",
    "                        )\n",
    "                    content = await response.read()\n",
    "                    with open(image_path, \"wb\") as f:\n",
    "                        f.write(content)\n",
    "                    print(f\"Downloaded image for ASIN: {asin}\")\n",
    "                    break\n",
    "            except Exception as e:\n",
    "                if attempt < max_retries - 1:\n",
    "                    print(f\"Error downloading image from {image_url}: {e}. Retrying in {retry_delay}s... (Attempt {attempt + 1}/{max_retries})\")\n",
    "                    await asyncio.sleep(retry_delay)\n",
    "                    retry_delay *= 2\n",
    "                else:\n",
    "                    print(f\"Failed to download image after {max_retries} attempts: {e}\")\n",
    "\n",
    "async def main():\n",
    "    sema = asyncio.Semaphore(10)\n",
    "    async with aiohttp.ClientSession() as session:\n",
    "        tasks = [download_image(line, session, sema) for _, line in metadata_asin_image.iterrows()]\n",
    "        # Use asyncio.as_completed to update the progress bar each time a task completes.\n",
    "        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc=\"Downloading images\"):\n",
    "            await coro\n",
    "\n",
    "await main()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "575f4549",
   "metadata": {},
   "source": [
    "# Handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1527a782",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from PIL import Image\n",
    "import timm\n",
    "from sklearn.preprocessing import normalize\n",
    "from timm.data import resolve_data_config\n",
    "from timm.data.transforms_factory import create_transform\n",
    "import requests\n",
    "from io import BytesIO\n",
    "import numpy as np\n",
    "import time\n",
    "import os\n",
    "\n",
    "\n",
    "class FeatureExtractor:\n",
    "    def __init__(self, modelname=\"vit_base_patch16_224\"):\n",
    "        # Load the pre-trained model\n",
    "        self.model = timm.create_model(\n",
    "            model_name=modelname, pretrained=True, num_classes=0, global_pool=\"avg\"\n",
    "        )\n",
    "        self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "        self.model = self.model.to(self.device)\n",
    "        self.model.eval()\n",
    "\n",
    "        # Get the input size required by the model\n",
    "        self.input_size = self.model.default_cfg[\"input_size\"]\n",
    "\n",
    "        config = resolve_data_config({}, model=modelname)\n",
    "        # Get the preprocessing function provided by TIMM for the model\n",
    "        self.preprocess = create_transform(**config)\n",
    "        print(f\"Model {modelname} loaded with input size: {self.input_size}\")\n",
    "\n",
    "    def __call__(self, imagepath):\n",
    "        # Preprocess the input image\n",
    "        assert isinstance(imagepath, str), \"Image path must be a string\"\n",
    "        assert len(imagepath) > 0, \"Image path cannot be empty\"\n",
    "\n",
    "        if not os.path.isfile(imagepath):\n",
    "            raise FileNotFoundError(f\"File not found: {imagepath}\")\n",
    "        \n",
    "        try:\n",
    "            # Open the image file\n",
    "            input_image = Image.open(imagepath).convert(\"RGB\")\n",
    "        except Exception as e:\n",
    "            raise IOError(f\"!!! Error opening image file {imagepath}: {e}\")\n",
    "        \n",
    "        input_image = Image.open(imagepath).convert(\"RGB\")\n",
    "\n",
    "        input_image = self.preprocess(input_image)\n",
    "\n",
    "        # Convert the image to a PyTorch tensor and add a batch dimension\n",
    "        input_tensor = input_image.unsqueeze(0)\n",
    "\n",
    "        with torch.no_grad():\n",
    "            forward_vector = self.model(input_tensor.to(self.device))\n",
    "        # Get the feature vector\n",
    "        # feature_vector = forward_vector.data.cpu().numpy()\n",
    "\n",
    "        # return feature_vector\n",
    "        return normalize(forward_vector.reshape(1, -1).cpu(), norm=\"l2\").flatten()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33f9175f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymilvus import MilvusClient, DataType\n",
    "\n",
    "client = MilvusClient(uri=\"./Amazon_electronics.db\")\n",
    "# if client.has_collection(collection_name=\"image_embeddings\"):\n",
    "#     client.drop_collection(collection_name=\"image_embeddings\")\n",
    "schema = MilvusClient.create_schema()\n",
    "schema.add_field(field_name=\"asin\", datatype=DataType.VARCHAR, max_length=64, is_primary=True)\n",
    "schema.add_field(field_name=\"embedding\", datatype=DataType.FLOAT_VECTOR, dim=768)\n",
    "client.create_collection(collection_name=\"image_embeddings\", schema=schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3751162",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check image open is ok\n",
    "for _, line in metadata_asin_image.iterrows():\n",
    "    asin = line[\"asin\"]\n",
    "    image_path = f\"./imgs/{asin}.jpg\"\n",
    "\n",
    "    #try open image\n",
    "    try:\n",
    "        if os.path.exists(image_path):\n",
    "            Image.open(image_path).convert(\"RGB\")\n",
    "            continue\n",
    "    except Exception as e:\n",
    "        raise IOError(f\"!!! Error opening image file {image_path}: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20253eb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 初始化特征提取器\n",
    "extractor = FeatureExtractor()\n",
    "\n",
    "\n",
    "def process_images(filepaths, batch_size=32):\n",
    "    \"\"\"\n",
    "    批量处理图片，充分利用 GPU 资源。\n",
    "    \"\"\"\n",
    "    for i in range(0, len(filepaths), batch_size):\n",
    "        batch = filepaths[i : i + batch_size]\n",
    "        data=[]\n",
    "        for filepath in batch:\n",
    "            asin = os.path.basename(filepath).split(\".\")[0]\n",
    "            image_embedding = extractor(filepath)\n",
    "            data.append({\n",
    "                \"asin\": asin,\n",
    "                \"embedding\": image_embedding\n",
    "            })\n",
    "        # 将数据插入到 Milvus 中\n",
    "        client.insert(collection_name=\"image_embeddings\", data=data)\n",
    "\n",
    "\n",
    "# 获取所有图片路径\n",
    "root = \"./imgs\"\n",
    "filepaths = [\n",
    "    os.path.join(dirpath, filename)\n",
    "    for dirpath, _, filenames in os.walk(root)\n",
    "    for filename in filenames\n",
    "    if filename.endswith(\".jpg\")\n",
    "]\n",
    "\n",
    "# 使用多线程处理图片\n",
    "chunks = [filepaths[i : i + 32] for i in range(0, len(filepaths), 32)]\n",
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
    "    list(\n",
    "        tqdm(\n",
    "            executor.map(process_images, chunks),\n",
    "            total=len(chunks),\n",
    "            desc=\"Processing images\",\n",
    "        )\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6bac1432",
   "metadata": {},
   "source": [
    "# Cal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d76299a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymilvus import MilvusClient, DataType\n",
    "from IPython.display import display\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa40682e",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = MilvusClient(uri=\"./Amazon_electronics.db\")\n",
    "extractor = FeatureExtractor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbbd0987",
   "metadata": {},
   "outputs": [],
   "source": [
    "index_params = MilvusClient.prepare_index_params()\n",
    "\n",
    "index_params.add_index(\n",
    "    field_name=\"embedding\",\n",
    "    metric_type=\"COSINE\",\n",
    "    index_type=\"IVF_FLAT\",\n",
    "    index_name=\"embedding_index\",\n",
    "    params={\"nlist\": 128}\n",
    ")\n",
    "\n",
    "client.create_index(\n",
    "    collection_name=\"image_embeddings\",\n",
    "    index_params=index_params,\n",
    "    sync=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b40d702",
   "metadata": {},
   "outputs": [],
   "source": [
    "res = client.list_indexes(\n",
    "    collection_name=\"image_embeddings\"\n",
    ")\n",
    "\n",
    "print(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37d4f0e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "query_image = \"imgs/0043396828.jpg\"\n",
    "display(Image.open(query_image))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "117f4a60",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = client.search(\n",
    "    collection_name=\"image_embeddings\",\n",
    "    data=[extractor(query_image)],\n",
    "    output_fields=[\"asin\"],\n",
    "    search_params={\"metric_type\": \"COSINE\"},\n",
    "    limit=20,\n",
    ")\n",
    "\n",
    "for result in results[0]:\n",
    "    print(f\"ASIN: {result['entity'].get('asin')}, Score: {result['distance']}\")\n",
    "    # Display the image\n",
    "    display(Image.open(f\"./imgs/{result['entity'].get('asin')}.jpg\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01d164f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymilvus import MilvusClient\n",
    "import numpy as np\n",
    "\n",
    "client = MilvusClient(uri=\"./Amazon_electronics.db\")\n",
    "collection_name = \"image_embeddings\"\n",
    "\n",
    "# milvus_cache 用于缓存每个asin的embedding，避免重复查询\n",
    "milvus_cache = {}\n",
    "\n",
    "def get_embedding(asin, client, collection_name, milvus_cache):\n",
    "    \"\"\"\n",
    "    从缓存中获取或通过 Milvus 查询指定asin的embedding向量\n",
    "    \"\"\"\n",
    "    if asin in milvus_cache:\n",
    "        # 已缓存，则直接返回\n",
    "        return milvus_cache[asin]\n",
    "    \n",
    "    # 使用 Milvus 的 query 接口，按主键查询当前 asin 的记录\n",
    "    # 注意：根据你使用的 Milvus SDK，查询语法可能稍有不同，请参考官方文档\n",
    "    query_expr = f\"asin == '{asin}'\"\n",
    "    results = client.query(collection_name=collection_name, filter=query_expr, output_fields=[\"embedding\"])\n",
    "    \n",
    "    if results and len(results) > 0:\n",
    "        embedding = results[0][\"embedding\"]\n",
    "        milvus_cache[asin] = embedding  # 缓存起来\n",
    "        return embedding\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "def cosine_similarity(vec1, vec2):\n",
    "    \"\"\"\n",
    "    计算两个向量之间的余弦相似度\n",
    "    \"\"\"\n",
    "    vec1 = np.array(vec1)\n",
    "    vec2 = np.array(vec2)\n",
    "    norm1 = np.linalg.norm(vec1)\n",
    "    norm2 = np.linalg.norm(vec2)\n",
    "    if norm1 == 0 or norm2 == 0:\n",
    "        return 0.0\n",
    "    return np.dot(vec1, vec2) / (norm1 * norm2)\n",
    "\n",
    "def get_asin_similarity(asin1, asin2, client, collection_name, milvus_cache):\n",
    "    \"\"\"\n",
    "    计算两个 asin 之间基于图片 embedding 的相似度：\n",
    "     1. 如果对应asin的向量存在于缓存中，直接使用；\n",
    "     2. 否则通过 Milvus 查询拿到 embedding 向量；\n",
    "     3. 使用余弦相似度计算两者的相似度。\n",
    "    \"\"\"\n",
    "    emb1 = get_embedding(asin1, client, collection_name, milvus_cache)\n",
    "    emb2 = get_embedding(asin2, client, collection_name, milvus_cache)\n",
    "    \n",
    "    if emb1 is None or emb2 is None:\n",
    "        print(f\"无法获取商品 {asin1 if emb1 is None else asin2} 的 embedding\")\n",
    "        return 0.0\n",
    "    return cosine_similarity(emb1, emb2)\n",
    "\n",
    "# 使用示例\n",
    "asin_a = \"059403390X\"   # 示例 asin\n",
    "asin_b = \"146476560X\"   # 示例 asin\n",
    "\n",
    "sim_score = get_asin_similarity(asin_a, asin_b, client, collection_name, milvus_cache)\n",
    "print(f\"商品 {asin_a} 和商品 {asin_b} 的图片相似度为: {sim_score:.4f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv (3.12.3)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}