codeShare
/

JupyterNotebooks

Safetensors

Model card Files Files and versions Community

codeShare commited on Sep 14, 2024

Commit

fd7fc65

verified ·

1 Parent(s): a070d20

Upload sd_token_similarity_calculator.ipynb

Browse files

Files changed (1) hide show

sd_token_similarity_calculator.ipynb +120 -52

sd_token_similarity_calculator.ipynb CHANGED Viewed

@@ -3,8 +3,7 @@
   "nbformat_minor": 0,
   "metadata": {
     "colab": {
-      "provenance": [],
-      "gpuType": "T4"
     },
     "kernelspec": {
       "name": "python3",
@@ -12,8 +11,7 @@
     },
     "language_info": {
       "name": "python"
-    },
-    "accelerator": "GPU"
   },
   "cells": [
     {
@@ -155,30 +153,115 @@
         "#print(get_token(35894))\n"
       ],
       "metadata": {
-        "id": "Ch9puvwKH1s3",
-        "collapsed": true,
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "outputId": "42e8d455-ca0a-4c78-dba7-a32d9dee9b41"
       },
-      "execution_count": 1,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Cloning into 'sd_tokens'...\n",
-            "remote: Enumerating objects: 99, done.\u001b[K\n",
-            "remote: Counting objects: 100% (96/96), done.\u001b[K\n",
-            "remote: Compressing objects: 100% (96/96), done.\u001b[K\n",
-            "remote: Total 99 (delta 34), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
-            "Unpacking objects: 100% (99/99), 1.35 MiB | 1.60 MiB/s, done.\n",
-            "Filtering content: 100% (22/22), 2.47 GiB | 36.54 MiB/s, done.\n",
-            "/content/sd_tokens\n"
-          ]
-        }
-      ]
     },
     {
       "cell_type": "code",
@@ -1097,27 +1180,14 @@
     {
       "cell_type": "code",
       "source": [
         "\n",
-        "# @title Import text-to-image-prompts .json files\n",
-        "%cd /content/\n",
-        "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
-        "\n",
-        "#Initialize\n",
         "import os\n",
-        "def my_mkdirs(folder):\n",
-        "  if os.path.exists(folder)==False:\n",
-        "    os.makedirs(folder)"
-      ],
-      "metadata": {
-        "id": "Qy51FFu8aVNA"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title Make your own text_encodings .db file for later use\n",
         "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
@@ -1125,20 +1195,18 @@
         "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
         "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
         "\n",
-        "#Import the vocab.json\n",
-        "import json\n",
-        "import pandas as pd\n",
         "\n",
         "my_mkdirs('/content/text_encodings/')\n",
         "filename = ''\n",
         "\n",
-        "for  file_index in range(34):\n",
         "  if file_index <1: continue\n",
         "  filename = f'🦜 fusion-t2i-prompt-features-{file_index}'\n",
         "  #🦜 fusion-t2i-prompt-features-1.json\n",
         "\n",
         "  # Read suffix.json\n",
-        "  %cd /content/text-to-image-prompts/civitai-prompts/green/\n",
         "  with open(filename + '.json', 'r') as f:\n",
         "      data = json.load(f)\n",
         "  _df = pd.DataFrame({'count': data})['count']\n",
@@ -1153,7 +1221,7 @@
         "  %cd /content/text_encodings/\n",
         "  import shelve\n",
         "  d = shelve.open(filename)\n",
-        "  for index in range(NUM_ITEMS):\n",
         "    inputs = tokenizer(text = '' + prompts[f'{index}'], padding=True, return_tensors=\"pt\").to(device)\n",
         "    text_features = model.get_text_features(**inputs).to(device)\n",
         "    text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True).to(device)\n",

   "nbformat_minor": 0,
   "metadata": {
     "colab": {
+      "provenance": []
     },
     "kernelspec": {
       "name": "python3",
     },
     "language_info": {
       "name": "python"
+    }
   },
   "cells": [
     {
         "#print(get_token(35894))\n"
       ],
       "metadata": {
+        "id": "w8O0TX7PBh5m"
       },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Load/initialize values (new version - ignore this cell)\n",
+        "#Imports\n",
+        "import json , os , shelve , torch\n",
+        "import pandas as pd\n",
+        "#----#\n",
+        "\n",
+        "def my_mkdirs(folder):\n",
+        "  if os.path.exists(folder)==False:\n",
+        "    os.makedirs(folder)\n",
+        "\n",
+        "def _modulus(_id,id_max):\n",
+        "  id = _id\n",
+        "  while(id>id_max):\n",
+        "    id = id-id_max\n",
+        "  return id\n",
+        "\n",
+        "def getPrompts(_path):\n",
+        "  path = _path + '/text'\n",
+        "  path_enc = _path + '/text_encodings'\n",
+        "  #-----#\n",
+        "  index = 0\n",
+        "  file_index = 0\n",
+        "  prompts = {}\n",
+        "  text_encodings = {}\n",
+        "  _text_encodings = {}\n",
+        "  #-----#\n",
+        "  for filename in os.listdir(f'{path}'):\n",
+        "\n",
+        "    print(f'reading {filename}....')\n",
+        "    _index = 0\n",
+        "    %cd {path}\n",
+        "    with open(f'{filename}', 'r') as f:\n",
+        "      data = json.load(f)\n",
+        "    #------#\n",
+        "    _df = pd.DataFrame({'count': data})['count']\n",
+        "    _prompts = {\n",
+        "        key : value for key, value in _df.items()\n",
+        "    }\n",
+        "    for key in _prompts:\n",
+        "      _index = int(key)\n",
+        "      value = _prompts[key]\n",
+        "\n",
+        "      #Read the 'header' file in the JSON\n",
+        "      if _index <= 0 :\n",
+        "        _NUM_ITEMS = int(value)\n",
+        "        index = index + 1\n",
+        "        continue\n",
+        "      if _index <= 1 :\n",
+        "        _file_name = f'{value}'\n",
+        "        %cd {path_enc}\n",
+        "        _text_encodings = shelve.open(_file_name)\n",
+        "        #Store text_encodings for the header items\n",
+        "        text_encodings[f'{index-1}'] = _text_encodings[f'{_index-1}']\n",
+        "        text_encodings[f'{index}'] = _text_encodings[f'{_index}']\n",
+        "        #------#\n",
+        "        index = index + 1\n",
+        "        continue\n",
+        "      #------#\n",
+        "      #Read the text_encodings + prompts\n",
+        "      text_encodings[f'{index}'] = _text_encodings[f'{_index}']\n",
+        "      prompts[f'{index}'] = _prompts[f'{_index}']\n",
+        "      index = index + 1\n",
+        "      continue\n",
+        "      #-------#\n",
+        "    #--------#\n",
+        "    _text_encodings.close() #close the text_encodings file\n",
+        "    file_index = file_index + 1\n",
+        "  #----------#\n",
+        "  RANGE = index\n",
+        "  return prompts , text_encodings , NUM_TOKENS\n",
+        "  #--------#\n",
+        "\n",
+        "#for key in prompts:\n",
+        "#  value = prompts[key]\n",
+        "#  if int(key)>=1000:break\n",
+        "#  print(value)\n",
+        "#------#\n"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "rUXQ73IbonHY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title  Load the tokens into the colab (new version - ignore this cell)\n",
+        "%cd /content/\n",
+        "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
+        "#------#\n",
+        "path = '/content/text-to-image-prompts/civitai-prompts/green'\n",
+        "prompts , text_encodings, RANGE = getPrompts(path)"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "ZMG4CThUAmwW"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
     {
       "cell_type": "code",
       "source": [
+        "# @title Make your own text_encodings .db file for later use (using GPU is recommended)\n",
         "\n",
+        "import json\n",
+        "import pandas as pd\n",
         "import os\n",
+        "import shelve\n",
+        "import torch\n",
+        "\n",
         "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
         "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
         "\n",
+        "%cd /content/\n",
         "\n",
         "my_mkdirs('/content/text_encodings/')\n",
         "filename = ''\n",
         "\n",
+        "for  file_index in range(34 + 1):\n",
         "  if file_index <1: continue\n",
         "  filename = f'🦜 fusion-t2i-prompt-features-{file_index}'\n",
         "  #🦜 fusion-t2i-prompt-features-1.json\n",
         "\n",
         "  # Read suffix.json\n",
+        "  %cd /content/text-to-image-prompts/civitai-prompts/green/text\n",
         "  with open(filename + '.json', 'r') as f:\n",
         "      data = json.load(f)\n",
         "  _df = pd.DataFrame({'count': data})['count']\n",
         "  %cd /content/text_encodings/\n",
         "  import shelve\n",
         "  d = shelve.open(filename)\n",
+        "  for index in range(NUM_ITEMS + 1):\n",
         "    inputs = tokenizer(text = '' + prompts[f'{index}'], padding=True, return_tensors=\"pt\").to(device)\n",
         "    text_features = model.get_text_features(**inputs).to(device)\n",
         "    text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True).to(device)\n",