codeShare commited on
Commit
fd7fc65
·
verified ·
1 Parent(s): a070d20

Upload sd_token_similarity_calculator.ipynb

Browse files
Files changed (1) hide show
  1. sd_token_similarity_calculator.ipynb +120 -52
sd_token_similarity_calculator.ipynb CHANGED
@@ -3,8 +3,7 @@
3
  "nbformat_minor": 0,
4
  "metadata": {
5
  "colab": {
6
- "provenance": [],
7
- "gpuType": "T4"
8
  },
9
  "kernelspec": {
10
  "name": "python3",
@@ -12,8 +11,7 @@
12
  },
13
  "language_info": {
14
  "name": "python"
15
- },
16
- "accelerator": "GPU"
17
  },
18
  "cells": [
19
  {
@@ -155,30 +153,115 @@
155
  "#print(get_token(35894))\n"
156
  ],
157
  "metadata": {
158
- "id": "Ch9puvwKH1s3",
159
- "collapsed": true,
160
- "colab": {
161
- "base_uri": "https://localhost:8080/"
162
- },
163
- "outputId": "42e8d455-ca0a-4c78-dba7-a32d9dee9b41"
164
  },
165
- "execution_count": 1,
166
- "outputs": [
167
- {
168
- "output_type": "stream",
169
- "name": "stdout",
170
- "text": [
171
- "Cloning into 'sd_tokens'...\n",
172
- "remote: Enumerating objects: 99, done.\u001b[K\n",
173
- "remote: Counting objects: 100% (96/96), done.\u001b[K\n",
174
- "remote: Compressing objects: 100% (96/96), done.\u001b[K\n",
175
- "remote: Total 99 (delta 34), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
176
- "Unpacking objects: 100% (99/99), 1.35 MiB | 1.60 MiB/s, done.\n",
177
- "Filtering content: 100% (22/22), 2.47 GiB | 36.54 MiB/s, done.\n",
178
- "/content/sd_tokens\n"
179
- ]
180
- }
181
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  },
183
  {
184
  "cell_type": "code",
@@ -1097,27 +1180,14 @@
1097
  {
1098
  "cell_type": "code",
1099
  "source": [
 
1100
  "\n",
1101
- "# @title Import text-to-image-prompts .json files\n",
1102
- "%cd /content/\n",
1103
- "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
1104
- "\n",
1105
- "#Initialize\n",
1106
  "import os\n",
1107
- "def my_mkdirs(folder):\n",
1108
- " if os.path.exists(folder)==False:\n",
1109
- " os.makedirs(folder)"
1110
- ],
1111
- "metadata": {
1112
- "id": "Qy51FFu8aVNA"
1113
- },
1114
- "execution_count": null,
1115
- "outputs": []
1116
- },
1117
- {
1118
- "cell_type": "code",
1119
- "source": [
1120
- "# @title Make your own text_encodings .db file for later use\n",
1121
  "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
1122
  "from transformers import AutoTokenizer\n",
1123
  "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
@@ -1125,20 +1195,18 @@
1125
  "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
1126
  "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
1127
  "\n",
1128
- "#Import the vocab.json\n",
1129
- "import json\n",
1130
- "import pandas as pd\n",
1131
  "\n",
1132
  "my_mkdirs('/content/text_encodings/')\n",
1133
  "filename = ''\n",
1134
  "\n",
1135
- "for file_index in range(34):\n",
1136
  " if file_index <1: continue\n",
1137
  " filename = f'🦜 fusion-t2i-prompt-features-{file_index}'\n",
1138
  " #🦜 fusion-t2i-prompt-features-1.json\n",
1139
  "\n",
1140
  " # Read suffix.json\n",
1141
- " %cd /content/text-to-image-prompts/civitai-prompts/green/\n",
1142
  " with open(filename + '.json', 'r') as f:\n",
1143
  " data = json.load(f)\n",
1144
  " _df = pd.DataFrame({'count': data})['count']\n",
@@ -1153,7 +1221,7 @@
1153
  " %cd /content/text_encodings/\n",
1154
  " import shelve\n",
1155
  " d = shelve.open(filename)\n",
1156
- " for index in range(NUM_ITEMS):\n",
1157
  " inputs = tokenizer(text = '' + prompts[f'{index}'], padding=True, return_tensors=\"pt\").to(device)\n",
1158
  " text_features = model.get_text_features(**inputs).to(device)\n",
1159
  " text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True).to(device)\n",
 
3
  "nbformat_minor": 0,
4
  "metadata": {
5
  "colab": {
6
+ "provenance": []
 
7
  },
8
  "kernelspec": {
9
  "name": "python3",
 
11
  },
12
  "language_info": {
13
  "name": "python"
14
+ }
 
15
  },
16
  "cells": [
17
  {
 
153
  "#print(get_token(35894))\n"
154
  ],
155
  "metadata": {
156
+ "id": "w8O0TX7PBh5m"
 
 
 
 
 
157
  },
158
+ "execution_count": null,
159
+ "outputs": []
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "source": [
164
+ "# @title Load/initialize values (new version - ignore this cell)\n",
165
+ "#Imports\n",
166
+ "import json , os , shelve , torch\n",
167
+ "import pandas as pd\n",
168
+ "#----#\n",
169
+ "\n",
170
+ "def my_mkdirs(folder):\n",
171
+ " if os.path.exists(folder)==False:\n",
172
+ " os.makedirs(folder)\n",
173
+ "\n",
174
+ "def _modulus(_id,id_max):\n",
175
+ " id = _id\n",
176
+ " while(id>id_max):\n",
177
+ " id = id-id_max\n",
178
+ " return id\n",
179
+ "\n",
180
+ "def getPrompts(_path):\n",
181
+ " path = _path + '/text'\n",
182
+ " path_enc = _path + '/text_encodings'\n",
183
+ " #-----#\n",
184
+ " index = 0\n",
185
+ " file_index = 0\n",
186
+ " prompts = {}\n",
187
+ " text_encodings = {}\n",
188
+ " _text_encodings = {}\n",
189
+ " #-----#\n",
190
+ " for filename in os.listdir(f'{path}'):\n",
191
+ "\n",
192
+ " print(f'reading {filename}....')\n",
193
+ " _index = 0\n",
194
+ " %cd {path}\n",
195
+ " with open(f'{filename}', 'r') as f:\n",
196
+ " data = json.load(f)\n",
197
+ " #------#\n",
198
+ " _df = pd.DataFrame({'count': data})['count']\n",
199
+ " _prompts = {\n",
200
+ " key : value for key, value in _df.items()\n",
201
+ " }\n",
202
+ " for key in _prompts:\n",
203
+ " _index = int(key)\n",
204
+ " value = _prompts[key]\n",
205
+ "\n",
206
+ " #Read the 'header' file in the JSON\n",
207
+ " if _index <= 0 :\n",
208
+ " _NUM_ITEMS = int(value)\n",
209
+ " index = index + 1\n",
210
+ " continue\n",
211
+ " if _index <= 1 :\n",
212
+ " _file_name = f'{value}'\n",
213
+ " %cd {path_enc}\n",
214
+ " _text_encodings = shelve.open(_file_name)\n",
215
+ " #Store text_encodings for the header items\n",
216
+ " text_encodings[f'{index-1}'] = _text_encodings[f'{_index-1}']\n",
217
+ " text_encodings[f'{index}'] = _text_encodings[f'{_index}']\n",
218
+ " #------#\n",
219
+ " index = index + 1\n",
220
+ " continue\n",
221
+ " #------#\n",
222
+ " #Read the text_encodings + prompts\n",
223
+ " text_encodings[f'{index}'] = _text_encodings[f'{_index}']\n",
224
+ " prompts[f'{index}'] = _prompts[f'{_index}']\n",
225
+ " index = index + 1\n",
226
+ " continue\n",
227
+ " #-------#\n",
228
+ " #--------#\n",
229
+ " _text_encodings.close() #close the text_encodings file\n",
230
+ " file_index = file_index + 1\n",
231
+ " #----------#\n",
232
+ " RANGE = index\n",
233
+ " return prompts , text_encodings , NUM_TOKENS\n",
234
+ " #--------#\n",
235
+ "\n",
236
+ "#for key in prompts:\n",
237
+ "# value = prompts[key]\n",
238
+ "# if int(key)>=1000:break\n",
239
+ "# print(value)\n",
240
+ "#------#\n"
241
+ ],
242
+ "metadata": {
243
+ "cellView": "form",
244
+ "id": "rUXQ73IbonHY"
245
+ },
246
+ "execution_count": null,
247
+ "outputs": []
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "source": [
252
+ "# @title Load the tokens into the colab (new version - ignore this cell)\n",
253
+ "%cd /content/\n",
254
+ "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
255
+ "#------#\n",
256
+ "path = '/content/text-to-image-prompts/civitai-prompts/green'\n",
257
+ "prompts , text_encodings, RANGE = getPrompts(path)"
258
+ ],
259
+ "metadata": {
260
+ "cellView": "form",
261
+ "id": "ZMG4CThUAmwW"
262
+ },
263
+ "execution_count": null,
264
+ "outputs": []
265
  },
266
  {
267
  "cell_type": "code",
 
1180
  {
1181
  "cell_type": "code",
1182
  "source": [
1183
+ "# @title Make your own text_encodings .db file for later use (using GPU is recommended)\n",
1184
  "\n",
1185
+ "import json\n",
1186
+ "import pandas as pd\n",
 
 
 
1187
  "import os\n",
1188
+ "import shelve\n",
1189
+ "import torch\n",
1190
+ "\n",
 
 
 
 
 
 
 
 
 
 
 
1191
  "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
1192
  "from transformers import AutoTokenizer\n",
1193
  "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
 
1195
  "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
1196
  "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
1197
  "\n",
1198
+ "%cd /content/\n",
 
 
1199
  "\n",
1200
  "my_mkdirs('/content/text_encodings/')\n",
1201
  "filename = ''\n",
1202
  "\n",
1203
+ "for file_index in range(34 + 1):\n",
1204
  " if file_index <1: continue\n",
1205
  " filename = f'🦜 fusion-t2i-prompt-features-{file_index}'\n",
1206
  " #🦜 fusion-t2i-prompt-features-1.json\n",
1207
  "\n",
1208
  " # Read suffix.json\n",
1209
+ " %cd /content/text-to-image-prompts/civitai-prompts/green/text\n",
1210
  " with open(filename + '.json', 'r') as f:\n",
1211
  " data = json.load(f)\n",
1212
  " _df = pd.DataFrame({'count': data})['count']\n",
 
1221
  " %cd /content/text_encodings/\n",
1222
  " import shelve\n",
1223
  " d = shelve.open(filename)\n",
1224
+ " for index in range(NUM_ITEMS + 1):\n",
1225
  " inputs = tokenizer(text = '' + prompts[f'{index}'], padding=True, return_tensors=\"pt\").to(device)\n",
1226
  " text_features = model.get_text_features(**inputs).to(device)\n",
1227
  " text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True).to(device)\n",