Upload sd_token_similarity_calculator.ipynb
Browse files
sd_token_similarity_calculator.ipynb
CHANGED
@@ -46,7 +46,8 @@
|
|
46 |
"NUM_PREFIX = 13662\n",
|
47 |
"NUM_SUFFIX = 32901\n",
|
48 |
"\n",
|
49 |
-
"
|
|
|
50 |
"\n",
|
51 |
"#Import the vocab.json\n",
|
52 |
"import json\n",
|
@@ -117,6 +118,22 @@
|
|
117 |
" return ' ' #<---- return whitespace if other id like emojis etc.\n",
|
118 |
"#--------#\n",
|
119 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
"#print(get_token(35894))\n"
|
121 |
],
|
122 |
"metadata": {
|
@@ -135,7 +152,7 @@
|
|
135 |
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
136 |
"\n",
|
137 |
"# @markdown Write name of token to match against\n",
|
138 |
-
"token_name = \"
|
139 |
"\n",
|
140 |
"prompt = token_name\n",
|
141 |
"# @markdown (optional) Mix the token with something else\n",
|
@@ -308,7 +325,7 @@
|
|
308 |
"#Get image\n",
|
309 |
"# You can use \"http://images.cocodataset.org/val2017/000000039769.jpg\" for testing\n",
|
310 |
"image_url = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for local upload (scroll down to see it)\"}\n",
|
311 |
-
"colab_image_path = \"\" # @param {\"type\":\"string\",\"placeholder\": \"eval. as '/content/sd_tokens/' + **your input**\"}\n",
|
312 |
"# @markdown --------------------------\n",
|
313 |
"\n",
|
314 |
"image_path = \"\"\n",
|
@@ -332,6 +349,8 @@
|
|
332 |
"else:\n",
|
333 |
" image_A = Image.open(requests.get(image_url, stream=True).raw)\n",
|
334 |
"#------#\n",
|
|
|
|
|
335 |
"\n"
|
336 |
],
|
337 |
"metadata": {
|
@@ -340,6 +359,89 @@
|
|
340 |
"execution_count": null,
|
341 |
"outputs": []
|
342 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
{
|
344 |
"cell_type": "code",
|
345 |
"source": [
|
@@ -718,6 +820,35 @@
|
|
718 |
"id": "hyK423TQCRup"
|
719 |
}
|
720 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
721 |
{
|
722 |
"cell_type": "markdown",
|
723 |
"source": [
|
@@ -929,7 +1060,9 @@
|
|
929 |
"\n",
|
930 |
"//---//\n",
|
931 |
"\n",
|
932 |
-
"https://codeandlife.com/2023/01/26/mastering-the-huggingface-clip-model-how-to-extract-embeddings-and-calculate-similarity-for-text-and-images
|
|
|
|
|
933 |
],
|
934 |
"metadata": {
|
935 |
"id": "njeJx_nSSA8H"
|
|
|
46 |
"NUM_PREFIX = 13662\n",
|
47 |
"NUM_SUFFIX = 32901\n",
|
48 |
"\n",
|
49 |
+
"PREFIX_ENC_VOCAB = 'encoded_prefix_to_girl'\n",
|
50 |
+
"SUFFIX_ENC_VOCAB = 'encoded_suffix'\n",
|
51 |
"\n",
|
52 |
"#Import the vocab.json\n",
|
53 |
"import json\n",
|
|
|
118 |
" return ' ' #<---- return whitespace if other id like emojis etc.\n",
|
119 |
"#--------#\n",
|
120 |
"\n",
|
121 |
+
"#get token from id (excluding tokens with special symbols)\n",
|
122 |
+
"def get_suffix(id):\n",
|
123 |
+
" _id = f'{id}'\n",
|
124 |
+
" if int(id) <= NUM_SUFFIX:\n",
|
125 |
+
" return suffix[_id]\n",
|
126 |
+
" return ' ' #<---- return whitespace if out of bounds\n",
|
127 |
+
"#--------#\n",
|
128 |
+
"\n",
|
129 |
+
"#get token from id (excluding tokens with special symbols)\n",
|
130 |
+
"def get_prefix(id):\n",
|
131 |
+
" _id = f'{id}'\n",
|
132 |
+
" if int(id) <= NUM_PREFIX:\n",
|
133 |
+
" return prefix[_id]\n",
|
134 |
+
" return ' ' #<---- return whitespace if out of bounds\n",
|
135 |
+
"#--------#\n",
|
136 |
+
"\n",
|
137 |
"#print(get_token(35894))\n"
|
138 |
],
|
139 |
"metadata": {
|
|
|
152 |
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
153 |
"\n",
|
154 |
"# @markdown Write name of token to match against\n",
|
155 |
+
"token_name = \"prs \" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
|
156 |
"\n",
|
157 |
"prompt = token_name\n",
|
158 |
"# @markdown (optional) Mix the token with something else\n",
|
|
|
325 |
"#Get image\n",
|
326 |
"# You can use \"http://images.cocodataset.org/val2017/000000039769.jpg\" for testing\n",
|
327 |
"image_url = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for local upload (scroll down to see it)\"}\n",
|
328 |
+
"colab_image_path = \"imperial.png\" # @param {\"type\":\"string\",\"placeholder\": \"eval. as '/content/sd_tokens/' + **your input**\"}\n",
|
329 |
"# @markdown --------------------------\n",
|
330 |
"\n",
|
331 |
"image_path = \"\"\n",
|
|
|
349 |
"else:\n",
|
350 |
" image_A = Image.open(requests.get(image_url, stream=True).raw)\n",
|
351 |
"#------#\n",
|
352 |
+
"from google.colab.patches import cv2_imshow\n",
|
353 |
+
"cv2_imshow(image_A)\n",
|
354 |
"\n"
|
355 |
],
|
356 |
"metadata": {
|
|
|
359 |
"execution_count": null,
|
360 |
"outputs": []
|
361 |
},
|
362 |
+
{
|
363 |
+
"cell_type": "code",
|
364 |
+
"source": [
|
365 |
+
"# @title Order pre-made text_encodings to image similarity\n",
|
366 |
+
"from transformers import AutoTokenizer\n",
|
367 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
368 |
+
"from transformers import CLIPProcessor, CLIPModel\n",
|
369 |
+
"processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
|
370 |
+
"model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
|
371 |
+
"\n",
|
372 |
+
"# Get image features\n",
|
373 |
+
"inputs = processor(images=image_A, return_tensors=\"pt\")\n",
|
374 |
+
"image_features = model.get_image_features(**inputs)\n",
|
375 |
+
"image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)\n",
|
376 |
+
"name_A = \"the image\"\n",
|
377 |
+
"\n",
|
378 |
+
"# Load the .db file for prefix encodings\n",
|
379 |
+
"import shelve\n",
|
380 |
+
"d = shelve.open(PREFIX_ENC_VOCAB)\n",
|
381 |
+
"dots = results_sim = torch.zeros(NUM_PREFIX)\n",
|
382 |
+
"for index in range(NUM_PREFIX):\n",
|
383 |
+
" text_features = d[f'{index}']\n",
|
384 |
+
" logit_scale = model.logit_scale.exp()\n",
|
385 |
+
" torch.matmul(text_features, image_features.t()) * logit_scale\n",
|
386 |
+
" sim = torch.nn.functional.cosine_similarity(text_features, image_features) * logit_scale\n",
|
387 |
+
" dots[index] = sim\n",
|
388 |
+
"#----#\n",
|
389 |
+
"prefix_sorted, prefix_indices = torch.sort(dots,dim=0 , descending=True)\n",
|
390 |
+
"d.close() #close the file\n",
|
391 |
+
"\n",
|
392 |
+
"# Load the .db file for suffix encodings\n",
|
393 |
+
"import shelve\n",
|
394 |
+
"d = shelve.open(SUFFIX_ENC_VOCAB)\n",
|
395 |
+
"dots = results_sim = torch.zeros(NUM_SUFFIX)\n",
|
396 |
+
"for index in range(NUM_SUFFIX):\n",
|
397 |
+
" text_features = d[f'{index}']\n",
|
398 |
+
" logit_scale = model.logit_scale.exp()\n",
|
399 |
+
" torch.matmul(text_features, image_features.t()) * logit_scale\n",
|
400 |
+
" sim = torch.nn.functional.cosine_similarity(text_features, image_features) * logit_scale\n",
|
401 |
+
" dots[index] = sim\n",
|
402 |
+
"#----#\n",
|
403 |
+
"suffix_sorted, suffix_indices = torch.sort(dots,dim=0 , descending=True)\n",
|
404 |
+
"d.close() #close the file"
|
405 |
+
],
|
406 |
+
"metadata": {
|
407 |
+
"id": "gaOB8rsOneIa"
|
408 |
+
},
|
409 |
+
"execution_count": null,
|
410 |
+
"outputs": []
|
411 |
+
},
|
412 |
+
{
|
413 |
+
"cell_type": "code",
|
414 |
+
"source": [
|
415 |
+
"# @title Show the 10 most similiar suffix and prefix text-encodings to the image encoding\n",
|
416 |
+
"\n",
|
417 |
+
"_suffixes = '{'\n",
|
418 |
+
"for index in range(20):\n",
|
419 |
+
" id = f'{suffix_indices[index]}'\n",
|
420 |
+
" sim = suffix_sorted[index]\n",
|
421 |
+
" name = get_suffix(id)\n",
|
422 |
+
" _suffixes = _suffixes + name + '|'\n",
|
423 |
+
"#------#\n",
|
424 |
+
"_suffixes = (_suffixes + '}').replace('|}', '}')\n",
|
425 |
+
"print('most similiar suffix tokens to image : ' + _suffixes)\n",
|
426 |
+
"\n",
|
427 |
+
"#-------#\n",
|
428 |
+
"\n",
|
429 |
+
"_prefixes = '{'\n",
|
430 |
+
"for index in range(20):\n",
|
431 |
+
" id = f'{prefix_indices[index]}'\n",
|
432 |
+
" sim = prefix_sorted[index]\n",
|
433 |
+
" name = get_prefix(id)\n",
|
434 |
+
" _prefixes = _prefixes + name + '|'\n",
|
435 |
+
"#------#\n",
|
436 |
+
"_prefixes = (_prefixes + '}').replace('|}', '}')\n",
|
437 |
+
"print('most similiar prefix tokens to image : ' + _prefixes)\n"
|
438 |
+
],
|
439 |
+
"metadata": {
|
440 |
+
"id": "eZqMUhP0qYaK"
|
441 |
+
},
|
442 |
+
"execution_count": null,
|
443 |
+
"outputs": []
|
444 |
+
},
|
445 |
{
|
446 |
"cell_type": "code",
|
447 |
"source": [
|
|
|
820 |
"id": "hyK423TQCRup"
|
821 |
}
|
822 |
},
|
823 |
+
{
|
824 |
+
"cell_type": "code",
|
825 |
+
"source": [
|
826 |
+
"# @title Make your own text_encodings .db file for later use (rate is roughly 1K encodings per minute, so plan accordingly)\n",
|
827 |
+
"from transformers import AutoTokenizer\n",
|
828 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
829 |
+
"from transformers import CLIPProcessor, CLIPModel\n",
|
830 |
+
"processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
|
831 |
+
"model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
|
832 |
+
"\n",
|
833 |
+
"# Save results as .db file\n",
|
834 |
+
"import shelve\n",
|
835 |
+
"d = shelve.open('my_text_encodings')\n",
|
836 |
+
"for index in range(NUM_PREFIX):\n",
|
837 |
+
" inputs = tokenizer(text = get_prefix(index)+'girl ', padding=True, return_tensors=\"pt\")\n",
|
838 |
+
" text_features = model.get_text_features(**inputs)\n",
|
839 |
+
" d[f'{index}'] = text_features\n",
|
840 |
+
"#----#\n",
|
841 |
+
"\n",
|
842 |
+
"d.close() #close the file\n",
|
843 |
+
"\n",
|
844 |
+
""
|
845 |
+
],
|
846 |
+
"metadata": {
|
847 |
+
"id": "9ZiTsF9jV0TV"
|
848 |
+
},
|
849 |
+
"execution_count": 10,
|
850 |
+
"outputs": []
|
851 |
+
},
|
852 |
{
|
853 |
"cell_type": "markdown",
|
854 |
"source": [
|
|
|
1060 |
"\n",
|
1061 |
"//---//\n",
|
1062 |
"\n",
|
1063 |
+
"https://codeandlife.com/2023/01/26/mastering-the-huggingface-clip-model-how-to-extract-embeddings-and-calculate-similarity-for-text-and-images/\n",
|
1064 |
+
"\n",
|
1065 |
+
"https://arxiv.org/pdf/2303.03032"
|
1066 |
],
|
1067 |
"metadata": {
|
1068 |
"id": "njeJx_nSSA8H"
|