ashwath-vaithina-ibm commited on
Commit
17d4ee3
·
verified ·
1 Parent(s): 750f8f4

Delete customize

Browse files
customize/customize_embeddings.py DELETED
@@ -1,49 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # Copyright 2021, IBM Corporation.
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """
19
- Python function to customize json sentences locally.
20
- """
21
-
22
- __author__ = "Vagner Santana, Melina Alberio, Cassia Sanctos and Tiago Machado"
23
- __copyright__ = "IBM Corporation 2024"
24
- __credits__ = ["Vagner Santana, Melina Alberio, Cassia Sanctos, Tiago Machado"]
25
- __license__ = "Apache 2.0"
26
- __version__ = "0.0.1"
27
-
28
- import os
29
- import json
30
- import pandas as pd
31
- import numpy as np
32
- import customize_helper
33
-
34
- # Sentence transformer model HF
35
- model_path = 'models/all-MiniLM-L6-v2'
36
- model_id = model_path.split("/")[1]
37
-
38
- # INPUT FILE
39
- # Default file with empty embeddings
40
- json_in_file = 'prompt-sentences-main/prompt_sentences.json'
41
- json_in_file_name = json_in_file.split(".json")[0]
42
-
43
- # OUTPUT FILE
44
- json_out_file_name = f'{json_in_file_name}-{model_id}.json'
45
-
46
- prompt_json = json.load(open(json_in_file))
47
- prompt_json_embeddings = customize_helper.populate_embeddings(prompt_json, model_path)
48
- prompt_json_centroids = customize_helper.populate_centroids(prompt_json_embeddings)
49
- customize_helper.save_json(prompt_json_centroids, json_out_file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
customize/customize_helper.py DELETED
@@ -1,129 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # Copyright 2021, IBM Corporation.
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """
19
- Python helper function to customize json sentences locally.
20
- """
21
-
22
- __author__ = "Vagner Santana, Melina Alberio, Cassia Sanctos and Tiago Machado"
23
- __copyright__ = "IBM Corporation 2024"
24
- __credits__ = ["Vagner Santana, Melina Alberio, Cassia Sanctos, Tiago Machado"]
25
- __license__ = "Apache 2.0"
26
- __version__ = "0.0.1"
27
-
28
- import os
29
- import json
30
- import pandas as pd
31
- import numpy as np
32
- import math
33
- from sentence_transformers import SentenceTransformer
34
-
35
- # Requests embeddings for a given sentence
36
- def query_model(texts, model_path):
37
- out = []
38
- model = SentenceTransformer(model_path)
39
- input_embedding = model.encode(texts)
40
- out.append(input_embedding)
41
- if( out != [] ):
42
- return out[0]
43
- else:
44
- return out
45
-
46
- # Returns euclidean distance between two embeddings
47
- def get_distance(embedding1, embedding2):
48
- total = 0
49
- if( len(embedding1) != len(embedding2)):
50
- return math.inf
51
-
52
- for i, obj in enumerate(embedding1):
53
- total += math.pow(embedding2[0][i] - embedding1[0][i], 2)
54
- return(math.sqrt(total))
55
-
56
- # Returns the centroid for a given value
57
- def get_centroid(v, dimension = 384, k = 10):
58
- centroid = [0] * dimension
59
- count = 0
60
- for p in v['prompts']:
61
- i = 0
62
- while i < len(p['embedding']):
63
- centroid[i] += p['embedding'][i]
64
- i += 1
65
- count += 1
66
- i = 0
67
- while i < len(centroid):
68
- centroid[i] /= count
69
- i += 1
70
-
71
- # Update centroid considering only the k-near elements
72
- if(len(v['prompts']) <= k):
73
- return centroid
74
- else:
75
- k_items = pd.DataFrame(columns=['embedding', 'distance'])
76
- for p in v['prompts']:
77
- dist = get_distance(pd.DataFrame(centroid), pd.DataFrame(p['embedding']))
78
- k_items = pd.concat([pd.DataFrame([[p['embedding'], dist]], columns=k_items.columns), k_items], ignore_index=True)
79
-
80
- k_items = k_items.sort_values(by='distance')
81
- k_items = k_items.head(k)
82
-
83
- # Computing centroid only for the k-near elements
84
- centroid = [0] * dimension
85
- for i, embedding in enumerate(k_items['embedding']):
86
- for j, dimension in enumerate(embedding):
87
- centroid[j] += embedding[j]
88
- i = 0
89
- while i < len(centroid):
90
- centroid[i] /= k
91
- i += 1
92
- return centroid
93
-
94
- def populate_embeddings(prompt_json, model_path):
95
- errors, successess = 0, 0
96
- for v in prompt_json['positive_values']:
97
- for p in v['prompts']:
98
- if( p['text'] != '' and p['embedding'] == []): # only considering missing embeddings
99
- embedding = query_model(p['text'], model_path)
100
- if( 'error' in embedding ):
101
- p['embedding'] = []
102
- errors += 1
103
- else:
104
- p['embedding'] = embedding.tolist()
105
- #successes += 1
106
-
107
- for v in prompt_json['negative_values']:
108
- for p in v['prompts']:
109
- if(p['text'] != '' and p['embedding'] == []):
110
- embedding = query_model(p['text'], model_path)
111
- if('error' in embedding):
112
- p['embedding'] = []
113
- errors += 1
114
- else:
115
- p['embedding'] = embedding.tolist()
116
- #successes += 1
117
- return prompt_json
118
-
119
- def populate_centroids(prompt_json):
120
- for v in prompt_json['positive_values']:
121
- v['centroid'] = get_centroid(v, dimension = 384, k = 10)
122
- for v in prompt_json['negative_values']:
123
- v['centroid'] = get_centroid(v, dimension = 384, k = 10)
124
- return prompt_json
125
-
126
- # Saving the embeddings for a specific LLM
127
- def save_json(prompt_json, json_out_file_name):
128
- with open(json_out_file_name, 'w') as outfile:
129
- json.dump(prompt_json, outfile)