Spaces:
Build error
Build error
import string | |
from tqdm import tqdm | |
import pickle | |
import scipy | |
import numpy as np | |
from numpy import linalg as LA | |
from sklearn.decomposition import PCA | |
# Experiment 1 | |
WEAT_words = { | |
'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'], | |
'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'], | |
'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'], | |
'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'], | |
'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'], | |
'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'], | |
'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'], | |
'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'], | |
} | |
def has_punct(w): | |
if any([c in string.punctuation for c in w]): | |
return True | |
return False | |
def has_digit(w): | |
if any([c in '0123456789' for c in w]): | |
return True | |
return False | |
def limit_vocab(wv, w2i, vocab, exclude = None): | |
vocab_limited = [] | |
for w in tqdm(vocab[:50000]): | |
if w.lower() != w: | |
continue | |
if len(w) >= 20: | |
continue | |
if has_digit(w): | |
continue | |
if '_' in w: | |
p = [has_punct(subw) for subw in w.split('_')] | |
if not any(p): | |
vocab_limited.append(w) | |
continue | |
if has_punct(w): | |
continue | |
vocab_limited.append(w) | |
if exclude: | |
vocab_limited = list(set(vocab_limited) - set(exclude)) | |
print("size of vocabulary:", len(vocab_limited)) | |
wv_limited = np.zeros((len(vocab_limited), len(wv[0, :]))) | |
for i,w in enumerate(vocab_limited): | |
wv_limited[i,:] = wv[w2i[w],:] | |
w2i_limited = {w: i for i, w in enumerate(vocab_limited)} | |
return vocab_limited, wv_limited, w2i_limited | |
def norm_stand(wv): | |
W_norm = np.zeros(wv.shape) | |
d = (np.sum(wv ** 2, 1) ** (0.5)) | |
W_norm = (wv.T / d).T | |
return W_norm | |
def normalize(wv): | |
# normalize vectors | |
norms = np.apply_along_axis(LA.norm, 1, wv) | |
wv = wv / norms[:, np.newaxis] | |
return wv | |
def topK(w, wv, w2i, vocab, k=10): | |
# extract the word vector for word w | |
idx = w2i[w] | |
vec = wv[idx, :] | |
# compute similarity of w with all words in the vocabulary | |
sim = wv.dot(vec) | |
# sim = [] | |
# for i in range(len(wv)): | |
# sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec)) | |
# sim = np.array(sim) | |
# sort similarities by descending order | |
sort_sim = (sim.argsort())[::-1] | |
# choose topK | |
best = sort_sim[:(k+1)] | |
return [vocab[i] for i in best if i!=idx] | |
def similarity(w1, w2, wv, w2i): | |
i1 = w2i[w1] | |
i2 = w2i[w2] | |
vec1 = wv[i1, :] | |
vec2 = wv[i2, :] | |
return 1-scipy.spatial.distance.cosine(vec1, vec2) | |
def drop(u, v): | |
return u - v * u.dot(v) / v.dot(v) | |
from sklearn.decomposition import PCA | |
from sklearn import preprocessing | |
def doPCA(pairs, wv, w2i): | |
matrix = [] | |
cnt = 0 | |
if type(pairs[0]) is list: | |
for a, b in pairs: | |
if not (a in w2i and b in w2i): continue | |
center = (wv[w2i[a], :] + wv[w2i[b], :])/2 | |
matrix.append(wv[w2i[a], :] - center) | |
matrix.append(wv[w2i[b], :] - center) | |
cnt += 1 | |
else: | |
for a in pairs: | |
if not (a in w2i): continue | |
matrix.append(wv[w2i[a], :]) | |
cnt += 1 | |
embeds = np.array(matrix) | |
wv_mean = np.mean(np.array(embeds), axis=0) | |
wv_hat = np.zeros(embeds.shape).astype(float) | |
for i in range(len(embeds)): | |
wv_hat[i, :] = embeds[i, :] - wv_mean | |
matrix = wv_hat | |
matrix = np.array(matrix) | |
pca = PCA() | |
pca.fit(matrix) | |
print('pairs used in PCA: ', cnt) | |
return pca | |
# get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors) | |
import operator | |
def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100): | |
tuples = [] | |
sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1)) | |
female = [item[0] for item in sorted_g[:size]] | |
male = [item[0] for item in sorted_g[-size:]] | |
# vocab = male + female | |
selected = female + male if size > 0 else vocab | |
for w in selected: | |
top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num] | |
m = 0 | |
f = 0 | |
for t in top: | |
if gender_bias_bef[t] > 0: | |
m+=1 | |
else: | |
f+=1 | |
tuples.append((w, gender_bias_bef[w], m, f)) | |
return tuples | |
def get_tuples_prof(wv, w2i, vocab, words, gender_bias_dict): | |
wv = normalize(wv) | |
tuples = [] | |
for w in words: | |
if w not in gender_bias_dict: | |
continue | |
top = topK(w, wv, w2i, vocab, k=105)[:100] | |
m = 0 | |
f = 0 | |
for t in top: | |
if gender_bias_dict[t] > 0: | |
m+=1 | |
else: | |
f+=1 | |
tuples.append((w, gender_bias_dict[w], m, f)) | |
return tuples | |
# compute correlation between bias-by-projection and bias-by-neighbors | |
import scipy.stats | |
def pearson(a,b): | |
return scipy.stats.pearsonr(a,b) | |
def compute_corr(tuples, i1, i2): | |
a = [] | |
b = [] | |
for t in tuples: | |
a.append(t[i1]) | |
b.append(t[i2]) | |
assert(len(a)==len(b)) | |
print('pearson: ', scipy.stats.pearsonr(a,b)) | |
print('spearman: ', scipy.stats.spearmanr(a, b)) | |
# Auxiliary finctions | |
from sklearn.cluster import KMeans | |
from sklearn.manifold import TSNE | |
def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2): | |
# perform TSNE | |
X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors) | |
for x,p,y in zip(X_embedded, y_pred, y_true): | |
if p: | |
if y: | |
ax.scatter(x[0], x[1], marker = '.', c = 'c') | |
else: | |
ax.scatter(x[0], x[1], marker = 'x', c = 'c') | |
else: | |
if y: | |
ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet') | |
else: | |
ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet') | |
ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15) | |
def extract_vectors(words, wv, w2i): | |
X = [wv[w2i[x],:] for x in words] | |
return X | |
def cluster_and_visualize(words, X, random_state, y_true, num=2): | |
y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X) | |
# fig, axs = plt.subplots(figsize=(6, 3)) | |
# visualize(X, y_true, y_pred, axs, 'Original', random_state) | |
correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ] | |
print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct)))) | |
import scipy.stats | |
from sklearn import svm | |
def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females): | |
X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]] | |
Y_train = [1]*size_train + [0]*size_train | |
X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]] | |
Y_test = [1]*size_test + [0]*size_test | |
clf = svm.SVC(gamma='auto') | |
clf.fit(X_train, Y_train) | |
preds = clf.predict(X_test) | |
accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)] | |
acc = float(sum(accuracy))/len(accuracy) | |
print('accuracy:', float(sum(accuracy))/len(accuracy)) | |
return acc | |
# Auxiliary functions for experiments by Caliskan et al. | |
import scipy | |
import scipy.misc as misc | |
import itertools | |
def s_word(w, A, B, wv, w2i, vocab, all_s_words): | |
if w in all_s_words: | |
return all_s_words[w] | |
mean_a = [] | |
mean_b = [] | |
for a in A: | |
mean_a.append(similarity(w, a, wv, w2i)) | |
for b in B: | |
mean_b.append(similarity(w, b, wv, w2i)) | |
mean_a = sum(mean_a)/float(len(mean_a)) | |
mean_b = sum(mean_b)/float(len(mean_b)) | |
all_s_words[w] = mean_a - mean_b | |
return all_s_words[w] | |
def s_group(X, Y, A, B, wv, w2i, vocab, all_s_words): | |
total = 0 | |
for x in X: | |
total += s_word(x, A, B, wv, w2i, vocab, all_s_words) | |
for y in Y: | |
total -= s_word(y, A, B, wv, w2i, vocab, all_s_words) | |
return total | |
def p_value_exhust(X, Y, A, B, wv, w2i, vocab): | |
if len(X) > 10: | |
print('might take too long, use sampled version: p_value') | |
return | |
assert(len(X) == len(Y)) | |
all_s_words = {} | |
s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words) | |
union = set(X+Y) | |
subset_size = int(len(union)/2) | |
larger = 0 | |
total = 0 | |
for subset in set(itertools.combinations(union, subset_size)): | |
total += 1 | |
Xi = list(set(subset)) | |
Yi = list(union - set(subset)) | |
if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig: | |
larger += 1 | |
print('num of samples', total) | |
return larger/float(total) | |
def association_diff(t, A, B, wv, w2i): | |
mean_a = [] | |
mean_b = [] | |
for a in A: | |
mean_a.append(similarity(t, a, wv, w2i)) | |
for b in B: | |
mean_b.append(similarity(t, b, wv, w2i)) | |
mean_a = sum(mean_a)/float(len(mean_a)) | |
mean_b = sum(mean_b)/float(len(mean_b)) | |
return mean_a - mean_b | |
def effect_size(X, Y, A, B, wv, w2i, vocab): | |
assert(len(X) == len(Y)) | |
assert(len(A) == len(B)) | |
norm_x = [] | |
norm_y = [] | |
for x in X: | |
norm_x.append(association_diff(x, A, B, wv, w2i)) | |
for y in Y: | |
norm_y.append(association_diff(y, A, B, wv, w2i)) | |
std = np.std(norm_x+norm_y, ddof=1) | |
norm_x = sum(norm_x) / float(len(norm_x)) | |
norm_y = sum(norm_y) / float(len(norm_y)) | |
return (norm_x-norm_y)/std | |
def p_value_sample(X, Y, A, B, wv, w2i, vocab): | |
random.seed(10) | |
np.random.seed(10) | |
all_s_words = {} | |
assert(len(X) == len(Y)) | |
length = len(X) | |
s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words) | |
num_of_samples = min(1000000, int(scipy.special.comb(length*2,length)*100)) | |
print('num of samples', num_of_samples) | |
larger = 0 | |
for i in range(num_of_samples): | |
permute = np.random.permutation(X+Y) | |
Xi = permute[:length] | |
Yi = permute[length:] | |
if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig: | |
larger += 1 | |
return larger/float(num_of_samples) | |