File size: 8,028 Bytes
dd49f8a
 
794f79d
 
dd49f8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794f79d
 
 
 
dd49f8a
 
 
 
 
 
 
 
 
 
 
 
794f79d
 
dd49f8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794f79d
dd49f8a
 
 
 
 
794f79d
dd49f8a
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
# coding: utf-8
import os 
script_dir = os.path.dirname(os.path.abspath(__file__))

import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
import random
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
from functools import partial

manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()
 
representation_dataframe = ""
protein_names =  ""
# define similarity_list and proteinList as global variables
representation_name = ""
similarity_tasks = ""
detailed_output = False

def parallelSimilarity(paramList):
    protein_embedding_dataframe = representation_dataframe
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]
    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        if protein1 in protein_names and protein2 in protein_names:
            prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
            prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
            #cosine will return in shape of input vectors first dimension
            cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
            manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
            manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
            manhattanSim = 1-manhattanDistNorm.item()
            if (norm(prot1vec,1)==0 and norm(prot2vec,1) == 0):
                manhattanSim = 1.0
                #print((protein1,protein2))
                #print(manhattanDist)
                #print(norm(prot1vec,1))
                #print(norm(prot2vec,1))
            euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
            euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2))
            euclidianSim =  1-euclideanDistNorm.item()
            if (norm(prot1vec,1)==0 and norm(prot2vec,1) == 0):
                euclidianSim = 1.0
            real = paramList[3]
            # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
            similarity_list.append((real,cos,manhattanSim ,euclidianSim))
    return similarity_list

def calculateCorrelationforOntology(aspect,matrix_type):
    print("\n\nSemantic similarity correlation calculation for aspect: " + aspect + " using matrix/dataset: " + matrix_type + " ...\n")
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []
    
    similarityMatrixNameDict = {}
    similarityMatrixNameDict["All"] = os.path.join(script_dir, "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix.csv")
    similarityMatrixNameDict["500"] = os.path.join(script_dir, "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv")
    similarityMatrixNameDict["Sparse"] = os.path.join(script_dir, "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv")
    similarityMatrixNameDict["200"] = os.path.join(script_dir, "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv")

    similarityMatrixFileName = similarityMatrixNameDict[matrix_type]

    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    proteinList = human_proteinSimilarityMatrix.columns

    #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
    if matrix_type == "Sparse":
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
        sparsified_similarity_coordinates = np.load(sparsified_path)
        protParamList = sparsified_similarity_coordinates
    else:     
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i,j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be 
    # used concurrently by different processes
    for tup in tqdm(protParamList):
        i = tup[0]
        j = tup[1]

        if matrix_type == "Sparse":
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real)
                protParamListNew.append(tupNew)

    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    #parallelSimilarityPartial = partial(parallelSimilarity,protein_embedding_type)
    for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity,protParamListNew), total=total_task_num , position=0, leave=True ):
        pass
        #time.sleep(0.1)
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    distance_lists = [real_distance_list,cosine_distance_list,manhattan_distance_list,euclidian_distance_list]
    if detailed_output:
        report_detailed_distance_scores(representation_name,matrix_type,aspect,distance_lists)
    
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)   

    #print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    #print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    #print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    return (cosineCorr,manhattanCorr,euclidianCorr)

def report_detailed_distance_scores(representation_name,similarity_matrix_type,aspect,distance_lists):
    saveFileName = os.path.join(script_dir, "../results/Semantic_sim_inference_detailed_distance_scores"+aspect+"_"+similarity_matrix_type+"_"+representation_name+".pkl")
    with open(saveFileName, "wb") as f:
            pickle.dump(distance_lists, f)

def calculate_all_correlations():
    for similarity_matrix_type in similarity_tasks:
        saveFileName = os.path.join(script_dir, "../results/Semantic_sim_inference_"+similarity_matrix_type+"_"+representation_name+".csv")
        buffer = "Semantic Aspect,CosineSim_Correlation,CosineSim_Correlation p-value, ManhattanSim_Correlation,ManhattanSim_Correlation p-value, EuclidianSim_Correlation,EuclidianSim_Correlation p-value \n"
        f = open(saveFileName,'w')
        f.write(buffer)
        for aspect in ["MF","BP","CC"]:
            corr =  calculateCorrelationforOntology(aspect,similarity_matrix_type) 
            buffer = "" + aspect + ","+ str(round(corr[0][0],5))+ ","+ str(round(corr[0][1],5))+ ","+ str(round(corr[1][0],5))\
            + ","+ str(round(corr[1][1],5))+ ","+ str(round(corr[2][0],5))+ ","+str(round(corr[2][1],5))+"\n" 
            f = open(saveFileName,'a')
            f.write(buffer)
            f.close()