fadliaulawi
commited on
Commit
•
1bd03b4
1
Parent(s):
51409c3
Change validation logic
Browse files- validate.py +42 -3
validate.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from collections import defaultdict
|
2 |
from dotenv import load_dotenv
|
3 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
4 |
from langchain_openai import ChatOpenAI
|
@@ -21,7 +20,7 @@ class Validation():
|
|
21 |
else:
|
22 |
self.llm = ChatOpenAI(temperature=0, model_name=llm, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
|
23 |
|
24 |
-
def validate(self, df, api):
|
25 |
|
26 |
df = df.fillna('')
|
27 |
df['Genes'] = df['Genes'].str.replace(' ', '').str.upper()
|
@@ -83,6 +82,7 @@ class Validation():
|
|
83 |
df.reset_index(drop=True, inplace=True)
|
84 |
df_clean = df.copy()
|
85 |
|
|
|
86 |
# Validate genes and SNPs with APIs
|
87 |
if api:
|
88 |
dbsnp = {}
|
@@ -110,14 +110,34 @@ class Validation():
|
|
110 |
else:
|
111 |
df = df.drop(i)
|
112 |
|
|
|
113 |
# Check with GWAS ground truth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
for i in df.index:
|
115 |
gene = df.loc[i, 'Genes']
|
116 |
snp = df.loc[i, 'rsID']
|
117 |
perms = permutate(gene)
|
118 |
|
119 |
for perm in perms:
|
120 |
-
if perm in
|
121 |
df.loc[i, 'Genes'] = perm
|
122 |
if gene != perm:
|
123 |
print(f'{gene} corrected to {perm} with {snp}')
|
@@ -128,6 +148,25 @@ class Validation():
|
|
128 |
print(f'{gene} and {snp} not found')
|
129 |
df = df.drop(i)
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
df.reset_index(drop=True, inplace=True)
|
132 |
|
133 |
return df, df_clean
|
|
|
|
|
1 |
from dotenv import load_dotenv
|
2 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
3 |
from langchain_openai import ChatOpenAI
|
|
|
20 |
else:
|
21 |
self.llm = ChatOpenAI(temperature=0, model_name=llm, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
|
22 |
|
23 |
+
def validate(self, df, text, api):
|
24 |
|
25 |
df = df.fillna('')
|
26 |
df['Genes'] = df['Genes'].str.replace(' ', '').str.upper()
|
|
|
82 |
df.reset_index(drop=True, inplace=True)
|
83 |
df_clean = df.copy()
|
84 |
|
85 |
+
# WARNING: DEPRECATED
|
86 |
# Validate genes and SNPs with APIs
|
87 |
if api:
|
88 |
dbsnp = {}
|
|
|
110 |
else:
|
111 |
df = df.drop(i)
|
112 |
|
113 |
+
# WARNING: DEPRECATED
|
114 |
# Check with GWAS ground truth
|
115 |
+
if False:
|
116 |
+
for i in df.index:
|
117 |
+
gene = df.loc[i, 'Genes']
|
118 |
+
snp = df.loc[i, 'rsID']
|
119 |
+
perms = permutate(gene)
|
120 |
+
|
121 |
+
for perm in perms:
|
122 |
+
if perm in ground_truth and snp in ground_truth[perm]:
|
123 |
+
df.loc[i, 'Genes'] = perm
|
124 |
+
if gene != perm:
|
125 |
+
print(f'{gene} corrected to {perm} with {snp}')
|
126 |
+
else:
|
127 |
+
print(f'{gene} and {snp} safe')
|
128 |
+
break
|
129 |
+
else:
|
130 |
+
print(f'{gene} and {snp} not found')
|
131 |
+
df = df.drop(i)
|
132 |
+
|
133 |
+
# Check with Text
|
134 |
for i in df.index:
|
135 |
gene = df.loc[i, 'Genes']
|
136 |
snp = df.loc[i, 'rsID']
|
137 |
perms = permutate(gene)
|
138 |
|
139 |
for perm in perms:
|
140 |
+
if perm in text and snp in text:
|
141 |
df.loc[i, 'Genes'] = perm
|
142 |
if gene != perm:
|
143 |
print(f'{gene} corrected to {perm} with {snp}')
|
|
|
148 |
print(f'{gene} and {snp} not found')
|
149 |
df = df.drop(i)
|
150 |
|
151 |
+
# Drop (duplicate) entries with empty values
|
152 |
+
genes = []
|
153 |
+
snps = []
|
154 |
+
for i in df.index:
|
155 |
+
gene = df.loc[i, 'Genes']
|
156 |
+
snp = df.loc[i, 'rsID']
|
157 |
+
|
158 |
+
if len(gene) == 0 and len(snp) == 0:
|
159 |
+
df = df.drop(i)
|
160 |
+
elif len(gene) == 0:
|
161 |
+
if snp in snps:
|
162 |
+
df = df.drop(i)
|
163 |
+
elif len(snp) == 0:
|
164 |
+
if gene in genes:
|
165 |
+
df = df.drop(i)
|
166 |
+
else:
|
167 |
+
genes.append(gene)
|
168 |
+
snps.append(snp)
|
169 |
+
|
170 |
df.reset_index(drop=True, inplace=True)
|
171 |
|
172 |
return df, df_clean
|