fadliaulawi
commited on
Commit
•
9c8e6da
1
Parent(s):
745c0a6
Enable API validation
Browse files- process.py +50 -50
process.py
CHANGED
@@ -203,7 +203,7 @@ class Process():
|
|
203 |
df['SNPs'] = df['SNPs'].str.lower()
|
204 |
|
205 |
# Check if there is two gene names
|
206 |
-
sym = [',', '
|
207 |
for i in df.index:
|
208 |
gene = df.loc[i, 'Genes']
|
209 |
for s in sym:
|
@@ -238,60 +238,60 @@ class Process():
|
|
238 |
df_clean = df.copy()
|
239 |
|
240 |
# # Validate genes and SNPs with APIs
|
241 |
-
|
242 |
|
243 |
-
|
244 |
-
|
245 |
|
246 |
-
|
247 |
-
|
248 |
|
249 |
-
|
250 |
-
|
251 |
|
252 |
-
|
253 |
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
|
296 |
# df.reset_index(drop=True, inplace=True)
|
297 |
df_no_llm = df.copy()
|
|
|
203 |
df['SNPs'] = df['SNPs'].str.lower()
|
204 |
|
205 |
# Check if there is two gene names
|
206 |
+
sym = [',', '/', '|']
|
207 |
for i in df.index:
|
208 |
gene = df.loc[i, 'Genes']
|
209 |
for s in sym:
|
|
|
238 |
df_clean = df.copy()
|
239 |
|
240 |
# # Validate genes and SNPs with APIs
|
241 |
+
def permutate(word):
|
242 |
|
243 |
+
if len(word) == 0:
|
244 |
+
return ['']
|
245 |
|
246 |
+
change = []
|
247 |
+
res = permutate(word[1:])
|
248 |
|
249 |
+
if word[0] in mistakes:
|
250 |
+
change = [mistakes[word[0]] + r for r in res]
|
251 |
|
252 |
+
return [word[0] + r for r in res] + change
|
253 |
|
254 |
+
def call(url):
|
255 |
+
|
256 |
+
while True:
|
257 |
+
try:
|
258 |
+
res = requests.get(url)
|
259 |
+
time.sleep(1)
|
260 |
+
break
|
261 |
+
except Exception as e:
|
262 |
+
print(e)
|
263 |
+
|
264 |
+
return res
|
265 |
+
|
266 |
+
mistakes = {'I': '1', 'O': '0'} # Common mistakes need to be maintained
|
267 |
+
dbsnp = {}
|
268 |
+
|
269 |
+
for i in df.index:
|
270 |
+
snp = df.loc[i, 'SNPs']
|
271 |
+
gene = df.loc[i, 'Genes']
|
272 |
+
|
273 |
+
if snp not in dbsnp:
|
274 |
+
res = call(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')
|
275 |
+
try:
|
276 |
+
res = res.json()
|
277 |
+
dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]
|
278 |
+
except:
|
279 |
+
dbsnp[snp] = []
|
280 |
+
|
281 |
+
res = call(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]
|
282 |
+
if 'error' not in res:
|
283 |
+
dbsnp[snp].extend([r['name'] for r in res['genes']])
|
284 |
+
|
285 |
+
dbsnp[snp] = list(set(dbsnp[snp]))
|
286 |
+
|
287 |
+
if gene not in dbsnp[snp]:
|
288 |
+
for other in permutate(gene):
|
289 |
+
if other in dbsnp[snp]:
|
290 |
+
df.loc[i, 'Genes'] = other
|
291 |
+
print(f'{gene} corrected to {other}')
|
292 |
+
break
|
293 |
+
else:
|
294 |
+
df = df.drop(i)
|
295 |
|
296 |
# df.reset_index(drop=True, inplace=True)
|
297 |
df_no_llm = df.copy()
|