fadliaulawi
commited on
Commit
•
593dcaa
1
Parent(s):
a4eafc6
Change ground truth preparation
Browse files- app.py +0 -3
- utils.py +0 -45
- validate.py +32 -4
app.py
CHANGED
@@ -12,9 +12,6 @@ from stqdm import stqdm
|
|
12 |
from tempfile import NamedTemporaryFile
|
13 |
from utils import *
|
14 |
|
15 |
-
if not os.path.exists('resources/ground-truth.json'):
|
16 |
-
generate_gene_rsid()
|
17 |
-
|
18 |
from process import Process
|
19 |
from validate import Validation
|
20 |
|
|
|
12 |
from tempfile import NamedTemporaryFile
|
13 |
from utils import *
|
14 |
|
|
|
|
|
|
|
15 |
from process import Process
|
16 |
from validate import Validation
|
17 |
|
utils.py
CHANGED
@@ -46,48 +46,3 @@ def call(url):
|
|
46 |
print(e)
|
47 |
|
48 |
return res
|
49 |
-
|
50 |
-
raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
|
51 |
-
gwas_path = "resources/gwas_catalog.tsv"
|
52 |
-
gwas_cln_path = "resources/gwas_cleaned.csv"
|
53 |
-
ground_truth_path = "resources/ground_truth.json"
|
54 |
-
|
55 |
-
def download_gwas():
|
56 |
-
|
57 |
-
response = requests.get(raw_url)
|
58 |
-
if response.status_code == 200:
|
59 |
-
with open(gwas_path, 'wb') as file:
|
60 |
-
file.write(response.content)
|
61 |
-
print('File downloaded successfully')
|
62 |
-
else:
|
63 |
-
print('Failed to download file')
|
64 |
-
|
65 |
-
gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
|
66 |
-
gwas.to_csv(gwas_cln_path)
|
67 |
-
|
68 |
-
def generate_gene_rsid():
|
69 |
-
|
70 |
-
if not os.path.exists(gwas_cln_path):
|
71 |
-
download_gwas()
|
72 |
-
|
73 |
-
data = pd.read_csv(gwas_cln_path)[['MAPPED_GENE', 'SNPS']]
|
74 |
-
data.dropna(inplace=True, ignore_index=True)
|
75 |
-
data['MAPPED_GENE'] = data['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
|
76 |
-
|
77 |
-
result = defaultdict(list)
|
78 |
-
for i in data.index:
|
79 |
-
gene = data.loc[i, 'MAPPED_GENE']
|
80 |
-
snp = data.loc[i, 'SNPS']
|
81 |
-
|
82 |
-
result[gene].append(snp)
|
83 |
-
result[snp].append(gene)
|
84 |
-
|
85 |
-
while '-' in gene:
|
86 |
-
genes = gene.split('-')
|
87 |
-
for gene in genes:
|
88 |
-
result[gene].append(snp)
|
89 |
-
result[snp].append(gene)
|
90 |
-
gene = genes[-1]
|
91 |
-
|
92 |
-
with open(ground_truth_path, 'w') as fp:
|
93 |
-
json.dump(result, fp)
|
|
|
46 |
print(e)
|
47 |
|
48 |
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
validate.py
CHANGED
@@ -1,19 +1,47 @@
|
|
|
|
1 |
from dotenv import load_dotenv
|
2 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
3 |
from langchain_openai import ChatOpenAI
|
4 |
from prompt import *
|
5 |
from utils import call, permutate
|
|
|
6 |
|
7 |
import os
|
8 |
-
import json
|
9 |
import pandas as pd
|
10 |
import re
|
|
|
11 |
|
12 |
load_dotenv()
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
class Validation():
|
19 |
|
|
|
1 |
+
from collections import defaultdict
|
2 |
from dotenv import load_dotenv
|
3 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
4 |
from langchain_openai import ChatOpenAI
|
5 |
from prompt import *
|
6 |
from utils import call, permutate
|
7 |
+
from io import StringIO
|
8 |
|
9 |
import os
|
|
|
10 |
import pandas as pd
|
11 |
import re
|
12 |
+
import requests
|
13 |
|
14 |
load_dotenv()
|
15 |
|
16 |
+
raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
|
17 |
+
gwas_path = "resources/gwas_catalog.tsv"
|
18 |
+
|
19 |
+
|
20 |
+
if os.path.exists(gwas_path):
|
21 |
+
gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
|
22 |
+
else:
|
23 |
+
data = requests.get(raw_url).content.decode('utf-8')
|
24 |
+
gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
|
25 |
+
|
26 |
+
gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
|
27 |
+
gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
|
28 |
+
gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
|
29 |
+
|
30 |
+
|
31 |
+
ground_truth = defaultdict(list)
|
32 |
+
for i in gwas_gene_rsid.index:
|
33 |
+
gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
|
34 |
+
snp = gwas_gene_rsid.loc[i, 'SNPS']
|
35 |
+
|
36 |
+
ground_truth[gene].append(snp)
|
37 |
+
ground_truth[snp].append(gene)
|
38 |
+
|
39 |
+
while '-' in gene:
|
40 |
+
genes = gene.split('-')
|
41 |
+
for gene in genes:
|
42 |
+
ground_truth[gene].append(snp)
|
43 |
+
ground_truth[snp].append(gene)
|
44 |
+
gene = genes[-1]
|
45 |
|
46 |
class Validation():
|
47 |
|