fadliaulawi commited on
Commit
593dcaa
1 Parent(s): a4eafc6

Change ground truth preparation

Browse files
Files changed (3) hide show
  1. app.py +0 -3
  2. utils.py +0 -45
  3. validate.py +32 -4
app.py CHANGED
@@ -12,9 +12,6 @@ from stqdm import stqdm
12
  from tempfile import NamedTemporaryFile
13
  from utils import *
14
 
15
- if not os.path.exists('resources/ground-truth.json'):
16
- generate_gene_rsid()
17
-
18
  from process import Process
19
  from validate import Validation
20
 
 
12
  from tempfile import NamedTemporaryFile
13
  from utils import *
14
 
 
 
 
15
  from process import Process
16
  from validate import Validation
17
 
utils.py CHANGED
@@ -46,48 +46,3 @@ def call(url):
46
  print(e)
47
 
48
  return res
49
-
50
- raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
51
- gwas_path = "resources/gwas_catalog.tsv"
52
- gwas_cln_path = "resources/gwas_cleaned.csv"
53
- ground_truth_path = "resources/ground_truth.json"
54
-
55
- def download_gwas():
56
-
57
- response = requests.get(raw_url)
58
- if response.status_code == 200:
59
- with open(gwas_path, 'wb') as file:
60
- file.write(response.content)
61
- print('File downloaded successfully')
62
- else:
63
- print('Failed to download file')
64
-
65
- gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
66
- gwas.to_csv(gwas_cln_path)
67
-
68
- def generate_gene_rsid():
69
-
70
- if not os.path.exists(gwas_cln_path):
71
- download_gwas()
72
-
73
- data = pd.read_csv(gwas_cln_path)[['MAPPED_GENE', 'SNPS']]
74
- data.dropna(inplace=True, ignore_index=True)
75
- data['MAPPED_GENE'] = data['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
76
-
77
- result = defaultdict(list)
78
- for i in data.index:
79
- gene = data.loc[i, 'MAPPED_GENE']
80
- snp = data.loc[i, 'SNPS']
81
-
82
- result[gene].append(snp)
83
- result[snp].append(gene)
84
-
85
- while '-' in gene:
86
- genes = gene.split('-')
87
- for gene in genes:
88
- result[gene].append(snp)
89
- result[snp].append(gene)
90
- gene = genes[-1]
91
-
92
- with open(ground_truth_path, 'w') as fp:
93
- json.dump(result, fp)
 
46
  print(e)
47
 
48
  return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
validate.py CHANGED
@@ -1,19 +1,47 @@
 
1
  from dotenv import load_dotenv
2
  from langchain_google_genai import ChatGoogleGenerativeAI
3
  from langchain_openai import ChatOpenAI
4
  from prompt import *
5
  from utils import call, permutate
 
6
 
7
  import os
8
- import json
9
  import pandas as pd
10
  import re
 
11
 
12
  load_dotenv()
13
 
14
- ground_truth = {}
15
- with open('resources/ground_truth.json') as f:
16
- ground_truth = json.loads(f.readlines()[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  class Validation():
19
 
 
1
+ from collections import defaultdict
2
  from dotenv import load_dotenv
3
  from langchain_google_genai import ChatGoogleGenerativeAI
4
  from langchain_openai import ChatOpenAI
5
  from prompt import *
6
  from utils import call, permutate
7
+ from io import StringIO
8
 
9
  import os
 
10
  import pandas as pd
11
  import re
12
+ import requests
13
 
14
  load_dotenv()
15
 
16
+ raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
17
+ gwas_path = "resources/gwas_catalog.tsv"
18
+
19
+
20
+ if os.path.exists(gwas_path):
21
+ gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
22
+ else:
23
+ data = requests.get(raw_url).content.decode('utf-8')
24
+ gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
25
+
26
+ gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
27
+ gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
28
+ gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
29
+
30
+
31
+ ground_truth = defaultdict(list)
32
+ for i in gwas_gene_rsid.index:
33
+ gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
34
+ snp = gwas_gene_rsid.loc[i, 'SNPS']
35
+
36
+ ground_truth[gene].append(snp)
37
+ ground_truth[snp].append(gene)
38
+
39
+ while '-' in gene:
40
+ genes = gene.split('-')
41
+ for gene in genes:
42
+ ground_truth[gene].append(snp)
43
+ ground_truth[snp].append(gene)
44
+ gene = genes[-1]
45
 
46
  class Validation():
47