Spaces:

KalbeDigitalLab
/

nutrigenme-paper-extractor

Running

App Files Files Community

fadliaulawi commited on Sep 6

Commit

593dcaa

•

1 Parent(s): a4eafc6

Change ground truth preparation

Browse files

Files changed (3) hide show

app.py +0 -3
utils.py +0 -45
validate.py +32 -4

app.py CHANGED Viewed

@@ -12,9 +12,6 @@ from stqdm import stqdm
 from tempfile import NamedTemporaryFile
 from utils import *
-if not os.path.exists('resources/ground-truth.json'):
-    generate_gene_rsid()
 from process import Process
 from validate import Validation

 from tempfile import NamedTemporaryFile
 from utils import *
 from process import Process
 from validate import Validation

utils.py CHANGED Viewed

@@ -46,48 +46,3 @@ def call(url):
             print(e)
     return res
-raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
-gwas_path = "resources/gwas_catalog.tsv"
-gwas_cln_path = "resources/gwas_cleaned.csv"
-ground_truth_path = "resources/ground_truth.json"
-def download_gwas():
-    response = requests.get(raw_url)
-    if response.status_code == 200:
-        with open(gwas_path, 'wb') as file:
-            file.write(response.content)
-        print('File downloaded successfully')
-    else:
-        print('Failed to download file')
-    gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
-    gwas.to_csv(gwas_cln_path)
-def generate_gene_rsid():
-    if not os.path.exists(gwas_cln_path):
-        download_gwas()
-    data = pd.read_csv(gwas_cln_path)[['MAPPED_GENE', 'SNPS']]
-    data.dropna(inplace=True, ignore_index=True)
-    data['MAPPED_GENE'] = data['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
-    result = defaultdict(list)
-    for i in data.index:
-        gene = data.loc[i, 'MAPPED_GENE']
-        snp = data.loc[i, 'SNPS']
-        result[gene].append(snp)
-        result[snp].append(gene)
-        while '-' in gene:
-            genes = gene.split('-')
-            for gene in genes:
-                result[gene].append(snp)
-                result[snp].append(gene)
-            gene = genes[-1]
-    with open(ground_truth_path, 'w') as fp:
-        json.dump(result, fp)


46	print(e)
47
48	return res

validate.py CHANGED Viewed

@@ -1,19 +1,47 @@
 from dotenv import load_dotenv
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_openai import ChatOpenAI
 from prompt import *
 from utils import call, permutate
 import os
-import json
 import pandas as pd
 import re
 load_dotenv()
-ground_truth = {}
-with open('resources/ground_truth.json') as f:
-    ground_truth = json.loads(f.readlines()[0])
 class Validation():

+from collections import defaultdict
 from dotenv import load_dotenv
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_openai import ChatOpenAI
 from prompt import *
 from utils import call, permutate
+from io import StringIO
 import os
 import pandas as pd
 import re
+import requests
 load_dotenv()
+raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
+gwas_path = "resources/gwas_catalog.tsv"
+if os.path.exists(gwas_path):
+    gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
+else:
+    data = requests.get(raw_url).content.decode('utf-8')
+    gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
+gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
+gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
+gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
+ground_truth = defaultdict(list)
+for i in gwas_gene_rsid.index:
+    gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
+    snp = gwas_gene_rsid.loc[i, 'SNPS']
+    ground_truth[gene].append(snp)
+    ground_truth[snp].append(gene)
+    while '-' in gene:
+        genes = gene.split('-')
+        for gene in genes:
+            ground_truth[gene].append(snp)
+            ground_truth[snp].append(gene)
+        gene = genes[-1]
 class Validation():