fadliaulawi
commited on
Commit
•
07396d9
1
Parent(s):
2b126e8
Add database integration
Browse files
app.py
CHANGED
@@ -11,7 +11,6 @@ from langchain_text_splitters import TokenTextSplitter
|
|
11 |
from stqdm import stqdm
|
12 |
from tempfile import NamedTemporaryFile
|
13 |
from utils import *
|
14 |
-
|
15 |
from process import Process
|
16 |
from validate import Validation
|
17 |
|
@@ -119,13 +118,16 @@ if uploaded_files and submit:
|
|
119 |
df, df_clean = validation.validate(dataframe, api)
|
120 |
df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
|
121 |
|
|
|
|
|
|
|
122 |
st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
|
123 |
-
st.write(f"Extracted {len(df)} rows")
|
124 |
st.dataframe(df)
|
125 |
|
126 |
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
|
127 |
-
df.to_excel(writer, sheet_name='
|
128 |
-
df_clean.to_excel(writer, sheet_name='
|
129 |
dataframe.to_excel(writer, sheet_name='Original')
|
130 |
writer.close()
|
131 |
|
|
|
11 |
from stqdm import stqdm
|
12 |
from tempfile import NamedTemporaryFile
|
13 |
from utils import *
|
|
|
14 |
from process import Process
|
15 |
from validate import Validation
|
16 |
|
|
|
118 |
df, df_clean = validation.validate(dataframe, api)
|
119 |
df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
|
120 |
|
121 |
+
# Integrate with Database
|
122 |
+
df = integrate(df)
|
123 |
+
|
124 |
st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
|
125 |
+
st.write(f"Extracted **{len(df)}** rows")
|
126 |
st.dataframe(df)
|
127 |
|
128 |
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
|
129 |
+
df.to_excel(writer, sheet_name='Validated + Database')
|
130 |
+
df_clean.to_excel(writer, sheet_name='Cleaned')
|
131 |
dataframe.to_excel(writer, sheet_name='Original')
|
132 |
writer.close()
|
133 |
|
utils.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
-
import json
|
2 |
import os
|
3 |
import pandas as pd
|
|
|
4 |
import requests
|
5 |
import time
|
6 |
|
7 |
from collections import defaultdict
|
|
|
8 |
|
9 |
# Common mistakes need to be maintained
|
10 |
mistakes = {
|
@@ -50,3 +51,60 @@ def call(url):
|
|
50 |
print(e)
|
51 |
|
52 |
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import pandas as pd
|
3 |
+
import re
|
4 |
import requests
|
5 |
import time
|
6 |
|
7 |
from collections import defaultdict
|
8 |
+
from io import StringIO
|
9 |
|
10 |
# Common mistakes need to be maintained
|
11 |
mistakes = {
|
|
|
51 |
print(e)
|
52 |
|
53 |
return res
|
54 |
+
|
55 |
+
raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
|
56 |
+
gwas_path = "resources/gwas_catalog.tsv"
|
57 |
+
|
58 |
+
# Load Raw GWAS files
|
59 |
+
if os.path.exists(gwas_path):
|
60 |
+
gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
|
61 |
+
else:
|
62 |
+
data = requests.get(raw_url).content.decode('utf-8')
|
63 |
+
gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
|
64 |
+
|
65 |
+
# Load Genes and SNPs from GWAS
|
66 |
+
gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
|
67 |
+
gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
|
68 |
+
gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
|
69 |
+
|
70 |
+
# Generate Genes and SNPs mapping
|
71 |
+
ground_truth = defaultdict(list)
|
72 |
+
for i in gwas_gene_rsid.index:
|
73 |
+
gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
|
74 |
+
snp = gwas_gene_rsid.loc[i, 'SNPS']
|
75 |
+
|
76 |
+
pattern = r"[,\-]"
|
77 |
+
genes = re.split(pattern, gene)
|
78 |
+
for gene in genes:
|
79 |
+
ground_truth[gene].append(snp)
|
80 |
+
ground_truth[snp].append(gene)
|
81 |
+
|
82 |
+
def integrate(df):
|
83 |
+
|
84 |
+
# Loop through extractor result
|
85 |
+
df_db = pd.DataFrame()
|
86 |
+
for i in df.index:
|
87 |
+
gene, snp = df.loc[i, 'Genes'], df.loc[i, 'rsID']
|
88 |
+
df_gwas = gwas[(gwas['MAPPED_GENE'].str.contains(gene, na=False)) & \
|
89 |
+
(gwas['SNPS'].str.contains(snp, na=False))]
|
90 |
+
|
91 |
+
df_db = pd.concat([df_db, df_gwas])
|
92 |
+
|
93 |
+
# Adjust new column
|
94 |
+
df_db.rename(columns={
|
95 |
+
'DISEASE/TRAIT': 'Traits',
|
96 |
+
'MAPPED_GENE': 'Genes',
|
97 |
+
'SNPS': 'rsID',
|
98 |
+
'P-VALUE': 'P Value',
|
99 |
+
'OR or BETA': 'OR Value'
|
100 |
+
}, inplace=True)
|
101 |
+
df_db.drop(columns=['CHR_ID'], inplace=True)
|
102 |
+
df_db['Beta Value'] = df_db['OR Value']
|
103 |
+
df_db['Source'] = 'Database'
|
104 |
+
|
105 |
+
# Combine raw and database
|
106 |
+
df_db = df_db[df.columns]
|
107 |
+
df = pd.concat([df, df_db])
|
108 |
+
df.reset_index(drop=True, inplace=True)
|
109 |
+
|
110 |
+
return df
|