fadliaulawi commited on
Commit
07396d9
1 Parent(s): 2b126e8

Add database integration

Browse files
Files changed (2) hide show
  1. app.py +6 -4
  2. utils.py +59 -1
app.py CHANGED
@@ -11,7 +11,6 @@ from langchain_text_splitters import TokenTextSplitter
11
  from stqdm import stqdm
12
  from tempfile import NamedTemporaryFile
13
  from utils import *
14
-
15
  from process import Process
16
  from validate import Validation
17
 
@@ -119,13 +118,16 @@ if uploaded_files and submit:
119
  df, df_clean = validation.validate(dataframe, api)
120
  df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
121
 
 
 
 
122
  st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
123
- st.write(f"Extracted {len(df)} rows")
124
  st.dataframe(df)
125
 
126
  with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
127
- df.to_excel(writer, sheet_name='Result Cleaned Validated')
128
- df_clean.to_excel(writer, sheet_name='Result Cleaned')
129
  dataframe.to_excel(writer, sheet_name='Original')
130
  writer.close()
131
 
 
11
  from stqdm import stqdm
12
  from tempfile import NamedTemporaryFile
13
  from utils import *
 
14
  from process import Process
15
  from validate import Validation
16
 
 
118
  df, df_clean = validation.validate(dataframe, api)
119
  df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
120
 
121
+ # Integrate with Database
122
+ df = integrate(df)
123
+
124
  st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
125
+ st.write(f"Extracted **{len(df)}** rows")
126
  st.dataframe(df)
127
 
128
  with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
129
+ df.to_excel(writer, sheet_name='Validated + Database')
130
+ df_clean.to_excel(writer, sheet_name='Cleaned')
131
  dataframe.to_excel(writer, sheet_name='Original')
132
  writer.close()
133
 
utils.py CHANGED
@@ -1,10 +1,11 @@
1
- import json
2
  import os
3
  import pandas as pd
 
4
  import requests
5
  import time
6
 
7
  from collections import defaultdict
 
8
 
9
  # Common mistakes need to be maintained
10
  mistakes = {
@@ -50,3 +51,60 @@ def call(url):
50
  print(e)
51
 
52
  return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import pandas as pd
3
+ import re
4
  import requests
5
  import time
6
 
7
  from collections import defaultdict
8
+ from io import StringIO
9
 
10
  # Common mistakes need to be maintained
11
  mistakes = {
 
51
  print(e)
52
 
53
  return res
54
+
55
+ raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
56
+ gwas_path = "resources/gwas_catalog.tsv"
57
+
58
+ # Load Raw GWAS files
59
+ if os.path.exists(gwas_path):
60
+ gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
61
+ else:
62
+ data = requests.get(raw_url).content.decode('utf-8')
63
+ gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
64
+
65
+ # Load Genes and SNPs from GWAS
66
+ gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
67
+ gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
68
+ gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
69
+
70
+ # Generate Genes and SNPs mapping
71
+ ground_truth = defaultdict(list)
72
+ for i in gwas_gene_rsid.index:
73
+ gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
74
+ snp = gwas_gene_rsid.loc[i, 'SNPS']
75
+
76
+ pattern = r"[,\-]"
77
+ genes = re.split(pattern, gene)
78
+ for gene in genes:
79
+ ground_truth[gene].append(snp)
80
+ ground_truth[snp].append(gene)
81
+
82
+ def integrate(df):
83
+
84
+ # Loop through extractor result
85
+ df_db = pd.DataFrame()
86
+ for i in df.index:
87
+ gene, snp = df.loc[i, 'Genes'], df.loc[i, 'rsID']
88
+ df_gwas = gwas[(gwas['MAPPED_GENE'].str.contains(gene, na=False)) & \
89
+ (gwas['SNPS'].str.contains(snp, na=False))]
90
+
91
+ df_db = pd.concat([df_db, df_gwas])
92
+
93
+ # Adjust new column
94
+ df_db.rename(columns={
95
+ 'DISEASE/TRAIT': 'Traits',
96
+ 'MAPPED_GENE': 'Genes',
97
+ 'SNPS': 'rsID',
98
+ 'P-VALUE': 'P Value',
99
+ 'OR or BETA': 'OR Value'
100
+ }, inplace=True)
101
+ df_db.drop(columns=['CHR_ID'], inplace=True)
102
+ df_db['Beta Value'] = df_db['OR Value']
103
+ df_db['Source'] = 'Database'
104
+
105
+ # Combine raw and database
106
+ df_db = df_db[df.columns]
107
+ df = pd.concat([df, df_db])
108
+ df.reset_index(drop=True, inplace=True)
109
+
110
+ return df