File size: 3,296 Bytes
36bfc91
 
07396d9
8503206
 
 
36bfc91
07396d9
36bfc91
8503206
36bfc91
 
 
 
 
 
5dd932f
 
 
 
237bb18
 
 
36bfc91
 
 
 
5dd932f
 
36bfc91
8503206
2bc0f8a
 
 
8503206
 
 
 
 
 
 
 
 
36bfc91
 
8503206
 
 
 
 
 
 
 
 
 
 
 
 
36bfc91
07396d9
2bc0f8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07396d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c49858
 
07396d9
 
 
3c49858
07396d9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import pandas as pd
import re
import requests
import time

from collections import defaultdict
from io import StringIO

# Common mistakes need to be maintained
mistakes = {
    '1': ['7', 'I', 'L', 'T'],
    '7': ['1', 'I', 'L', 'T'],
    'I': ['1', '7', 'L', 'T'],
    'L': ['1', '7', 'I', 'T'],
    'T': ['1', '7', 'I', 'L'],
    '0': ['D', 'O', 'V'],
    'D': ['0', 'O', 'V'],
    'O': ['0', 'D', 'V'],
    'V': ['0', 'D', 'O'],
    '4': ['A', 'X'],
    'A': ['4', 'X'],
    'X': ['4', 'A'],
    '5': ['S'],
    'S': ['5'],
    'F': ['H'],
    'H': ['F'],
    '9': ['P'],
    'P': ['9']
}

raw_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
gwas_path = "resources/gwas_catalog.tsv"

def permutate(word):

    if len(word) == 0:
        return ['']

    change = []
    res = permutate(word[1:])

    if word[0] in mistakes:
        for m in mistakes[word[0]]:
            change.extend([m + r for r in res])

    return [word[0] + r for r in res] + change

def call(url):

    while True:
        try:
            res = requests.get(url)
            time.sleep(1)
            break
        except Exception as e:
            print(e)

    return res

def generate_raw_files():

    # Load Raw GWAS files
    if os.path.exists(gwas_path):
        gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
    else:
        data = requests.get(raw_url).content.decode('utf-8')
        gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]

    # Load Genes and SNPs from GWAS
    gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
    gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
    gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())

    # Generate Genes and SNPs mapping
    ground_truth = defaultdict(list)
    for i in gwas_gene_rsid.index:
        gene = gwas_gene_rsid.loc[i, 'MAPPED_GENE']
        snp = gwas_gene_rsid.loc[i, 'SNPS']

        pattern = r"[,x\-]"
        genes = re.split(pattern, gene)
        snps = re.split(pattern, snp)

        for gene in genes:
            for snp in snps:
                ground_truth[gene].append(snp)
                ground_truth[snp].append(gene)
    
    return gwas, ground_truth

gwas, ground_truth = generate_raw_files()

def integrate(df):

    # Loop through extractor result
    df_db = pd.DataFrame()
    for i in df.index:
        gene, snp = df.loc[i, 'Genes'], df.loc[i, 'rsID']
        df_gwas = gwas[(gwas['MAPPED_GENE'].str.contains(gene, na=False)) & \
                       (gwas['SNPS'].str.contains(snp, na=False))]

        df_db = pd.concat([df_db, df_gwas])

    # Adjust new column
    df_db.rename(columns={
        'DISEASE/TRAIT': 'Traits',
        'MAPPED_GENE': 'Genes',
        'SNPS': 'rsID',
        'P-VALUE': 'P Value',
        'OR or BETA': 'OR Value'
    }, inplace=True)
    df_db.drop(columns=['CHR_ID'], inplace=True, errors='ignore')
    df_db['Beta Value'] = df_db.get('OR Value')
    df_db['Source'] = 'Database'

    # Combine raw and database
    df_db = df_db.get(df.columns)
    df = pd.concat([df, df_db])
    df.reset_index(drop=True, inplace=True)

    return df