File size: 2,094 Bytes
ba600a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


def trainingDataFromUTagsJSON(data: dict) -> pd.DataFrame:
    """
    Get the training data from the UTags JSON file
    ==============================================
    Parameters:
    -----------
        data:
            description: UTags JSON file
            type: dict
    -----------
    Returns:
    --------
        data:
            description: Training data
            type: pd.DataFrame
    """
    df = pd.DataFrame()
    
    df['disease'] = [disease.disease_persian[0] for disease in data['diseases']] # disease[UTag]
    df['symptoms'] = [disease.symptom_eng for disease in data['diseases']]
    df['causes'] = [disease.cause_eng for disease in data['diseases']]
    # df['cause_persian'] = [disease.cause_persian for disease in data['diseases']]
    
    mlb = MultiLabelBinarizer(sparse_output=True)
    
    for col in df.columns:
        if col == 'disease':
            continue
        
        try:
            df = df.join(
                pd.DataFrame.sparse.from_spmatrix(
                    mlb.fit_transform(df.pop(col)), # type: ignore
                    index=df.index,
                    columns=[f'{col}_'] + mlb.classes_
                ),
            )
        except Exception as error:
            print(f'Error: {error} at column: {col}, skipping...')
        
    
    return df


def trainingDataFromPromptsForBERT(data: dict) -> pd.DataFrame:
    """
    Get the training data from the prompts JSON file
    ================================================
    Parameters:
    -----------
        data:
            description: Prompts JSON file
            type: dict
    -----------
    Returns:
    --------
        data:
            description: Training data
            type: pd.DataFrame
    """
    
    sentences = []
    
    for prompt in data['diseasesPrompts']:
        for sentence in prompt['sentences']:
            sentences.append((sentence, prompt['disease']))
    
    df = pd.DataFrame(sentences, columns=['sentence', 'disease'])
    
    return df