Spaces:
Runtime error
Runtime error
File size: 2,094 Bytes
ba600a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
def trainingDataFromUTagsJSON(data: dict) -> pd.DataFrame:
"""
Get the training data from the UTags JSON file
==============================================
Parameters:
-----------
data:
description: UTags JSON file
type: dict
-----------
Returns:
--------
data:
description: Training data
type: pd.DataFrame
"""
df = pd.DataFrame()
df['disease'] = [disease.disease_persian[0] for disease in data['diseases']] # disease[UTag]
df['symptoms'] = [disease.symptom_eng for disease in data['diseases']]
df['causes'] = [disease.cause_eng for disease in data['diseases']]
# df['cause_persian'] = [disease.cause_persian for disease in data['diseases']]
mlb = MultiLabelBinarizer(sparse_output=True)
for col in df.columns:
if col == 'disease':
continue
try:
df = df.join(
pd.DataFrame.sparse.from_spmatrix(
mlb.fit_transform(df.pop(col)), # type: ignore
index=df.index,
columns=[f'{col}_'] + mlb.classes_
),
)
except Exception as error:
print(f'Error: {error} at column: {col}, skipping...')
return df
def trainingDataFromPromptsForBERT(data: dict) -> pd.DataFrame:
"""
Get the training data from the prompts JSON file
================================================
Parameters:
-----------
data:
description: Prompts JSON file
type: dict
-----------
Returns:
--------
data:
description: Training data
type: pd.DataFrame
"""
sentences = []
for prompt in data['diseasesPrompts']:
for sentence in prompt['sentences']:
sentences.append((sentence, prompt['disease']))
df = pd.DataFrame(sentences, columns=['sentence', 'disease'])
return df |