Unani-Medicine-AI-Engine / tasks /data /dataEngineering.py
HaiderSultanArc's picture
AI Engine API
ba600a6
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
def trainingDataFromUTagsJSON(data: dict) -> pd.DataFrame:
"""
Get the training data from the UTags JSON file
==============================================
Parameters:
-----------
data:
description: UTags JSON file
type: dict
-----------
Returns:
--------
data:
description: Training data
type: pd.DataFrame
"""
df = pd.DataFrame()
df['disease'] = [disease.disease_persian[0] for disease in data['diseases']] # disease[UTag]
df['symptoms'] = [disease.symptom_eng for disease in data['diseases']]
df['causes'] = [disease.cause_eng for disease in data['diseases']]
# df['cause_persian'] = [disease.cause_persian for disease in data['diseases']]
mlb = MultiLabelBinarizer(sparse_output=True)
for col in df.columns:
if col == 'disease':
continue
try:
df = df.join(
pd.DataFrame.sparse.from_spmatrix(
mlb.fit_transform(df.pop(col)), # type: ignore
index=df.index,
columns=[f'{col}_'] + mlb.classes_
),
)
except Exception as error:
print(f'Error: {error} at column: {col}, skipping...')
return df
def trainingDataFromPromptsForBERT(data: dict) -> pd.DataFrame:
"""
Get the training data from the prompts JSON file
================================================
Parameters:
-----------
data:
description: Prompts JSON file
type: dict
-----------
Returns:
--------
data:
description: Training data
type: pd.DataFrame
"""
sentences = []
for prompt in data['diseasesPrompts']:
for sentence in prompt['sentences']:
sentences.append((sentence, prompt['disease']))
df = pd.DataFrame(sentences, columns=['sentence', 'disease'])
return df