import pandas as pd from sklearn.preprocessing import MultiLabelBinarizer def trainingDataFromUTagsJSON(data: dict) -> pd.DataFrame: """ Get the training data from the UTags JSON file ============================================== Parameters: ----------- data: description: UTags JSON file type: dict ----------- Returns: -------- data: description: Training data type: pd.DataFrame """ df = pd.DataFrame() df['disease'] = [disease.disease_persian[0] for disease in data['diseases']] # disease[UTag] df['symptoms'] = [disease.symptom_eng for disease in data['diseases']] df['causes'] = [disease.cause_eng for disease in data['diseases']] # df['cause_persian'] = [disease.cause_persian for disease in data['diseases']] mlb = MultiLabelBinarizer(sparse_output=True) for col in df.columns: if col == 'disease': continue try: df = df.join( pd.DataFrame.sparse.from_spmatrix( mlb.fit_transform(df.pop(col)), # type: ignore index=df.index, columns=[f'{col}_'] + mlb.classes_ ), ) except Exception as error: print(f'Error: {error} at column: {col}, skipping...') return df def trainingDataFromPromptsForBERT(data: dict) -> pd.DataFrame: """ Get the training data from the prompts JSON file ================================================ Parameters: ----------- data: description: Prompts JSON file type: dict ----------- Returns: -------- data: description: Training data type: pd.DataFrame """ sentences = [] for prompt in data['diseasesPrompts']: for sentence in prompt['sentences']: sentences.append((sentence, prompt['disease'])) df = pd.DataFrame(sentences, columns=['sentence', 'disease']) return df