Unani-Medicine-AI-Engine / tasks /data /dataAugmentation.py
HaiderSultanArc's picture
AI Engine API
ba600a6
import pandas as pd
def augmentDataWithVectorSpaceAlgorithm(data: pd.DataFrame) -> pd.DataFrame:
"""
Augment the Data
=================
Parameters:
-----------
data:
description: Data to augment
type: pd.DataFrame
-----------
Returns:
--------
data:
description: Augmented data
type: pd.DataFrame
--------------------------------------------------------------------------------------------
Working:
--------
- Create a DataFrame from data
- Remove a symptom or cause from the new DataFrame
- Check if the resulting row is present in the original data
- If not present, add the resulting row to the new DataFrame
- Repeat steps 1-3 for all symptoms and causes
- Remove the rows with sum = 0
- Remove the same rows from the new DataFrame
- Add the new DataFrame to the original data
- Return the Resulting DataFrame
--------------------------------------------------------------------------------------------
"""
# Get the number of columns with symptoms_ prefix
numberOfSymptoms = len([col for col in data.columns if col.startswith('symptoms_')])
symptoms = data.columns[1:numberOfSymptoms]
causes = data.columns[numberOfSymptoms:]
df = data
for index, row in data.iterrows():
for symptom in symptoms:
if row[symptom] == 1: # type: ignore
row[symptom] = 0
df = df.append(row, ignore_index=True) # type: ignore
row[symptom] = 1
df.append(row, ignore_index=True)
for cause in causes:
if row[cause] == 1: # type: ignore
row[cause] = 0
df = df.append(row, ignore_index=True) # type: ignore
row[cause] = 1
df.append(row, ignore_index=True)
print(f"data before drop_duplicates: {df}")
df = df[(df.sum(axis=1, numeric_only=True) != 0)]
data = data.append(df, ignore_index=True) # type: ignore
data = data.drop_duplicates(subset=df.columns.difference(['disease']), keep=False)
data.reset_index(drop=True, inplace=True)
print(f"final data: {data}")
return data