import pandas as pd def augmentDataWithVectorSpaceAlgorithm(data: pd.DataFrame) -> pd.DataFrame: """ Augment the Data ================= Parameters: ----------- data: description: Data to augment type: pd.DataFrame ----------- Returns: -------- data: description: Augmented data type: pd.DataFrame -------------------------------------------------------------------------------------------- Working: -------- - Create a DataFrame from data - Remove a symptom or cause from the new DataFrame - Check if the resulting row is present in the original data - If not present, add the resulting row to the new DataFrame - Repeat steps 1-3 for all symptoms and causes - Remove the rows with sum = 0 - Remove the same rows from the new DataFrame - Add the new DataFrame to the original data - Return the Resulting DataFrame -------------------------------------------------------------------------------------------- """ # Get the number of columns with symptoms_ prefix numberOfSymptoms = len([col for col in data.columns if col.startswith('symptoms_')]) symptoms = data.columns[1:numberOfSymptoms] causes = data.columns[numberOfSymptoms:] df = data for index, row in data.iterrows(): for symptom in symptoms: if row[symptom] == 1: # type: ignore row[symptom] = 0 df = df.append(row, ignore_index=True) # type: ignore row[symptom] = 1 df.append(row, ignore_index=True) for cause in causes: if row[cause] == 1: # type: ignore row[cause] = 0 df = df.append(row, ignore_index=True) # type: ignore row[cause] = 1 df.append(row, ignore_index=True) print(f"data before drop_duplicates: {df}") df = df[(df.sum(axis=1, numeric_only=True) != 0)] data = data.append(df, ignore_index=True) # type: ignore data = data.drop_duplicates(subset=df.columns.difference(['disease']), keep=False) data.reset_index(drop=True, inplace=True) print(f"final data: {data}") return data