Spaces:
Runtime error
Runtime error
File size: 2,314 Bytes
ba600a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import pandas as pd
def augmentDataWithVectorSpaceAlgorithm(data: pd.DataFrame) -> pd.DataFrame:
"""
Augment the Data
=================
Parameters:
-----------
data:
description: Data to augment
type: pd.DataFrame
-----------
Returns:
--------
data:
description: Augmented data
type: pd.DataFrame
--------------------------------------------------------------------------------------------
Working:
--------
- Create a DataFrame from data
- Remove a symptom or cause from the new DataFrame
- Check if the resulting row is present in the original data
- If not present, add the resulting row to the new DataFrame
- Repeat steps 1-3 for all symptoms and causes
- Remove the rows with sum = 0
- Remove the same rows from the new DataFrame
- Add the new DataFrame to the original data
- Return the Resulting DataFrame
--------------------------------------------------------------------------------------------
"""
# Get the number of columns with symptoms_ prefix
numberOfSymptoms = len([col for col in data.columns if col.startswith('symptoms_')])
symptoms = data.columns[1:numberOfSymptoms]
causes = data.columns[numberOfSymptoms:]
df = data
for index, row in data.iterrows():
for symptom in symptoms:
if row[symptom] == 1: # type: ignore
row[symptom] = 0
df = df.append(row, ignore_index=True) # type: ignore
row[symptom] = 1
df.append(row, ignore_index=True)
for cause in causes:
if row[cause] == 1: # type: ignore
row[cause] = 0
df = df.append(row, ignore_index=True) # type: ignore
row[cause] = 1
df.append(row, ignore_index=True)
print(f"data before drop_duplicates: {df}")
df = df[(df.sum(axis=1, numeric_only=True) != 0)]
data = data.append(df, ignore_index=True) # type: ignore
data = data.drop_duplicates(subset=df.columns.difference(['disease']), keep=False)
data.reset_index(drop=True, inplace=True)
print(f"final data: {data}")
return data |