File size: 1,696 Bytes
7841ce0
41c5156
 
 
 
7aebf2b
de2a82e
 
7841ce0
aba41f2
 
7aebf2b
aba41f2
 
 
 
 
 
41c5156
 
de2a82e
41c5156
 
 
 
 
7841ce0
41c5156
 
 
 
 
7841ce0
41c5156
 
 
 
 
 
 
 
 
 
 
 
 
de2a82e
41c5156
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from typing import Tuple
import pandas as pd
import random
from datetime import datetime, timedelta

from dataset.download import presentation_data_schema
from whale_viewer import WHALE_CLASSES

def generate_fake_data(df:pd.DataFrame, num_fake:int) -> pd.DataFrame:
    """
    Generate fake data for the dataset.

    Args:
        df (pd.DataFrame): Original DataFrame to append fake data to.
        num_fake (int): Number of fake observations to generate.
    Returns:
        pd.DataFrame: DataFrame with the original and fake data.
    """

    # Options for random generation
    species_options = WHALE_CLASSES
    email_options = [
        '[email protected]', '[email protected]',
        '[email protected]', '[email protected]'
    ]

    def random_ocean_coord() -> Tuple[float, float]:
        """Generate random ocean-friendly coordinates."""
        lat = random.uniform(-60, 60)  # avoid poles
        lon = random.uniform(-180, 180)
        return lat, lon

    def random_date(start_year:int=2018, end_year:int=2025) -> datetime:
        """Generate a random date."""
        start = datetime(start_year, 1, 1)
        end = datetime(end_year, 1, 1)
        return start + timedelta(days=random.randint(0, (end - start).days))

    new_data = []
    for _ in range(num_fake):
        lat, lon = random_ocean_coord()
        species = random.choice(species_options)
        email = random.choice(email_options)
        date = random_date()
        new_data.append([lat, lon, species, email, date])

    new_df = pd.DataFrame(new_data, columns=presentation_data_schema).astype(presentation_data_schema)
    df = pd.concat([df, new_df], ignore_index=True)
    return df