Spaces:

neuralcomputation
/

batik

Sleeping

App Files Files Community

ncoria commited on Nov 27, 2024

Commit

3f0e895

verified ·

1 Parent(s): da69119

update utils generate_embeddings_io

Browse files

Files changed (1) hide show

utils/utils.py +636 -636

utils/utils.py CHANGED Viewed

@@ -1,636 +1,636 @@
-import os
-import io
-import pickle
-import copy
-from collections import Counter
-from pathlib import Path
-from tempfile import NamedTemporaryFile
-import regex as re
-import numpy as np
-import pandas as pd
-from sklearn.manifold import TSNE
-from sklearn.svm import SVC
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, classification_report
-import torch
-from tqdm import tqdm
-from PIL import Image
-from transformers import AutoProcessor, AutoModel
-import streamlit as st
-from .data_loading import load_multiple_annotations, load_multiple_annotations_io
-from .data_processing import generate_label_array
-from .seqIo import seqIo_reader
-from .mp4Io import mp4Io_reader
-SLIP_MODEL_ID = "google/siglip-so400m-patch14-384"
-CLIP_MODEL_ID = "openai/clip-vit-base-patch32"
-def create_annot_fname_dict(annot_fnames: list[str])-> dict:
-    fs = re.compile(r'.*(_\d+)$')
-    unique_files = set()
-    for file in annot_fnames:
-        file_name = os.fsdecode(file)
-        base_name, _ = os.path.splitext(file_name)
-        if fs.match(base_name):
-            ind = len(fs.match(base_name).group(1))
-            unique_files.add(base_name[:-ind])
-        else:
-            unique_files.add(base_name)
-    annot_fname_dict = {}
-    for unique_file in unique_files:
-        annot_fname_dict.update({unique_file: [file for file in annot_fnames if unique_file in file]})
-    return annot_fname_dict
-def create_annot_fname_dict_io(annot_fnames: list[str], annot_files: list)-> dict:
-    annot_file_dict = {}
-    for file in annot_files:
-        annot_file_dict.update({file.name : file})
-    fs = re.compile(r'.*(_\d+)$')
-    unique_files = set()
-    for file in annot_fnames:
-        file_name = os.fsdecode(file)
-        base_name, _ = os.path.splitext(file_name)
-        if fs.match(base_name):
-            ind = len(fs.match(base_name).group(1))
-            unique_files.add(base_name[:-ind])
-        else:
-            unique_files.add(base_name)
-    annot_fname_dict = {}
-    for unique_file in unique_files:
-        annot_list = [file for file in annot_fnames if unique_file in file]
-        annot_list.sort()
-        annot_file_list = [annot_file_dict[annot_file_name] for annot_file_name in annot_list]
-        annot_fname_dict.update({unique_file: annot_file_list})
-    return annot_fname_dict
-def get_io_reader(uploaded_file):
-    assert uploaded_file.name[-3:]=='seq', 'Not a seq file'
-    with NamedTemporaryFile(suffix="seq", delete=False) as temp:
-        temp.write(uploaded_file.getvalue())
-        sr = seqIo_reader(temp.name)
-    return sr
-def load_slip_model(device):
-    return AutoModel.from_pretrained(SLIP_MODEL_ID).to(device)
-def load_slip_preprocessor():
-    return AutoProcessor.from_pretrained(SLIP_MODEL_ID)
-def load_clip_model(device):
-    return AutoModel.from_pretrained(CLIP_MODEL_ID).to(device)
-def load_clip_preprocessor():
-    return AutoProcessor.from_pretrained(CLIP_MODEL_ID)
-def encode_image(image, device, model, processor):
-    with torch.no_grad():
-        #convert_models_to_fp32(model)
-        inputs = processor(images=image, return_tensors="pt").to(device)
-        image_features = model.get_image_features(**inputs)
-    return image_features.cpu().numpy().flatten()
-def generate_embeddings_stream(fnames : list[str],
-                        model = 'SLIP',
-                        downsample_rate = 4,
-                        save_csv = False)-> tuple[list, list, list]:
-    # set up model and device
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-    if model == 'SLIP':
-        embed_model = load_slip_model(device)
-        processor = load_slip_preprocessor()
-    elif model == 'CLIP':
-        embed_model = load_clip_model(device)
-        processor = load_clip_preprocessor()
-    all_video_embeddings = []
-    all_video_frames = []
-    for fname in fnames:
-        # read in file
-        is_seq = False
-        if fname[-3:] == 'seq': is_seq = True
-        if is_seq:
-            sr = seqIo_reader(fname)
-        else:
-            sr = mp4Io_reader(fname)
-        N  = sr.header['numFrames']
-        # set up embeddings and frame arrays
-        embeddings = []
-        frames = list(range(N))[::downsample_rate]
-        print(frames)
-        # create progress bar
-        i = 0
-        pbar_text = lambda i: f'Creating embeddings for {fname}. {i}/{len(frames)} frames.'
-        pbar = st.progress(0, text=pbar_text(0))
-        # convert each frame to embeddings
-        for f in tqdm(frames):
-            img, _ = sr.getFrame(f)
-            img_arr = np.array(img)
-            if is_seq:
-                img_rgb = Image.fromarray(img_arr, 'L').convert('RGB')
-            else:
-                img_rgb = Image.fromarray(img_arr).convert('RGB')
-            embeddings.append(encode_image(img_rgb, device, embed_model, processor))
-            # update progress bar
-            i += 1
-            pbar.progress(i/len(frames), pbar_text(i))
-        # save csv of single file
-        if save_csv:
-            df = pd.DataFrame(embeddings)
-            df['Frame'] = frames
-            # save csv
-            basename = Path(fname).stem
-            df.to_csv(f'{basename}_embeddings_downsample_{downsample_rate}.csv', index=False)
-        all_video_embeddings.append(np.array(embeddings))
-        all_video_frames.append(frames)
-    return all_video_embeddings, all_video_frames
-def get_io_reader(uploaded_file):
-    if uploaded_file.name[-3:]=='seq':
-        with NamedTemporaryFile(suffix="seq", delete=False) as temp:
-            temp.write(uploaded_file.getvalue())
-            sr = seqIo_reader(temp.name)
-    else:
-        with NamedTemporaryFile(suffix="mp4", delete=False) as temp:
-            temp.write(uploaded_file.getvalue())
-            sr = mp4Io_reader(temp.name)
-    return sr
-def generate_embeddings_stream_io(uploaded_files : list,
-                                model = 'SLIP',
-                                downsample_rate = 4,
-                                save_csv = False)-> tuple[list, list, list]:
-    # set up model and device
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-    if model == 'SLIP':
-        embed_model = load_slip_model(device)
-        processor = load_slip_preprocessor()
-    elif model == 'CLIP':
-        embed_model = load_clip_model(device)
-        processor = load_clip_preprocessor()
-    all_video_embeddings = []
-    all_video_frames = []
-    for file in uploaded_files:
-        is_seq = False
-        if file.name[-3:] == 'seq': is_seq = True
-        # read in file
-        sr = get_io_reader(file)
-        N  = sr.header['numFrames']
-        # set up embeddings and frame arrays
-        embeddings = []
-        frames = list(range(N))[::downsample_rate]
-        print(frames)
-        # create progress bar
-        i = 0
-        pbar_text = lambda i: f'Creating embeddings for {file.name}. {i}/{len(frames)} frames.'
-        pbar = st.progress(0, text=pbar_text(0))
-        # convert each frame to embeddings
-        for f in tqdm(frames):
-            img, _ = sr.getFrame(f)
-            img_arr = np.array(img)
-            if is_seq:
-                img_rgb = Image.fromarray(img_arr, 'L').convert('RGB')
-            else:
-                img_rgb = Image.fromarray(img_arr).convert('RGB')
-            embeddings.append(encode_image(img_rgb, device, embed_model, processor))
-            # update progress bar
-            i += 1
-            pbar.progress(i/len(frames), pbar_text(i))
-        # save csv of single file
-        if save_csv:
-            df = pd.DataFrame(embeddings)
-            df['Frame'] = frames
-            # save csv
-            df.to_csv(f'embeddings_downsample_{downsample_rate}_{frames}_frames.csv', index=False)
-        all_video_embeddings.append(np.array(embeddings))
-        all_video_frames.append(frames)
-    return all_video_embeddings, all_video_frames
-def create_embeddings_csv(out: str,
-                          fnames: list[str],
-                          embeddings: list[np.ndarray],
-                          frames: list[list[int]],
-                          annotations: list[list[str]],
-                          test_fnames: None | list[str],
-                          views: None | list[str],
-                          conditions: None | list[str],
-                          downsample_rate = 4,
-                          filesystem = None):
-    """
-    Creates a .csv file containing all of the generated embeddings and provived information.
-    Parameters:
-    -----------
-    out : str
-        The name of the resulting file.
-    fnames : list[str]
-        Video sources for each of the embedding arrays.
-    embeddings : np.ndarray
-        The generated embeddings from the images.
-    downsample_rate : int
-        The downsample_rate used for generating the embeddings.
-    """
-    assert len(fnames) == len(embeddings)
-    assert len(embeddings) == len(frames)
-    all_embeddings = np.vstack(embeddings)
-    df = pd.DataFrame(all_embeddings)
-    labels = []
-    for i, annot_fnames in enumerate(annotations):
-        _, ext = os.path.splitext(annot_fnames[0])
-        if ext == '.annot':
-            annot, _, _, sr = load_multiple_annotations(annot_fnames, filesystem=filesystem)
-            annot_labels = generate_label_array(annot, downsample_rate, len(frames[i]))
-        elif ext == '.csv':
-            if not filesystem:
-                annot_df = pd.read_csv(annot_fnames[0], header=None)
-            else:
-                with filesystem.open(annot_fnames[0], 'r') as csv_file:
-                    annot_df = pd.read_csv(csv_file, header=None)
-            annot_labels = annot_df[0].to_list()[::downsample_rate]
-            assert len(annot_labels) == len(frames[i]), "There is a mismatch between the number of frames and number of labels. Make sure that the passed in csv file has no header."
-        else:
-            raise ValueError(f'Incompatible file for annotations used. Got a file of type "{ext}".')
-        assert len(annot_labels) == len(frames[i]), "There is a mismatch between the number of frames and number of labels. Make sure you have passed in the correct files."
-        print(annot_labels)
-        labels.append(annot_labels)
-    all_labels = np.hstack(labels)
-    print(len(all_labels))
-    df['Label'] = all_labels
-    all_frames = np.hstack(frames)
-    df['Frame'] = all_frames
-    sources = [[fname for _ in range(len(frames[i]))] for i, fname in enumerate(fnames)]
-    all_sources = np.hstack(sources)
-    df['Source'] = all_sources
-    if test_fnames:
-        t_split = lambda x: True if x in test_fnames else False
-        test = [[t_split(fname) for _ in range(len(frames[i]))] for i, fname in enumerate(fnames)]
-    else:
-        test = [[True for _ in range(len(frames[i]))] for i, _ in enumerate(fnames)]
-    all_test = np.hstack(test)
-    df['Test'] = all_test
-    if views:
-        view = [[views[i] for _ in range(len(frames[i]))] for i in range(len(fnames))]
-    else:
-        view = [[None for _ in range(len(frames[i]))] for i in range(len(fnames))]
-    all_view = np.hstack(view)
-    df['View'] = all_view
-    if conditions:
-        condition = [[conditions[i] for _ in range(len(frames[i]))] for i in range(len(fnames))]
-    else:
-        condition = [[None for _ in range(len(frames[i]))] for i in range(len(fnames))]
-    all_condition = np.hstack(condition)
-    df['Condition'] = all_condition
-    return df
-def create_embeddings_csv_io(out: str,
-                          fnames: list[str],
-                          embeddings: list[np.ndarray],
-                          frames: list[list[int]],
-                          annotations: list,
-                          test_fnames: None | list[str],
-                          views: None | list[str],
-                          conditions: None | list[str],
-                          downsample_rate = 4):
-    """
-    Creates a .csv file containing all of the generated embeddings and provived information.
-    Parameters:
-    -----------
-    out : str
-        The name of the resulting file.
-    fnames : list[str]
-        Video sources for each of the embedding arrays.
-    embeddings : np.ndarray
-        The generated embeddings from the images.
-    downsample_rate : int
-        The downsample_rate used for generating the embeddings.
-    """
-    assert len(fnames) == len(embeddings)
-    assert len(embeddings) == len(frames)
-    all_embeddings = np.vstack(embeddings)
-    df = pd.DataFrame(all_embeddings)
-    labels = []
-    for i, uploaded_annots in enumerate(annotations):
-        print(i)
-        _, ext = os.path.splitext(uploaded_annots[0].name)
-        if ext == '.annot':
-            annot, _, _, sr = load_multiple_annotations_io(uploaded_annots)
-            annot_labels = generate_label_array(annot, downsample_rate, len(frames[i]))
-        elif ext == '.csv':
-            annot_df = pd.read_csv(uploaded_annots[0], header=None)
-            annot_labels = annot_df[0].to_list()[::downsample_rate]
-            assert len(annot_labels) == len(frames[i]), "There is a mismatch between the number of frames and number of labels. Make sure that the passed in csv file has no header."
-        else:
-            raise ValueError(f'Incompatible file for annotations used. Got a file of type "{ext}".')
-        assert len(annot_labels) == len(frames[i]), "There is a mismatch between the number of frames and number of labels. Make sure you have passed in the correct files."
-        print(annot_labels)
-        labels.append(annot_labels)
-    all_labels = np.hstack(labels)
-    print(len(all_labels))
-    df['Label'] = all_labels
-    all_frames = np.hstack(frames)
-    df['Frame'] = all_frames
-    sources = [[fname for _ in range(len(frames[i]))] for i, fname in enumerate(fnames)]
-    all_sources = np.hstack(sources)
-    df['Source'] = all_sources
-    if test_fnames:
-        t_split = lambda x: True if x in test_fnames else False
-        test = [[t_split(fname) for _ in range(len(frames[i]))] for i, fname in enumerate(fnames)]
-    else:
-        test = [[True for _ in range(len(frames[i]))] for i, _ in enumerate(fnames)]
-    all_test = np.hstack(test)
-    df['Test'] = all_test
-    if views:
-        view = [[views[i] for _ in range(len(frames[i]))] for i in range(len(fnames))]
-    else:
-        view = [[None for _ in range(len(frames[i]))] for i in range(len(fnames))]
-    all_view = np.hstack(view)
-    df['View'] = all_view
-    if conditions:
-        condition = [[conditions[i] for _ in range(len(frames[i]))] for i in range(len(fnames))]
-    else:
-        condition = [[None for _ in range(len(frames[i]))] for i in range(len(fnames))]
-    all_condition = np.hstack(condition)
-    df['Condition'] = all_condition
-    return df
-def process_dataset_in_mem(embeddings_df: pd.DataFrame,
-                    specified_classes=None,
-                    classes_to_remove=None,
-                    max_class_size=None,
-                    animal_state=None,
-                    view=None,
-                    shuffle_data=False,
-                    test_videos=None):
-    """
-    Processes output generated from embeddings paired with images and behavior labels.
-    Parameters:
-    -----------
-    csv_path : str
-        Path to the file containing the original data. This should contain embeddings,
-        a column named `'Label'` and a column named `'Images'`.
-    specified_classes : None | list[str]
-        An optional input. Defines labels which should be kept as is in the `'Label'`
-        column and which should be changed to a default `other` label.
-    classes_to_remove : None | list[str]
-        An optional input. Drops rows from the dataframe which contain a label in the
-        list.
-    max_class_size : None | int
-        An optional input. Determines the maximum amount of rows a single label can
-        appear in for each unique label in the `'Label'` column.
-    animal_state : None | str
-        An optional input. Drops rows from the dataframe which do not contain a match
-        for `animal_state` in the text field within the `'Images'` column.
-    view : None | str
-        An optional input. Drops rows from the dataframe which do not contain a match
-        for `view` in the text field within the `'Images'` column.
-    shuffle_data : bool
-        Determines wether the dataframe should have its rows shuffled.
-    test_videos : None | list[str]
-        An optional input. Determines what rows should be in the `test` dataframe, and
-        which should be in the `train` dataframe. It drops rows from the respective
-        dataframe by keeping or dropping rows which do not contain a match for a `str`
-        in `test_videos` in the text field within the `'Images'` column, respectively.
-    Returns:
-    --------
-    balanced_train_embeddings : pandas.DataFrame
-        A processed dataframe whose rows contain the embeddings for each of the images
-        at the corresponding index within `balanced_train_images`.
-    balanced_train_labels : list[str]
-        A list of labels for each of the images at the corresponing index within
-        `balanced_train_images`.
-    balanced_train_images: list[str]
-        A list of paths to images with each image at an index corresponding to a label
-        with the same index in `balanced_train_labels` and the same row index within
-        `balanced_train_embeddings`.
-    test_embeddings : pandas.DataFrame
-        A processed dataframe whose rows contain the embeddings for each of the images
-        at the corresponding index within `test_images`.
-    test_labels : list[str]
-        A list of labels for each of the images at the corresponing index within
-        `test_images`.
-    test_images : list[str]
-        A list of paths to images with each image at an index corresponding to a label
-        with the same index in `test_labels` and the same row index within
-        `test_embeddings`.
-    """
-    # Convert embeddings, labels, and images to a DataFrame for easy manipulation
-    df = copy.deepcopy(embeddings_df)
-    df_keys = [str(x) for x in df.keys()]
-    #Filter by fed or fasted
-    if 'Condition' in df_keys and animal_state:
-        df = df[df['Condition'].str.contains(animal_state, na=False)]
-    if 'View' in df_keys and view:
-        df = df[df['View'].str.contains(view, na=False)]
-    # Extract unique video names excluding the frame number
-    #unique_video_names = df['Images'].apply(lambda x: '_'.join(x.split('_')[:-1])).unique()
-    #print("\nUnique video names:\n", unique_video_names)
-    if classes_to_remove:
-        df = df[~df['Label'].str.contains('|'.join(classes_to_remove), na=False)]
-    elif classes_to_remove and 'all' in classes_to_remove:
-        df = df[df['Label'].str.contains('|'.join(classes_to_remove), na=False)]
-    # Further filter to include only specified_classes
-    if specified_classes:
-        single_match = lambda x: list(set(x.split('||')) & set(specified_classes))[0]
-        df['Label'] = df['Label'].apply(lambda x: single_match(x) if not set(x.split('||')).isdisjoint(specified_classes) else 'other')
-        specified_classes.append('other')
-    # Separate the DataFrame into test and training sets based on test_videos
-    if 'Test' in df_keys and test_videos:
-        test_df = df[df['Test']]
-        train_df = df[~df['Test']]
-    elif test_videos:
-        test_df = df[df['Images'].str.contains('|'.join(test_videos), na=False)]
-        train_df = df[~df['Images'].str.contains('|'.join(test_videos), na=False)]
-    else:
-        test_df = pd.DataFrame(columns=df.columns)
-        train_df = df
-    # Print the number of frames in each class before balancing
-    label_counts = train_df['Label'].value_counts()
-    print("\nNumber of training frames in each class before balancing:")
-    print(label_counts)
-    if max_class_size:
-        balanced_train_df = pd.concat([
-            group.sample(n=min(len(group), max_class_size), random_state=1)
-            for label, group in train_df.groupby('Label')
-        ])
-    else:
-        balanced_train_df = train_df
-    # Shuffle the training DataFrame
-    if shuffle_data:
-        balanced_train_df = balanced_train_df.sample(frac=1).reset_index(drop=True)
-    # Convert training set back to numpy array and list
-    if not "Images" in df_keys:
-        balanced_train_embeddings = balanced_train_df.drop(columns=['Label', 'Frame', 'Source', 'Test','View','Condition']).to_numpy()
-        balanced_train_labels = balanced_train_df['Label'].tolist()
-        balanced_train_images = balanced_train_df['Frame'].tolist()
-        # Convert test set back to numpy array and list
-        test_embeddings = test_df.drop(columns=['Label', 'Frame', 'Source', 'Test','View','Condition']).to_numpy()
-        test_labels = test_df['Label'].tolist()
-        test_images = test_df['Frame'].tolist()
-    else:
-        # Convert training set back to numpy array and list
-        balanced_train_embeddings = balanced_train_df.drop(columns=['Label', 'Images']).to_numpy()
-        balanced_train_labels = balanced_train_df['Label'].tolist()
-        balanced_train_images = balanced_train_df['Images'].tolist()
-        # Convert test set back to numpy array and list
-        if 'Test' in test_df:
-            test_embeddings = test_df.drop(columns=['Label', 'Images', 'Test']).to_numpy()
-        else:
-            test_embeddings = test_df.drop(columns=['Label', 'Images']).to_numpy()
-        test_labels = test_df['Label'].tolist()
-        test_images = test_df['Images'].tolist()
-    # Print the number of frames in each class after balancing
-    if specified_classes or max_class_size:
-        balanced_label_counts = Counter(balanced_train_labels)
-        print("\nNumber of training frames in each class after balancing:")
-        print(balanced_label_counts)
-    test_label_counts = test_df['Label'].value_counts()
-    # print("\nNumber of testing frames in each class:")
-    print(test_label_counts)
-    return balanced_train_embeddings, balanced_train_labels, balanced_train_images, test_embeddings, test_labels, test_images
-def multiclass_merge_and_filter_bouts(multiclass_vector, bout_threshold, proximity_threshold):
-    # Get the unique labels in the multiclass vector (excluding zero, assuming zero is the background/no label)
-    unique_labels = np.unique(multiclass_vector)
-    unique_labels = unique_labels[unique_labels != 0]
-    # Initialize a vector to store the merged and filtered multiclass vector
-    merged_vector = np.zeros_like(multiclass_vector)
-    for label in unique_labels:
-        # Create a binary vector for the current label
-        binary_vector = (multiclass_vector == label)
-        # Find the start and end indices of all sequences of 1's for this label
-        starts = np.where(np.diff(np.concatenate(([0], binary_vector))) == 1)[0]
-        ends = np.where(np.diff(np.concatenate((binary_vector, [0]))) == -1)[0]
-        # Step 1: Merge close short bouts
-        i = 0
-        while i < len(starts) - 1:
-            # Check if the gap between the end of the current bout and the start of the next bout
-            # is within the proximity threshold
-            if starts[i + 1] - ends[i] <= proximity_threshold:
-                # Merge the two bouts by setting all elements between the start of the first
-                # and the end of the second bout to 1
-                binary_vector[ends[i]:starts[i + 1]] = 1
-                # Remove the next bout from consideration
-                starts = np.delete(starts, i + 1)
-                ends = np.delete(ends, i)
-            else:
-                i += 1
-        # Update the starts and ends after merging
-        starts = np.where(np.diff(np.concatenate(([0], binary_vector))) == 1)[0]
-        ends = np.where(np.diff(np.concatenate((binary_vector, [0]))) == -1)[0]
-        # Step 2: Remove standalone short bouts
-        for i in range(len(starts)):
-            # Check the length of the bout
-            length_of_bout = ends[i] - starts[i] + 1
-            # If the length is less than the threshold, set those elements to 0
-            if length_of_bout < bout_threshold:
-                binary_vector[starts[i]:ends[i] + 1] = 0
-        # Combine the binary vector with the merged_vector, ensuring only the current label is set
-        merged_vector[binary_vector] = label
-    # Return the filtered multiclass vector
-    return merged_vector
-def get_unique_labels(label_list: list[str]):
-    label_set = set()
-    for label in label_list:
-        individual_labels = label.split('||')
-        for individual_label in individual_labels:
-            label_set.add(individual_label)
-    return list(label_set)
-def get_train_test_split(train_embeds, numerical_labels, test_size=0.05, random_state=42):
-    return train_test_split(train_embeds, numerical_labels, test_size=test_size, random_state=random_state)
-def train_model(X_train, y_train, random_state=42):
-    # Train SVM Classifier
-    svm_clf = SVC(kernel='rbf', random_state=random_state, probability=True)
-    svm_clf.fit(X_train, y_train)
-    return svm_clf
-def pickle_model(model):
-    pickled = io.BytesIO()
-    pickle.dump(model, pickled)
-    return pickled
-def get_seq_io_reader(uploaded_file):
-    assert uploaded_file.name[-3:]=='seq', 'Not a seq file'
-    with NamedTemporaryFile(suffix="seq", delete=False) as temp:
-        temp.write(uploaded_file.getvalue())
-        sr = seqIo_reader(temp.name)
-    return sr
-def seq_to_arr(sr):
-    N = sr.header['numFrames']
-    images = []
-    for f in range(N):
-        I, ts = sr.getFrame(f)
-        images.append(I)
-    return np.array(images)
-def get_2d_embedding(embeddings: pd.DataFrame):
-    tsne = TSNE(n_jobs=4, n_components=2, random_state=42, perplexity=50)
-    embedding_2d = tsne.fit_transform(np.array(embeddings))
-    return embedding_2d

+import os
+import io
+import pickle
+import copy
+from collections import Counter
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+import regex as re
+import numpy as np
+import pandas as pd
+from sklearn.manifold import TSNE
+from sklearn.svm import SVC
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report
+import torch
+from tqdm import tqdm
+from PIL import Image
+from transformers import AutoProcessor, AutoModel
+import streamlit as st
+from .data_loading import load_multiple_annotations, load_multiple_annotations_io
+from .data_processing import generate_label_array
+from .seqIo import seqIo_reader
+from .mp4Io import mp4Io_reader
+SLIP_MODEL_ID = "google/siglip-so400m-patch14-384"
+CLIP_MODEL_ID = "openai/clip-vit-base-patch32"
+def create_annot_fname_dict(annot_fnames: list[str])-> dict:
+    fs = re.compile(r'.*(_\d+)$')
+    unique_files = set()
+    for file in annot_fnames:
+        file_name = os.fsdecode(file)
+        base_name, _ = os.path.splitext(file_name)
+        if fs.match(base_name):
+            ind = len(fs.match(base_name).group(1))
+            unique_files.add(base_name[:-ind])
+        else:
+            unique_files.add(base_name)
+    annot_fname_dict = {}
+    for unique_file in unique_files:
+        annot_fname_dict.update({unique_file: [file for file in annot_fnames if unique_file in file]})
+    return annot_fname_dict
+def create_annot_fname_dict_io(annot_fnames: list[str], annot_files: list)-> dict:
+    annot_file_dict = {}
+    for file in annot_files:
+        annot_file_dict.update({file.name : file})
+    fs = re.compile(r'.*(_\d+)$')
+    unique_files = set()
+    for file in annot_fnames:
+        file_name = os.fsdecode(file)
+        base_name, _ = os.path.splitext(file_name)
+        if fs.match(base_name):
+            ind = len(fs.match(base_name).group(1))
+            unique_files.add(base_name[:-ind])
+        else:
+            unique_files.add(base_name)
+    annot_fname_dict = {}
+    for unique_file in unique_files:
+        annot_list = [file for file in annot_fnames if unique_file in file]
+        annot_list.sort()
+        annot_file_list = [annot_file_dict[annot_file_name] for annot_file_name in annot_list]
+        annot_fname_dict.update({unique_file: annot_file_list})
+    return annot_fname_dict
+def get_io_reader(uploaded_file):
+    assert uploaded_file.name[-3:]=='seq', 'Not a seq file'
+    with NamedTemporaryFile(suffix="seq", delete=False) as temp:
+        temp.write(uploaded_file.getvalue())
+        sr = seqIo_reader(temp.name)
+    return sr
+def load_slip_model(device):
+    return AutoModel.from_pretrained(SLIP_MODEL_ID).to(device)
+def load_slip_preprocessor():
+    return AutoProcessor.from_pretrained(SLIP_MODEL_ID)
+def load_clip_model(device):
+    return AutoModel.from_pretrained(CLIP_MODEL_ID).to(device)
+def load_clip_preprocessor():
+    return AutoProcessor.from_pretrained(CLIP_MODEL_ID)
+def encode_image(image, device, model, processor):
+    with torch.no_grad():
+        #convert_models_to_fp32(model)
+        inputs = processor(images=image, return_tensors="pt").to(device)
+        image_features = model.get_image_features(**inputs)
+    return image_features.cpu().numpy().flatten()
+def generate_embeddings_stream(fnames : list[str],
+                        model = 'SLIP',
+                        downsample_rate = 4,
+                        save_csv = False)-> tuple[list, list, list]:
+    # set up model and device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+    if model == 'SLIP':
+        embed_model = load_slip_model(device)
+        processor = load_slip_preprocessor()
+    elif model == 'CLIP':
+        embed_model = load_clip_model(device)
+        processor = load_clip_preprocessor()
+    all_video_embeddings = []
+    all_video_frames = []
+    for fname in fnames:
+        # read in file
+        is_seq = False
+        if fname[-3:] == 'seq': is_seq = True
+        if is_seq:
+            sr = seqIo_reader(fname)
+        else:
+            sr = mp4Io_reader(fname)
+        N  = sr.header['numFrames']
+        # set up embeddings and frame arrays
+        embeddings = []
+        frames = list(range(N))[::downsample_rate]
+        print(frames)
+        # create progress bar
+        i = 0
+        pbar_text = lambda i: f'Creating embeddings for {fname}. {i}/{len(frames)} frames.'
+        pbar = st.progress(0, text=pbar_text(0))
+        # convert each frame to embeddings
+        for f in tqdm(frames):
+            img, _ = sr.getFrame(f)
+            img_arr = np.array(img)
+            if is_seq:
+                img_rgb = Image.fromarray(img_arr, 'L').convert('RGB')
+            else:
+                img_rgb = Image.fromarray(img_arr).convert('RGB')
+            embeddings.append(encode_image(img_rgb, device, embed_model, processor))
+            # update progress bar
+            i += 1
+            pbar.progress(i/len(frames), pbar_text(i))
+        # save csv of single file
+        if save_csv:
+            df = pd.DataFrame(embeddings)
+            df['Frame'] = frames
+            # save csv
+            basename = Path(fname).stem
+            df.to_csv(f'{basename}_embeddings_downsample_{downsample_rate}.csv', index=False)
+        all_video_embeddings.append(np.array(embeddings))
+        all_video_frames.append(frames)
+    return all_video_embeddings, all_video_frames
+def get_io_reader(uploaded_file):
+    if uploaded_file.name[-3:]=='seq':
+        with NamedTemporaryFile(suffix="seq", delete=False) as temp:
+            temp.write(uploaded_file.getvalue())
+            sr = seqIo_reader(temp.name)
+    else:
+        with NamedTemporaryFile(suffix="mp4", delete=False) as temp:
+            temp.write(uploaded_file.getvalue())
+            sr = mp4Io_reader(temp.name)
+    return sr
+def generate_embeddings_stream_io(uploaded_files : list,
+                                model = 'SLIP',
+                                downsample_rate = 4,
+                                save_csv = False)-> tuple[list, list, list]:
+    # set up model and device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+    if model == 'SLIP':
+        embed_model = load_slip_model(device)
+        processor = load_slip_preprocessor()
+    elif model == 'CLIP':
+        embed_model = load_clip_model(device)
+        processor = load_clip_preprocessor()
+    all_video_embeddings = []
+    all_video_frames = []
+    for file in uploaded_files:
+        is_seq = False
+        if file.name[-3:] == 'seq': is_seq = True
+        # read in file
+        sr = get_io_reader(file)
+        N  = sr.header['numFrames']
+        # set up embeddings and frame arrays
+        embeddings = []
+        frames = list(range(N))[::downsample_rate]
+        print(frames)
+        # create progress bar
+        i = 0
+        pbar_text = lambda i: f'Creating embeddings for {file.name}. {i}/{len(frames)} frames.'
+        pbar = st.progress(0, text=pbar_text(0))
+        # convert each frame to embeddings
+        for f in tqdm(frames):
+            img, _ = sr.getFrame(f)
+            img_arr = np.array(img)
+            if is_seq:
+                img_rgb = Image.fromarray(img_arr, 'L').convert('RGB')
+            else:
+                img_rgb = Image.fromarray(img_arr).convert('RGB')
+            embeddings.append(encode_image(img_rgb, device, embed_model, processor))
+            # update progress bar
+            i += 1
+            pbar.progress(i/len(frames), pbar_text(i))
+        # save csv of single file
+        if save_csv:
+            df = pd.DataFrame(embeddings)
+            df['Frame'] = frames
+            # save csv
+            df.to_csv(f'embeddings_downsample_{downsample_rate}_{N}_frames.csv', index=False)
+        all_video_embeddings.append(np.array(embeddings))
+        all_video_frames.append(frames)
+    return all_video_embeddings, all_video_frames
+def create_embeddings_csv(out: str,
+                          fnames: list[str],
+                          embeddings: list[np.ndarray],
+                          frames: list[list[int]],
+                          annotations: list[list[str]],
+                          test_fnames: None | list[str],
+                          views: None | list[str],
+                          conditions: None | list[str],
+                          downsample_rate = 4,
+                          filesystem = None):
+    """
+    Creates a .csv file containing all of the generated embeddings and provived information.
+    Parameters:
+    -----------
+    out : str
+        The name of the resulting file.
+    fnames : list[str]
+        Video sources for each of the embedding arrays.
+    embeddings : np.ndarray
+        The generated embeddings from the images.
+    downsample_rate : int
+        The downsample_rate used for generating the embeddings.
+    """
+    assert len(fnames) == len(embeddings)
+    assert len(embeddings) == len(frames)
+    all_embeddings = np.vstack(embeddings)
+    df = pd.DataFrame(all_embeddings)
+    labels = []
+    for i, annot_fnames in enumerate(annotations):
+        _, ext = os.path.splitext(annot_fnames[0])
+        if ext == '.annot':
+            annot, _, _, sr = load_multiple_annotations(annot_fnames, filesystem=filesystem)
+            annot_labels = generate_label_array(annot, downsample_rate, len(frames[i]))
+        elif ext == '.csv':
+            if not filesystem:
+                annot_df = pd.read_csv(annot_fnames[0], header=None)
+            else:
+                with filesystem.open(annot_fnames[0], 'r') as csv_file:
+                    annot_df = pd.read_csv(csv_file, header=None)
+            annot_labels = annot_df[0].to_list()[::downsample_rate]
+            assert len(annot_labels) == len(frames[i]), "There is a mismatch between the number of frames and number of labels. Make sure that the passed in csv file has no header."
+        else:
+            raise ValueError(f'Incompatible file for annotations used. Got a file of type "{ext}".')
+        assert len(annot_labels) == len(frames[i]), "There is a mismatch between the number of frames and number of labels. Make sure you have passed in the correct files."
+        print(annot_labels)
+        labels.append(annot_labels)
+    all_labels = np.hstack(labels)
+    print(len(all_labels))
+    df['Label'] = all_labels
+    all_frames = np.hstack(frames)
+    df['Frame'] = all_frames
+    sources = [[fname for _ in range(len(frames[i]))] for i, fname in enumerate(fnames)]
+    all_sources = np.hstack(sources)
+    df['Source'] = all_sources
+    if test_fnames:
+        t_split = lambda x: True if x in test_fnames else False
+        test = [[t_split(fname) for _ in range(len(frames[i]))] for i, fname in enumerate(fnames)]
+    else:
+        test = [[True for _ in range(len(frames[i]))] for i, _ in enumerate(fnames)]
+    all_test = np.hstack(test)
+    df['Test'] = all_test
+    if views:
+        view = [[views[i] for _ in range(len(frames[i]))] for i in range(len(fnames))]
+    else:
+        view = [[None for _ in range(len(frames[i]))] for i in range(len(fnames))]
+    all_view = np.hstack(view)
+    df['View'] = all_view
+    if conditions:
+        condition = [[conditions[i] for _ in range(len(frames[i]))] for i in range(len(fnames))]
+    else:
+        condition = [[None for _ in range(len(frames[i]))] for i in range(len(fnames))]
+    all_condition = np.hstack(condition)
+    df['Condition'] = all_condition
+    return df
+def create_embeddings_csv_io(out: str,
+                          fnames: list[str],
+                          embeddings: list[np.ndarray],
+                          frames: list[list[int]],
+                          annotations: list,
+                          test_fnames: None | list[str],
+                          views: None | list[str],
+                          conditions: None | list[str],
+                          downsample_rate = 4):
+    """
+    Creates a .csv file containing all of the generated embeddings and provived information.
+    Parameters:
+    -----------
+    out : str
+        The name of the resulting file.
+    fnames : list[str]
+        Video sources for each of the embedding arrays.
+    embeddings : np.ndarray
+        The generated embeddings from the images.
+    downsample_rate : int
+        The downsample_rate used for generating the embeddings.
+    """
+    assert len(fnames) == len(embeddings)
+    assert len(embeddings) == len(frames)
+    all_embeddings = np.vstack(embeddings)
+    df = pd.DataFrame(all_embeddings)
+    labels = []
+    for i, uploaded_annots in enumerate(annotations):
+        print(i)
+        _, ext = os.path.splitext(uploaded_annots[0].name)
+        if ext == '.annot':
+            annot, _, _, sr = load_multiple_annotations_io(uploaded_annots)
+            annot_labels = generate_label_array(annot, downsample_rate, len(frames[i]))
+        elif ext == '.csv':
+            annot_df = pd.read_csv(uploaded_annots[0], header=None)
+            annot_labels = annot_df[0].to_list()[::downsample_rate]
+            assert len(annot_labels) == len(frames[i]), "There is a mismatch between the number of frames and number of labels. Make sure that the passed in csv file has no header."
+        else:
+            raise ValueError(f'Incompatible file for annotations used. Got a file of type "{ext}".')
+        assert len(annot_labels) == len(frames[i]), "There is a mismatch between the number of frames and number of labels. Make sure you have passed in the correct files."
+        print(annot_labels)
+        labels.append(annot_labels)
+    all_labels = np.hstack(labels)
+    print(len(all_labels))
+    df['Label'] = all_labels
+    all_frames = np.hstack(frames)
+    df['Frame'] = all_frames
+    sources = [[fname for _ in range(len(frames[i]))] for i, fname in enumerate(fnames)]
+    all_sources = np.hstack(sources)
+    df['Source'] = all_sources
+    if test_fnames:
+        t_split = lambda x: True if x in test_fnames else False
+        test = [[t_split(fname) for _ in range(len(frames[i]))] for i, fname in enumerate(fnames)]
+    else:
+        test = [[True for _ in range(len(frames[i]))] for i, _ in enumerate(fnames)]
+    all_test = np.hstack(test)
+    df['Test'] = all_test
+    if views:
+        view = [[views[i] for _ in range(len(frames[i]))] for i in range(len(fnames))]
+    else:
+        view = [[None for _ in range(len(frames[i]))] for i in range(len(fnames))]
+    all_view = np.hstack(view)
+    df['View'] = all_view
+    if conditions:
+        condition = [[conditions[i] for _ in range(len(frames[i]))] for i in range(len(fnames))]
+    else:
+        condition = [[None for _ in range(len(frames[i]))] for i in range(len(fnames))]
+    all_condition = np.hstack(condition)
+    df['Condition'] = all_condition
+    return df
+def process_dataset_in_mem(embeddings_df: pd.DataFrame,
+                    specified_classes=None,
+                    classes_to_remove=None,
+                    max_class_size=None,
+                    animal_state=None,
+                    view=None,
+                    shuffle_data=False,
+                    test_videos=None):
+    """
+    Processes output generated from embeddings paired with images and behavior labels.
+    Parameters:
+    -----------
+    csv_path : str
+        Path to the file containing the original data. This should contain embeddings,
+        a column named `'Label'` and a column named `'Images'`.
+    specified_classes : None | list[str]
+        An optional input. Defines labels which should be kept as is in the `'Label'`
+        column and which should be changed to a default `other` label.
+    classes_to_remove : None | list[str]
+        An optional input. Drops rows from the dataframe which contain a label in the
+        list.
+    max_class_size : None | int
+        An optional input. Determines the maximum amount of rows a single label can
+        appear in for each unique label in the `'Label'` column.
+    animal_state : None | str
+        An optional input. Drops rows from the dataframe which do not contain a match
+        for `animal_state` in the text field within the `'Images'` column.
+    view : None | str
+        An optional input. Drops rows from the dataframe which do not contain a match
+        for `view` in the text field within the `'Images'` column.
+    shuffle_data : bool
+        Determines wether the dataframe should have its rows shuffled.
+    test_videos : None | list[str]
+        An optional input. Determines what rows should be in the `test` dataframe, and
+        which should be in the `train` dataframe. It drops rows from the respective
+        dataframe by keeping or dropping rows which do not contain a match for a `str`
+        in `test_videos` in the text field within the `'Images'` column, respectively.
+    Returns:
+    --------
+    balanced_train_embeddings : pandas.DataFrame
+        A processed dataframe whose rows contain the embeddings for each of the images
+        at the corresponding index within `balanced_train_images`.
+    balanced_train_labels : list[str]
+        A list of labels for each of the images at the corresponing index within
+        `balanced_train_images`.
+    balanced_train_images: list[str]
+        A list of paths to images with each image at an index corresponding to a label
+        with the same index in `balanced_train_labels` and the same row index within
+        `balanced_train_embeddings`.
+    test_embeddings : pandas.DataFrame
+        A processed dataframe whose rows contain the embeddings for each of the images
+        at the corresponding index within `test_images`.
+    test_labels : list[str]
+        A list of labels for each of the images at the corresponing index within
+        `test_images`.
+    test_images : list[str]
+        A list of paths to images with each image at an index corresponding to a label
+        with the same index in `test_labels` and the same row index within
+        `test_embeddings`.
+    """
+    # Convert embeddings, labels, and images to a DataFrame for easy manipulation
+    df = copy.deepcopy(embeddings_df)
+    df_keys = [str(x) for x in df.keys()]
+    #Filter by fed or fasted
+    if 'Condition' in df_keys and animal_state:
+        df = df[df['Condition'].str.contains(animal_state, na=False)]
+    if 'View' in df_keys and view:
+        df = df[df['View'].str.contains(view, na=False)]
+    # Extract unique video names excluding the frame number
+    #unique_video_names = df['Images'].apply(lambda x: '_'.join(x.split('_')[:-1])).unique()
+    #print("\nUnique video names:\n", unique_video_names)
+    if classes_to_remove:
+        df = df[~df['Label'].str.contains('|'.join(classes_to_remove), na=False)]
+    elif classes_to_remove and 'all' in classes_to_remove:
+        df = df[df['Label'].str.contains('|'.join(classes_to_remove), na=False)]
+    # Further filter to include only specified_classes
+    if specified_classes:
+        single_match = lambda x: list(set(x.split('||')) & set(specified_classes))[0]
+        df['Label'] = df['Label'].apply(lambda x: single_match(x) if not set(x.split('||')).isdisjoint(specified_classes) else 'other')
+        specified_classes.append('other')
+    # Separate the DataFrame into test and training sets based on test_videos
+    if 'Test' in df_keys and test_videos:
+        test_df = df[df['Test']]
+        train_df = df[~df['Test']]
+    elif test_videos:
+        test_df = df[df['Images'].str.contains('|'.join(test_videos), na=False)]
+        train_df = df[~df['Images'].str.contains('|'.join(test_videos), na=False)]
+    else:
+        test_df = pd.DataFrame(columns=df.columns)
+        train_df = df
+    # Print the number of frames in each class before balancing
+    label_counts = train_df['Label'].value_counts()
+    print("\nNumber of training frames in each class before balancing:")
+    print(label_counts)
+    if max_class_size:
+        balanced_train_df = pd.concat([
+            group.sample(n=min(len(group), max_class_size), random_state=1)
+            for label, group in train_df.groupby('Label')
+        ])
+    else:
+        balanced_train_df = train_df
+    # Shuffle the training DataFrame
+    if shuffle_data:
+        balanced_train_df = balanced_train_df.sample(frac=1).reset_index(drop=True)
+    # Convert training set back to numpy array and list
+    if not "Images" in df_keys:
+        balanced_train_embeddings = balanced_train_df.drop(columns=['Label', 'Frame', 'Source', 'Test','View','Condition']).to_numpy()
+        balanced_train_labels = balanced_train_df['Label'].tolist()
+        balanced_train_images = balanced_train_df['Frame'].tolist()
+        # Convert test set back to numpy array and list
+        test_embeddings = test_df.drop(columns=['Label', 'Frame', 'Source', 'Test','View','Condition']).to_numpy()
+        test_labels = test_df['Label'].tolist()
+        test_images = test_df['Frame'].tolist()
+    else:
+        # Convert training set back to numpy array and list
+        balanced_train_embeddings = balanced_train_df.drop(columns=['Label', 'Images']).to_numpy()
+        balanced_train_labels = balanced_train_df['Label'].tolist()
+        balanced_train_images = balanced_train_df['Images'].tolist()
+        # Convert test set back to numpy array and list
+        if 'Test' in test_df:
+            test_embeddings = test_df.drop(columns=['Label', 'Images', 'Test']).to_numpy()
+        else:
+            test_embeddings = test_df.drop(columns=['Label', 'Images']).to_numpy()
+        test_labels = test_df['Label'].tolist()
+        test_images = test_df['Images'].tolist()
+    # Print the number of frames in each class after balancing
+    if specified_classes or max_class_size:
+        balanced_label_counts = Counter(balanced_train_labels)
+        print("\nNumber of training frames in each class after balancing:")
+        print(balanced_label_counts)
+    test_label_counts = test_df['Label'].value_counts()
+    # print("\nNumber of testing frames in each class:")
+    print(test_label_counts)
+    return balanced_train_embeddings, balanced_train_labels, balanced_train_images, test_embeddings, test_labels, test_images
+def multiclass_merge_and_filter_bouts(multiclass_vector, bout_threshold, proximity_threshold):
+    # Get the unique labels in the multiclass vector (excluding zero, assuming zero is the background/no label)
+    unique_labels = np.unique(multiclass_vector)
+    unique_labels = unique_labels[unique_labels != 0]
+    # Initialize a vector to store the merged and filtered multiclass vector
+    merged_vector = np.zeros_like(multiclass_vector)
+    for label in unique_labels:
+        # Create a binary vector for the current label
+        binary_vector = (multiclass_vector == label)
+        # Find the start and end indices of all sequences of 1's for this label
+        starts = np.where(np.diff(np.concatenate(([0], binary_vector))) == 1)[0]
+        ends = np.where(np.diff(np.concatenate((binary_vector, [0]))) == -1)[0]
+        # Step 1: Merge close short bouts
+        i = 0
+        while i < len(starts) - 1:
+            # Check if the gap between the end of the current bout and the start of the next bout
+            # is within the proximity threshold
+            if starts[i + 1] - ends[i] <= proximity_threshold:
+                # Merge the two bouts by setting all elements between the start of the first
+                # and the end of the second bout to 1
+                binary_vector[ends[i]:starts[i + 1]] = 1
+                # Remove the next bout from consideration
+                starts = np.delete(starts, i + 1)
+                ends = np.delete(ends, i)
+            else:
+                i += 1
+        # Update the starts and ends after merging
+        starts = np.where(np.diff(np.concatenate(([0], binary_vector))) == 1)[0]
+        ends = np.where(np.diff(np.concatenate((binary_vector, [0]))) == -1)[0]
+        # Step 2: Remove standalone short bouts
+        for i in range(len(starts)):
+            # Check the length of the bout
+            length_of_bout = ends[i] - starts[i] + 1
+            # If the length is less than the threshold, set those elements to 0
+            if length_of_bout < bout_threshold:
+                binary_vector[starts[i]:ends[i] + 1] = 0
+        # Combine the binary vector with the merged_vector, ensuring only the current label is set
+        merged_vector[binary_vector] = label
+    # Return the filtered multiclass vector
+    return merged_vector
+def get_unique_labels(label_list: list[str]):
+    label_set = set()
+    for label in label_list:
+        individual_labels = label.split('||')
+        for individual_label in individual_labels:
+            label_set.add(individual_label)
+    return list(label_set)
+def get_train_test_split(train_embeds, numerical_labels, test_size=0.05, random_state=42):
+    return train_test_split(train_embeds, numerical_labels, test_size=test_size, random_state=random_state)
+def train_model(X_train, y_train, random_state=42):
+    # Train SVM Classifier
+    svm_clf = SVC(kernel='rbf', random_state=random_state, probability=True)
+    svm_clf.fit(X_train, y_train)
+    return svm_clf
+def pickle_model(model):
+    pickled = io.BytesIO()
+    pickle.dump(model, pickled)
+    return pickled
+def get_seq_io_reader(uploaded_file):
+    assert uploaded_file.name[-3:]=='seq', 'Not a seq file'
+    with NamedTemporaryFile(suffix="seq", delete=False) as temp:
+        temp.write(uploaded_file.getvalue())
+        sr = seqIo_reader(temp.name)
+    return sr
+def seq_to_arr(sr):
+    N = sr.header['numFrames']
+    images = []
+    for f in range(N):
+        I, ts = sr.getFrame(f)
+        images.append(I)
+    return np.array(images)
+def get_2d_embedding(embeddings: pd.DataFrame):
+    tsne = TSNE(n_jobs=4, n_components=2, random_state=42, perplexity=50)
+    embedding_2d = tsne.fit_transform(np.array(embeddings))
+    return embedding_2d