File size: 4,460 Bytes
61488b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from collections import namedtuple

import numpy as np

from scipy.interpolate import interp1d
import torch 
import matplotlib.pyplot as plt
# Mapping of nucleotides to float coordinates
mapping_easy = {
    "A": np.array([0.5, -0.8660254037844386]),
    "T": np.array([0.5, 0.8660254037844386]),
    "G": np.array([0.8660254037844386, -0.5]),
    "C": np.array([0.8660254037844386, 0.5]),
    "N": np.array([0, 0]),
}

# coordinates for x+iy
Coord = namedtuple("Coord", ["x", "y"])
# coordinates for a CGR encoding
CGRCoords = namedtuple("CGRCoords", ["N", "x", "y"])
# coordinates for each nucleotide in the 2d-plane
DEFAULT_COORDS = {"A": Coord(1, 1), "C": Coord(-1, 1), "G": Coord(-1, -1), "T": Coord(1, -1)}


# Function to convert a DNA sequence to a list of coordinates
def _dna_to_coordinates(dna_sequence: str, mapping: dict[str, np.ndarray]) -> np.ndarray:
    dna_sequence = dna_sequence.upper()
    coordinates = np.array([mapping.get(nucleotide, mapping["N"]) for nucleotide in dna_sequence])
    return coordinates


# Function to create the cumulative sum of a list of coordinates
def _get_cumulative_coords(mapped_coords):
    cumulative_coords = np.cumsum(mapped_coords, axis=0)
    return cumulative_coords


def generate_2d_sequence(seq):
    dna_sequence = seq.upper()
    mapped_coords = _dna_to_coordinates(dna_sequence, mapping_easy)
    cumulative_coords = _get_cumulative_coords(mapped_coords)

    # Scale the input data using standardization
    x_train = cumulative_coords[:, 0]
    y_train = cumulative_coords[:, 1]
    x_train_scaled = (x_train - x_train.mean()) / x_train.std()
    y_train_scaled = (y_train - y_train.mean()) / y_train.std()
    scaled_coords = np.column_stack((x_train_scaled, y_train_scaled))

    # example["2D_Sequence"] = cumulative_coords.tolist()
    # example["2D_Sequence_Scaled"] = scaled_coords.tolist()

    # Interpolate the 2D sequences to have exactly 1000 pairs
    interpolated_coords = y_train_scaled # default to filter out bad examples
    if len(scaled_coords) != 1000:
        try:
            t = np.linspace(0, 1, len(scaled_coords))
            t_new = np.linspace(0, 1, 1000)

            interp_func_x = interp1d(t, scaled_coords[:, 0], kind="linear")
            interp_func_y = interp1d(t, scaled_coords[:, 1], kind="linear")

            interpolated_coords = interp_func_x(t_new)
        except Exception as e:
            print(f"Interpolation error: {e}")

    tensor_2d_rep_y = torch.Tensor(interpolated_coords).reshape(1,1000)

    return y_train_scaled, x_train_scaled


def generate_2d_sequence_small(seq):
    dna_sequence = seq.upper()
    mapped_coords = _dna_to_coordinates(dna_sequence, mapping_easy)
    cumulative_coords = _get_cumulative_coords(mapped_coords)

    # Scale the input data using standardization
    x_train = cumulative_coords[:, 0]
    y_train = cumulative_coords[:, 1]
    x_train_scaled = (x_train - x_train.mean()) / x_train.std()
    y_train_scaled = (y_train - y_train.mean()) / y_train.std()
    scaled_coords = np.column_stack((x_train_scaled, y_train_scaled))

    # example["2D_Sequence"] = cumulative_coords.tolist()
    # example["2D_Sequence_Scaled"] = scaled_coords.tolist()

    # Interpolate the 2D sequences to have exactly 1000 pairs
    interpolated_coords = y_train_scaled # default to filter out bad examples
    if len(scaled_coords) != 1000:
        try:
            t = np.linspace(0, 1, len(scaled_coords))
            t_new = np.linspace(0, 1, 400)

            interp_func_x = interp1d(t, scaled_coords[:, 0], kind="linear")
            interp_func_y = interp1d(t, scaled_coords[:, 1], kind="linear")

            interpolated_coords = interp_func_y(t_new)
        except Exception as e:
            print(f"Interpolation error: {e}")

    tensor_2d_rep_y = torch.Tensor(interpolated_coords).reshape(400)

    return tensor_2d_rep_y
    
def plot_seq_full_label(df, filter):
    ncols = len(filter)
    unique_ids = df.label_id.unique()
    print(unique_ids)
    unique_ids_plot = [id for id in unique_ids if id in filter]
    print(unique_ids_plot)
    fig, axs = plt.subplots(ncols=ncols)
    for i, id in enumerate(unique_ids_plot):
        # data = (df[df['label_id'] == id].sample((n=3)))['seq'].values[0]
        # print(data)
        data = generate_2d_sequence_small(df[df['label_id'] == id].sample(n=1)['seq'].values[0]).numpy()
       # two_d = generate_2d_sequence(data)[0]
        axs[i].plot(data)
    return fig