csuhan commited on
Commit
05f14fa
1 Parent(s): 916cf13

Delete lib

Browse files
lib/__pycache__/point_utils.cpython-310.pyc DELETED
Binary file (6.74 kB)
 
lib/point_utils.py DELETED
@@ -1,191 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from torch.autograd import Function
4
- import pointnet2_cuda
5
-
6
- class KNN(nn.Module):
7
- def __init__(self, neighbors, transpose_mode=True):
8
- super(KNN, self).__init__()
9
- self.neighbors = neighbors
10
-
11
- @torch.no_grad()
12
- def forward(self, support, query):
13
- """
14
- Args:
15
- support ([tensor]): [B, N, C]
16
- query ([tensor]): [B, M, C]
17
- Returns:
18
- [int]: neighbor idx. [B, M, K]
19
- """
20
- dist = torch.cdist(support, query)
21
- k_dist = dist.topk(k=self.neighbors, dim=1, largest=False)
22
- return k_dist.values, k_dist.indices.transpose(1, 2).contiguous().int()
23
-
24
-
25
- class GroupingOperation(Function):
26
-
27
- @staticmethod
28
- @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
29
- def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
30
- """
31
- :param ctx:
32
- :param features: (B, C, N) tensor of features to group
33
- :param idx: (B, npoint, nsample) tensor containing the indicies of features to group with
34
- :return:
35
- output: (B, C, npoint, nsample) tensor
36
- """
37
- assert features.is_contiguous()
38
- assert idx.is_contiguous()
39
-
40
- B, nfeatures, nsample = idx.size()
41
- _, C, N = features.size()
42
- output = torch.cuda.FloatTensor(B, C, nfeatures, nsample, device=features.device)
43
-
44
- pointnet2_cuda.group_points_wrapper(B, C, N, nfeatures, nsample, features, idx, output)
45
-
46
- ctx.for_backwards = (idx, N)
47
- return output
48
-
49
- @staticmethod
50
- def backward(ctx, grad_out: torch.Tensor):
51
- """
52
- :param ctx:
53
- :param grad_out: (B, C, npoint, nsample) tensor of the gradients of the output from forward
54
- :return:
55
- grad_features: (B, C, N) gradient of the features
56
- """
57
- idx, N = ctx.for_backwards
58
-
59
- B, C, npoint, nsample = grad_out.size()
60
- grad_features = torch.zeros([B, C, N], dtype=torch.float, device=grad_out.device, requires_grad=True)
61
- grad_out_data = grad_out.data.contiguous()
62
- pointnet2_cuda.group_points_grad_wrapper(B, C, N, npoint, nsample, grad_out_data, idx, grad_features.data)
63
- return grad_features, None
64
-
65
- grouping_operation = GroupingOperation.apply
66
-
67
-
68
- class KNNGroup(nn.Module):
69
- def __init__(self, nsample: int,
70
- relative_xyz=True,
71
- normalize_dp=False,
72
- return_only_idx=False,
73
- **kwargs
74
- ):
75
- """[summary]
76
-
77
- Args:
78
- nsample (int): maximum number of features to gather in the ball
79
- use_xyz (bool, optional): concate xyz. Defaults to True.
80
- ret_grouped_xyz (bool, optional): [description]. Defaults to False.
81
- normalize_dp (bool, optional): [description]. Defaults to False.
82
- """
83
- super().__init__()
84
- self.nsample = nsample
85
- self.knn = KNN(nsample, transpose_mode=True)
86
- self.relative_xyz = relative_xyz
87
- self.normalize_dp = normalize_dp
88
- self.return_only_idx = return_only_idx
89
-
90
- def forward(self, query_xyz: torch.Tensor, support_xyz: torch.Tensor, features: torch.Tensor = None):
91
- """
92
- :param query_xyz: (B, N, 3) xyz coordinates of the features
93
- :param support_xyz: (B, npoint, 3) centroids
94
- :param features: (B, C, N) descriptors of the features
95
- :return:
96
- new_features: (B, 3 + C, npoint, nsample)
97
- """
98
- _, idx = self.knn(support_xyz, query_xyz)
99
- if self.return_only_idx:
100
- return idx
101
- idx = idx.int()
102
- xyz_trans = support_xyz.transpose(1, 2).contiguous()
103
- grouped_xyz = grouping_operation(xyz_trans, idx) # (B, 3, npoint, nsample)
104
- if self.relative_xyz:
105
- grouped_xyz -= query_xyz.transpose(1, 2).unsqueeze(-1) # relative position
106
- if self.normalize_dp:
107
- grouped_xyz /= torch.amax(torch.sqrt(torch.sum(grouped_xyz**2, dim=1)), dim=(1, 2)).view(-1, 1, 1, 1)
108
- if features is not None:
109
- grouped_features = grouping_operation(features, idx)
110
- return grouped_xyz, grouped_features
111
- else:
112
- return grouped_xyz, None
113
-
114
-
115
- class FurthestPointSampling(Function):
116
- @staticmethod
117
- def forward(ctx, xyz: torch.Tensor, npoint: int) -> torch.Tensor:
118
- """
119
- Uses iterative furthest point sampling to select a set of npoint features that have the largest
120
- minimum distance
121
- :param ctx:
122
- :param xyz: (B, N, 3) where N > npoint
123
- :param npoint: int, number of features in the sampled set
124
- :return:
125
- output: (B, npoint) tensor containing the set (idx)
126
- """
127
- assert xyz.is_contiguous()
128
-
129
- B, N, _ = xyz.size()
130
- # output = torch.cuda.IntTensor(B, npoint, device=xyz.device)
131
- # temp = torch.cuda.FloatTensor(B, N, device=xyz.device).fill_(1e10)
132
- output = torch.cuda.IntTensor(B, npoint)
133
- temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
134
-
135
- pointnet2_cuda.furthest_point_sampling_wrapper(
136
- B, N, npoint, xyz, temp, output)
137
- return output
138
-
139
- @staticmethod
140
- def backward(xyz, a=None):
141
- return None, None
142
-
143
- furthest_point_sample = FurthestPointSampling.apply
144
-
145
-
146
- class PointPatchEmbed(nn.Module):
147
-
148
- def __init__(self,
149
- sample_ratio=0.0625,
150
- sample_number=1024,
151
- group_size=32,
152
- in_channels=6,
153
- channels=1024,
154
- kernel_size=1,
155
- stride=1,
156
- normalize_dp=False,
157
- relative_xyz=True,
158
- ):
159
- super().__init__()
160
- self.sample_ratio = sample_ratio
161
- self.sample_number = sample_number
162
- self.group_size = group_size
163
-
164
- self.sample_fn = furthest_point_sample
165
- self.grouper = KNNGroup(self.group_size, relative_xyz=relative_xyz, normalize_dp=normalize_dp)
166
-
167
- self.conv1 = nn.Conv2d(in_channels, channels, kernel_size=kernel_size, stride=stride)
168
-
169
-
170
- def forward(self, x):
171
- # coordinates
172
- p = x[:, :, 3:].contiguous()
173
-
174
- B, N, _ = p.shape[:3]
175
- # idx = self.sample_fn(p, int(N * self.sample_ratio)).long()
176
- idx = self.sample_fn(p, self.sample_number).long()
177
- center_p = torch.gather(p, 1, idx.unsqueeze(-1).expand(-1, -1, 3))
178
- # query neighbors.
179
- _, fj = self.grouper(center_p, p, x.permute(0, 2, 1).contiguous()) # [B, N, 6] -> [B, 6, N] -> [B, 6, 1024, 32]
180
-
181
- # [B, 6, 1024] -> [B, channels, 1024, 1]
182
- fj = self.conv1(fj).max(dim=-1, keepdim=True)[0]
183
-
184
- return fj
185
-
186
-
187
- if __name__ == '__main__':
188
- model = PointPatchEmbed(channels=256).cuda()
189
- input = torch.rand(4, 16384, 6).cuda()
190
- ou = model(input)
191
- import pdb;pdb.set_trace()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/pointnet2_modules.py DELETED
@@ -1,160 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
-
5
- from . import pointnet2_utils
6
- from . import pytorch_utils as pt_utils
7
- from typing import List
8
-
9
-
10
- class _PointnetSAModuleBase(nn.Module):
11
-
12
- def __init__(self):
13
- super().__init__()
14
- self.npoint = None
15
- self.groupers = None
16
- self.mlps = None
17
- self.pool_method = 'max_pool'
18
-
19
- def forward(self, xyz: torch.Tensor, features: torch.Tensor = None, new_xyz=None) -> (torch.Tensor, torch.Tensor):
20
- """
21
- :param xyz: (B, N, 3) tensor of the xyz coordinates of the features
22
- :param features: (B, N, C) tensor of the descriptors of the the features
23
- :param new_xyz:
24
- :return:
25
- new_xyz: (B, npoint, 3) tensor of the new features' xyz
26
- new_features: (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors
27
- """
28
- new_features_list = []
29
-
30
- xyz_flipped = xyz.transpose(1, 2).contiguous()
31
- if new_xyz is None:
32
- new_xyz = pointnet2_utils.gather_operation(
33
- xyz_flipped,
34
- pointnet2_utils.furthest_point_sample(xyz, self.npoint)
35
- ).transpose(1, 2).contiguous() if self.npoint is not None else None
36
-
37
- for i in range(len(self.groupers)):
38
- new_features = self.groupers[i](xyz, new_xyz, features) # (B, C, npoint, nsample)
39
-
40
- new_features = self.mlps[i](new_features) # (B, mlp[-1], npoint, nsample)
41
- if self.pool_method == 'max_pool':
42
- new_features = F.max_pool2d(
43
- new_features, kernel_size=[1, new_features.size(3)]
44
- ) # (B, mlp[-1], npoint, 1)
45
- elif self.pool_method == 'avg_pool':
46
- new_features = F.avg_pool2d(
47
- new_features, kernel_size=[1, new_features.size(3)]
48
- ) # (B, mlp[-1], npoint, 1)
49
- else:
50
- raise NotImplementedError
51
-
52
- new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint)
53
- new_features_list.append(new_features)
54
-
55
- return new_xyz, torch.cat(new_features_list, dim=1)
56
-
57
-
58
- class PointnetSAModuleMSG(_PointnetSAModuleBase):
59
- """Pointnet set abstraction layer with multiscale grouping"""
60
-
61
- def __init__(self, *, npoint: int, radii: List[float], nsamples: List[int], mlps: List[List[int]], bn: bool = True,
62
- use_xyz: bool = True, pool_method='max_pool', instance_norm=False):
63
- """
64
- :param npoint: int
65
- :param radii: list of float, list of radii to group with
66
- :param nsamples: list of int, number of samples in each ball query
67
- :param mlps: list of list of int, spec of the pointnet before the global pooling for each scale
68
- :param bn: whether to use batchnorm
69
- :param use_xyz:
70
- :param pool_method: max_pool / avg_pool
71
- :param instance_norm: whether to use instance_norm
72
- """
73
- super().__init__()
74
-
75
- assert len(radii) == len(nsamples) == len(mlps)
76
-
77
- self.npoint = npoint
78
- self.groupers = nn.ModuleList()
79
- self.mlps = nn.ModuleList()
80
- for i in range(len(radii)):
81
- radius = radii[i]
82
- nsample = nsamples[i]
83
- self.groupers.append(
84
- pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz)
85
- if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
86
- )
87
- mlp_spec = mlps[i]
88
- if use_xyz:
89
- mlp_spec[0] += 3
90
-
91
- self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn, instance_norm=instance_norm))
92
- self.pool_method = pool_method
93
-
94
-
95
- class PointnetSAModule(PointnetSAModuleMSG):
96
- """Pointnet set abstraction layer"""
97
-
98
- def __init__(self, *, mlp: List[int], npoint: int = None, radius: float = None, nsample: int = None,
99
- bn: bool = True, use_xyz: bool = True, pool_method='max_pool', instance_norm=False):
100
- """
101
- :param mlp: list of int, spec of the pointnet before the global max_pool
102
- :param npoint: int, number of features
103
- :param radius: float, radius of ball
104
- :param nsample: int, number of samples in the ball query
105
- :param bn: whether to use batchnorm
106
- :param use_xyz:
107
- :param pool_method: max_pool / avg_pool
108
- :param instance_norm: whether to use instance_norm
109
- """
110
- super().__init__(
111
- mlps=[mlp], npoint=npoint, radii=[radius], nsamples=[nsample], bn=bn, use_xyz=use_xyz,
112
- pool_method=pool_method, instance_norm=instance_norm
113
- )
114
-
115
-
116
- class PointnetFPModule(nn.Module):
117
- r"""Propigates the features of one set to another"""
118
-
119
- def __init__(self, *, mlp: List[int], bn: bool = True):
120
- """
121
- :param mlp: list of int
122
- :param bn: whether to use batchnorm
123
- """
124
- super().__init__()
125
- self.mlp = pt_utils.SharedMLP(mlp, bn=bn)
126
-
127
- def forward(
128
- self, unknown: torch.Tensor, known: torch.Tensor, unknow_feats: torch.Tensor, known_feats: torch.Tensor
129
- ) -> torch.Tensor:
130
- """
131
- :param unknown: (B, n, 3) tensor of the xyz positions of the unknown features
132
- :param known: (B, m, 3) tensor of the xyz positions of the known features
133
- :param unknow_feats: (B, C1, n) tensor of the features to be propigated to
134
- :param known_feats: (B, C2, m) tensor of features to be propigated
135
- :return:
136
- new_features: (B, mlp[-1], n) tensor of the features of the unknown features
137
- """
138
- if known is not None:
139
- dist, idx = pointnet2_utils.three_nn(unknown, known)
140
- dist_recip = 1.0 / (dist + 1e-8)
141
- norm = torch.sum(dist_recip, dim=2, keepdim=True)
142
- weight = dist_recip / norm
143
-
144
- interpolated_feats = pointnet2_utils.three_interpolate(known_feats, idx, weight)
145
- else:
146
- interpolated_feats = known_feats.expand(*known_feats.size()[0:2], unknown.size(1))
147
-
148
- if unknow_feats is not None:
149
- new_features = torch.cat([interpolated_feats, unknow_feats], dim=1) # (B, C2 + C1, n)
150
- else:
151
- new_features = interpolated_feats
152
-
153
- new_features = new_features.unsqueeze(-1)
154
- new_features = self.mlp(new_features)
155
-
156
- return new_features.squeeze(-1)
157
-
158
-
159
- if __name__ == "__main__":
160
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/pointnet2_utils.py DELETED
@@ -1,290 +0,0 @@
1
- import torch
2
- from torch.autograd import Variable
3
- from torch.autograd import Function
4
- import torch.nn as nn
5
- from typing import Tuple
6
-
7
- import pointnet2_cuda as pointnet2
8
-
9
-
10
- class FurthestPointSampling(Function):
11
- @staticmethod
12
- def forward(ctx, xyz: torch.Tensor, npoint: int) -> torch.Tensor:
13
- """
14
- Uses iterative furthest point sampling to select a set of npoint features that have the largest
15
- minimum distance
16
- :param ctx:
17
- :param xyz: (B, N, 3) where N > npoint
18
- :param npoint: int, number of features in the sampled set
19
- :return:
20
- output: (B, npoint) tensor containing the set
21
- """
22
- assert xyz.is_contiguous()
23
-
24
- B, N, _ = xyz.size()
25
- output = torch.cuda.IntTensor(B, npoint)
26
- temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
27
-
28
- pointnet2.furthest_point_sampling_wrapper(B, N, npoint, xyz, temp, output)
29
- return output
30
-
31
- @staticmethod
32
- def backward(xyz, a=None):
33
- return None, None
34
-
35
-
36
- furthest_point_sample = FurthestPointSampling.apply
37
-
38
-
39
- class GatherOperation(Function):
40
-
41
- @staticmethod
42
- def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
43
- """
44
- :param ctx:
45
- :param features: (B, C, N)
46
- :param idx: (B, npoint) index tensor of the features to gather
47
- :return:
48
- output: (B, C, npoint)
49
- """
50
- assert features.is_contiguous()
51
- assert idx.is_contiguous()
52
-
53
- B, npoint = idx.size()
54
- _, C, N = features.size()
55
- output = torch.cuda.FloatTensor(B, C, npoint)
56
-
57
- pointnet2.gather_points_wrapper(B, C, N, npoint, features, idx, output)
58
-
59
- ctx.for_backwards = (idx, C, N)
60
- return output
61
-
62
- @staticmethod
63
- def backward(ctx, grad_out):
64
- idx, C, N = ctx.for_backwards
65
- B, npoint = idx.size()
66
-
67
- grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
68
- grad_out_data = grad_out.data.contiguous()
69
- pointnet2.gather_points_grad_wrapper(B, C, N, npoint, grad_out_data, idx, grad_features.data)
70
- return grad_features, None
71
-
72
-
73
- gather_operation = GatherOperation.apply
74
-
75
-
76
- class ThreeNN(Function):
77
-
78
- @staticmethod
79
- def forward(ctx, unknown: torch.Tensor, known: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
80
- """
81
- Find the three nearest neighbors of unknown in known
82
- :param ctx:
83
- :param unknown: (B, N, 3)
84
- :param known: (B, M, 3)
85
- :return:
86
- dist: (B, N, 3) l2 distance to the three nearest neighbors
87
- idx: (B, N, 3) index of 3 nearest neighbors
88
- """
89
- assert unknown.is_contiguous()
90
- assert known.is_contiguous()
91
-
92
- B, N, _ = unknown.size()
93
- m = known.size(1)
94
- dist2 = torch.cuda.FloatTensor(B, N, 3)
95
- idx = torch.cuda.IntTensor(B, N, 3)
96
-
97
- pointnet2.three_nn_wrapper(B, N, m, unknown, known, dist2, idx)
98
- return torch.sqrt(dist2), idx
99
-
100
- @staticmethod
101
- def backward(ctx, a=None, b=None):
102
- return None, None
103
-
104
-
105
- three_nn = ThreeNN.apply
106
-
107
-
108
- class ThreeInterpolate(Function):
109
-
110
- @staticmethod
111
- def forward(ctx, features: torch.Tensor, idx: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
112
- """
113
- Performs weight linear interpolation on 3 features
114
- :param ctx:
115
- :param features: (B, C, M) Features descriptors to be interpolated from
116
- :param idx: (B, n, 3) three nearest neighbors of the target features in features
117
- :param weight: (B, n, 3) weights
118
- :return:
119
- output: (B, C, N) tensor of the interpolated features
120
- """
121
- assert features.is_contiguous()
122
- assert idx.is_contiguous()
123
- assert weight.is_contiguous()
124
-
125
- B, c, m = features.size()
126
- n = idx.size(1)
127
- ctx.three_interpolate_for_backward = (idx, weight, m)
128
- output = torch.cuda.FloatTensor(B, c, n)
129
-
130
- pointnet2.three_interpolate_wrapper(B, c, m, n, features, idx, weight, output)
131
- return output
132
-
133
- @staticmethod
134
- def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
135
- """
136
- :param ctx:
137
- :param grad_out: (B, C, N) tensor with gradients of outputs
138
- :return:
139
- grad_features: (B, C, M) tensor with gradients of features
140
- None:
141
- None:
142
- """
143
- idx, weight, m = ctx.three_interpolate_for_backward
144
- B, c, n = grad_out.size()
145
-
146
- grad_features = Variable(torch.cuda.FloatTensor(B, c, m).zero_())
147
- grad_out_data = grad_out.data.contiguous()
148
-
149
- pointnet2.three_interpolate_grad_wrapper(B, c, n, m, grad_out_data, idx, weight, grad_features.data)
150
- return grad_features, None, None
151
-
152
-
153
- three_interpolate = ThreeInterpolate.apply
154
-
155
-
156
- class GroupingOperation(Function):
157
-
158
- @staticmethod
159
- def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
160
- """
161
- :param ctx:
162
- :param features: (B, C, N) tensor of features to group
163
- :param idx: (B, npoint, nsample) tensor containing the indicies of features to group with
164
- :return:
165
- output: (B, C, npoint, nsample) tensor
166
- """
167
- assert features.is_contiguous()
168
- assert idx.is_contiguous()
169
-
170
- B, nfeatures, nsample = idx.size()
171
- _, C, N = features.size()
172
- output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
173
-
174
- pointnet2.group_points_wrapper(B, C, N, nfeatures, nsample, features, idx, output)
175
-
176
- ctx.for_backwards = (idx, N)
177
- return output
178
-
179
- @staticmethod
180
- def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
181
- """
182
- :param ctx:
183
- :param grad_out: (B, C, npoint, nsample) tensor of the gradients of the output from forward
184
- :return:
185
- grad_features: (B, C, N) gradient of the features
186
- """
187
- idx, N = ctx.for_backwards
188
-
189
- B, C, npoint, nsample = grad_out.size()
190
- grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
191
-
192
- grad_out_data = grad_out.data.contiguous()
193
- pointnet2.group_points_grad_wrapper(B, C, N, npoint, nsample, grad_out_data, idx, grad_features.data)
194
- return grad_features, None
195
-
196
-
197
- grouping_operation = GroupingOperation.apply
198
-
199
-
200
- class BallQuery(Function):
201
-
202
- @staticmethod
203
- def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> torch.Tensor:
204
- """
205
- :param ctx:
206
- :param radius: float, radius of the balls
207
- :param nsample: int, maximum number of features in the balls
208
- :param xyz: (B, N, 3) xyz coordinates of the features
209
- :param new_xyz: (B, npoint, 3) centers of the ball query
210
- :return:
211
- idx: (B, npoint, nsample) tensor with the indicies of the features that form the query balls
212
- """
213
- assert new_xyz.is_contiguous()
214
- assert xyz.is_contiguous()
215
-
216
- B, N, _ = xyz.size()
217
- npoint = new_xyz.size(1)
218
- idx = torch.cuda.IntTensor(B, npoint, nsample).zero_()
219
-
220
- pointnet2.ball_query_wrapper(B, N, npoint, radius, nsample, new_xyz, xyz, idx)
221
- return idx
222
-
223
- @staticmethod
224
- def backward(ctx, a=None):
225
- return None, None, None, None
226
-
227
-
228
- ball_query = BallQuery.apply
229
-
230
-
231
- class QueryAndGroup(nn.Module):
232
- def __init__(self, radius: float, nsample: int, use_xyz: bool = True):
233
- """
234
- :param radius: float, radius of ball
235
- :param nsample: int, maximum number of features to gather in the ball
236
- :param use_xyz:
237
- """
238
- super().__init__()
239
- self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
240
-
241
- def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None) -> Tuple[torch.Tensor]:
242
- """
243
- :param xyz: (B, N, 3) xyz coordinates of the features
244
- :param new_xyz: (B, npoint, 3) centroids
245
- :param features: (B, C, N) descriptors of the features
246
- :return:
247
- new_features: (B, 3 + C, npoint, nsample)
248
- """
249
- idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
250
- xyz_trans = xyz.transpose(1, 2).contiguous()
251
- grouped_xyz = grouping_operation(xyz_trans, idx) # (B, 3, npoint, nsample)
252
- grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
253
-
254
- if features is not None:
255
- grouped_features = grouping_operation(features, idx)
256
- if self.use_xyz:
257
- new_features = torch.cat([grouped_xyz, grouped_features], dim=1) # (B, C + 3, npoint, nsample)
258
- else:
259
- new_features = grouped_features
260
- else:
261
- assert self.use_xyz, "Cannot have not features and not use xyz as a feature!"
262
- new_features = grouped_xyz
263
-
264
- return new_features
265
-
266
-
267
- class GroupAll(nn.Module):
268
- def __init__(self, use_xyz: bool = True):
269
- super().__init__()
270
- self.use_xyz = use_xyz
271
-
272
- def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None):
273
- """
274
- :param xyz: (B, N, 3) xyz coordinates of the features
275
- :param new_xyz: ignored
276
- :param features: (B, C, N) descriptors of the features
277
- :return:
278
- new_features: (B, C + 3, 1, N)
279
- """
280
- grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
281
- if features is not None:
282
- grouped_features = features.unsqueeze(2)
283
- if self.use_xyz:
284
- new_features = torch.cat([grouped_xyz, grouped_features], dim=1) # (B, 3 + C, 1, N)
285
- else:
286
- new_features = grouped_features
287
- else:
288
- new_features = grouped_xyz
289
-
290
- return new_features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/pytorch_utils.py DELETED
@@ -1,236 +0,0 @@
1
- import torch.nn as nn
2
- from typing import List, Tuple
3
-
4
-
5
- class SharedMLP(nn.Sequential):
6
-
7
- def __init__(
8
- self,
9
- args: List[int],
10
- *,
11
- bn: bool = False,
12
- activation=nn.ReLU(inplace=True),
13
- preact: bool = False,
14
- first: bool = False,
15
- name: str = "",
16
- instance_norm: bool = False,
17
- ):
18
- super().__init__()
19
-
20
- for i in range(len(args) - 1):
21
- self.add_module(
22
- name + 'layer{}'.format(i),
23
- Conv2d(
24
- args[i],
25
- args[i + 1],
26
- bn=(not first or not preact or (i != 0)) and bn,
27
- activation=activation
28
- if (not first or not preact or (i != 0)) else None,
29
- preact=preact,
30
- instance_norm=instance_norm
31
- )
32
- )
33
-
34
-
35
- class _ConvBase(nn.Sequential):
36
-
37
- def __init__(
38
- self,
39
- in_size,
40
- out_size,
41
- kernel_size,
42
- stride,
43
- padding,
44
- activation,
45
- bn,
46
- init,
47
- conv=None,
48
- batch_norm=None,
49
- bias=True,
50
- preact=False,
51
- name="",
52
- instance_norm=False,
53
- instance_norm_func=None
54
- ):
55
- super().__init__()
56
-
57
- bias = bias and (not bn)
58
- conv_unit = conv(
59
- in_size,
60
- out_size,
61
- kernel_size=kernel_size,
62
- stride=stride,
63
- padding=padding,
64
- bias=bias
65
- )
66
- init(conv_unit.weight)
67
- if bias:
68
- nn.init.constant_(conv_unit.bias, 0)
69
-
70
- if bn:
71
- if not preact:
72
- bn_unit = batch_norm(out_size)
73
- else:
74
- bn_unit = batch_norm(in_size)
75
- if instance_norm:
76
- if not preact:
77
- in_unit = instance_norm_func(out_size, affine=False, track_running_stats=False)
78
- else:
79
- in_unit = instance_norm_func(in_size, affine=False, track_running_stats=False)
80
-
81
- if preact:
82
- if bn:
83
- self.add_module(name + 'bn', bn_unit)
84
-
85
- if activation is not None:
86
- self.add_module(name + 'activation', activation)
87
-
88
- if not bn and instance_norm:
89
- self.add_module(name + 'in', in_unit)
90
-
91
- self.add_module(name + 'conv', conv_unit)
92
-
93
- if not preact:
94
- if bn:
95
- self.add_module(name + 'bn', bn_unit)
96
-
97
- if activation is not None:
98
- self.add_module(name + 'activation', activation)
99
-
100
- if not bn and instance_norm:
101
- self.add_module(name + 'in', in_unit)
102
-
103
-
104
- class _BNBase(nn.Sequential):
105
-
106
- def __init__(self, in_size, batch_norm=None, name=""):
107
- super().__init__()
108
- self.add_module(name + "bn", batch_norm(in_size))
109
-
110
- nn.init.constant_(self[0].weight, 1.0)
111
- nn.init.constant_(self[0].bias, 0)
112
-
113
-
114
- class BatchNorm1d(_BNBase):
115
-
116
- def __init__(self, in_size: int, *, name: str = ""):
117
- super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name)
118
-
119
-
120
- class BatchNorm2d(_BNBase):
121
-
122
- def __init__(self, in_size: int, name: str = ""):
123
- super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name)
124
-
125
-
126
- class Conv1d(_ConvBase):
127
-
128
- def __init__(
129
- self,
130
- in_size: int,
131
- out_size: int,
132
- *,
133
- kernel_size: int = 1,
134
- stride: int = 1,
135
- padding: int = 0,
136
- activation=nn.ReLU(inplace=True),
137
- bn: bool = False,
138
- init=nn.init.kaiming_normal_,
139
- bias: bool = True,
140
- preact: bool = False,
141
- name: str = "",
142
- instance_norm=False
143
- ):
144
- super().__init__(
145
- in_size,
146
- out_size,
147
- kernel_size,
148
- stride,
149
- padding,
150
- activation,
151
- bn,
152
- init,
153
- conv=nn.Conv1d,
154
- batch_norm=BatchNorm1d,
155
- bias=bias,
156
- preact=preact,
157
- name=name,
158
- instance_norm=instance_norm,
159
- instance_norm_func=nn.InstanceNorm1d
160
- )
161
-
162
-
163
- class Conv2d(_ConvBase):
164
-
165
- def __init__(
166
- self,
167
- in_size: int,
168
- out_size: int,
169
- *,
170
- kernel_size: Tuple[int, int] = (1, 1),
171
- stride: Tuple[int, int] = (1, 1),
172
- padding: Tuple[int, int] = (0, 0),
173
- activation=nn.ReLU(inplace=True),
174
- bn: bool = False,
175
- init=nn.init.kaiming_normal_,
176
- bias: bool = True,
177
- preact: bool = False,
178
- name: str = "",
179
- instance_norm=False
180
- ):
181
- super().__init__(
182
- in_size,
183
- out_size,
184
- kernel_size,
185
- stride,
186
- padding,
187
- activation,
188
- bn,
189
- init,
190
- conv=nn.Conv2d,
191
- batch_norm=BatchNorm2d,
192
- bias=bias,
193
- preact=preact,
194
- name=name,
195
- instance_norm=instance_norm,
196
- instance_norm_func=nn.InstanceNorm2d
197
- )
198
-
199
-
200
- class FC(nn.Sequential):
201
-
202
- def __init__(
203
- self,
204
- in_size: int,
205
- out_size: int,
206
- *,
207
- activation=nn.ReLU(inplace=True),
208
- bn: bool = False,
209
- init=None,
210
- preact: bool = False,
211
- name: str = ""
212
- ):
213
- super().__init__()
214
-
215
- fc = nn.Linear(in_size, out_size, bias=not bn)
216
- if init is not None:
217
- init(fc.weight)
218
- if not bn:
219
- nn.init.constant(fc.bias, 0)
220
-
221
- if preact:
222
- if bn:
223
- self.add_module(name + 'bn', BatchNorm1d(in_size))
224
-
225
- if activation is not None:
226
- self.add_module(name + 'activation', activation)
227
-
228
- self.add_module(name + 'fc', fc)
229
-
230
- if not preact:
231
- if bn:
232
- self.add_module(name + 'bn', BatchNorm1d(out_size))
233
-
234
- if activation is not None:
235
- self.add_module(name + 'activation', activation)
236
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/setup.py DELETED
@@ -1,23 +0,0 @@
1
- from setuptools import setup
2
- from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3
-
4
- setup(
5
- name='pointnet2',
6
- ext_modules=[
7
- CUDAExtension('pointnet2_cuda', [
8
- 'src/pointnet2_api.cpp',
9
-
10
- 'src/ball_query.cpp',
11
- 'src/ball_query_gpu.cu',
12
- 'src/group_points.cpp',
13
- 'src/group_points_gpu.cu',
14
- 'src/interpolate.cpp',
15
- 'src/interpolate_gpu.cu',
16
- 'src/sampling.cpp',
17
- 'src/sampling_gpu.cu',
18
- ],
19
- extra_compile_args={'cxx': ['-g'],
20
- 'nvcc': ['-O2']})
21
- ],
22
- cmdclass={'build_ext': BuildExtension}
23
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/ball_query.cpp DELETED
@@ -1,24 +0,0 @@
1
- #include <torch/serialize/tensor.h>
2
- #include <vector>
3
- #include <ATen/cuda/CUDAContext.h>
4
- #include <ATen/cuda/CUDAEvent.h>
5
- #include <cuda.h>
6
- #include <cuda_runtime_api.h>
7
- #include "ball_query_gpu.h"
8
-
9
- #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
10
- #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
11
- #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
12
-
13
- int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
14
- at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
15
- CHECK_INPUT(new_xyz_tensor);
16
- CHECK_INPUT(xyz_tensor);
17
- const float *new_xyz = new_xyz_tensor.data<float>();
18
- const float *xyz = xyz_tensor.data<float>();
19
- int *idx = idx_tensor.data<int>();
20
-
21
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
22
- ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
23
- return 1;
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/ball_query_gpu.cu DELETED
@@ -1,67 +0,0 @@
1
- #include <math.h>
2
- #include <stdio.h>
3
- #include <stdlib.h>
4
-
5
- #include "ball_query_gpu.h"
6
- #include "cuda_utils.h"
7
-
8
-
9
- __global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample,
10
- const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
11
- // new_xyz: (B, M, 3)
12
- // xyz: (B, N, 3)
13
- // output:
14
- // idx: (B, M, nsample)
15
- int bs_idx = blockIdx.y;
16
- int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
17
- if (bs_idx >= b || pt_idx >= m) return;
18
-
19
- new_xyz += bs_idx * m * 3 + pt_idx * 3;
20
- xyz += bs_idx * n * 3;
21
- idx += bs_idx * m * nsample + pt_idx * nsample;
22
-
23
- float radius2 = radius * radius;
24
- float new_x = new_xyz[0];
25
- float new_y = new_xyz[1];
26
- float new_z = new_xyz[2];
27
-
28
- int cnt = 0;
29
- for (int k = 0; k < n; ++k) {
30
- float x = xyz[k * 3 + 0];
31
- float y = xyz[k * 3 + 1];
32
- float z = xyz[k * 3 + 2];
33
- float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
34
- if (d2 < radius2){
35
- if (cnt == 0){
36
- for (int l = 0; l < nsample; ++l) {
37
- idx[l] = k;
38
- }
39
- }
40
- idx[cnt] = k;
41
- ++cnt;
42
- if (cnt >= nsample) break;
43
- }
44
- }
45
- }
46
-
47
-
48
- void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
49
- const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
50
- // new_xyz: (B, M, 3)
51
- // xyz: (B, N, 3)
52
- // output:
53
- // idx: (B, M, nsample)
54
-
55
- cudaError_t err;
56
-
57
- dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
58
- dim3 threads(THREADS_PER_BLOCK);
59
-
60
- ball_query_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
61
- // cudaDeviceSynchronize(); // for using printf in kernel function
62
- err = cudaGetLastError();
63
- if (cudaSuccess != err) {
64
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
65
- exit(-1);
66
- }
67
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/ball_query_gpu.h DELETED
@@ -1,15 +0,0 @@
1
- #ifndef _BALL_QUERY_GPU_H
2
- #define _BALL_QUERY_GPU_H
3
-
4
- #include <torch/serialize/tensor.h>
5
- #include <vector>
6
- #include <cuda.h>
7
- #include <cuda_runtime_api.h>
8
-
9
- int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
10
- at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
11
-
12
- void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample,
13
- const float *xyz, const float *new_xyz, int *idx, cudaStream_t stream);
14
-
15
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/cuda_utils.h DELETED
@@ -1,15 +0,0 @@
1
- #ifndef _CUDA_UTILS_H
2
- #define _CUDA_UTILS_H
3
-
4
- #include <cmath>
5
-
6
- #define TOTAL_THREADS 1024
7
- #define THREADS_PER_BLOCK 256
8
- #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
9
-
10
- inline int opt_n_threads(int work_size) {
11
- const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
12
-
13
- return max(min(1 << pow_2, TOTAL_THREADS), 1);
14
- }
15
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/group_points.cpp DELETED
@@ -1,34 +0,0 @@
1
- #include <torch/serialize/tensor.h>
2
- #include <cuda.h>
3
- #include <cuda_runtime_api.h>
4
- #include <vector>
5
- #include "group_points_gpu.h"
6
- #include <ATen/cuda/CUDAContext.h>
7
- #include <ATen/cuda/CUDAEvent.h>
8
-
9
-
10
-
11
- int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
12
- at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
13
-
14
- float *grad_points = grad_points_tensor.data<float>();
15
- const int *idx = idx_tensor.data<int>();
16
- const float *grad_out = grad_out_tensor.data<float>();
17
-
18
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
19
- group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points, stream);
20
- return 1;
21
- }
22
-
23
-
24
- int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
25
- at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
26
-
27
- const float *points = points_tensor.data<float>();
28
- const int *idx = idx_tensor.data<int>();
29
- float *out = out_tensor.data<float>();
30
-
31
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
32
- group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out, stream);
33
- return 1;
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/group_points_gpu.cu DELETED
@@ -1,86 +0,0 @@
1
- #include <stdio.h>
2
- #include <stdlib.h>
3
-
4
- #include "cuda_utils.h"
5
- #include "group_points_gpu.h"
6
-
7
-
8
- __global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, int nsample,
9
- const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
10
- // grad_out: (B, C, npoints, nsample)
11
- // idx: (B, npoints, nsample)
12
- // output:
13
- // grad_points: (B, C, N)
14
- int bs_idx = blockIdx.z;
15
- int c_idx = blockIdx.y;
16
- int index = blockIdx.x * blockDim.x + threadIdx.x;
17
- int pt_idx = index / nsample;
18
- if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
19
-
20
- int sample_idx = index % nsample;
21
- grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
22
- idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
23
-
24
- atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
25
- }
26
-
27
- void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
28
- const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
29
- // grad_out: (B, C, npoints, nsample)
30
- // idx: (B, npoints, nsample)
31
- // output:
32
- // grad_points: (B, C, N)
33
- cudaError_t err;
34
- dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
35
- dim3 threads(THREADS_PER_BLOCK);
36
-
37
- group_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
38
-
39
- err = cudaGetLastError();
40
- if (cudaSuccess != err) {
41
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
42
- exit(-1);
43
- }
44
- }
45
-
46
-
47
- __global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int nsample,
48
- const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
49
- // points: (B, C, N)
50
- // idx: (B, npoints, nsample)
51
- // output:
52
- // out: (B, C, npoints, nsample)
53
- int bs_idx = blockIdx.z;
54
- int c_idx = blockIdx.y;
55
- int index = blockIdx.x * blockDim.x + threadIdx.x;
56
- int pt_idx = index / nsample;
57
- if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
58
-
59
- int sample_idx = index % nsample;
60
-
61
- idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
62
- int in_idx = bs_idx * c * n + c_idx * n + idx[0];
63
- int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
64
-
65
- out[out_idx] = points[in_idx];
66
- }
67
-
68
-
69
- void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
70
- const float *points, const int *idx, float *out, cudaStream_t stream) {
71
- // points: (B, C, N)
72
- // idx: (B, npoints, nsample)
73
- // output:
74
- // out: (B, C, npoints, nsample)
75
- cudaError_t err;
76
- dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
77
- dim3 threads(THREADS_PER_BLOCK);
78
-
79
- group_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, nsample, points, idx, out);
80
- // cudaDeviceSynchronize(); // for using printf in kernel function
81
- err = cudaGetLastError();
82
- if (cudaSuccess != err) {
83
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
84
- exit(-1);
85
- }
86
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/group_points_gpu.h DELETED
@@ -1,22 +0,0 @@
1
- #ifndef _GROUP_POINTS_GPU_H
2
- #define _GROUP_POINTS_GPU_H
3
-
4
- #include <torch/serialize/tensor.h>
5
- #include <cuda.h>
6
- #include <cuda_runtime_api.h>
7
- #include <vector>
8
-
9
-
10
- int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
11
- at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
12
-
13
- void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
14
- const float *points, const int *idx, float *out, cudaStream_t stream);
15
-
16
- int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
17
- at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
18
-
19
- void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
20
- const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
21
-
22
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/interpolate.cpp DELETED
@@ -1,53 +0,0 @@
1
- #include <torch/serialize/tensor.h>
2
- #include <vector>
3
- #include <math.h>
4
- #include <stdio.h>
5
- #include <stdlib.h>
6
- #include <cuda.h>
7
- #include <cuda_runtime_api.h>
8
- #include "interpolate_gpu.h"
9
- #include <ATen/cuda/CUDAContext.h>
10
- #include <ATen/cuda/CUDAEvent.h>
11
-
12
-
13
- void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
14
- at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
15
- const float *unknown = unknown_tensor.data<float>();
16
- const float *known = known_tensor.data<float>();
17
- float *dist2 = dist2_tensor.data<float>();
18
- int *idx = idx_tensor.data<int>();
19
-
20
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
21
- three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx, stream);
22
- }
23
-
24
-
25
- void three_interpolate_wrapper_fast(int b, int c, int m, int n,
26
- at::Tensor points_tensor,
27
- at::Tensor idx_tensor,
28
- at::Tensor weight_tensor,
29
- at::Tensor out_tensor) {
30
-
31
- const float *points = points_tensor.data<float>();
32
- const float *weight = weight_tensor.data<float>();
33
- float *out = out_tensor.data<float>();
34
- const int *idx = idx_tensor.data<int>();
35
-
36
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
37
- three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out, stream);
38
- }
39
-
40
- void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
41
- at::Tensor grad_out_tensor,
42
- at::Tensor idx_tensor,
43
- at::Tensor weight_tensor,
44
- at::Tensor grad_points_tensor) {
45
-
46
- const float *grad_out = grad_out_tensor.data<float>();
47
- const float *weight = weight_tensor.data<float>();
48
- float *grad_points = grad_points_tensor.data<float>();
49
- const int *idx = idx_tensor.data<int>();
50
-
51
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
52
- three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points, stream);
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/interpolate_gpu.cu DELETED
@@ -1,161 +0,0 @@
1
- #include <math.h>
2
- #include <stdio.h>
3
- #include <stdlib.h>
4
-
5
- #include "cuda_utils.h"
6
- #include "interpolate_gpu.h"
7
-
8
-
9
- __global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown,
10
- const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
11
- // unknown: (B, N, 3)
12
- // known: (B, M, 3)
13
- // output:
14
- // dist2: (B, N, 3)
15
- // idx: (B, N, 3)
16
-
17
- int bs_idx = blockIdx.y;
18
- int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
19
- if (bs_idx >= b || pt_idx >= n) return;
20
-
21
- unknown += bs_idx * n * 3 + pt_idx * 3;
22
- known += bs_idx * m * 3;
23
- dist2 += bs_idx * n * 3 + pt_idx * 3;
24
- idx += bs_idx * n * 3 + pt_idx * 3;
25
-
26
- float ux = unknown[0];
27
- float uy = unknown[1];
28
- float uz = unknown[2];
29
-
30
- double best1 = 1e40, best2 = 1e40, best3 = 1e40;
31
- int besti1 = 0, besti2 = 0, besti3 = 0;
32
- for (int k = 0; k < m; ++k) {
33
- float x = known[k * 3 + 0];
34
- float y = known[k * 3 + 1];
35
- float z = known[k * 3 + 2];
36
- float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
37
- if (d < best1) {
38
- best3 = best2; besti3 = besti2;
39
- best2 = best1; besti2 = besti1;
40
- best1 = d; besti1 = k;
41
- }
42
- else if (d < best2) {
43
- best3 = best2; besti3 = besti2;
44
- best2 = d; besti2 = k;
45
- }
46
- else if (d < best3) {
47
- best3 = d; besti3 = k;
48
- }
49
- }
50
- dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
51
- idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
52
- }
53
-
54
-
55
- void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
56
- const float *known, float *dist2, int *idx, cudaStream_t stream) {
57
- // unknown: (B, N, 3)
58
- // known: (B, M, 3)
59
- // output:
60
- // dist2: (B, N, 3)
61
- // idx: (B, N, 3)
62
-
63
- cudaError_t err;
64
- dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
65
- dim3 threads(THREADS_PER_BLOCK);
66
-
67
- three_nn_kernel_fast<<<blocks, threads, 0, stream>>>(b, n, m, unknown, known, dist2, idx);
68
-
69
- err = cudaGetLastError();
70
- if (cudaSuccess != err) {
71
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
72
- exit(-1);
73
- }
74
- }
75
-
76
-
77
- __global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points,
78
- const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
79
- // points: (B, C, M)
80
- // idx: (B, N, 3)
81
- // weight: (B, N, 3)
82
- // output:
83
- // out: (B, C, N)
84
-
85
- int bs_idx = blockIdx.z;
86
- int c_idx = blockIdx.y;
87
- int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
88
-
89
- if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
90
-
91
- weight += bs_idx * n * 3 + pt_idx * 3;
92
- points += bs_idx * c * m + c_idx * m;
93
- idx += bs_idx * n * 3 + pt_idx * 3;
94
- out += bs_idx * c * n + c_idx * n;
95
-
96
- out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
97
- }
98
-
99
- void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
100
- const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream) {
101
- // points: (B, C, M)
102
- // idx: (B, N, 3)
103
- // weight: (B, N, 3)
104
- // output:
105
- // out: (B, C, N)
106
-
107
- cudaError_t err;
108
- dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
109
- dim3 threads(THREADS_PER_BLOCK);
110
- three_interpolate_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, m, n, points, idx, weight, out);
111
-
112
- err = cudaGetLastError();
113
- if (cudaSuccess != err) {
114
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
115
- exit(-1);
116
- }
117
- }
118
-
119
-
120
- __global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
121
- const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
122
- // grad_out: (B, C, N)
123
- // weight: (B, N, 3)
124
- // output:
125
- // grad_points: (B, C, M)
126
-
127
- int bs_idx = blockIdx.z;
128
- int c_idx = blockIdx.y;
129
- int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
130
-
131
- if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
132
-
133
- grad_out += bs_idx * c * n + c_idx * n + pt_idx;
134
- weight += bs_idx * n * 3 + pt_idx * 3;
135
- grad_points += bs_idx * c * m + c_idx * m;
136
- idx += bs_idx * n * 3 + pt_idx * 3;
137
-
138
-
139
- atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
140
- atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
141
- atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
142
- }
143
-
144
- void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
145
- const int *idx, const float *weight, float *grad_points, cudaStream_t stream) {
146
- // grad_out: (B, C, N)
147
- // weight: (B, N, 3)
148
- // output:
149
- // grad_points: (B, C, M)
150
-
151
- cudaError_t err;
152
- dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
153
- dim3 threads(THREADS_PER_BLOCK);
154
- three_interpolate_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, m, grad_out, idx, weight, grad_points);
155
-
156
- err = cudaGetLastError();
157
- if (cudaSuccess != err) {
158
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
159
- exit(-1);
160
- }
161
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/interpolate_gpu.h DELETED
@@ -1,30 +0,0 @@
1
- #ifndef _INTERPOLATE_GPU_H
2
- #define _INTERPOLATE_GPU_H
3
-
4
- #include <torch/serialize/tensor.h>
5
- #include<vector>
6
- #include <cuda.h>
7
- #include <cuda_runtime_api.h>
8
-
9
-
10
- void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
11
- at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
12
-
13
- void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
14
- const float *known, float *dist2, int *idx, cudaStream_t stream);
15
-
16
-
17
- void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor,
18
- at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
19
-
20
- void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
21
- const float *points, const int *idx, const float *weight, float *out, cudaStream_t stream);
22
-
23
-
24
- void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor,
25
- at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
26
-
27
- void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
28
- const int *idx, const float *weight, float *grad_points, cudaStream_t stream);
29
-
30
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/pointnet2_api.cpp DELETED
@@ -1,24 +0,0 @@
1
- #include <torch/serialize/tensor.h>
2
- #include <torch/extension.h>
3
-
4
- #include "ball_query_gpu.h"
5
- #include "group_points_gpu.h"
6
- #include "sampling_gpu.h"
7
- #include "interpolate_gpu.h"
8
-
9
-
10
- PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11
- m.def("ball_query_wrapper", &ball_query_wrapper_fast, "ball_query_wrapper_fast");
12
-
13
- m.def("group_points_wrapper", &group_points_wrapper_fast, "group_points_wrapper_fast");
14
- m.def("group_points_grad_wrapper", &group_points_grad_wrapper_fast, "group_points_grad_wrapper_fast");
15
-
16
- m.def("gather_points_wrapper", &gather_points_wrapper_fast, "gather_points_wrapper_fast");
17
- m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper_fast, "gather_points_grad_wrapper_fast");
18
-
19
- m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
20
-
21
- m.def("three_nn_wrapper", &three_nn_wrapper_fast, "three_nn_wrapper_fast");
22
- m.def("three_interpolate_wrapper", &three_interpolate_wrapper_fast, "three_interpolate_wrapper_fast");
23
- m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper_fast, "three_interpolate_grad_wrapper_fast");
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/sampling.cpp DELETED
@@ -1,45 +0,0 @@
1
- #include <torch/serialize/tensor.h>
2
- #include <ATen/cuda/CUDAContext.h>
3
- #include <vector>
4
- #include <ATen/cuda/CUDAContext.h>
5
- #include <ATen/cuda/CUDAEvent.h>
6
- #include "sampling_gpu.h"
7
-
8
-
9
-
10
- int gather_points_wrapper_fast(int b, int c, int n, int npoints,
11
- at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
12
- const float *points = points_tensor.data<float>();
13
- const int *idx = idx_tensor.data<int>();
14
- float *out = out_tensor.data<float>();
15
-
16
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
17
- gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out, stream);
18
- return 1;
19
- }
20
-
21
-
22
- int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
23
- at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
24
-
25
- const float *grad_out = grad_out_tensor.data<float>();
26
- const int *idx = idx_tensor.data<int>();
27
- float *grad_points = grad_points_tensor.data<float>();
28
-
29
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
30
- gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points, stream);
31
- return 1;
32
- }
33
-
34
-
35
- int furthest_point_sampling_wrapper(int b, int n, int m,
36
- at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
37
-
38
- const float *points = points_tensor.data<float>();
39
- float *temp = temp_tensor.data<float>();
40
- int *idx = idx_tensor.data<int>();
41
-
42
- cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
43
- furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
44
- return 1;
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/sampling_gpu.cu DELETED
@@ -1,253 +0,0 @@
1
- #include <stdio.h>
2
- #include <stdlib.h>
3
-
4
- #include "cuda_utils.h"
5
- #include "sampling_gpu.h"
6
-
7
-
8
- __global__ void gather_points_kernel_fast(int b, int c, int n, int m,
9
- const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
10
- // points: (B, C, N)
11
- // idx: (B, M)
12
- // output:
13
- // out: (B, C, M)
14
-
15
- int bs_idx = blockIdx.z;
16
- int c_idx = blockIdx.y;
17
- int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
18
- if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
19
-
20
- out += bs_idx * c * m + c_idx * m + pt_idx;
21
- idx += bs_idx * m + pt_idx;
22
- points += bs_idx * c * n + c_idx * n;
23
- out[0] = points[idx[0]];
24
- }
25
-
26
- void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
27
- const float *points, const int *idx, float *out, cudaStream_t stream) {
28
- // points: (B, C, N)
29
- // idx: (B, npoints)
30
- // output:
31
- // out: (B, C, npoints)
32
-
33
- cudaError_t err;
34
- dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
35
- dim3 threads(THREADS_PER_BLOCK);
36
-
37
- gather_points_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, points, idx, out);
38
-
39
- err = cudaGetLastError();
40
- if (cudaSuccess != err) {
41
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
42
- exit(-1);
43
- }
44
- }
45
-
46
- __global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
47
- const int *__restrict__ idx, float *__restrict__ grad_points) {
48
- // grad_out: (B, C, M)
49
- // idx: (B, M)
50
- // output:
51
- // grad_points: (B, C, N)
52
-
53
- int bs_idx = blockIdx.z;
54
- int c_idx = blockIdx.y;
55
- int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
56
- if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
57
-
58
- grad_out += bs_idx * c * m + c_idx * m + pt_idx;
59
- idx += bs_idx * m + pt_idx;
60
- grad_points += bs_idx * c * n + c_idx * n;
61
-
62
- atomicAdd(grad_points + idx[0], grad_out[0]);
63
- }
64
-
65
- void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
66
- const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) {
67
- // grad_out: (B, C, npoints)
68
- // idx: (B, npoints)
69
- // output:
70
- // grad_points: (B, C, N)
71
-
72
- cudaError_t err;
73
- dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
74
- dim3 threads(THREADS_PER_BLOCK);
75
-
76
- gather_points_grad_kernel_fast<<<blocks, threads, 0, stream>>>(b, c, n, npoints, grad_out, idx, grad_points);
77
-
78
- err = cudaGetLastError();
79
- if (cudaSuccess != err) {
80
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
81
- exit(-1);
82
- }
83
- }
84
-
85
-
86
- __device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
87
- const float v1 = dists[idx1], v2 = dists[idx2];
88
- const int i1 = dists_i[idx1], i2 = dists_i[idx2];
89
- dists[idx1] = max(v1, v2);
90
- dists_i[idx1] = v2 > v1 ? i2 : i1;
91
- }
92
-
93
- template <unsigned int block_size>
94
- __global__ void furthest_point_sampling_kernel(int b, int n, int m,
95
- const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
96
- // dataset: (B, N, 3)
97
- // tmp: (B, N)
98
- // output:
99
- // idx: (B, M)
100
-
101
- if (m <= 0) return;
102
- __shared__ float dists[block_size];
103
- __shared__ int dists_i[block_size];
104
-
105
- int batch_index = blockIdx.x;
106
- dataset += batch_index * n * 3;
107
- temp += batch_index * n;
108
- idxs += batch_index * m;
109
-
110
- int tid = threadIdx.x;
111
- const int stride = block_size;
112
-
113
- int old = 0;
114
- if (threadIdx.x == 0)
115
- idxs[0] = old;
116
-
117
- __syncthreads();
118
- for (int j = 1; j < m; j++) {
119
- int besti = 0;
120
- float best = -1;
121
- float x1 = dataset[old * 3 + 0];
122
- float y1 = dataset[old * 3 + 1];
123
- float z1 = dataset[old * 3 + 2];
124
- for (int k = tid; k < n; k += stride) {
125
- float x2, y2, z2;
126
- x2 = dataset[k * 3 + 0];
127
- y2 = dataset[k * 3 + 1];
128
- z2 = dataset[k * 3 + 2];
129
- // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
130
- // if (mag <= 1e-3)
131
- // continue;
132
-
133
- float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
134
- float d2 = min(d, temp[k]);
135
- temp[k] = d2;
136
- besti = d2 > best ? k : besti;
137
- best = d2 > best ? d2 : best;
138
- }
139
- dists[tid] = best;
140
- dists_i[tid] = besti;
141
- __syncthreads();
142
-
143
- if (block_size >= 1024) {
144
- if (tid < 512) {
145
- __update(dists, dists_i, tid, tid + 512);
146
- }
147
- __syncthreads();
148
- }
149
-
150
- if (block_size >= 512) {
151
- if (tid < 256) {
152
- __update(dists, dists_i, tid, tid + 256);
153
- }
154
- __syncthreads();
155
- }
156
- if (block_size >= 256) {
157
- if (tid < 128) {
158
- __update(dists, dists_i, tid, tid + 128);
159
- }
160
- __syncthreads();
161
- }
162
- if (block_size >= 128) {
163
- if (tid < 64) {
164
- __update(dists, dists_i, tid, tid + 64);
165
- }
166
- __syncthreads();
167
- }
168
- if (block_size >= 64) {
169
- if (tid < 32) {
170
- __update(dists, dists_i, tid, tid + 32);
171
- }
172
- __syncthreads();
173
- }
174
- if (block_size >= 32) {
175
- if (tid < 16) {
176
- __update(dists, dists_i, tid, tid + 16);
177
- }
178
- __syncthreads();
179
- }
180
- if (block_size >= 16) {
181
- if (tid < 8) {
182
- __update(dists, dists_i, tid, tid + 8);
183
- }
184
- __syncthreads();
185
- }
186
- if (block_size >= 8) {
187
- if (tid < 4) {
188
- __update(dists, dists_i, tid, tid + 4);
189
- }
190
- __syncthreads();
191
- }
192
- if (block_size >= 4) {
193
- if (tid < 2) {
194
- __update(dists, dists_i, tid, tid + 2);
195
- }
196
- __syncthreads();
197
- }
198
- if (block_size >= 2) {
199
- if (tid < 1) {
200
- __update(dists, dists_i, tid, tid + 1);
201
- }
202
- __syncthreads();
203
- }
204
-
205
- old = dists_i[0];
206
- if (tid == 0)
207
- idxs[j] = old;
208
- }
209
- }
210
-
211
- void furthest_point_sampling_kernel_launcher(int b, int n, int m,
212
- const float *dataset, float *temp, int *idxs, cudaStream_t stream) {
213
- // dataset: (B, N, 3)
214
- // tmp: (B, N)
215
- // output:
216
- // idx: (B, M)
217
-
218
- cudaError_t err;
219
- unsigned int n_threads = opt_n_threads(n);
220
-
221
- switch (n_threads) {
222
- case 1024:
223
- furthest_point_sampling_kernel<1024><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
224
- case 512:
225
- furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
226
- case 256:
227
- furthest_point_sampling_kernel<256><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
228
- case 128:
229
- furthest_point_sampling_kernel<128><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
230
- case 64:
231
- furthest_point_sampling_kernel<64><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
232
- case 32:
233
- furthest_point_sampling_kernel<32><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
234
- case 16:
235
- furthest_point_sampling_kernel<16><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
236
- case 8:
237
- furthest_point_sampling_kernel<8><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
238
- case 4:
239
- furthest_point_sampling_kernel<4><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
240
- case 2:
241
- furthest_point_sampling_kernel<2><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
242
- case 1:
243
- furthest_point_sampling_kernel<1><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs); break;
244
- default:
245
- furthest_point_sampling_kernel<512><<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
246
- }
247
-
248
- err = cudaGetLastError();
249
- if (cudaSuccess != err) {
250
- fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
251
- exit(-1);
252
- }
253
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/pointnet2/src/sampling_gpu.h DELETED
@@ -1,29 +0,0 @@
1
- #ifndef _SAMPLING_GPU_H
2
- #define _SAMPLING_GPU_H
3
-
4
- #include <torch/serialize/tensor.h>
5
- #include <ATen/cuda/CUDAContext.h>
6
- #include<vector>
7
-
8
-
9
- int gather_points_wrapper_fast(int b, int c, int n, int npoints,
10
- at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
11
-
12
- void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
13
- const float *points, const int *idx, float *out, cudaStream_t stream);
14
-
15
-
16
- int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
17
- at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
18
-
19
- void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
20
- const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream);
21
-
22
-
23
- int furthest_point_sampling_wrapper(int b, int n, int m,
24
- at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
25
-
26
- void furthest_point_sampling_kernel_launcher(int b, int n, int m,
27
- const float *dataset, float *temp, int *idxs, cudaStream_t stream);
28
-
29
- #endif