/
fast_xcor.h
103 lines (90 loc) · 2.81 KB
/
fast_xcor.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#ifndef __FAST_XCOR_H__
#define __FAST_XCOR_H__
#include <iostream>
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/kernels/bounds_check.h"
using namespace tensorflow;
template <typename Device, typename T>
struct FastXCor{
void operator()(
const Device& d,
const int n_n,
const int n_h,
const int n_w,
const int n_c,
const int n_d,
const int s_h,
const int s_w,
const T* input_data,
const T* filter_data,
T* output_data);
};
//#if GOOGLE_CUDA
//// Partially specialize functor for GpuDevice.
//template <typename Eigen::GpuDevice, typename T>
//struct FastXCor{
// void operator()(const Eigen::GpuDevice& d,
// const int n_n,
// const int n_h,
// const int n_w,
// const int n_c,
// const int n_d,
// const int s_h,
// const int s_w,
//
// const T* input_data,
// const T* filter_data,
// T* output_data);
//};
//#endif
template <typename Device, typename T>
class FastXCorOp: public OpKernel{
private:
int delta_;
int stride_h_;
int stride_w_;
public:
explicit FastXCorOp(OpKernelConstruction* context) : OpKernel(context){
OP_REQUIRES_OK(context, context->GetAttr("delta", &delta_));
OP_REQUIRES_OK(context, context->GetAttr("stride_h", &stride_h_));
OP_REQUIRES_OK(context, context->GetAttr("stride_w", &stride_w_));
}
void Compute(OpKernelContext* context) override{
const Tensor& input = context->input(0);
const Tensor& filter = context->input(1);
int n_n = input.shape().dim_size(0);
int n_h = input.shape().dim_size(1);
int n_w = input.shape().dim_size(2);
int n_c = input.shape().dim_size(3);
int n_d = (2 * delta_ + 1);
//printf("Dimension : (%d,%d,%d,%d,%d)\n", n_n, n_h, n_w, n_d, n_d);
TensorShape output_shape({n_n,n_h,n_w,n_d,n_d}); //n,h,w,d,d
Tensor* output = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
//if(!output){
// fprintf(stderr, "Output Tensor Allocation Failed?\n");
//}else{
// printf("Output Tensor Allocation Should Have Succeeded");
// int n_size = (n_n * n_h * n_w * n_d * n_d);
// auto output_flat = output->flat<float>();
// for (int i = 0; i < n_size; i++) {
// printf("%d ", i);
// output_flat(i) = 1;
// }
//}
FastXCor<Device, T>()(
context->eigen_device<Device>(),
n_n,
n_h,
n_w,
n_c,
n_d,
stride_h_,
stride_w_,
input.flat<T>().data(),//flat<T>().data(),
filter.flat<T>().data(),//flat<T>().data(),
output->flat<T>().data()//flat<T>().data()
);
}
};
#endif