/
parameters.proto
212 lines (186 loc) · 7.45 KB
/
parameters.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
syntax = "proto2";
package fpgaconvnet.protos;
message OptimizerOptions
{
/* The maximium utilisation of resources of designs that the optimiser
* considers. A higher value increase compilation time and probability
* of compilation failure, but also increases utilisation & performance.
*
* You should tweak these parameters only when designs are failing to
* compile at mapping or MPPR.
*/
optional float dsp_threshold = 1 [default = 0.8];
optional float bram_threshold = 2 [default = 0.9];
}
message Precision
{
optional uint32 integer_bits = 1 [default = 8];
optional uint32 fractional_bits = 2 [default = 8];
}
message Network
{
/* The layers that will be compiled. The tool currently supports only
* sequential layers (like AlexNet and VGG) and not ireregular nets
* (like GoogLenet and ResNet).
*/
repeated LayerParameter layer = 1;
/* Frequency of design in MHz. Note that all kernels across all FPGAs
* will run at the same frequency.
*/
optional uint32 frequency = 2 [default = 100];
optional uint32 num_fpga_available = 3 [default = 1];
/* Defines whether design-space exploration should consider separate
* layers across several FPGA bitstreams. When this is true, the
* I/O interface is DDR4 and the design targets only a single FPGA.
* When this is false, the I/O interface is PCIe and can utilise a
* FPGA pipeline comprising of as many as specified by
* [num_fpga_available].
*
* If this is true, [num_fpga_available] must be set to 1. This is a
* current limitation of the tool. See the report's section on future
* work for information about multiple FPGAs with reconfiguration.
*/
optional bool allow_runtime_reconfiguration = 5 [default = false];
/* Our tool currently supports only fixed point precision. This field
* defines the precision that will be used in layers. This field
* must be specified.
*/
optional Precision default_precision = 6;
optional OptimizerOptions optimizer_options = 7;
/* =================================
* For internal usage
* =================================
*
* When used as a bitstream subnetwork, [num_fpga_used] signifies the
* number of FPGA used for just that single bitstream.
*
* When used for the entire network, [num_fpga_used] is the maximum
* of all the [num_fpga_used] across all bitstreams used.
*/
optional uint32 num_fpga_used = 4 [default = 1];
}
message LayerParameter
{
/* These fields should be specified. */
optional uint32 input_height = 3;
optional uint32 input_width = 4;
optional uint32 num_inputs = 5;
enum Activation
{
None = 1;
Relu = 2;
}
optional uint32 num_outputs = 6;
optional Activation activation = 8 [default = None];
oneof params {
ConvolutionParameter conv = 1;
PoolingParameter pool = 2;
LrnParameter lrn = 14;
}
/* Used internally - users of this function are _not_ expected to complete this,
* values here will be ignored by the runtime.
*/
optional uint32 output_height = 9;
optional uint32 output_width = 10;
optional uint32 layer_id = 11;
optional bool is_first_layer = 12 [default = false];
optional bool is_last_layer = 13 [default = false];
optional uint32 fpga_id = 15 [default = 0];
optional uint32 output_vector_width = 16;
optional uint32 bitstream_id = 17 [default = 0];
}
message ConvolutionParameter
{
/* These fields should be specified. */
optional uint32 kernel_size = 7;
optional uint32 stride = 8 [default = 1];
optional uint32 pad = 9 [default = 0];
/* These fields are used by the compiler. */
optional uint32 weight_address_base = 11;
/*
* These fields are used by the optimizer
*
* conv_folding_factor * worker_factor <= bram_factor <= num_inputs * num_outputs
* 1 (no lookahead) <= look_ahead <= input_height * input_width (full lookahead)
*
* Consequently, the amount of BRAM used by each multiplier would be
* proportional to:
*
* `(bram_factor / (worker_factor * conv_folding_factor))
* * ceil(kernel_size^2 / kernel_folding_factor)`.
*
* The total amount of blockram required will then be:
*
* `bram_factor * kernel_size * kernel_size`
*
* The number of kernels to be stored in BRAM at any point in time.
* if bram_input_factor == num_inputs and bram_output_factor == num_outputs,
* that means all the weights stored in BRAM and no off-chip RAM reads
* will not be required. fpgaConvNet will generate a graph that reads
* the weights directly from the host to the FPGA.
*
* bram_factor has to be at least as large as `worker_factor * conv_folding_factor`
* simply because setting them to be lower values makes unnecessary stalls
* the extra (wasted) memory cycles.
*
* They should also, ideally, be a multiples of `worker_factor * conv_folding_factor`
* fpgaConvNet is not smart enough to optimize non-
* alligned memory access (yet). There isn't much reason to optimize
* non-multiple memory accesses cases - the total amount of iterations
* will not improve (and might even degrade due to unncessary off-chip
* memory accesses!)
*
* bram_input_factor and bram_output_factor do not affect how bias are
* alligned. Those are stored in BRam in all cases.
*
* look_ahead affects the frequency of memory lookups. Higher lookahead
* means less frequent off-chip memory lookup, but higher on-chip memory
* usage.
*/
optional bool should_fit_on_chip = 10 [default = false];
optional uint32 bram_factor = 4; /* Assigns to fully on-chip when this is not set. */
optional uint32 look_ahead = 5 [default = 1]; /* Must be a factor of image size. */
optional uint32 worker_factor = 1; // 1 <= x <= num_inputs
optional uint32 conv_folding_factor = 2; // 1 <= x <= num_outputs
// - number of convoluters per worker.
optional uint32 kernel_folding_factor = 3; // 1 <= x <= kernel_dim * kernel_dim
// - number of multipliers per convoluter.
// - must be a factor of kernelDim * kernelDim
optional uint32 group = 15 [default = 1];
}
message PoolingParameter
{
enum PoolingType
{
Max = 1;
Average = 2;
}
optional PoolingType type = 1 [default = Max];
/* 2D Pooling supported only */
optional uint32 dim = 2;
/* when stride is not given, stride will be assiged to dim. */
optional uint32 stride = 3;
optional uint32 channel_folding_factor = 4; /* 1 <= x <= num_inputs */
}
message LrnParameter
{
enum LrnRegion
{
WITHIN_CHANNEL = 1;
ACROSS_CHANNELS = 2;
}
enum ApproxMethod
{
BINOMIAL = 1;
LUT = 2;
}
optional uint32 local_size = 1;
optional float alpha = 2;
optional float beta = 3;
optional float k = 8 [default = 1];
optional LrnRegion norm_region = 4 [default = ACROSS_CHANNELS];
optional ApproxMethod approx_method = 5 [default = BINOMIAL];
optional uint32 approx_degree = 6 [default = 1]; /* How many powers to approximate. */
/* Determines parallelism - channel_folding_factor == num_inputs means fully unrolled. */
optional uint32 channel_folding_factor = 7;
}