protos/fpgaconvnet/protos/parameters.proto

syntax = "proto2";
package fpgaconvnet.protos;

message OptimizerOptions
{
    /* The maximium utilisation of resources of designs that the optimiser
     * considers. A higher value increase compilation time and probability
     * of compilation failure, but also increases utilisation & performance.
     *
     * You should tweak these parameters only when designs are failing to
     * compile at mapping or MPPR.
     */
    optional float dsp_threshold  = 1 [default = 0.8];
    optional float bram_threshold = 2 [default = 0.9];
}

message Precision
{
    optional uint32 integer_bits    = 1 [default = 8];
    optional uint32 fractional_bits = 2 [default = 8];
}

message Network
{
    /* The layers that will be compiled. The tool currently supports only
     * sequential layers (like AlexNet and VGG) and not ireregular nets
     * (like GoogLenet and ResNet).
     */
    repeated LayerParameter layer = 1;

    /* Frequency of design in MHz. Note that all kernels across all FPGAs
     * will run at the same frequency.
     */
    optional uint32 frequency = 2 [default = 100];
    optional uint32 num_fpga_available = 3 [default = 1];

    /* Defines whether design-space exploration should consider separate
     * layers across several FPGA bitstreams. When this is true, the
     * I/O interface is DDR4 and the design targets only a single FPGA.
     * When this is false, the I/O interface is PCIe and can utilise a
     * FPGA pipeline comprising of as many as specified by
     * [num_fpga_available].
     * 
     * If this is true, [num_fpga_available] must be set to 1. This is a
     * current limitation of the tool. See the report's section on future
     * work for information about multiple FPGAs with reconfiguration.
     */
    optional bool   allow_runtime_reconfiguration = 5 [default = false];

    /* Our tool currently supports only fixed point precision. This field
     * defines the precision that will be used in layers. This field
     * must be specified.
     */
    optional Precision default_precision = 6;
    optional OptimizerOptions optimizer_options = 7;

    /* =================================
     *        For internal usage
     * =================================
     *
     * When used as a bitstream subnetwork, [num_fpga_used] signifies the
     * number of FPGA used for just that single bitstream.
     *
     * When used for the entire network, [num_fpga_used] is the maximum
     * of all the [num_fpga_used] across all bitstreams used.
     */
    optional uint32 num_fpga_used = 4 [default = 1];
}


message LayerParameter
{
    /* These fields should be specified. */
    optional uint32 input_height = 3;
    optional uint32 input_width = 4;
    optional uint32 num_inputs = 5;

    enum Activation
    {
        None = 1;
        Relu = 2;
    }
    optional uint32 num_outputs = 6;
    optional Activation activation = 8 [default = None];

    oneof params {
        ConvolutionParameter conv = 1;
        PoolingParameter pool = 2;
        LrnParameter lrn = 14;
    }

    /* Used internally - users of this function are _not_ expected to complete this,
     * values here will be ignored by the runtime.
     */
    optional uint32 output_height = 9;
    optional uint32 output_width = 10;
    optional uint32 layer_id = 11;
    optional bool is_first_layer = 12 [default = false];
    optional bool is_last_layer = 13 [default = false];
    optional uint32 fpga_id = 15 [default = 0];
    optional uint32 output_vector_width = 16;
    optional uint32 bitstream_id = 17 [default = 0];
}


message ConvolutionParameter
{
    /* These fields should be specified. */
    optional uint32 kernel_size = 7;
    optional uint32 stride = 8 [default = 1];
    optional uint32 pad = 9 [default = 0];

    /* These fields are used by the compiler. */
    optional uint32 weight_address_base = 11;

    /*
     * These fields are used by the optimizer
     * 
     * conv_folding_factor * worker_factor <= bram_factor <= num_inputs * num_outputs
     * 1 (no lookahead) <= look_ahead <= input_height * input_width (full lookahead)
     *
     * Consequently, the amount of BRAM used by each multiplier would be
     * proportional to:
     * 
     * `(bram_factor / (worker_factor * conv_folding_factor))
     *   * ceil(kernel_size^2 / kernel_folding_factor)`.
     * 
     * The total amount of blockram required will then be:
     * 
     * `bram_factor * kernel_size * kernel_size`
     *
     * The number of kernels to be stored in BRAM at any point in time.
     * if bram_input_factor == num_inputs and bram_output_factor == num_outputs,
     * that means all the weights stored in BRAM and no off-chip RAM reads
     * will not be required. fpgaConvNet will generate a graph that reads
     * the weights directly from the host to the FPGA.
     *
     * bram_factor has to be at least as large as `worker_factor * conv_folding_factor`
     * simply because setting them to be lower values makes unnecessary stalls
     * the extra (wasted) memory cycles.
     *
     * They should also, ideally, be a multiples of `worker_factor * conv_folding_factor`
     * fpgaConvNet is not smart enough to optimize non-
     * alligned memory access (yet). There isn't much reason to optimize
     * non-multiple memory accesses cases - the total amount of iterations
     * will not improve (and might even degrade due to unncessary off-chip
     * memory accesses!)
     *
     * bram_input_factor and bram_output_factor do not affect how bias are
     * alligned. Those are stored in BRam in all cases.
     * 
     * look_ahead affects the frequency of memory lookups. Higher lookahead
     * means less frequent off-chip memory lookup, but higher on-chip memory
     * usage.
     */
    optional bool should_fit_on_chip = 10 [default = false];
    optional uint32 bram_factor = 4;  /* Assigns to fully on-chip when this is not set. */
    optional uint32 look_ahead = 5 [default = 1]; /* Must be a factor of image size. */

    optional uint32 worker_factor = 1;         // 1 <= x <= num_inputs
    optional uint32 conv_folding_factor = 2;    // 1 <= x <= num_outputs
                                              // - number of convoluters per worker.
    optional uint32 kernel_folding_factor = 3;  // 1 <= x <= kernel_dim * kernel_dim
                                              // - number of multipliers per convoluter.
                                              // - must be a factor of kernelDim * kernelDim
    optional uint32 group = 15 [default = 1];

}


message PoolingParameter
{
    enum PoolingType
    {
        Max = 1;
        Average = 2;
    }

    optional PoolingType type = 1 [default = Max];
    /* 2D Pooling supported only */
    optional uint32 dim = 2;
    /* when stride is not given, stride will be assiged to dim. */
    optional uint32 stride = 3;

    optional uint32 channel_folding_factor = 4; /* 1 <= x <= num_inputs */
}


message LrnParameter
{
    enum LrnRegion
    {
        WITHIN_CHANNEL = 1;
        ACROSS_CHANNELS = 2;
    }
    enum ApproxMethod
    {
        BINOMIAL = 1;
        LUT = 2;
    }

    optional uint32 local_size = 1;
    optional float alpha = 2;
    optional float beta = 3;
    optional float k = 8 [default = 1];
    optional LrnRegion norm_region = 4 [default = ACROSS_CHANNELS];
    optional ApproxMethod approx_method = 5 [default = BINOMIAL];
    optional uint32 approx_degree = 6 [default = 1];  /* How many powers to approximate. */

    /* Determines parallelism - channel_folding_factor == num_inputs means fully unrolled. */
    optional uint32 channel_folding_factor = 7;
}