

# Chapter 8: Hierarchical Design

Team 11: 鄭昊昕 游子慶 林稜凱 游翊

# Outline

- Arrays Shared Between Blocks
- Blocks with Common Interface Control Variables
- Reconvergence: Balancing the Latency Between Blocks

# Outline

- Arrays Shared Between Blocks
- Blocks with Common Interface Control Variables
- Reconvergence: Balancing the Latency Between Blocks

# Two Blocks in Cascade but Access in Reverse Order

## Example 129 Out of Order Array Access

```
1 #include "ac_int.h"
2
3 void BLOCK0 (ac_int<4,false> din[3],
4             ac_int<4,false> dout[3]) {
5
6     WRITE:for(ac_int<3,false> i=0; i <3; i++) {
7         dout[i] = din[i]+1;
8     }
9 }
10
11 void BLOCK1 (ac_int<4,false> din[3],
12             ac_int<4,false> dout[3]) {
13
14     WRITE:for(ac_int<3,true> i=2; i >= 0; i--) {
15         dout[i] = din[i]-1;
16     }
17 }
18
19 #pragma hls_design top
20 void top (ac_int<4,false> din[3],
21           ac_int<4,false> dout[3]) {
22
23     ac_int<4,false> tmp[3];
24     BLOCK0(din,tmp);
25     BLOCK1(tmp,dout);
26 }
```

← ascending order

← descending order



# Out of Order Array Access

```
void BLOCK0(ap_uint<4> din[3], ap_uint<4> dout[3]) {
    WRITE:for (ap_uint<3> i = 0; i < 3; i++) {
#pragma HLS PIPELINE II = 1
        dout[i] = din[i] + 1;
    }
}

void BLOCK1(ap_uint<4> din[3], ap_uint<4> dout[3]) {
#pragma HLS PIPELINE II = 1
    WRITE:for (ap_int<3> i = 2; i >= 0; i--) {
        dout[i] = din[i] - 1;
    }
}

void top(ap_uint<4> din[3], ap_uint<4> dout[3]) {
    ap_uint<4> tmp[3];
    BLOCK0(din, tmp);
    BLOCK1(tmp, dout);
}
```

# Out of Order Array Access - Scheduling



# Out of Order Array Access - Testbench

```
#define len 3

int main(void) {
    ap_uint<4> din[3] = {0, 1, 2}, dout[3];
    int pass = 1;

    top(din, dout);

    cout << ">> Start Checking" << endl;
    cout << "-----" << endl;
    for (int i = 0; i < len; i++) {
        cout << "Output: " << dout[i] << "\tExpect: " << i << endl;
        if (dout[i] != i) pass = 0;
    }

    cout << "-----" << endl;
    if (!pass) {
        cout << "Failed" << endl;
        return 1;
    }
    cout << "Passed Congrats" << endl;
    return 0;
}
```

## Performance Estimates

### Timing

#### Summary

| Clock  | Target   | Estimated | Uncertainty |
|--------|----------|-----------|-------------|
| ap_clk | 10.00 ns | 5.369 ns  | 1.25 ns     |

### Latency

#### Summary

| Latency (cycles) |     | Latency (absolute) |           | Interval (cycles) |     |          |
|------------------|-----|--------------------|-----------|-------------------|-----|----------|
| min              | max | min                | max       | min               | max | Type     |
| 7                | 7   | 70.000 ns          | 70.000 ns | 4                 | 4   | dataflow |

#### Detail

##### Instance

##### Loop

## Utilization Estimates

### Summary

| Name            | BRAM_18K | DSP48E | FF     | LUT   | URAM |
|-----------------|----------|--------|--------|-------|------|
| DSP             | -        | -      | -      | -     | -    |
| Expression      | -        | -      | 0      | 2     | -    |
| FIFO            | 0        | -      | 5      | 17    | -    |
| Instance        | -        | -      | 33     | 224   | -    |
| Memory          | -        | -      | -      | -     | -    |
| Multiplexer     | -        | -      | -      | -     | -    |
| Register        | -        | -      | -      | -     | -    |
| Total           | 0        | 0      | 38     | 243   | 0    |
| Available       | 280      | 220    | 106400 | 53200 | 0    |
| Utilization (%) | 0        | 0      | ~0     | ~0    | 0    |

### Detail

##### Instance

##### DSP48E

##### Memory

##### FIFO

##### Expression

##### Multiplexer

##### Register

## Interface

### Summary

| RTL Ports      | Dir | Bits | Protocol   | Source Object | C Type       |
|----------------|-----|------|------------|---------------|--------------|
| in_V_V_dout    | in  | 4    | ap_fifo    | in_V_V        | pointer      |
| in_V_V_empty_n | in  | 1    | ap_fifo    | in_V_V        | pointer      |
| in_V_V_read    | out | 1    | ap_fifo    | in_V_V        | pointer      |
| out_V_V_din    | out | 5    | ap_fifo    | out_V_V       | pointer      |
| out_V_V_full_n | in  | 1    | ap_fifo    | out_V_V       | pointer      |
| out_V_V_write  | out | 1    | ap_fifo    | out_V_V       | pointer      |
| ap_clk         | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_rst         | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_start       | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_done        | out | 1    | ap_ctrl_hs | top           | return value |
| ap_ready       | out | 1    | ap_ctrl_hs | top           | return value |
| ap_idle        | out | 1    | ap_ctrl_hs | top           | return value |

# Arrays Mapped to Registers

```
class simple_chanstruct {
    hls::stream<chanStruct> tmp;

    void BLOCK0(hls::stream<uint4>& din, hls::stream<chanStruct>& dout) {
        chanStruct reg;
#pragma HLS ARRAY_PARTITION variable=reg complete dim=1
        WRITE:for (ap_uint<3> i = 0; i < 3; i++) {
#pragma HLS PIPELINE II = 1
            reg.data[1] = din.read();
        }
        dout.write(reg);
    }

    void BLOCK1(hls::stream<chanStruct>& din, hls::stream<uint4>& dout) {
        chanStruct reg;
#pragma HLS ARRAY_PARTITION variable=reg complete dim=1
        reg = din.read();
        READ:for (ap_int<4> i = 2; i >= 0; i--) {
#pragma HLS PIPELINE II = 1
            dout.write(reg.data[i]);
        }
    }

public:
    simple_chanstruct() {}

    void run(hls::stream<uint4>& din, hls::stream<uint4>& dout) {
#pragma HLS ARRAY_PARTITION variable=tmp complete dim=1
#pragma HLS STREAM variable=tmp depth=3
        BLOCK0(din, tmp);
        BLOCK1(tmp, dout);
    }
};
```

# Arrays Mapped to Registers - Block Diagram & Scheduling



# Arrays Mapped to Registers - Scheduling

Block 0



# Arrays Mapped to Registers - Testbench

```
5 #define len 9
6
7 using namespace std;
8
9 int main(void) {
10     hls::stream<uint4> din, dout;
11     uint4 output;
12     int pass = 1;
13
14     for (int i = 0; i < len; i++) {
15         din.write(i);
16     }
17
18     while (!din.empty()) {
19         top(din, dout);
20     }
21
22     cout << ">> Start Checking" << endl;
23     cout << "-----" << endl;
24     for (int i = 0; i < len; i++) {
25         output = dout.read();
26         uint4 expect = (i / 3) * 3 + 2 - i % 3;
27         cout << "Output: " << output << "\tExpect: " << expect << endl;
28         if (output != expect) pass = 0;
29     }
30     cout << "-----" << endl;
31
32     if (!pass) {
33         cout << "Failed" << endl;
34         return 1;
35     }
36
37     cout << "Passed Congrats" << endl;
38     return 0;
39 }
```

## Performance Estimates

### Timing

#### Summary

| Clock  | Target  | Estimated | Uncertainty |
|--------|---------|-----------|-------------|
| ap_clk | 5.00 ns | 4.375 ns  | 0.62 ns     |

### Latency

#### Summary

| Latency (cycles) | Latency (absolute) | Interval (cycles) |       |
|------------------|--------------------|-------------------|-------|
| min              | max                | min               | max   |
| 5                | 525.000 ns         | 25.000 ns         | 5     |
|                  |                    |                   | 5none |

## Utilization Estimates

#### Summary

| Name            | BRAM_18KDSP48E | FF        | LUT   | URAM |
|-----------------|----------------|-----------|-------|------|
| DSP             | -              | -         | -     | -    |
| Expression      | -              | -         | 0     | 2    |
| FIFO            | 0              | -         | 15    | 48   |
| Instance        | -              | -         | 22    | 145  |
| Memory          | -              | -         | -     | -    |
| Multiplexer     | -              | -         | -     | 128  |
| Register        | -              | -         | 7     | -    |
| Total           | 0              | 0         | 44    | 323  |
| Available       | 280            | 220106400 | 53200 | 0    |
| Utilization (%) | 0              | 0         | ~0    | ~0   |

## Interface

#### Summary

| RTL Ports       | Dir | Bits | Protocol   | Source Object | C Type       |
|-----------------|-----|------|------------|---------------|--------------|
| ap_clk          | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_rst          | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_start        | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_done         | out | 1    | ap_ctrl_hs | top           | return value |
| ap_idle         | out | 1    | ap_ctrl_hs | top           | return value |
| ap_ready        | out | 1    | ap_ctrl_hs | top           | return value |
| din_V_V_dout    | in  | 4    | ap_fifo    | din_V_V       | pointer      |
| din_V_V_empty_n | in  | 1    | ap_fifo    | din_V_V       | pointer      |
| din_V_V_read    | out | 1    | ap_fifo    | din_V_V       | pointer      |
| dout_V_V_din    | out | 4    | ap_fifo    | dout_V_V      | pointer      |
| dout_V_V_full_n | in  | 1    | ap_fifo    | dout_V_V      | pointer      |
| dout_V_V_write  | out | 1    | ap_fifo    | dout_V_V      | pointer      |

# Waveform

Stream  
Struct



# Outline

- Arrays Shared Between Blocks
- Blocks with Common Interface Control Variables
- Reconvergence: Balancing the Latency Between Blocks

# Passing Control Variables Between Blocks

```
3 void BLOCK0(hls::stream<uint4>& din,
4             hls::stream<uint4>& dout,
5             hls::stream<ctrl_type>& ctrl_in,
6             hls::stream<ctrl_type>& ctrl_out) {
7 #pragma HLS PIPELINE II = 1
8     ctrl_type temp_int = ctrl_in.read();
9     ctrl_out.write(temp_int); // one write
10    WRITE:for (ctrl_type i = 0; i != temp_int; i++) {
11        dout.write(din.read() * 13);
12    }
13 }
14
15 void BLOCK1(hls::stream<uint4>& din, hls::stream<uint4>& dout, hls::stream<ctrl_type>&
16             ctrl) {
17 #pragma HLS PIPELINE II = 1
18     ctrl_type temp_int = ctrl.read(); // one read
19     READ:for (ctrl_type i = 0; i != temp_int; i++) {
20         dout.write(din.read());
21     }
22
23 void top(hls::stream<uint4>& din, hls::stream<uint4>& dout, hls::stream<ctrl_type>& ctrl) {
24 #pragma HLS dataflow
25     hls::stream<uint4> data_int;
26     hls::stream<ctrl_type> ctrl_int;
27
28     BLOCK0(din, data_int, ctrl, ctrl_int);
29     BLOCK1(data_int, dout, ctrl_int);
30 }
```

temporary variable to avoid generating multiple read strobes

# Scheduling



The scheduling seems bizarre.  
We tried two designs:

1. With dataflow pragma in top function
  2. Without dataflow pragma: No weird stall, but the FIFO has to be as large as the input size, since in the setting second block has to wait until first block finishes.



## Performance Estimates

### Timing

#### Summary

| Clock  | Target   | Estimated | Uncertainty |
|--------|----------|-----------|-------------|
| ap_clk | 10.00 ns | 7.268 ns  | 1.25 ns     |

### Latency

#### Summary

| Latency (cycles) | Latency (absolute) |     | Interval (cycles) |     | Type |
|------------------|--------------------|-----|-------------------|-----|------|
|                  | min                | max | min               | max |      |
| ?                | ?                  | ?   | ?                 | ?   | ?    |

#### Detail

- + Instance
- + Loop

## Utilization Estimates

### Summary

| Name            | BRAM_18K | DSP48E | FF     | LUT   | URAM |
|-----------------|----------|--------|--------|-------|------|
| DSP             | -        | -      | -      | -     | -    |
| Expression      | -        | -      | 0      | 2     | -    |
| FIFO            | 0        | -      | 10     | 32    | -    |
| Instance        | -        | -      | 32     | 210   | -    |
| Memory          | -        | -      | -      | -     | -    |
| Multiplexer     | -        | -      | -      | -     | -    |
| Register        | -        | -      | -      | -     | -    |
| Total           | 0        | 0      | 42     | 244   | 0    |
| Available       | 280      | 220    | 106400 | 53200 | 0    |
| Utilization (%) | 0        | 0      | ~0     | ~0    | 0    |

## Interface

### Summary

| RTL Ports        | Dir | Bits | Protocol   | Source Object | C Type       |
|------------------|-----|------|------------|---------------|--------------|
| din_V_V_dout     | in  | 4    | ap_fifo    | din_V_V       | pointer      |
| din_V_V_empty_n  | in  | 1    | ap_fifo    | din_V_V       | pointer      |
| din_V_V_read     | out | 1    | ap_fifo    | din_V_V       | pointer      |
| dout_V_V_din     | out | 4    | ap_fifo    | dout_V_V      | pointer      |
| dout_V_V_full_n  | in  | 1    | ap_fifo    | dout_V_V      | pointer      |
| dout_V_V_write   | out | 1    | ap_fifo    | dout_V_V      | pointer      |
| ctrl_V_V_dout    | in  | 4    | ap_fifo    | ctrl_V_V      | pointer      |
| ctrl_V_V_empty_n | in  | 1    | ap_fifo    | ctrl_V_V      | pointer      |
| ctrl_V_V_read    | out | 1    | ap_fifo    | ctrl_V_V      | pointer      |
| ap_clk           | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_rst           | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_start         | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_done          | out | 1    | ap_ctrl_hs | top           | return value |
| ap_ready         | out | 1    | ap_ctrl_hs | top           | return value |
| ap_idle          | out | 1    | ap_ctrl_hs | top           | return value |

# Waveform

## dataflow



# Waveform

no dataflow



# Connecting Interface Control Variables to Multiple Blocks

```
4 void BLOCK0 (hls::stream<uint4> &din,
5          hls::stream<uint4> &dout,
6          ap_uint<WIDTH> &ctrl) {
7     WRITE:for (ap_uint<8> i = 0; i < 255; i++) {
8         if (i == ctrl)
9             break;
10        dout.write(din.read() * 13);
11    }
12 } pass on all interfaces as a reference
13
14 void BLOCK1 (hls::stream<uint4> &din,
15          hls::stream<uint4> &dout,
16          ap_uint<WIDTH> &ctrl) {
17     READ:for (ap_uint<WIDTH> i = 0; i < 255; i++) {
18         if(i == ctrl)
19             break;
20         dout.write(din.read());
21     }
22 }
23
24 void top (hls::stream<uint4> &din,
25          hls::stream<uint4> &dout,
26          ap_uint<WIDTH> &ctrl) {
27 #pragma HLS STREAM variable=data_int depth=1
28     hls::stream<uint4> data_int;
29     BLOCK0(din, data_int, ctrl);
30     BLOCK1(data_int, dout, ctrl);
31 }
32 }
```

(to avoid creating hierarchy)

# Waveform



There seems to be an additional module synthesized to control the timing of block 1.



## Performance Estimates

### Timing

#### Summary

| Clock  | Target   | Estimated | Uncertainty |
|--------|----------|-----------|-------------|
| ap_clk | 10.00 ns | 7.268 ns  | 1.25 ns     |

### Latency

#### Summary

| Latency (cycles) |     | Latency (absolute) |          | Interval (cycles) |     |      |
|------------------|-----|--------------------|----------|-------------------|-----|------|
| min              | max | min                | max      | min               | max | Type |
| 5                | 767 | 50.000 ns          | 7.670 us | 5                 | 767 | none |

#### Detail

## Utilization Estimates

### Summary

| Name            | BRAM_18K | DSP48E | FF     | LUT   | URAM |
|-----------------|----------|--------|--------|-------|------|
| DSP             | -        | -      | -      | -     | -    |
| Expression      | -        | -      | 0      | 96    | -    |
| FIFO            | 0        | -      | 5      | 16    | -    |
| Instance        | -        | -      | -      | -     | -    |
| Memory          | -        | -      | -      | -     | -    |
| Multiplexer     | -        | -      | -      | 63    | -    |
| Register        | -        | -      | 32     | -     | -    |
| <b>Total</b>    | 0        | 0      | 37     | 175   | 0    |
| Available       | 280      | 220    | 106400 | 53200 | 0    |
| Utilization (%) | 0        | 0      | ~0     | ~0    | 0    |

#### Detail

## Interface

### Summary

| RTL Ports       | Dir | Bits | Protocol   | Source Object | C Type       |
|-----------------|-----|------|------------|---------------|--------------|
| ap_clk          | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_RST          | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_start        | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_done         | out | 1    | ap_ctrl_hs | top           | return value |
| ap_idle         | out | 1    | ap_ctrl_hs | top           | return value |
| ap_ready        | out | 1    | ap_ctrl_hs | top           | return value |
| din_V_V_dout    | in  | 4    | ap_fifo    | din_V_V       | pointer      |
| din_V_V_empty_n | in  | 1    | ap_fifo    | din_V_V       | pointer      |
| din_V_V_read    | out | 1    | ap_fifo    | din_V_V       | pointer      |
| dout_V_V_din    | out | 4    | ap_fifo    | dout_V_V      | pointer      |
| dout_V_V_full_n | in  | 1    | ap_fifo    | dout_V_V      | pointer      |
| dout_V_V_write  | out | 1    | ap_fifo    | dout_V_V      | pointer      |
| ctrl_V          | in  | 8    | ap_none    | ctrl_V        | pointer      |

# Duplicating Control IO

```
10 class control_mult {
11     hls::stream<ctr_type> ctrl;
12     hls::stream<ctr_type> ctrl1;
13
14 #pragma hls_design
15     void CONTROL(hls::stream<ctr_type>& ctrl, hls::stream<ctr_type>& ctrl0,
16                 hls::stream<ctr_type>& ctrl1) {
17         ctr_type ctrl_int = ctrl.read(); // one read
18         ctrl0.write(ctrl_int);      control copy block
19         ctrl1.write(ctrl_int);
20     }
21
22 #pragma hls_design
23     void BLOCK0(hls::stream<dat_type>& din, hls::stream<dat_type>& dout,
24                 hls::stream<ctr_type>& ctrl) {
25         ctr_type ctrl_int = ctrl.read();
26         WRITE:for (ctr_type i = 0; i < 255; i++) {
27             if (i == ctrl_int) break;
28             dout.write(din.read() * 13);
29         }
30     }
31
32 #pragma hls_design
33     void BLOCK1(hls::stream<dat_type>& din, hls::stream<dat_type>& dout,
34                 hls::stream<ctr_type>& ctrl) {
35         ctr_type ctrl_int = ctrl.read();
36         WRITE:for (ctr_type i = 0; i < 255; i++) {
37             if (i == ctrl_int) break;
38             dout.write(din.read() * 111);
39         }
40     }
42     public:
43         control_mult() {}
44
45 #pragma hls_design interface
46     void run(hls::stream<dat_type>& din0,
47               hls::stream<dat_type>& din1,
48               hls::stream<dat_type>& dout0,
49               hls::stream<dat_type>& dout1,
50               hls::stream<ctr_type>& ctrl) {
51     #pragma HLS STREAM variable=ctrl0 depth=1
52     #pragma HLS STREAM variable=ctrl1 depth=1
53         CONTROL(ctrl, ctrl0, ctrl1);
54         BLOCK0(din0, dout0, ctrl0);
55         BLOCK1(din1, dout1, ctrl1);
56     }
57 };
```





# Waveform



## Performance Estimates

### Timing

#### Summary

| Clock  | Target  | Estimated | Uncertainty |
|--------|---------|-----------|-------------|
| ap_clk | 5.00 ns | 4.375 ns  | 0.62 ns     |

### Latency

#### Summary

| Latency (cycles) | Latency (absolute) | Interval (cycles) |          |     |     |      |
|------------------|--------------------|-------------------|----------|-----|-----|------|
| min              | max                | min               | max      | min | max | Type |
| 258              | 258                | 1.290 us          | 1.290 us | 258 | 258 | none |

## Utilization Estimates

### Summary

| Name            | BRAM_18K | DSP48E         | FF        | LUT   | URAM |
|-----------------|----------|----------------|-----------|-------|------|
| DSP             | -        | -              | -         | -     | -    |
| Expression      | -        | -              | 0         | 6     | -    |
| FIFO            | 0        | -              | 10        | 40    | -    |
| Instance        | -        | -              | 2072      | 15659 | -    |
| Memory          | -        | -              | -         | -     | -    |
| Multiplexer     | -        | -              | -         | 1311  | -    |
| Register        | -        | -              | 261       | -     | -    |
| Total           | 0        | 0              | 234317016 | 0     | 0    |
| Available       | 280      | 22010640053200 | 0         | 0     | 0    |
| Utilization (%) | 0        | 0              | 2         | 31    | 0    |

## Interface

### Summary

| RTL Ports        | Dir | Bits | Protocol   | Source Object | C Type       |
|------------------|-----|------|------------|---------------|--------------|
| ap_clk           | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_rst           | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_start         | in  | 1    | ap_ctrl_hs | top           | return value |
| ap_done          | out | 1    | ap_ctrl_hs | top           | return value |
| ap_idle          | out | 1    | ap_ctrl_hs | top           | return value |
| ap_ready         | out | 1    | ap_ctrl_hs | top           | return value |
| din0_V_V_dout    | in  | 4    | ap_fifo    | din0_V_V      | pointer      |
| din0_V_V_empty_n | in  | 1    | ap_fifo    | din0_V_V      | pointer      |
| din0_V_V_read    | out | 1    | ap_fifo    | din0_V_V      | pointer      |
| din1_V_V_dout    | in  | 4    | ap_fifo    | din1_V_V      | pointer      |
| din1_V_V_empty_n | in  | 1    | ap_fifo    | din1_V_V      | pointer      |
| din1_V_V_read    | out | 1    | ap_fifo    | din1_V_V      | pointer      |
| dout0_V_V_din    | out | 4    | ap_fifo    | dout0_V_V     | pointer      |
| dout0_V_V_full_n | in  | 1    | ap_fifo    | dout0_V_V     | pointer      |
| dout0_V_V_write  | out | 1    | ap_fifo    | dout0_V_V     | pointer      |
| dout1_V_V_din    | out | 4    | ap_fifo    | dout1_V_V     | pointer      |
| dout1_V_V_full_n | in  | 1    | ap_fifo    | dout1_V_V     | pointer      |
| dout1_V_V_write  | out | 1    | ap_fifo    | dout1_V_V     | pointer      |
| ctrl_V_V_dout    | in  | 8    | ap_fifo    | ctrl_V_V      | pointer      |
| ctrl_V_V_empty_n | in  | 1    | ap_fifo    | ctrl_V_V      | pointer      |
| ctrl_V_V_read    | out | 1    | ap_fifo    | ctrl_V_V      | pointer      |

# Outline

- Arrays Shared Between Blocks
- Blocks with Common Interface Control Variables
- Reconvergence: Balancing the Latency Between Blocks

# Balancing the Latency Between Blocks

## Example 136 Multi-Block Design with Reconvergence

```
1 #include "ac_int.h"
2 #include "ac_channel.h"
3
4 typedef ac_int<4, false> uint4;
5
6 class recon_multadd_diff {
7
8
9     ac_channel<uint4> b1_in;
10    ac_channel<uint4> b2_in;
11    ac_channel<uint4> b3_in_0;
12    ac_channel<uint4> b3_in_1;
13
14 #pragma hls_design
15 void BLOCK0 (ac_channel<uint4> &din0,
16             ac_channel<uint4> &din1,
17             ac_channel<uint4> &dout0,
18             ac_channel<uint4> &dout1) {
19
20     uint4 tmp0, tmp1;
21     tmp0 = din0.read();
22     tmp1 = din1.read();
23     dout0.write(tmp0 + tmp1);
24     dout1.write(tmp0 - tmp1);
25 }
26
27 #pragma hls_design
28 void BLOCK1 (ac_channel<uint4> &din,
29             ac_channel<uint4> &dout) {
30
31     uint4 tmp;
32     tmp = din.read() * 13;
33     dout.write(tmp);
34 }
```

```
33     #pragma hls_design
34     void BLOCK2 (ac_channel<uint4> &din1,
35                  ac_channel<uint4> &dout) {
36
37         uint4 tmp;
38         tmp = din1.read() + 111;
39         dout.write(tmp);
40     }
41
42     #pragma hls_design
43     void BLOCK3 (ac_channel<uint4> &din0,
44                  ac_channel<uint4> &din1,
45                  ac_channel<uint4> &dout) {
46
47         uint4 tmp;
48         tmp = din0.read() - din1.read();
49         dout.write(tmp);
50     }
51
52     public:
53     recon_multadd_diff () {}
54
55     #pragma hls_design interface
56     void run (ac_channel<uint4> &din0,
57               ac_channel<uint4> &din1,
58               ac_channel<uint4> &dout) {
59         BLOCK0(din0,din1,b1_in,b2_in);
60         BLOCK1(b1_in,b3_in_0);
61         BLOCK2(b2_in,b3_in_1);
62         BLOCK3(b3_in_0,b3_in_1,dout);
63     }
64 }
```



# Balancing the Latency Between Blocks

Two paths from block 0 to block 3  
The latency might be different.



# Solution1: Automatic Pipeline Flushing

```
12 #pragma hls_design
13     void BLOCK0(hls::stream<uint4>& din0,
14                 hls::stream<uint4>& din1,
15                 hls::stream<uint4>& dout0,
16                 hls::stream<uint4>& dout1) {
17 #pragma HLS PIPELINE II = 1 enable_flush
18     uint4 tmp0, tmp1;
19     tmp0 = din0.read();
20     tmp1 = din1.read();
21     dout0.write(tmp0 + tmp1);
22     dout1.write(tmp0 - tmp1);
23 }
24
25 #pragma hls_design
26     void BLOCK1(hls::stream<uint4>& din, hls::stream<uint4>& dout) {
27 #pragma HLS PIPELINE II = 1
28     uint4 tmp;
29     tmp = din.read() * 13;
30     dout.write(tmp);
31 }
```

```
33 #pragma hls_design
34     void BLOCK2(hls::stream<uint4>& din1, hls::stream<uint4>& dout) {
35 #pragma HLS PIPELINE II = 1 enable_flush
36     uint4 tmp;
37     tmp = din1.read() + 111;
38     dout.write(tmp);
39 }
40
41 #pragma hls_design
42     void BLOCK3(hls::stream<uint4>& din0, hls::stream<uint4>& din1,
43                 hls::stream<uint4>& dout) {
44 #pragma HLS PIPELINE II = 1 enable_flush
45     uint4 tmp;
46     tmp = din0.read() - din1.read();
47     dout.write(tmp);
```

# Solution1: Automatic Pipeline Flushing



# Latency

## Synthesis Report for 'BLOCK1'

### General Information

Date: Mon Apr 12 23:28:31 2021  
Version: 2019.2 (Build 2704478 on Wed Nov 06 22:10:23 MST)  
Project: feedback\_reconverge\_flush  
Solution: solution1  
Product: zynq  
Target device: xc7z020-clg484-1

### Performance Estimates

- + Timing
- Latency
- Summary

| Latency (cycles) |     | Latency (absolute) |           | Interval (cycles) |     |          |
|------------------|-----|--------------------|-----------|-------------------|-----|----------|
| min              | max | min                | max       | min               | max | Type     |
| 2                | 2   | 10.000 ns          | 10.000 ns | 1                 | 1   | function |

## Synthesis Report for 'BLOCK2'

### General Information

Date: Mon Apr 12 23:28:31 2021  
Version: 2019.2 (Build 2704478 on Wed Nov 06 22:10:23 MST)  
Project: feedback\_reconverge\_flush  
Solution: solution1  
Product: zynq  
Target device: xc7z020-clg484-1

### Performance Estimates

- + Timing
- Latency
- Summary

| Latency (cycles) |     | Latency (absolute) |          | Interval (cycles) |     |          |
|------------------|-----|--------------------|----------|-------------------|-----|----------|
| min              | max | min                | max      | min               | max | Type     |
| 1                | 1   | 5.000 ns           | 5.000 ns | 1                 | 1   | function |

# Solution2: Manually Setting FIFO Depths

```
52 #pragma hls_design interface
53     void run(hls::stream<uint4>& din0, hls::stream<uint4>& din1, hls::stream<uint4>& dout) {
54 #pragma HLS STREAM variable=b1_in depth=1
55 #pragma HLS STREAM variable=b2_in depth=1
56 #pragma HLS STREAM variable=b3_in_0 depth=1
57 #pragma HLS STREAM variable=b3_in_1 depth=2
58         BLOCK0(din0, din1, b1_in, b2_in);
59         BLOCK1(b1_in, b3_in_0);
60         BLOCK2(b2_in, b3_in_1);
61         BLOCK3(b3_in_0, b3_in_1, dout);
62     }
63 };
64
65 void top(hls::stream<uint4>& din0, hls::stream<uint4>& din1, hls::stream<uint4>& dout);
```

# Solution2: Manually Setting FIFO Depths



**Illustration 131: Manually Setting FIFO Depths**

# Solution2: Manually Setting FIFO Depths



# Solution2: Manually Setting FIFO Depths

```
52 #pragma hls_design interface
53     void run(hls::stream<uint4>& din0, hls::stream<uint4>& din1, hls::stream<uint4>& dout) {
54 #pragma HLS STREAM variable=b1_in depth=1
55 #pragma HLS STREAM variable=b2_in depth=2
56 #pragma HLS STREAM variable=b3_in_0 depth=1
57 #pragma HLS STREAM variable=b3_in_1 depth=1
58         BLOCK0(din0, din1, b1_in, b2_in);
59         BLOCK1(b1_in, b3_in_0);
60         BLOCK2(b2_in, b3_in_1);
61         BLOCK3(b3_in_0, b3_in_1, dout);
62     }
63 };
64
65 void top(hls::stream<uint4>& din0, hls::stream<uint4>& din1, hls::stream<uint4>& dout);
```

# Solution2: Manually Setting FIFO Depths



**Illustration 131: Manually Setting FIFO Depths**

# Solution2: Manually Setting FIFO Depths



# Solution2: Manually Setting FIFO Depths



# Github

- [https://github.com/s950449/HLS\\_LabA\\_Team11](https://github.com/s950449/HLS_LabA_Team11)

Thank you for listening!