

### Q13. Booth's Multiplication algorithm for signed multiplication.

Rule is to simplify multiplication process by

→ inspecting  $(Q_0, Q_{-1})$  bits at the same time

→ if bits are (01) we add and shift

→ if bits are (10) we subtract and shift

→  $Q_{-1}$  is assumed to be 0.

This process reduces number of additions/ subtractions.



flowchart for booth's multiplier algorithm.



From the flow chart the state transitions are defined as follows:



module BOOTH(ldata, ldq, ldm, clra, clrq, clrf, sfta, sftq, addsub,  
 decr, ldcnt, data-in, clk, qm1, eq-z);  
 input ldata, ldq, ldm, clra, clrq, clrf, sfta, sftq, addsub, clk;  
 output [15:0] data-in;  
 wire [5:0] A, M, Q, Z;  
 wire [4:0] count;  
 assign eqz = ~count;

Shift reg SR(A[5], A[5], clk, ldata, clra, sfta);  
 Shift reg SR(Q, data-in, A[0], clk, ldq, clrq, sftq);  
 diff QM1(Q[0]), QM1, CLK, clrf;  
 PIFO MR(data-in, M, CLK, ldm);  
 ALU AS(Z, A, M, addsub);  
 counter CNT(count, decr, ldcnt, CLK);  
endmodule

```
module PIP0 (data_out, data_in, clk, load);
    input [15:0] data_in;
    output [15:0] data_out;
    input clk, load;
    always @ (posedge clk)
        if (load) data_out <= data_in;
endmodule
```

```
module dff (d, q, clk, clr);
    input d, clk, clr;
    output reg q;
    always @ (posedge clk)
        if (!clr) q <= 0;
        else q <= d;
endmodule
```

```
shiftreg (data_out, data_in, s_in, clk, ld, clr, sft);
    input s_in, clk, ld, clr, sft;
    input [15:0] data_in;
    output reg [15:0] data_out;
    always @ (posedge clk);
        begin
            if (clr) data_out <= 0;
            else if (ld) data_out <= data_in;
            else if (sft)
                data_out <= {s_in, data_out[15:1]};
        end
endmodule
```

```
module ALU (out, in1, in2, addsub);
    input [15:0] in1, in2;
    input addsub;
    output reg [15:0] out;
    always @ (*) begin
        if (addsub == 0)
            out = in1 - in2;
        else
            out = in1 + in2;
    end
endmodule
```

```
module counter(data_out, decr, ldcnt, clk);
    input decr, clk;
    output [4:0] data_out;
    always @ (posedge clk)
        begin
            if (ldcnt)
                data_out <= 5'b10000;
            else if (decr)
                data_out <= data_out - 1;
        end
endmodule;
```

module controller (ldA, clrA, sftA, ldQ, clrQ, sftQ, ldM,  
clrff, addsub, start, decr, ldcnt, done  
clk, q0, qm1);

input clk, q0, qm1, start;

output reg ldA, clrA, sftA, ldQ, clrQ, sftQ,

ldM, clrff, addsub, decr, ldcnt, done;

reg [2:0] state;

parameter S0 = 3'b000, S1 = 3'b001, S2 = 3'b010,  
S3 = 3'b011, S4 = 3'b100; S5 = 3'b101, S6 = 3'b110;

always @ (posedge clk)

begin

case (state)

S0: if (start) state <= S1;

S1: if state <= S2,

S2: #2 if (q20, qm1) = 2'b01) state <= S3;

else if (q20, qm1) = 2'b10) state <= S4;

else state <= S5;

S3: state <= S5;

S4: state <= S5;

S5: #2 if ((q0, qm1) = 2'b01 & !eq2) state <= S3;

else if ((q0, qm1) = 2'b10 & !eq2) state <= S4;

else if (eq2)

state <= S6;

S6: state <= S6;

default: state <= S0;

endcase

end.

- always @ (state)

begin

case (state)

S0: begin

clrA=0; ldA=0; sftA=0; clrQ=0; sftQ=0;

ldQ=0; ldM=0; clrf=0; done=0

end.

S1: begin

clrA=1; clrf=1; ldcnt=1; ldM=1

end

S2: begin

clrA=0; clrf=0; ldcnt=0; ldM=0;

ldQ=1;

end

S3: begin

ldA=1; addsub=1; ldQ=0; sftA=0;

sftQ=0; decr=0;

end

S4: begin

ldA=1; addsub=0; ldQ=0; sftA=0;

sftQ=0; decr=0;

end

S5: begin

sftA=1; sftQ=1; ldA=0; ldQ=0; decr=1

end

S6: done=1;

default: begin clrA=0; sftA=0; ldQ=0; sftQ=0 end.

endcase

end

endmodule

## Modelling register banks



an example for smaller register banks.

```
module register_banks_v1 (rdData1, rdData2, wrData, s1, s2,
    dr, write, clk);
```

```
    input clk, write;
```

```
    input [31:0] s1, s2; dr;
```

```
    input [31:0] wrData;
```

```
    output reg [31:0] rdData1, rdData2;
```

```
    reg [31:0] R0, R1, R2, R3;
```

```
    always @(*)
```

```
        begin
```

```
            case (s1)
```

```
                0: rdData1 = R0;
```

```
                1: rdData1 = R1;
```

```
                2: rdData1 = R2;
```

```
                3: rdData1 = R3;
```

```
            enddefault: rdData1 = 32'hxxxxxxxx;
```

```
        endcase
```

```
    end
```

```

always@(*)
begin
    case (SR2);
        0: rdData2 = R0;
        1: rdData2 = R1;
        2: rdData2 = R2;
        3: rdData2 = R3;
        default: rdData2 = 32'hxxxxxx;
    endcase
end
always@(posedge Clk)
begin
    if (write)
        case (dr)
            0: R0 <= wrData;
            1: R1 <= wrData;
            2: R2 <= wrData;
            3: R3 <= wrData;
        default:
            end case
    end
endmodule

```

This can be implemented with assign statements and conditional operators. Model during simulation: Register vs. Register bank V3 is a simple assign statement for scaled version of 4x32 to 32x32. Implement during projects.

```
module reg bank_v4 (rdData1, rdData2, wrData, sr1, sr2, dr,
                     write, reset, clk);
    input clk, write, reset;
    input [4:0] sr1, sr2, dr;
    input [31:0] wrData;
    output [31:0] rdData1, rdData2;
    integer k;
```

```
reg [31:0] regfile [0:31];
```

//check for [31:0] as well.

```
assign rdData1 = regfile[sr1];
```

```
assign rdData2 = regfile[sr2];
```

```
always @ (posedge clk) :
```

```
begin
```

```
if (reset) begin
```

```
for (k=0; k<32; k=k+1) begin
```

```
    regfile[k] <= 0;
```

```
end
```

```
end
```

```
else begin
```

```
if (write)
```

```
    regfile[dr] <= wrData;
```

```
end
```

```
end.
```

```
endmodule.
```

```

module regfile-test;
reg [4:0] sr1, sr2, dr;
reg [31:0] wrData;
reg write, reset, clk;
wire [31:0] rdData1, rdData2;
$integer k;
regbanks_v4 reg (rdData1, rdData2, wrData, sr1, sr2, dr, write,
reset, clk);
initial clk = 0;
always #5 clk = !clk;
initial begin
    $dumpfile ("regfile.vcd")
    $dumpvars (0, regfile-test);
    #1 reset = 1; write = 0;
    #15 reset = 0;
end
initial begin
    #7
    for (k=0; k<32; k=k+1)
        begin
            dr = k; wrData = 10*k; write = 1;
            #10 write = 0;
        end
    #20
    for (k=0; k<32; k=k+2);
        begin
            sr1 = k; sr2 = k+1;
            #15
            $display ("Reg[%.2d] = %.d",

```

```

    reg (Y,2d') = Y, d", sr1, rdData1, sr2, rdData2);
end
#2000 $finish;
end
endmodule

```

### Pipelining:

Overlapping of computational stages.

Requires latches to store data from previous stage



$L_1 \dots L_k$  are latches to store each stage ops

$S_1 \dots S_k$  are combinational circuits for computation

linear pipeline

non-linear pipeline

Reservation table

→ X-axis time steps

→ Y-axis stages

four stage linear pipeline reservation table

|       | 1 | 2 | 3 | 4 |
|-------|---|---|---|---|
| $S_1$ | X |   |   |   |
| $S_2$ |   | X |   |   |
| $S_3$ |   |   | X |   |
| $S_4$ |   |   |   | X |



Non linear reservation table is complex structure due to feedback and feedforward combinations.

Speed up & efficiency:

$T$  = clock period of pipeline

$t_i^*$  = time delay of  $S_i$ , the stage

$d_L$  = latch delay

maximum stage delay  $T_m = \max\{t_i^*\}$

$$\Rightarrow T_p = T_m + d_L$$

pipeline frequency  $f = \frac{1}{T_p}$

Total time to process  $N$  dataset is

$$T_k = ((k-1) + N) T$$

since  $k-1$  is for filling  $k-1$  slots.

and  $N$  stage outputs for all slots.

for a non-pipelined processor:

$$T_1 = N \cdot k \cdot T$$

Since  $N$  dataset and  $k$  timeslots.

$$\text{Speed up} = \frac{T_k}{T_1} = \frac{k(N+1)T}{[(k-1)+N]T} = S_k.$$

as  $N \rightarrow \infty$   $T$  for  $S_k$  is  $k$

i.e.  $N \rightarrow \infty$   $S_k = k$  speed  $\approx$  total latency

$$\text{efficiency } E_k = \frac{S_k}{k} = \frac{N}{k(k-1)} \quad [\text{Performance}]$$

$$\text{throughput } \frac{N}{T_k} = \frac{N}{[k(k-1)]T} \quad [\text{no. of operations per unit time}]$$

Clock skew: max. delay difference between arrival of clocks.

Jitter: maximum delay difference between arrival of clock signal at same latch.

Logic delay: Maximum delay of slowest stage of pipeline

Setup time: time taken for clock to become stable at the input of latch before it can be captured

d) Simple pipelining example with 3 stages.

Given A, B, C, D are n-bit inputs

a) S1 :  $x_1 = A + B$ ;  $x_2 = C - D$ ;

b) S2 :  $x_3 = x_1 + x_2$ ;

c) S3 :  $F = x_3 * D$ ;



```

module pipe_ex (F,A,B,C,D, clk);
parameter N=10;
input [N-1:0] A,B,C,D;
input clk;
output [N-1:0] F;
reg [N-1:0] L12_X, L12_X2, L23_D, L23_X3, L23_D,
L34_F;
assign F = L34_F;
always@(posedge clk) begin
    L12_X<= #4 A+B; //Stage 1
    L12_X2<= #4 C-D;
    L12_D<= D;
    L23_X3<= #4 L12_X1+L12_X2; //Stage 2
    L23_D <= L12_D;
    L34_F <= #6 L23_X3 * L23_D; //Stage 3
end
endmodule

```

\*\*\* stages can be divided into separate procedural blocks \*\*\*

```

module pipe_test;
parameter N=10;
wire [N-1:0] F;
reg [N-1:0] A,B,C,D;
reg clk;
pipeex p (.F(F), .A(A), .B(B), .C(C), .D(D), .clk(clk));
initial
    clk=0;
    always #10 clk=~clk;

```

initial begin

#5 A=10; B=12; C=6; D=3;

#10 A=10; B=10; C=5; D=3;

#20 A=20; B=11; C=1; D=4;

#20 A=15; B=10; C=8; D=2;

#20 A=8; B=15; C=5; D=0;

#20 A=30; B=1; C=2; D=4;

end

initial begin

\$dumpfile ("pipeline.vcd");

\$dumpvars(0, pipe-test);

} monitor(\$time, "F[1:d]", F);

\$300 finish;

end

endmodule

In this example due to common clock for all latches there are chances of race condition. This can be avoided using ~~Master Slave Flip Flops~~

Consider following example for subpart of MP implementation



## Stepwise Information

Inputs: 3 register addresses ( $rs_1, rs_2, rd$ ), an ALU function and memory address ( $addr$ )

Stage 1: Read two 16-bit numbers from  $rs_1, rs_2$  and store in A & B (Temp Regs)



Stage 2: perform ALU on A, B by func and store in Z (Temp Regs)

Stage 3: Write the value of Z in register specified by rd.

Stage 4: Write value of Z in memory location addr.

## Assumptions for design

Register bank with 16 16-bit registers

Memory is organised as  $256 \times 16$

8-bits are required to specify memory address  
each location contains 16-bit data

ALU function (func) is selected by four bit variable

0000: ADD    0001: SUB    0010: MUL

0011: SELA    0100: SELB    0101: AND

0110: OR    0111: XOR    1000: NEGA

1011: NEGAB    1100: SRA    1011: SRA

ADD:  $A+B$     SELB:  $Z=B$     NEGA:  $Z=\sim A$

SUB:  $A-B$     AND:  $A \cdot B$     NEGAB:  $Z=\sim B$

MUL:  $A \cdot B$     OR:  $A \vee B$     SRA: Shift Right  $\downarrow$

SELA:  $Z=A$     XOR:  $A \oplus B$     SLA: Shift Right  $\uparrow$



pipeline diagram for above details:



\* Applying two clocks non-overlapping and alternate ffs both are activated. While designing in Verilog try with the Master-Slave ffs instead of two-phase clocking.

module pipe\_ex2(zout, rs1, rs2, rd, func, addr, clk1, clk2);

input [3:0] rs1, rs2, rd, func;

input [7:0] addr;

input write, clk1, clk2;

output [15:0] zout;

reg [15:0] L12A, L12B, L23Z, L34Z;

reg [3:0] L12rd, L23rd, L12func;

reg [7:0] L12addr, L23addr, L34addr;

reg [15:0] regbank[0:15];

reg [5:0] mem[0:255];

assign zout = L34.Z;

// Stage 1 if Pipeline

always @ (posedge clk1) //clk1  
begin

L12A <= #2 regbank[rs1];

L12B <= #2 regbank[rs2];

L12rd <= #2 rd;

L12rd <= #2 func;

L12addr <= #2 addr;

end.

//stage 2 of pipeline

always @ (posedge clk2) //clk2.  
begin

case(func)

0: L23Z <= #2 L12A + L12B;

1: L23Z <= #2 L12A - L12B;

2: L23Z <= #2 L12A \* L12B;

3: L23Z <= #2 L12A;

4: L23Z <= #2 L12B;

5: L23Z <= #2 L12A & L12B;

6: L23Z <= #2 L12A | L12B;

7: L23Z <= #2 L12A ^ L12B;

8: L23Z <= #2 ~L12A;

9: L23Z <= #2 ~L12B;

10: L23Z <= #2 L12A >> 1;

11: L23Z <= #2 L12A << 1;

default: L23Z <= #2 16'bXXXX;

endcase

L23rd <= #2 L12rd;

L23addr <= #2 L12addr;

end

//Stage 3 of pipelining

always @ (posedge clk1);

begin

logbank [L23rd] <= #2 L232;

L34z <= #2 L232;

L34addr <= #2 L23addr;

end

//Stage 4 of pipeline

always @ (negedge clk2);

begin

mem [L34addr] <= #2 L34z;

end

endmodule

/Testbench

module pipetest;

wire [15:0] z;

reg [350] r1, r2, rd, func;

reg clk1, clk2;

reg [7:0] addr;

integer k;

pipe\_en2 #(zout(z), .r1(r1), .r2(r2), .clk1(clk1), .clk2(clk2), .rd(rd), .func(func), .addr(addr));

initial

begin

clk1 = 0; clk2 = 0;

repeat (20)

begin

#5 clk1 = 1; #5 clk1 = 0;

```
#5 CLK2 = 1; #5 CLK2 = 0;  
end  
initial begin  
    for (k=0; k<16; k=k+1)  
        Pipe.regbank[k] = k;
```

```
initial begin  
    #5 r51=3; r52=5; rd=10; func=0; addr=125;  
    #20 r51=3; r52=8; rd=12; func=2; addr=126;  
    #20 r51=10; r52=5; rd=14; func=1; addr=128;  
    #20 r51= 7; r52=3; rd=13; func=11; addr=127;  
    #20 r51=10; r52=5; rd=15; func=1; addr=129;  
    #20 r51=12; r52=13; rd=16; func=0; addr=130;
```

```
#60 for (k=125; k<131; k=k+1)  
    $display ("Mem[%d]=%d", k, Pipe.mem[k]);
```

end.

```
initial begin  
    $dumpfile("pipe2.vcd");  
    $dumpvars(0, pipe);  
    $monitor($time, "F=%d", Z);  
end #300 $finish;  
end  
endmodule
```