.. _biquad_source:

biquad Source File
==================

.. code-block:: verilog
   :linenos:

   // Biquad IIR filter
   //
   // y(t) = u(t)*b0 + u(t-1)*b1 + u(t-2)*b2 + y(t-1)*(-a1) + y(t-2)*(-a2)
   //
   // Minimizes latency by computing all but u(t)*b0 at end of previous sample.
   //
   // Minimizes resource consumption by using internal
   // DSP registers for between-sample storage.
   //
   // Writing a coefficient holds the filter in reset until address 7 is written.
   // This allows usage of a consistent set of coefficients.
   //
   module biquad #(
       parameter DATA_WIDTH        = 28,
       parameter DATA_COUNT        = 1,
       parameter COEFFICIENT_WIDTH = 25,
       parameter DEBUG             = "false"
   ) (
       input                         sysClk,
       input                         sysCoefficientStrobe,
       input                   [2:0] sysCoefficientAddress,
       input [COEFFICIENT_WIDTH-1:0] sysCoefficientValue,

       input                                                         dataClk,
       (*mark_debug=DEBUG*) input      [(DATA_COUNT*DATA_WIDTH)-1:0] S_TDATA,
       (*mark_debug=DEBUG*) input                                    S_TVALID,
       (*mark_debug=DEBUG*) output reg                               S_TREADY,
       (*mark_debug=DEBUG*) output reg [(DATA_COUNT*DATA_WIDTH)-1:0] M_TDATA,
       (*mark_debug=DEBUG*) output reg                               M_TVALID,
       (*mark_debug=DEBUG*) input                                    M_TREADY
   );

   localparam MAC_WIDEN = 4;
   localparam MAC_WIDTH = DATA_WIDTH + COEFFICIENT_WIDTH + MAC_WIDEN;

   // Coefficient dual-port RAM 0:b0, 1:b1, 2:b2, 3:-a2, 4:-a1
   // Coefficient range [-2,2) -- i.e. two bits to the left of the binary point
   reg [COEFFICIENT_WIDTH-1:0] coefficientRAM [0:4], coefficientRAMq;
   reg sysReset = 1;
   always @(posedge sysClk) begin
       if (sysCoefficientStrobe) begin
           if (sysCoefficientAddress <= 4) begin
               coefficientRAM[sysCoefficientAddress] <= sysCoefficientValue;
               sysReset <= 1;
           end else if (sysCoefficientAddress == 7) begin
               sysReset <= 0;
           end
       end
   end

   // I/O history
   (*mark_debug=DEBUG*) reg [(DATA_COUNT*DATA_WIDTH)-1:0] u, uOld = 0, yOld = 0;

   // MAC parameter input multiplexer
   reg [2:0] state = 0;
   wire [(DATA_COUNT*DATA_WIDTH)-1:0] parameterMux = (state == 1) ? u :
                                                     (state == 2) ? u :
                                                     (state == 3) ? uOld :
                                                     (state == 4) ? yOld : M_TDATA;

   // Move sysReset to our clock domain
   wire reset;
   reg_tech_cdc reset_cdc(.I(sysReset), .C(dataClk), .O(reset));

   // Computation state machine
   reg enMAC = 0, ldMAC = 0;

   always @(posedge dataClk) begin
       coefficientRAMq <= coefficientRAM[state];
   end

   always @(posedge dataClk) begin
       if (reset) begin
           state <= 0;
           u <= 0;
           uOld <= 0;
           yOld <= 0;
           S_TREADY <= 0;
           M_TVALID <= 0;
       end
       else begin
           case (state)
           0: begin
               if (S_TVALID && S_TREADY) begin
                   u <= S_TDATA;
                   S_TREADY <= 0;
                   enMAC <= 1;
                   state <= 1;
               end
               else begin
                   S_TREADY <= 1;
               end
           end
           1: begin // MAC inputs: u(t), b0
                    // Multiplier inputs: y(t-1), -a1
                    // Accumulator input: y(t-2)*-a2
                    // Clip input: u(t-2)*b2 + u(t-1)*b1
                    // M_TDATA: u(t-1)*b1
               state <= 2;
           end
           2: begin // MAC inputs: u(t-1), b1 for next cycle
                    // Multiplier inputs: u(t), b0
                    // Accumulator inputx: y(t-1)*-a1
                    // Clip input: y(t-2)*-a2 + u(t-2)*b2 + u(t-1)*b1
                    // M_TDATA: u(t-2)*b2 + u(t-1)*b1
               ldMAC <= 1;
               state <= 3;
           end
           3: begin // MAC inputs: u(t-2), b2 for next cycle
                    // Multiplier inputs: u(t-1), b1 for next cycle
                    // Accumulator input: u(t)*b0
                    // Clip input: y(t-1)*a1 + y(t-2)*-a2 + u(t-2)*b2 + u(t-1)*b1
                    // M_TDATA: y(t-2)*-a2 + u(t-2)*b2 + u(t-1)*b1
                    // sload = 1
               ldMAC <= 0;
               uOld <= u;
               state <= 4;
           end
           4: begin // MAC inputs: y(t-2), -a2 for next cycle
                    // Multiplier inputs: u(t-2), b2 for next cycle
                    // Accumulator input: u(t-1)*b1 for next cycle
                    // Clip input: u(t)*b0+y(t-1)*a1+y(t-2)*-a2+u(t-2)*b2+u(t-1)*b1
                    // M_TDATA: y(t-1)*a1+y(t-2)*-a2+u(t-2)*b2+u(t-1)*b1
                    // sload_reg = 1
               M_TVALID <= 1;
               state <= 5;
           end
           5: begin // MAC inputs: y(t-1), -a1 for next cycle
                    // Multiplier inputs: y(t-2), -a2 for next cycle
                    // Accumulator input: u(t-2), b2 for next cycle
                    // Clip input: u(t-1)*b1 for next cycle
                    // M_TDATA: u(t)*b0+y(t-1)*a1+y(t-2)*-a2+u(t-2)*b2+u(t-1)*b1
                    // M_TVALID = 1
               enMAC <= 0;
               yOld <= M_TDATA;
               if (M_TREADY) begin
                   M_TVALID <= 0;
                   S_TREADY <= 1;
                   state <= 0;
               end
           end
           default: begin
               enMAC <= 0;
               ldMAC <= 0;
               S_TREADY <= 0;
               M_TVALID <= 0;
               state <= 0;
           end
           endcase
       end
   end

   ///////////////////////////////////////////////////////////////////////////////
   // Per-lane computation
   genvar i;
   generate
   for (i = 0 ; i < DATA_COUNT ; i = i + 1) begin
       // Instantiate multiply-accumulate module
       // Module doesn't provide a reset port so fake one by enabling
       // the module in 'load' mode with coefficients all 0.
       wire [MAC_WIDTH-1:0] accum_out;
       macc # (.SIZEA(DATA_WIDTH),
               .SIZEB(COEFFICIENT_WIDTH),
               .SIZEOUT(MAC_WIDTH))
         macc_i (
           .clk(dataClk),
           .ce(reset || enMAC),
           .sload(reset || ldMAC),
           .a(reset ? {DATA_WIDTH{1'b0}} : parameterMux[i*DATA_WIDTH+:DATA_WIDTH]),
           .b(coefficientRAMq),
           .accum_out(accum_out));

       // Clip accumulated result
       // The '-2' on the input width and input bit selection accounts
       // for the fact that the coefficient range is [-2,2).
       wire [DATA_WIDTH-1:0] accum_out_clipped;
       reduceWidth #(.IWIDTH(MAC_WIDTH-(COEFFICIENT_WIDTH-2)),
                     .OWIDTH(DATA_WIDTH))
         clipMAC (.I(accum_out[MAC_WIDTH-1:COEFFICIENT_WIDTH-2]),
                  .O(accum_out_clipped));
       always @(posedge dataClk) begin
           M_TDATA[i*DATA_WIDTH+:DATA_WIDTH] <= accum_out_clipped;
       end
   end
   endgenerate
   endmodule

   ///////////////////////////////////////////////////////////////////////////////
   // Multiply-accumulate unit
   // Template from Vivado
   module macc #(
       parameter SIZEA   = 25,
                 SIZEB   = 28,
                 SIZEOUT = 55
   ) (
       input clk,
       input ce,
       input sload,
       input signed    [SIZEA-1:0] a,
       input signed    [SIZEB-1:0] b,
       output signed [SIZEOUT-1:0] accum_out
   );

   // Declare registers for intermediate values
   reg signed       [SIZEA-1:0] a_reg;
   reg signed       [SIZEB-1:0] b_reg;
   reg                          sload_reg;
   reg signed [SIZEA+SIZEB-1:0] mult_reg;
   reg signed     [SIZEOUT-1:0] adder_out, old_result;

   always @(sload_reg or adder_out)
   begin
    if (sload_reg)
       old_result <= 0;
    else
     // 'sload' is now and opens the accumulation loop.
     // The accumulator takes the next multiplier output
     // in the same cycle.
       old_result <= adder_out;
   end

   always @(posedge clk)
    if (ce)
     begin
       a_reg     <= a;
       b_reg     <= b;
       mult_reg  <= a_reg * b_reg;
       sload_reg <= sload;
       // Store accumulation result into a register
       adder_out <= old_result + mult_reg;
    end

   // Output accumulation result
   assign accum_out = adder_out;

   endmodule