.. _xy_pi_clip_source:

xy_pi_clip Source File
======================

.. code-block:: verilog
   :linenos:

   `timescale 1ns / 1ns

   // Proportional-Integral gain for multiplexed X-Y data stream,
   // with programmable clip levels.  Timing plan shown below.

   // Spartan-6: 159 LUTs, 1 DSP48A1
   //  (not counting ~36 LUTs needed to generate coeff and lim)
   // XXX this is the critical timing path, streamline and/or pipeline some more
   // XXX proportional and integral gain terms need very different scaling?

   // Serious pipelining internally.
   // At any one point, the data flow sequence is:
   //   X integral     high-side clip (new data from multiplier summed with previous X integral term)
   //   Y integral     high-side clip (new data from multiplier summed with previous Y integral term)
   //   X proportional high-side clip (new data from multiplier summed with previous X integral term)
   //   Y proportional high-side clip (new data from multiplier summed with previous Y integral term)
   //   X integral      low-side clip (recirculated data from high-side clip)
   //   Y integral      low-side clip (recirculated data from high-side clip)
   //   X proportional  low-side clip (recirculated data from high-side clip)
   //   Y proportional  low-side clip (recirculated data from high-side clip)

   //        in_xy  coeff   lim
   //  sync  xerr
   //  .     yerr   x_int
   //  .     .      y_int
   //  .     .      x_prop
   //  .     .      y_prop
   //  .     .      .       x_hi
   //  .     .      .       y_hi
   //  .     .      .       x_hi
   //  sync  .      .       y_hi
   //  .     .      .       x_lo
   //  .     .      .       y_lo
   //  .     .      .       x_lo
   //  .     .      .       y_lo
   //  .     .      .       .      o_sync  out_x
   //  .     .      .       .      .       out_y
   module xy_pi_clip #(
   	parameter ff_dshift = 0 // Deferred ff_ddrive downshift
   ) (
   	input clk,  // timespec 6.8 ns
   	input sync,  // high for the first of the xy pair
   	input signed [17:0] in_xy,
   	output signed [17:0] out_xy,
   	output o_sync,
   	// 8-way muxed configuration
   	input signed [17:0] coeff,
   	input signed [17:0] lim,
   	// feed-forward inputs
   	input ffd_en,
   	input signed [17:0] ff_ddrive, // FF drive (derivative) to be accumulated in I term
   	input signed [17:0] ff_dphase, // FF phase (derivative); currently unused
   	input ffp_en,
   	input signed [17:0] ff_drive, // FF drive added to P term
   	input signed [17:0] ff_phase, // FF phase
   	// Output clipped, four bits are vs. {x_hi, y_hi, x_lo, y_lo}
   	output [3:0] clipped
   );

   // sync comes in one out of every eight cycles
   // build a one-hot encoding of the various phases out of a simple shift register
   reg [14:0] stb=0;
   always @(posedge clk) stb <= {stb[13:0],sync};

   `define SAT(x,old,new) ((~|x[old:new] | &x[old:new]) ? x[new:0] : {x[old],{new{~x[old]}}})

   wire signed [17:0] in_xy1;
   reg_delay #(.dw(18), .len(2))
   	pi_match(.clk(clk), .reset(1'b0), .gate(1'b1), .din(in_xy), .dout(in_xy1));

   // FF ddrive to be sampled during 'integral' cycles only. Phase tie-in still incomplete
   // Pipelined to ease timing
   reg signed [41:0] mr_scale=0;
   reg signed [42:0] mr_ff=0;
   reg signed [18+12-1:0] ff_mp=0;
   wire signed [18+12-1:0] ff_ddrive_l, ff_drive_l, ff_phase_l; // Worst-case bit-sizing

   // Perform deferred down-shifting of ff_ddrive here by combining with up-shifting by 12
   // required by the accumulator construction below.
   generate if (ff_dshift > 12) begin: g_dshift
   	assign ff_ddrive_l = ff_ddrive >>> (ff_dshift-12);
   end else begin: g_ushift
   	assign ff_ddrive_l = ff_ddrive <<< (12-ff_dshift);
   end endgenerate
   assign ff_drive_l = ff_drive <<< 12;
   assign ff_phase_l = ff_phase <<< 12;

   always @(posedge clk) begin
   	ff_mp <= 0;
   	case(stb[4:1])
   		4'b0001: ff_mp <= ffd_en ? ff_ddrive_l : 0; // ddrive I
   		4'b0010: ff_mp <= 0;           // dphase I
   		4'b0100: ff_mp <= ffp_en ? ff_drive_l : 0;  // drive P
   		4'b1000: ff_mp <= ffp_en ? ff_phase_l : 0;  // phase P
   		default: ff_mp <= 0;
   	endcase
   	// Avoid 3-way add by pre-computing mr_scale + ff_mp
   	mr_ff <= mr_scale + ff_mp;  // outputs on stb 3, 4, 5, 6
   end

   wire signed [29:0] mr_sat = `SAT(mr_ff,42,29);
   `undef SAT

   reg signed [35:0] mr=0;
   reg signed [30:0] lim1=0;
   reg signed [30:0] accum1=0, accum2=0, accum3=0, accum4=0, accum5=0, accum6=0;
   reg signed [17:0] val=0;
   reg clip_recirc=0, p_term=0, p_term1=0, p_term2=0, lim_hi=0, cmp=0;
   wire sat1 = cmp ^ lim_hi;
   wire signed [18:0] accum1_upper = accum1[30:12];
   always @(posedge clk) begin
   	clip_recirc <= stb[6]|stb[7]|stb[0]|stb[1];
   	p_term <= stb[2]|stb[3]|stb[6]|stb[7];
   	p_term1 <= p_term;
   	p_term2 <= p_term1;
   	lim_hi <= stb[6]|stb[7]|stb[8]|stb[9];
   	val <= (sync|stb[0]) ? in_xy : in_xy1;  // outputs on stb 0, 1, 2, 3
   	mr <= coeff * val;  // outputs on stb 1, 2, 3, 4
   	mr_scale <= p_term ? (mr <<< 6) : mr;  // this step determines K_P vs. K_I scaling
   	accum1 <= clip_recirc ? accum4 : (mr_sat + (p_term2 ? accum6 : accum4));
   	accum2 <= accum1;
   	cmp <= accum1_upper < lim;
   	lim1 <= {lim[17],lim,12'b0};
   	accum3 <= sat1 ? lim1 : accum2;
   	accum4 <= accum3;
   	accum5 <= accum4;
   	accum6 <= accum5;
   end

   wire signed [17:0] out_show = accum3[29:12];
   wire signed [18:0] acc_show = accum1[30:12];  // debug only, match cmp expression
   assign out_xy = (stb[4]|stb[5]) ? out_show : 0;
   assign o_sync = stb[4];
   assign clipped = {4{sat1}} & {stb[14]|stb[12], stb[13]|stb[11], stb[10]|stb[8], stb[9]|stb[7]};

   endmodule