Attention

This documentation is a work in progress. Expect to see errors and unfinished things.

banyan Source File

  1`timescale 1ns / 1ns
  2
  3// permute np ports, each dw bits wide, using rl levels of 2-way mux
  4//  np must be 2**rl
  5//
  6// Networking equipment with similar topology is known as a Banyan switch,
  7// so I'll use that name.  A non-folded Banyan switch actually has two
  8// switches of this type back-to-back; I can leave off the second half
  9// because in my application the output channels are fungible.
 10//
 11// The intended use case is to route 8 x ADC data to 8 x data sinks,
 12// in any of the following configurations:
 13//  8 parallel  :   1 combination    x  1 time state
 14//  4 parallel  :  70 combinations   x  2 time states
 15//  2 parallel  :  28 combinations   x  4 time states
 16//  1 at a time :   8 possibilities  x  8 time states
 17// for 1*1 + 70*2 + 28*4 + 8*8 = 317 useful configurations
 18//
 19// Represent the channel selection configuration with an 8-bit mask of
 20// which channels to look at.  Only 1+70+28+8 = 107 of 256 possible masks
 21// are valid, since it's invalid to have 0, 3, 5, 6, or 7 bits set.
 22// Configuration also includes 3 bits of time_state.
 23//
 24// time_state and mask_in are processed at full bandwidth.
 25// There are rl+1 cycles of latency from a change of these
 26// control inputs to the new data_out permutation appearing.
 27// There are only rl cycles of latency to mask_out.
 28//
 29// The implementation is done with recursive instantiation of itself.
 30// The capability of doing recursion in Verilog is explicitly acknowledged
 31// and blessed by the Verilog-2005 standard, and it works without problem
 32// for me in Icarus Verilog, Verilator, Xilinx XST, and Xilinx Vivado.
 33//
 34// An N=8 dw=16 instantiation in Kintex speed grade 2 seems capable of
 35// running at ridiculous clock rates (over 550 MHz?), consuming on the
 36// order of 430 LUT.  The data muxes alone ought to use 384 LUT outputs.
 37// Even a lowly Spartan-6 should manage 160 MHz, using about 226 LUT
 38// (437 LUT outputs).
 39//
 40// Used successfully on hardware since 2016.
 41
 42module banyan #(
 43     parameter dw=16,  // data width
 44     parameter np=8,   // number of ports, must be a power of 2
 45     parameter rl=3    // number of routing layers == log_2(np)
 46) (
 47     input clk,
 48     input [rl-1:0] time_state,
 49     input [np-1:0] mask_in,
 50     input [dw*np-1:0] data_in,
 51     output [np-1:0] mask_out,   // for DPRAM write-enable
 52     output [dw*np-1:0] data_out
 53);
 54
 55localparam M = np/2;   // number of swap-boxes
 56wire two_or_more;  // use balance mode, when more than one bit of mask is set
 57two_set #(.dw(np)) two_set(.d(mask_in), .two(two_or_more));
 58
 59wire [M-1:0] mask_upper = mask_in[2*M-1:M];
 60wire [M-1:0] mask_lower = mask_in[  M-1:0];
 61wire any_lower = |mask_lower;
 62
 63// The below statement creates a Ripple-Carry chain of xors with the initial
 64// bit being fed from the right (a 1'b0). The in and out refer to the circuit
 65// verilator lint_save
 66// verilator lint_off UNOPTFLAT
 67wire [M:0] imbalance_in;
 68wire [M-1:0] imbalance_out = imbalance_in ^ mask_upper ^ mask_lower;
 69assign imbalance_in = {imbalance_out,1'b0};
 70// verilator lint_restore
 71
 72// Priority set sources to Lower sinks when in Balance.
 73// If currently NOT imbalanced, and lower is 0 and upper 1, Flip and route to Lower
 74// If currently     imbalanced, and lower is 1 and upper 0, Flip and route to Upper
 75// You can at most flip M/2 channels
 76wire [M-1:0] flip_bal = imbalance_in & ~mask_upper &  mask_lower
 77                     | ~imbalance_in &  mask_upper & ~mask_lower;
 78
 79// If only single source; Alternate routing to different sink based on time
 80wire [M-1:0] flip_deal = { M {~any_lower ^ time_state[rl-1]}};
 81wire [M-1:0] flip_ctl = two_or_more ? flip_bal : flip_deal;
 82
 83reg [M-1:0] out_mask_upper=0, out_mask_lower=0, data_flip=0;
 84reg [rl-1:0] recurse_time_state=0;  // upper bit will be ignored
 85always @(posedge clk) begin
 86     out_mask_upper <= flip_ctl & mask_lower | ~flip_ctl & mask_upper;
 87     out_mask_lower <= flip_ctl & mask_upper | ~flip_ctl & mask_lower;
 88     recurse_time_state <= time_state;
 89     data_flip <= flip_ctl;  // pipeline
 90end
 91
 92// Data goes through one cycle after control
 93reg [dw*np-1:0] stage=0;
 94genvar jx;
 95generate for (jx=0; jx<M; jx=jx+1) begin: ss
 96always @(posedge clk) begin
 97     stage[(jx+0+1)*dw-1:(jx+0)*dw] <= data_flip[jx] ? data_in[(jx+M+1)*dw-1:(jx+M)*dw] : data_in[(jx+0+1)*dw-1:(jx+0)*dw];
 98     stage[(jx+M+1)*dw-1:(jx+M)*dw] <= data_flip[jx] ? data_in[(jx+0+1)*dw-1:(jx+0)*dw] : data_in[(jx+M+1)*dw-1:(jx+M)*dw];
 99end
100end endgenerate
101
102// Recursive step
103generate if (rl > 1) begin: halvsies
104     banyan #(.rl(rl-1), .dw(dw), .np(np/2)) top(.clk(clk), .time_state(recurse_time_state[rl-2:0]), .mask_in(out_mask_upper),
105             .data_in(stage[dw*np-1:dw*np/2]), .data_out(data_out[dw*np-1:dw*np/2]), .mask_out(mask_out[2*M-1:M]));
106     banyan #(.rl(rl-1), .dw(dw), .np(np/2)) bot(.clk(clk), .time_state(recurse_time_state[rl-2:0]), .mask_in(out_mask_lower),
107             .data_in(stage[dw*np/2-1:    0]), .data_out(data_out[dw*np/2-1:    0]), .mask_out(mask_out[  M-1:0]));
108end else begin: passthrough
109     // base of recursion
110     assign data_out = stage;
111     assign mask_out = {out_mask_upper,out_mask_lower};
112end endgenerate
113endmodule
114
115// =======================================
116// Totally combinatorial count of number of bits set in a word
117// Uses recursion
118module two_set #(
119     parameter dw=8
120) (
121     input [dw-1:0] d,
122     output one,  // or more bits of d are set
123     output two   // or more bits of d are set
124);
125wire one_lower, one_upper;
126wire two_lower, two_upper;
127generate if (dw > 1) begin: split
128     two_set #(.dw(   dw/2)) lower(.d(d[dw/2-1  :0]), .one(one_lower), .two(two_lower));
129     two_set #(.dw(dw-dw/2)) upper(.d(d[dw-1 :dw/2]), .one(one_upper), .two(two_upper));
130     assign one = one_lower | one_upper;
131     assign two = two_lower | two_upper | (one_lower & one_upper);
132end else begin: passthrough
133     // single bit case is trivial
134     assign one = d[0];
135     assign two = 0;
136end endgenerate
137endmodule