Attention
This documentation is a work in progress. Expect to see errors and unfinished things.
banyan Source File
1`timescale 1ns / 1ns
2
3// permute np ports, each dw bits wide, using rl levels of 2-way mux
4// np must be 2**rl
5//
6// Networking equipment with similar topology is known as a Banyan switch,
7// so I'll use that name. A non-folded Banyan switch actually has two
8// switches of this type back-to-back; I can leave off the second half
9// because in my application the output channels are fungible.
10//
11// The intended use case is to route 8 x ADC data to 8 x data sinks,
12// in any of the following configurations:
13// 8 parallel : 1 combination x 1 time state
14// 4 parallel : 70 combinations x 2 time states
15// 2 parallel : 28 combinations x 4 time states
16// 1 at a time : 8 possibilities x 8 time states
17// for 1*1 + 70*2 + 28*4 + 8*8 = 317 useful configurations
18//
19// Represent the channel selection configuration with an 8-bit mask of
20// which channels to look at. Only 1+70+28+8 = 107 of 256 possible masks
21// are valid, since it's invalid to have 0, 3, 5, 6, or 7 bits set.
22// Configuration also includes 3 bits of time_state.
23//
24// time_state and mask_in are processed at full bandwidth.
25// There are rl+1 cycles of latency from a change of these
26// control inputs to the new data_out permutation appearing.
27// There are only rl cycles of latency to mask_out.
28//
29// The implementation is done with recursive instantiation of itself.
30// The capability of doing recursion in Verilog is explicitly acknowledged
31// and blessed by the Verilog-2005 standard, and it works without problem
32// for me in Icarus Verilog, Verilator, Xilinx XST, and Xilinx Vivado.
33//
34// An N=8 dw=16 instantiation in Kintex speed grade 2 seems capable of
35// running at ridiculous clock rates (over 550 MHz?), consuming on the
36// order of 430 LUT. The data muxes alone ought to use 384 LUT outputs.
37// Even a lowly Spartan-6 should manage 160 MHz, using about 226 LUT
38// (437 LUT outputs).
39//
40// Used successfully on hardware since 2016.
41
42module banyan #(
43 parameter dw=16, // data width
44 parameter np=8, // number of ports, must be a power of 2
45 parameter rl=3 // number of routing layers == log_2(np)
46) (
47 input clk,
48 input [rl-1:0] time_state,
49 input [np-1:0] mask_in,
50 input [dw*np-1:0] data_in,
51 output [np-1:0] mask_out, // for DPRAM write-enable
52 output [dw*np-1:0] data_out
53);
54
55localparam M = np/2; // number of swap-boxes
56wire two_or_more; // use balance mode, when more than one bit of mask is set
57two_set #(.dw(np)) two_set(.d(mask_in), .two(two_or_more));
58
59wire [M-1:0] mask_upper = mask_in[2*M-1:M];
60wire [M-1:0] mask_lower = mask_in[ M-1:0];
61wire any_lower = |mask_lower;
62
63// The below statement creates a Ripple-Carry chain of xors with the initial
64// bit being fed from the right (a 1'b0). The in and out refer to the circuit
65// verilator lint_save
66// verilator lint_off UNOPTFLAT
67wire [M:0] imbalance_in;
68wire [M-1:0] imbalance_out = imbalance_in ^ mask_upper ^ mask_lower;
69assign imbalance_in = {imbalance_out,1'b0};
70// verilator lint_restore
71
72// Priority set sources to Lower sinks when in Balance.
73// If currently NOT imbalanced, and lower is 0 and upper 1, Flip and route to Lower
74// If currently imbalanced, and lower is 1 and upper 0, Flip and route to Upper
75// You can at most flip M/2 channels
76wire [M-1:0] flip_bal = imbalance_in & ~mask_upper & mask_lower
77 | ~imbalance_in & mask_upper & ~mask_lower;
78
79// If only single source; Alternate routing to different sink based on time
80wire [M-1:0] flip_deal = { M {~any_lower ^ time_state[rl-1]}};
81wire [M-1:0] flip_ctl = two_or_more ? flip_bal : flip_deal;
82
83reg [M-1:0] out_mask_upper=0, out_mask_lower=0, data_flip=0;
84reg [rl-1:0] recurse_time_state=0; // upper bit will be ignored
85always @(posedge clk) begin
86 out_mask_upper <= flip_ctl & mask_lower | ~flip_ctl & mask_upper;
87 out_mask_lower <= flip_ctl & mask_upper | ~flip_ctl & mask_lower;
88 recurse_time_state <= time_state;
89 data_flip <= flip_ctl; // pipeline
90end
91
92// Data goes through one cycle after control
93reg [dw*np-1:0] stage=0;
94genvar jx;
95generate for (jx=0; jx<M; jx=jx+1) begin: ss
96always @(posedge clk) begin
97 stage[(jx+0+1)*dw-1:(jx+0)*dw] <= data_flip[jx] ? data_in[(jx+M+1)*dw-1:(jx+M)*dw] : data_in[(jx+0+1)*dw-1:(jx+0)*dw];
98 stage[(jx+M+1)*dw-1:(jx+M)*dw] <= data_flip[jx] ? data_in[(jx+0+1)*dw-1:(jx+0)*dw] : data_in[(jx+M+1)*dw-1:(jx+M)*dw];
99end
100end endgenerate
101
102// Recursive step
103generate if (rl > 1) begin: halvsies
104 banyan #(.rl(rl-1), .dw(dw), .np(np/2)) top(.clk(clk), .time_state(recurse_time_state[rl-2:0]), .mask_in(out_mask_upper),
105 .data_in(stage[dw*np-1:dw*np/2]), .data_out(data_out[dw*np-1:dw*np/2]), .mask_out(mask_out[2*M-1:M]));
106 banyan #(.rl(rl-1), .dw(dw), .np(np/2)) bot(.clk(clk), .time_state(recurse_time_state[rl-2:0]), .mask_in(out_mask_lower),
107 .data_in(stage[dw*np/2-1: 0]), .data_out(data_out[dw*np/2-1: 0]), .mask_out(mask_out[ M-1:0]));
108end else begin: passthrough
109 // base of recursion
110 assign data_out = stage;
111 assign mask_out = {out_mask_upper,out_mask_lower};
112end endgenerate
113endmodule
114
115// =======================================
116// Totally combinatorial count of number of bits set in a word
117// Uses recursion
118module two_set #(
119 parameter dw=8
120) (
121 input [dw-1:0] d,
122 output one, // or more bits of d are set
123 output two // or more bits of d are set
124);
125wire one_lower, one_upper;
126wire two_lower, two_upper;
127generate if (dw > 1) begin: split
128 two_set #(.dw( dw/2)) lower(.d(d[dw/2-1 :0]), .one(one_lower), .two(two_lower));
129 two_set #(.dw(dw-dw/2)) upper(.d(d[dw-1 :dw/2]), .one(one_upper), .two(two_upper));
130 assign one = one_lower | one_upper;
131 assign two = two_lower | two_upper | (one_lower & one_upper);
132end else begin: passthrough
133 // single bit case is trivial
134 assign one = d[0];
135 assign two = 0;
136end endgenerate
137endmodule