newStep.v
This commit is contained in:
16
RTL/PROCESSOR/README.md
Normal file
16
RTL/PROCESSOR/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
# FemtoRV processor collection
|
||||
|
||||
FemtoRV is a collection of small and understandable RISC-V processors.
|
||||
|
||||
See this table to choose the most suitable one for your project!
|
||||
|
||||
File name | ISA | Special capabilities
|
||||
------------------------- | -------------- | --------
|
||||
femtorv32_quark.v | RV32I | The smallest core in this collection, perfect for tiny FPGAs. For size reasons, it shifts only one bit per clock cycle.
|
||||
femtorv32_quark_bicycle.v | RV32I | The simplest and fastest - in terms of cycles/instruction - core in this collection. Basically Quark with a barrel shifter and additional multiplexers. Recommended if you can afford a few more LUTs and just need a vanilla RV32I.
|
||||
femtorv32_tachyon.v | RV32I | Quark with execute cycle split in two in order to achieve a higher maximum clock frequency, but at the expense of more cycles per instruction.
|
||||
femtorv32_electron.v | RV32IM | Featuring barrel shifter, multiplication and division instructions.
|
||||
femtorv32_intermissum.v | RV32IM + IRQ | Full interrupt support along with CSR registers.
|
||||
femtorv32_gracilis.v | RV32IMC + IRQ | With compressed instructions support, saves both RAM usage and memory fetch cycles. Recommended as general-purpose processor.
|
||||
femtorv32_individua.v | RV32IMAC + IRQ | Also available with atomic instructions support. Not really necessary in single processor designs, but probably useful if you have tricky interrupt handlers.
|
||||
femtorv32_petitbateau.v | RV32IMFC + IRQ | Floating point!
|
||||
7
RTL/PROCESSOR/TESTDRIVE/README.md
Normal file
7
RTL/PROCESSOR/TESTDRIVE/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
This directory contains several versions of femtorv32, that I'm using
|
||||
for testing different features and influence on timings:
|
||||
- testdrive_RV32IM: tachyon core (with two execute cycles) with M extension
|
||||
- testdrive_RV32IM_simF: M extension, F decoder and simulated FPU (works only with Verilator)
|
||||
- testdrive_RV32IMF: M and F extensions
|
||||
|
||||
I recommend using the other cores instead.
|
||||
479
RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IM.v
Normal file
479
RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IM.v
Normal file
@@ -0,0 +1,479 @@
|
||||
/******************************************************************************/
|
||||
// Electron: valid. fmax: 70 MHz exp. fmax: 80 MHz
|
||||
// TestDrive: morphing tachyon into a RV32IMF core, trying to
|
||||
// preserve maxfreq at each step.
|
||||
// Step 0: Tachyon valid. fmax: 115-120 MHz exp. fmax: 135-140 MHz
|
||||
// Step 1: Barrel shft valid. fmax: 110-115 MHz exp. fmax: 130-135 MHz
|
||||
// Step 2: RV32M valid. fmax: 105-115 MHz exp. fmax: 120 MHz
|
||||
|
||||
//
|
||||
/******************************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32im"
|
||||
`define NRV_ABI "ilp32"
|
||||
`define NRV_OPTIMIZE "-O3"
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
|
||||
|
||||
|
||||
// Flip a 32 bit word. Used by the shifter (a single shifter for
|
||||
// left and right shifts, saves silicium !)
|
||||
function [31:0] flip32;
|
||||
input [31:0] x;
|
||||
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
|
||||
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
|
||||
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
|
||||
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
|
||||
endfunction
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The destination register
|
||||
wire [4:0] rdId = instr[11:7];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *) reg [7:0] funct3Is;
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
reg isLoad;
|
||||
reg isALUimm;
|
||||
reg isAUIPC;
|
||||
reg isStore;
|
||||
reg isALUreg;
|
||||
reg isLUI;
|
||||
reg isBranch;
|
||||
reg isJALR;
|
||||
reg isJAL;
|
||||
reg isSYSTEM;
|
||||
|
||||
reg [31:0] Uimm;
|
||||
reg [31:0] Iimm;
|
||||
reg [31:0] Simm;
|
||||
reg [31:0] Bimm;
|
||||
reg [31:0] Jimm;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit] & !mem_rbusy) begin
|
||||
isLoad <= (mem_rdata[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
|
||||
isALUimm <= (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
isAUIPC <= (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
isStore <= (mem_rdata[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
|
||||
isALUreg <= (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
isLUI <= (mem_rdata[6:2] == 5'b01101); // rd <- Uimm
|
||||
isBranch <= (mem_rdata[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
isJALR <= (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
isJAL <= (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
isSYSTEM <= (mem_rdata[6:2] == 5'b11100); // rd <- cycles
|
||||
funct3Is <= 8'b00000001 << mem_rdata[14:12];
|
||||
|
||||
Uimm <= { mem_rdata[31], mem_rdata[30:12], {12{1'b0}}};
|
||||
Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]};
|
||||
Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]};
|
||||
Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0};
|
||||
Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0};
|
||||
end
|
||||
end
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] registerFile [31:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (writeBack)
|
||||
if (rdId != 0)
|
||||
registerFile[rdId] <= writeBackData;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except shifts.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = flip32(shifter);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] alu_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
// funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU
|
||||
// 4->DIV 5->DIVU 6->REM 7->REMU
|
||||
|
||||
wire [31:0] alu_mul = funct3Is[0] ? multiply[31: 0] // 0:MUL
|
||||
: multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU
|
||||
|
||||
wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend)
|
||||
: (div_sign ? -quotient : quotient);
|
||||
|
||||
|
||||
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
|
||||
reg [31:0] aluOut;
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = instr[14];
|
||||
|
||||
always @(posedge clk) begin
|
||||
aluOut <= (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg div_sign;
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [32:0] quotient_msk;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (aluWr) begin
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
quotient_msk[32] <= isALUreg & funcM & isDivide;
|
||||
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] ^ aluIn2[31]) & |aluIn2);
|
||||
end else begin
|
||||
divisor <= divisor >> 1;
|
||||
quotient_msk <= quotient_msk >> 1;
|
||||
if(divisor <= {31'b0, dividend}) begin
|
||||
quotient <= {quotient[30:0],1'b1};
|
||||
dividend <= dividend - divisor[31:0];
|
||||
end else begin
|
||||
quotient <= {quotient[30:0],1'b0};
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate_ =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
reg predicate;
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
reg [ADDR_WIDTH-1:0] PCplusImm;
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
reg [ADDR_WIDTH-1:0] loadstore_addr;
|
||||
|
||||
assign mem_addr = {ADDR_PAD,
|
||||
state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
PC : loadstore_addr
|
||||
};
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] writeBackData =
|
||||
/* verilator lint_off WIDTH */
|
||||
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
|
||||
/* verilator lint_on WIDTH */
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? {ADDR_PAD,PCplus4 } : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0); // Load
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
|
||||
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE1_bit = 2;
|
||||
localparam EXECUTE2_bit = 3;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 4;
|
||||
localparam NB_STATES = 5;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE1 = 1 << EXECUTE1_bit;
|
||||
localparam EXECUTE2 = 1 << EXECUTE2_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) &
|
||||
(state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit]);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (shifts) in the ALU.
|
||||
assign aluWr = state[EXECUTE1_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
`ifdef NRV_IS_IO_ADDR
|
||||
wire needToWait = isLoad |
|
||||
isStore & `NRV_IS_IO_ADDR(mem_addr) |
|
||||
aluBusy;
|
||||
`else
|
||||
wire needToWait = isLoad | isStore | aluBusy;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
end else
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
rs1 <= registerFile[mem_rdata[19:15]];
|
||||
rs2 <= registerFile[mem_rdata[24:20]];
|
||||
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
|
||||
state <= EXECUTE1; // also the declaration of instr).
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE1_bit]: begin
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to:
|
||||
// PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
loadstore_addr <= rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
predicate <= predicate_;
|
||||
state <= EXECUTE2;
|
||||
end
|
||||
|
||||
state[EXECUTE2_bit]: begin
|
||||
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
PCplus4;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
endcase
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Cycle counter
|
||||
/***************************************************************************/
|
||||
|
||||
`ifdef NRV_COUNTER_WIDTH
|
||||
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
|
||||
`else
|
||||
reg [31:0] cycles;
|
||||
`endif
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
// Notes:
|
||||
//
|
||||
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
|
||||
// It is just a cleaner way of writing a series of cascaded if() statements,
|
||||
// To understand it, think about the case statement *in general* as follows:
|
||||
// case (expr)
|
||||
// val_1: statement_1
|
||||
// val_2: statement_2
|
||||
// ... val_n: statement_n
|
||||
// endcase
|
||||
// The first statement_i such that expr == val_i is executed.
|
||||
// Now if expr is 1'b1:
|
||||
// case (1'b1)
|
||||
// cond_1: statement_1
|
||||
// cond_2: statement_2
|
||||
// ... cond_n: statement_n
|
||||
// endcase
|
||||
// It is *exactly the same thing*, the first statement_i such that
|
||||
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
|
||||
// in other words, such that cond_i is true)
|
||||
// More on this:
|
||||
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
|
||||
//
|
||||
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
|
||||
// It uses a larger number of bits (one bit per state), but often results in
|
||||
// a both more compact (fewer LUTs) and faster state machine.
|
||||
|
||||
1162
RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IMF.v
Normal file
1162
RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IMF.v
Normal file
File diff suppressed because it is too large
Load Diff
689
RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IM_simF.v
Normal file
689
RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IM_simF.v
Normal file
@@ -0,0 +1,689 @@
|
||||
/******************************************************************************/
|
||||
// Electron: valid. fmax: 70 MHz exp. fmax: 80 MHz
|
||||
// TestDrive: morphing tachyon into a RV32IMF core, trying to
|
||||
// preserve maxfreq at each step.
|
||||
// Step 0: Tachyon valid. fmax: 115-120 MHz exp. fmax: 135-140 MHz
|
||||
// Step 1: Barrel shft valid. fmax: 110-115 MHz exp. fmax: 130-135 MHz
|
||||
// Step 2: RV32M valid. fmax: 105-115 MHz exp. fmax: 120 MHz
|
||||
// Step 3: RV32F decod only valid. fmax: 100-105 MHz exp. fmax: 105 MHz
|
||||
|
||||
//
|
||||
/******************************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32imaf"
|
||||
`define NRV_ABI "ilp32f"
|
||||
|
||||
//`define NRV_ARCH "rv32im"
|
||||
//`define NRV_ABI "ilp32"
|
||||
|
||||
`define NRV_OPTIMIZE "-O3"
|
||||
|
||||
// Check condition and display message in simulation
|
||||
`ifdef BENCH
|
||||
`define ASSERT(cond,msg) if(!(cond)) $display msg
|
||||
`define ASSERT_NOT_REACHED(msg) $display msg
|
||||
`else
|
||||
`define ASSERT(cond,msg)
|
||||
`define ASSERT_NOT_REACHED(msg)
|
||||
`endif
|
||||
|
||||
// FPU Normalization needs to detect the position of the first bit set
|
||||
// in the A_frac register. It is easier to count the number of leading
|
||||
// zeroes (CLZ for Count Leading Zeroes), as follows. See:
|
||||
// https://electronics.stackexchange.com/questions/196914/verilog-synthesize-high-speed-leading-zero-count
|
||||
module CLZ #(
|
||||
parameter W_IN = 64, // must be power of 2, >= 2
|
||||
parameter W_OUT = $clog2(W_IN)
|
||||
) (
|
||||
input wire [W_IN-1:0] in,
|
||||
output wire [W_OUT-1:0] out
|
||||
);
|
||||
generate
|
||||
if(W_IN == 2) begin
|
||||
assign out = !in[1];
|
||||
end else begin
|
||||
wire [W_OUT-2:0] half_count;
|
||||
wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2];
|
||||
wire [W_IN/2-1:0] rhs = in[0 +: W_IN/2];
|
||||
wire left_empty = ~|lhs;
|
||||
CLZ #(
|
||||
.W_IN(W_IN/2)
|
||||
) inner(
|
||||
.in(left_empty ? rhs : lhs),
|
||||
.out(half_count)
|
||||
);
|
||||
assign out = {left_empty, half_count};
|
||||
end
|
||||
endgenerate
|
||||
endmodule
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
|
||||
|
||||
|
||||
// Flip a 32 bit word. Used by the shifter (a single shifter for
|
||||
// left and right shifts, saves silicium !)
|
||||
function [31:0] flip32;
|
||||
input [31:0] x;
|
||||
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
|
||||
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
|
||||
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
|
||||
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
|
||||
endfunction
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *) reg [7:0] funct3Is;
|
||||
|
||||
// Instruction decoder and immediate decoder
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
|
||||
reg isLoad, isALUimm, isAUIPC, isStore, isALUreg, isLUI,
|
||||
isBranch, isJALR, isJAL, isSYSTEM, isFPU;
|
||||
|
||||
reg [31:0] Uimm, Iimm, Simm, Bimm, Jimm;
|
||||
reg rdIsNZ; // Asserted if dest. register is non-zero (writeback)
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit]) begin
|
||||
isLoad <= (mem_rdata[6:3] == 4'b0000); // rd <- mem[rs1+Iimm]
|
||||
isALUimm <= (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
isAUIPC <= (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
isStore <= (mem_rdata[6:3] == 4'b0100); // mem[rs1+Simm] <- rs2
|
||||
isALUreg <= (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
isLUI <= (mem_rdata[6:2] == 5'b01101); // rd <- Uimm
|
||||
isBranch <= (mem_rdata[6:2] == 5'b11000); // if(rs1OPrs2) PC<-PC+Bimm
|
||||
isJALR <= (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
isJAL <= (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
isSYSTEM <= (mem_rdata[6:2] == 5'b11100); // rd <- cycles
|
||||
isFPU <= (mem_rdata[6:5] == 2'b10); // all FPU except FLW/FSW
|
||||
funct3Is <= 8'b00000001 << mem_rdata[14:12];
|
||||
|
||||
Uimm <= { mem_rdata[31], mem_rdata[30:12], {12{1'b0}}};
|
||||
Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]};
|
||||
Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]};
|
||||
Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0};
|
||||
Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0};
|
||||
|
||||
rdIsNZ <= |mem_rdata[11:7];
|
||||
end
|
||||
end
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] rs3; // this one is used by the FMA instructions.
|
||||
|
||||
reg [31:0] registerFile [0:63]; // 0..31: integer registers
|
||||
// 32..63: floating-point registers
|
||||
|
||||
/***************************************************************************/
|
||||
// The FPU
|
||||
/***************************************************************************/
|
||||
|
||||
// instruction decoder
|
||||
|
||||
reg isFMADD, isFMSUB, isFNMSUB, isFNMADD, isFADD, isFSUB, isFMUL, isFDIV,
|
||||
isFSQRT, isFSGNJ, isFSGNJN, isFSGNJX, isFMIN, isFMAX, isFEQ, isFLT,
|
||||
isFLE, isFCLASS, isFCVTWS, isFCVTWUS, isFCVTSW, isFCVTSWU, isFMVXW,
|
||||
isFMVWX;
|
||||
|
||||
reg rdIsFP; // Asserted if destination register is a FP register.
|
||||
|
||||
// rs1 is a FP register if instr[6:5] = 2'b10 except for:
|
||||
// FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
|
||||
// FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
|
||||
// (two versions of the signal, one for regular instruction decode,
|
||||
// the other one for compressed instructions).
|
||||
wire rs1IsFP = (mem_rdata[6:5] == 2'b10 ) &&
|
||||
!((mem_rdata[4:2] == 3'b100) && (
|
||||
(mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(mem_rdata[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
|
||||
// rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
|
||||
// (two versions of the signal, one for regular instruction decode,
|
||||
// the other one for compressed instructions).
|
||||
wire rs2IsFP = (mem_rdata[6:5] == 2'b10) || (mem_rdata[6:2]==5'b01001);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit]) begin
|
||||
isFMADD <= (mem_rdata[4:2] == 3'b000);
|
||||
isFMSUB <= (mem_rdata[4:2] == 3'b001);
|
||||
isFNMSUB <= (mem_rdata[4:2] == 3'b010);
|
||||
isFNMADD <= (mem_rdata[4:2] == 3'b011);
|
||||
|
||||
isFADD <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00000));
|
||||
isFSUB <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00001));
|
||||
isFMUL <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00010));
|
||||
isFDIV <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00011));
|
||||
isFSQRT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b01011));
|
||||
|
||||
isFSGNJ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b00));
|
||||
isFSGNJN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b01));
|
||||
isFSGNJX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b10));
|
||||
|
||||
isFMIN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && !mem_rdata[12]);
|
||||
isFMAX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && mem_rdata[12]);
|
||||
|
||||
isFEQ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b10));
|
||||
isFLT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b01));
|
||||
isFLE <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b00));
|
||||
|
||||
isFCLASS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && mem_rdata[12]);
|
||||
|
||||
isFCVTWS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && !mem_rdata[20]);
|
||||
isFCVTWUS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && mem_rdata[20]);
|
||||
|
||||
isFCVTSW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && !mem_rdata[20]);
|
||||
isFCVTSWU <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && mem_rdata[20]);
|
||||
|
||||
isFMVXW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && !mem_rdata[12]);
|
||||
isFMVWX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11110));
|
||||
|
||||
rdIsFP <= (mem_rdata[6:2] == 5'b00001) || // FLW
|
||||
(mem_rdata[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB
|
||||
(mem_rdata[6:4] == 3'b101 && (
|
||||
(mem_rdata[31] == 1'b0) || // R-Type FPU
|
||||
(mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(mem_rdata[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
end
|
||||
end
|
||||
|
||||
reg [31:0] fpuOut;
|
||||
`define FPU_OUT fpuOut
|
||||
wire fpuBusy = 0;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit]) begin
|
||||
// Fetch registers as soon as instruction is ready.
|
||||
rs1 <= registerFile[{rs1IsFP,mem_rdata[19:15]}];
|
||||
rs2 <= registerFile[{rs2IsFP,mem_rdata[24:20]}];
|
||||
rs3 <= registerFile[{1'b1, mem_rdata[31:27]}];
|
||||
end else if(state[EXECUTE2_bit] & isFPU) begin
|
||||
`ifdef VERILATOR
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
isFMADD : `FPU_OUT <= $c32("FMADD(",rs1,",",rs2,",",rs3,")");
|
||||
isFMSUB : `FPU_OUT <= $c32("FMSUB(",rs1,",",rs2,",",rs3,")");
|
||||
isFNMSUB : `FPU_OUT <= $c32("FNMSUB(",rs1,",",rs2,",",rs3,")");
|
||||
isFNMADD : `FPU_OUT <= $c32("FNMADD(",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFMUL : `FPU_OUT <= $c32("FMUL(",rs1,",",rs2,")");
|
||||
isFADD : `FPU_OUT <= $c32("FADD(",rs1,",",rs2,")");
|
||||
isFSUB : `FPU_OUT <= $c32("FSUB(",rs1,",",rs2,")");
|
||||
|
||||
isFDIV : `FPU_OUT <= $c32("FDIV(",rs1,",",rs2,")");
|
||||
isFSQRT : `FPU_OUT <= $c32("FSQRT(",rs1,")");
|
||||
|
||||
|
||||
isFSGNJ : `FPU_OUT <= $c32("FSGNJ(",rs1,",",rs2,")");
|
||||
isFSGNJN : `FPU_OUT <= $c32("FSGNJN(",rs1,",",rs2,")");
|
||||
isFSGNJX : `FPU_OUT <= $c32("FSGNJX(",rs1,",",rs2,")");
|
||||
|
||||
isFMIN : `FPU_OUT <= $c32("FMIN(",rs1,",",rs2,")");
|
||||
isFMAX : `FPU_OUT <= $c32("FMAX(",rs1,",",rs2,")");
|
||||
|
||||
isFEQ : `FPU_OUT <= $c32("FEQ(",rs1,",",rs2,")");
|
||||
isFLE : `FPU_OUT <= $c32("FLE(",rs1,",",rs2,")");
|
||||
isFLT : `FPU_OUT <= $c32("FLT(",rs1,",",rs2,")");
|
||||
|
||||
isFCLASS : `FPU_OUT <= $c32("FCLASS(",rs1,")") ;
|
||||
|
||||
isFCVTWS : `FPU_OUT <= $c32("FCVTWS(",rs1,")");
|
||||
isFCVTWUS: `FPU_OUT <= $c32("FCVTWUS(",rs1,")");
|
||||
|
||||
isFCVTSW : `FPU_OUT <= $c32("FCVTSW(",rs1,")");
|
||||
isFCVTSWU: `FPU_OUT <= $c32("FCVTSWU(",rs1,")");
|
||||
|
||||
isFMVXW: `FPU_OUT <= rs1;
|
||||
isFMVWX: `FPU_OUT <= rs1;
|
||||
endcase
|
||||
`endif
|
||||
|
||||
// register write-back
|
||||
end else if(
|
||||
!(isBranch | isStore) & (rdIsFP | rdIsNZ) &
|
||||
(state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit])
|
||||
) begin
|
||||
registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
`ifdef VERILATOR
|
||||
// When doing simulations, compare the result of all operations with
|
||||
// what's computed on the host CPU.
|
||||
|
||||
reg [31:0] z;
|
||||
reg [31:0] rs1_bkp;
|
||||
reg [31:0] rs2_bkp;
|
||||
reg [31:0] rs3_bkp;
|
||||
|
||||
always @(posedge clk) begin
|
||||
// Some micro-coded instructions (FDIV/FSQRT) use rs1, rs2 and
|
||||
// rs3 as temporaty registers, so we need to save them to be able
|
||||
// to recompute the operation on the host CPU.
|
||||
if(isFPU && state[EXECUTE2_bit]) begin
|
||||
rs1_bkp <= rs1;
|
||||
rs2_bkp <= rs2;
|
||||
rs3_bkp <= rs3;
|
||||
end
|
||||
|
||||
if(
|
||||
isFPU && state[WAIT_ALU_OR_MEM_bit] // && fpmi_PC == 0
|
||||
) begin
|
||||
case(1'b1)
|
||||
isFMUL: z <= $c32("CHECK_FMUL(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFADD: z <= $c32("CHECK_FADD(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFSUB: z <= $c32("CHECK_FSUB(",fpuOut,",",rs1,",",rs2,")");
|
||||
|
||||
// my FDIV and FSQRT are not IEEE754 compliant !
|
||||
// (checks commented-out for now)
|
||||
// Note: checks use rs1_bkp and rs2_bkp because
|
||||
// FDIV and FSQRT overwrite rs1 and rs2
|
||||
//
|
||||
//isFDIV:
|
||||
// z<=$c32("CHECK_FDIV(",fpuOut,",",rs1_bkp,",",rs2_bkp,")");
|
||||
//isFSQRT:
|
||||
// z<=$c32("CHECK_FSQRT(",fpuOut,",",rs1_bkp,")");
|
||||
|
||||
|
||||
isFMADD :
|
||||
z<=$c32("CHECK_FMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFMSUB :
|
||||
z<=$c32("CHECK_FMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFNMSUB:
|
||||
z<=$c32("CHECK_FNMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFNMADD:
|
||||
z<=$c32("CHECK_FNMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFEQ: z <= $c32("CHECK_FEQ(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFLT: z <= $c32("CHECK_FLT(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFLE: z <= $c32("CHECK_FLE(",fpuOut,",",rs1,",",rs2,")");
|
||||
|
||||
isFCVTWS : z <= $c32("CHECK_FCVTWS(",fpuOut,",",rs1,")");
|
||||
isFCVTWUS: z <= $c32("CHECK_FCVTWUS(",fpuOut,",",rs1,")");
|
||||
|
||||
isFCVTSW : z <= $c32("CHECK_FCVTSW(",fpuOut,",",rs1,")");
|
||||
isFCVTSWU: z <= $c32("CHECK_FCVTSWU(",fpuOut,",",rs1,")");
|
||||
|
||||
isFMIN: z <= $c32("CHECK_FMIN(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFMAX: z <= $c32("CHECK_FMAX(",fpuOut,",",rs1,",",rs2,")");
|
||||
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except DIV
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = flip32(shifter);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] alu_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
// funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU
|
||||
// 4->DIV 5->DIVU 6->REM 7->REMU
|
||||
|
||||
wire [31:0] alu_mul = funct3Is[0]
|
||||
? multiply[31: 0] // 0:MUL
|
||||
: multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU
|
||||
|
||||
wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend)
|
||||
: (div_sign ? -quotient : quotient);
|
||||
|
||||
|
||||
wire aluBusy = |quotient_msk; // ALU is busy if division in progress.
|
||||
reg [31:0] aluOut;
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = instr[14];
|
||||
|
||||
always @(posedge clk) begin
|
||||
aluOut <= (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg div_sign;
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [32:0] quotient_msk;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (aluWr) begin
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
quotient_msk[32] <= isALUreg & funcM & isDivide;
|
||||
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] ^ aluIn2[31]) & |aluIn2);
|
||||
end else begin
|
||||
divisor <= divisor >> 1;
|
||||
quotient_msk <= quotient_msk >> 1;
|
||||
if(divisor <= {31'b0, dividend}) begin
|
||||
quotient <= {quotient[30:0],1'b1};
|
||||
dividend <= dividend - divisor[31:0];
|
||||
end else begin
|
||||
quotient <= {quotient[30:0],1'b0};
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate_ =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
reg predicate;
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
reg [ADDR_WIDTH-1:0] PCplusImm;
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
reg [ADDR_WIDTH-1:0] loadstore_addr;
|
||||
|
||||
assign mem_addr = {ADDR_PAD,
|
||||
state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
PC : loadstore_addr
|
||||
};
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] writeBackData =
|
||||
/* verilator lint_off WIDTH */
|
||||
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
|
||||
/* verilator lint_on WIDTH */
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isFPU ? fpuOut : 32'b0) | // FPU
|
||||
(isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? {ADDR_PAD,PCplus4 } : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0); // Load
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word (=instr[13:12])
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
// - instr[2] is set for FLW and FSW.
|
||||
wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00);
|
||||
wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01);
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE1_bit = 2;
|
||||
localparam EXECUTE2_bit = 3;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 4;
|
||||
localparam NB_STATES = 5;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE1 = 1 << EXECUTE1_bit;
|
||||
localparam EXECUTE2 = 1 << EXECUTE2_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (shifts) in the ALU.
|
||||
assign aluWr = state[EXECUTE1_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
`ifdef NRV_IS_IO_ADDR
|
||||
wire needToWait = isLoad |
|
||||
isStore & `NRV_IS_IO_ADDR(mem_addr) |
|
||||
aluBusy | isFPU;
|
||||
`else
|
||||
wire needToWait = isLoad | isStore | aluBusy | isFPU;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
end else
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored
|
||||
state <= EXECUTE1; // also the declaration of instr).
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE1_bit]: begin
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to:
|
||||
// PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
loadstore_addr <= rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
predicate <= predicate_;
|
||||
state <= EXECUTE2;
|
||||
end
|
||||
|
||||
state[EXECUTE2_bit]: begin
|
||||
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
PCplus4;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
|
||||
state <= FETCH_INSTR;
|
||||
end
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
endcase
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Cycle counter
|
||||
/***************************************************************************/
|
||||
|
||||
`ifdef NRV_COUNTER_WIDTH
|
||||
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
|
||||
`else
|
||||
reg [31:0] cycles;
|
||||
`endif
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
452
RTL/PROCESSOR/femtorv32_electron.v
Normal file
452
RTL/PROCESSOR/femtorv32_electron.v
Normal file
@@ -0,0 +1,452 @@
|
||||
/*******************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
//
|
||||
// This version: The "electron", with RV32IM support.
|
||||
// A single VERILOG file, compact & understandable code.
|
||||
//
|
||||
// Instruction set: RV32IM
|
||||
//
|
||||
// Parameters:
|
||||
// Reset address can be defined using RESET_ADDR (default is 0).
|
||||
//
|
||||
// The ADDR_WIDTH parameter lets you define the width of the internal
|
||||
// address bus (and address computation logic).
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/*******************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32im"
|
||||
`define NRV_ABI "ilp32"
|
||||
`define NRV_OPTIMIZE "-O3"
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The destination register
|
||||
wire [4:0] rdId = instr[11:7];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *)
|
||||
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
|
||||
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] registerFile [31:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (writeBack)
|
||||
if (rdId != 0)
|
||||
registerFile[rdId] <= writeBackData;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except division.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe, starts dividing.
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ?
|
||||
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
|
||||
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
|
||||
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
|
||||
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
|
||||
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
|
||||
aluIn1[30], aluIn1[31]} : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = {
|
||||
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
|
||||
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
|
||||
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
|
||||
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
|
||||
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
|
||||
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
|
||||
shifter[30], shifter[31]};
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
|
||||
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
wire [31:0] aluOut_muldiv =
|
||||
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
|
||||
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
|
||||
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
|
||||
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
|
||||
|
||||
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [31:0] quotient_msk;
|
||||
|
||||
wire divstep_do = divisor <= {31'b0, dividend};
|
||||
|
||||
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
|
||||
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
|
||||
|
||||
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] != aluIn2[31]) & |aluIn2);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (isDivide & aluWr) begin
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
quotient_msk <= 1 << 31;
|
||||
end else begin
|
||||
dividend <= dividendN;
|
||||
divisor <= divisor >> 1;
|
||||
quotient <= quotientN;
|
||||
quotient_msk <= quotient_msk >> 1;
|
||||
end
|
||||
end
|
||||
|
||||
reg [31:0] divResult;
|
||||
always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
// A separate adder to compute the destination of load/store.
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
// internal address registers and cycles counter may have less than
|
||||
// 32 bits, so we deactivate width test for mem_addr and writeBackData
|
||||
|
||||
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
PC : loadstore_addr;
|
||||
|
||||
/***************************************************************************/
|
||||
// Counter.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [63:0] cycles; // Cycle counter
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
wire sel_cyclesh = (instr[31:20] == 12'hC80);
|
||||
wire [31:0] CSR_read = sel_cyclesh ? cycles[63:32] : cycles[31:0];
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0) ; // Load
|
||||
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
|
||||
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE_bit = 2;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 3;
|
||||
localparam NB_STATES = 4;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE = 1 << EXECUTE_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) &
|
||||
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (shifts) in the ALU.
|
||||
assign aluWr = state[EXECUTE_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
|
||||
wire needToWait = isLoad | isStore | isDivide;
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PC_new =
|
||||
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
PCplus4;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
end else
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
rs1 <= registerFile[mem_rdata[19:15]];
|
||||
rs2 <= registerFile[mem_rdata[24:20]];
|
||||
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
|
||||
state <= EXECUTE; // also the declaration of instr).
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE_bit]: begin
|
||||
PC <= PC_new;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
endcase
|
||||
end
|
||||
|
||||
`ifdef BENCH
|
||||
initial begin
|
||||
cycles = 0;
|
||||
registerFile[0] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
// Notes:
|
||||
//
|
||||
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
|
||||
// It is just a cleaner way of writing a series of cascaded if() statements,
|
||||
// To understand it, think about the case statement *in general* as follows:
|
||||
// case (expr)
|
||||
// val_1: statement_1
|
||||
// val_2: statement_2
|
||||
// ... val_n: statement_n
|
||||
// endcase
|
||||
// The first statement_i such that expr == val_i is executed.
|
||||
// Now if expr is 1'b1:
|
||||
// case (1'b1)
|
||||
// cond_1: statement_1
|
||||
// cond_2: statement_2
|
||||
// ... cond_n: statement_n
|
||||
// endcase
|
||||
// It is *exactly the same thing*, the first statement_i such that
|
||||
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
|
||||
// in other words, such that cond_i is true)
|
||||
// More on this:
|
||||
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
|
||||
//
|
||||
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
|
||||
// It uses a larger number of bits (one bit per state), but often results in
|
||||
// a both more compact (fewer LUTs) and faster state machine.
|
||||
|
||||
674
RTL/PROCESSOR/femtorv32_gracilis.v
Normal file
674
RTL/PROCESSOR/femtorv32_gracilis.v
Normal file
@@ -0,0 +1,674 @@
|
||||
/******************************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
//
|
||||
// This version: The "Gracilis", with full interrupt and
|
||||
// RVC compressed instructions support.
|
||||
// A single VERILOG file, compact & understandable code.
|
||||
//
|
||||
// Instruction set: RV32IMC + CSR + MRET
|
||||
//
|
||||
// Parameters:
|
||||
// Reset address can be defined using RESET_ADDR (default is 0).
|
||||
//
|
||||
// The ADDR_WIDTH parameter lets you define the width of the internal
|
||||
// address bus (and address computation logic).
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/******************************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32imac"
|
||||
`define NRV_ABI "ilp32"
|
||||
`define NRV_OPTIMIZE "-O3"
|
||||
`define NRV_INTERRUPTS
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input interrupt_request,
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The destination register
|
||||
wire [4:0] rdId = instr[11:7];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *)
|
||||
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
|
||||
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] registerFile [31:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (writeBack)
|
||||
if (rdId != 0)
|
||||
registerFile[rdId] <= writeBackData;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except divisions.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe, starts dividing.
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ?
|
||||
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
|
||||
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
|
||||
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
|
||||
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
|
||||
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
|
||||
aluIn1[30], aluIn1[31]} : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = {
|
||||
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
|
||||
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
|
||||
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
|
||||
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
|
||||
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
|
||||
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
|
||||
shifter[30], shifter[31]};
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = isALUreg & funcM & instr[14];
|
||||
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
wire [31:0] aluOut_muldiv =
|
||||
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
|
||||
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
|
||||
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
|
||||
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
|
||||
|
||||
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [31:0] quotient_msk;
|
||||
|
||||
wire divstep_do = (divisor <= {31'b0, dividend});
|
||||
|
||||
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
|
||||
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
|
||||
|
||||
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] != aluIn2[31]) & |aluIn2);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (isDivide & aluWr) begin
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
quotient_msk <= 1 << 31;
|
||||
end else begin
|
||||
dividend <= dividendN;
|
||||
divisor <= divisor >> 1;
|
||||
quotient <= quotientN;
|
||||
quotient_msk <= quotient_msk >> 1;
|
||||
end
|
||||
end
|
||||
|
||||
reg [31:0] divResult;
|
||||
always @(posedge clk) begin
|
||||
divResult <= instr[13] ? dividendN : quotientN;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
wire [ADDR_WIDTH-1:0] PCinc = long_instr ? PCplus4 : PCplus2;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
|
||||
: {PC [ADDR_WIDTH-1:2], 2'b00}
|
||||
: loadstore_addr ;
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// Interrupt logic, CSR registers and opcodes.
|
||||
/***************************************************************************/
|
||||
|
||||
// Remember interrupt requests as they are not checked for every cycle
|
||||
reg interrupt_request_sticky;
|
||||
|
||||
// Interrupt enable and lock logic
|
||||
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
|
||||
|
||||
// Processor accepts interrupts in EXECUTE state.
|
||||
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
|
||||
|
||||
// If current interrupt is accepted, there already might be the next one,
|
||||
// which should not be missed:
|
||||
always @(posedge clk) begin
|
||||
interrupt_request_sticky <=
|
||||
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
|
||||
end
|
||||
|
||||
// Decoder for mret opcode
|
||||
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
|
||||
|
||||
// CSRs:
|
||||
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
|
||||
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
|
||||
reg mstatus; // Interrupt enable
|
||||
reg mcause; // Interrupt cause (and lock)
|
||||
reg [63:0] cycles; // Cycle counter
|
||||
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
wire sel_mstatus = (instr[31:20] == 12'h300);
|
||||
wire sel_mtvec = (instr[31:20] == 12'h305);
|
||||
wire sel_mepc = (instr[31:20] == 12'h341);
|
||||
wire sel_mcause = (instr[31:20] == 12'h342);
|
||||
wire sel_cycles = (instr[31:20] == 12'hC00);
|
||||
wire sel_cyclesh = (instr[31:20] == 12'hC80);
|
||||
|
||||
// Read CSRs
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] CSR_read =
|
||||
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
|
||||
(sel_mtvec ? mtvec : 32'b0) |
|
||||
(sel_mepc ? mepc : 32'b0) |
|
||||
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
|
||||
(sel_cycles ? cycles[31:0] : 32'b0) |
|
||||
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
// Write CSRs: 5 bit unsigned immediate or content of RS1
|
||||
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
|
||||
|
||||
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
|
||||
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
|
||||
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
mstatus <= 0;
|
||||
end else begin
|
||||
// Execute a CSR opcode
|
||||
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
|
||||
if (sel_mstatus) mstatus <= CSR_write[3];
|
||||
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? PCinc : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0); // Load
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
|
||||
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/***************************************************************************/
|
||||
// Unaligned fetch mechanism and compressed opcode handling
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:2] cached_addr;
|
||||
reg [31:0] cached_data;
|
||||
|
||||
wire current_cache_hit = cached_addr == PC [ADDR_WIDTH-1:2];
|
||||
wire next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
|
||||
|
||||
wire current_unaligned_long = &cached_mem [17:16] & PC [1];
|
||||
wire next_unaligned_long = &cached_data[17:16] & PC_new[1];
|
||||
|
||||
reg fetch_second_half;
|
||||
reg long_instr;
|
||||
|
||||
wire [31:0] cached_mem = current_cache_hit ? cached_data : mem_rdata;
|
||||
wire [31:0] decomp_input = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
|
||||
: cached_mem;
|
||||
wire [31:0] decompressed;
|
||||
|
||||
decompressor _decomp ( .c(decomp_input), .d(decompressed) );
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE_bit = 2;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 3;
|
||||
localparam WAIT_ALU_OR_MEM_SKIP_bit = 4;
|
||||
|
||||
localparam NB_STATES = 5;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE = 1 << EXECUTE_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) & (
|
||||
state[EXECUTE_bit] |
|
||||
state[WAIT_ALU_OR_MEM_bit] |
|
||||
state[WAIT_ALU_OR_MEM_SKIP_bit]
|
||||
);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (divide) in the ALU.
|
||||
assign aluWr = state[EXECUTE_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
|
||||
wire needToWait = isLoad | isStore | isDivide;
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PC_new =
|
||||
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
interrupt_return ? mepc :
|
||||
PCinc;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; //Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
mcause <= 0;
|
||||
cached_addr <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
|
||||
fetch_second_half <= 0;
|
||||
end else begin
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
// Update cache
|
||||
if (~current_cache_hit | fetch_second_half) begin
|
||||
cached_addr <= mem_addr[ADDR_WIDTH-1:2];
|
||||
cached_data <= mem_rdata;
|
||||
end;
|
||||
|
||||
// Decode instruction
|
||||
rs1 <= registerFile[decompressed[19:15]];
|
||||
rs2 <= registerFile[decompressed[24:20]];
|
||||
instr <= decompressed[31:2];
|
||||
long_instr <= &decomp_input[1:0];
|
||||
|
||||
// Long opcode, unaligned, first part fetched,
|
||||
// happens in non-linear code
|
||||
if (current_unaligned_long & ~fetch_second_half) begin
|
||||
fetch_second_half <= 1;
|
||||
state <= FETCH_INSTR;
|
||||
end else begin
|
||||
fetch_second_half <= 0;
|
||||
state <= EXECUTE;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE_bit]: begin
|
||||
if (interrupt) begin
|
||||
PC <= mtvec;
|
||||
mepc <= PC_new;
|
||||
mcause <= 1;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end else begin
|
||||
PC <= PC_new;
|
||||
if (interrupt_return) mcause <= 0;
|
||||
|
||||
state <= next_cache_hit & ~next_unaligned_long
|
||||
? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
|
||||
: (needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR);
|
||||
|
||||
fetch_second_half <= next_cache_hit & next_unaligned_long;
|
||||
end
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
|
||||
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef BENCH
|
||||
initial begin
|
||||
cycles = 0;
|
||||
registerFile[0] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
// if c[15:0] is a compressed instrution, decompresses it in d
|
||||
// else copies c to d
|
||||
module decompressor(
|
||||
input wire [31:0] c,
|
||||
output reg [31:0] d
|
||||
);
|
||||
|
||||
// How to handle illegal and unknown opcodes
|
||||
|
||||
localparam illegal = 32'h00000000;
|
||||
localparam unknown = 32'h00000000;
|
||||
|
||||
// Register decoder
|
||||
|
||||
wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
|
||||
wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
|
||||
|
||||
wire [4:0] rwl = c[ 6:2]; // Register wide low
|
||||
wire [4:0] rwh = c[11:7]; // Register wide high
|
||||
|
||||
localparam x0 = 5'b00000;
|
||||
localparam x1 = 5'b00001;
|
||||
localparam x2 = 5'b00010;
|
||||
|
||||
// Immediate decoder
|
||||
|
||||
wire [4:0] shiftImm = c[6:2];
|
||||
|
||||
wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
|
||||
wire [11:0] lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
|
||||
wire [11:0] lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
|
||||
wire [11:0] swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
|
||||
|
||||
wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
|
||||
wire [11:0] addImm = {{ 7{c[12]}}, c[6:2]};
|
||||
|
||||
/* verilator lint_off UNUSED */
|
||||
wire [12:0] bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
|
||||
wire [20:0] jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
|
||||
wire [31:0] luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
always @*
|
||||
casez (c[15:0])
|
||||
// imm / funct7 + rs2 rs1 fn3 rd opcode
|
||||
16'b???___????????_???_11 : d = c ; // Long opcode, no need to decompress
|
||||
|
||||
/* verilator lint_off CASEOVERLAP */
|
||||
|
||||
16'b000___00000000_000_00 : d = illegal ; // c.illegal --> illegal
|
||||
16'b000___????????_???_00 : d = { addi4spnImm, x2, 3'b000, rcl, 7'b00100_11} ; // c.addi4spn --> addi rd', x2, nzuimm[9:2]
|
||||
/* verilator lint_on CASEOVERLAP */
|
||||
|
||||
16'b010_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00000_11} ; // c.lw --> lw rd', offset[6:2](rs1')
|
||||
16'b110_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01000_11} ; // c.sw --> sw rs2', offset[6:2](rs1')
|
||||
|
||||
16'b000_???_???_??_???_01 : d = { addImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi --> addi rd, rd, nzimm[5:0]
|
||||
16'b001____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x1, 7'b11011_11} ; // c.jal --> jal x1, offset[11:1]
|
||||
16'b010__?_?????_?????_01 : d = { addImm, x0, 3'b000, rwh, 7'b00100_11} ; // c.li --> addi rd, x0, imm[5:0]
|
||||
16'b011__?_00010_?????_01 : d = { addi16spImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi16sp --> addi x2, x2, nzimm[9:4]
|
||||
16'b011__?_?????_?????_01 : d = { luiImm[31:12], rwh, 7'b01101_11} ; // c.lui --> lui rd, nzuimm[17:12]
|
||||
16'b100_?_00_???_?????_01 : d = { 7'b0000000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srli --> srli rd', rd', shamt[5:0]
|
||||
16'b100_?_01_???_?????_01 : d = { 7'b0100000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srai --> srai rd', rd', shamt[5:0]
|
||||
16'b100_?_10_???_?????_01 : d = { addImm, rch, 3'b111, rch, 7'b00100_11} ; // c.andi --> andi rd', rd', imm[5:0]
|
||||
16'b100_011_???_00_???_01 : d = { 7'b0100000, rcl, rch, 3'b000, rch, 7'b01100_11} ; // c.sub --> sub rd', rd', rs2'
|
||||
16'b100_011_???_01_???_01 : d = { 7'b0000000, rcl, rch, 3'b100, rch, 7'b01100_11} ; // c.xor --> xor rd', rd', rs2'
|
||||
16'b100_011_???_10_???_01 : d = { 7'b0000000, rcl, rch, 3'b110, rch, 7'b01100_11} ; // c.or --> or rd', rd', rs2'
|
||||
16'b100_011_???_11_???_01 : d = { 7'b0000000, rcl, rch, 3'b111, rch, 7'b01100_11} ; // c.and --> and rd', rd', rs2'
|
||||
16'b101____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x0, 7'b11011_11} ; // c.j --> jal x0, offset[11:1]
|
||||
16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz --> beq rs1', x0, offset[8:1]
|
||||
16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez --> bne rs1', x0, offset[8:1]
|
||||
|
||||
16'b000__?_?????_?????_10 : d = { 7'b0000000, shiftImm, rwh, 3'b001, rwh, 7'b00100_11} ; // c.slli --> slli rd, rd, shamt[5:0]
|
||||
16'b010__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00000_11} ; // c.lwsp --> lw rd, offset[7:2](x2)
|
||||
16'b100__0_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x0, 7'b11001_11} ; // c.jr --> jalr x0, rs1, 0
|
||||
16'b100__0_?????_?????_10 : d = { 7'b0000000, rwl, x0, 3'b000, rwh, 7'b01100_11} ; // c.mv --> add rd, x0, rs2
|
||||
// 16'b100__1_00000_00000_10 : d = { 25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak --> ebreak
|
||||
16'b100__1_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x1, 7'b11001_11} ; // c.jalr --> jalr x1, rs1, 0
|
||||
16'b100__1_?????_?????_10 : d = { 7'b0000000, rwl, rwh, 3'b000, rwh, 7'b01100_11} ; // c.add --> add rd, rd, rs2
|
||||
16'b110__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01000_11} ; // c.swsp --> sw rs2, offset[7:2](x2)
|
||||
|
||||
default: d = unknown ; // Unknown opcode
|
||||
endcase
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
// Notes:
|
||||
//
|
||||
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
|
||||
// It is just a cleaner way of writing a series of cascaded if() statements,
|
||||
// To understand it, think about the case statement *in general* as follows:
|
||||
// case (expr)
|
||||
// val_1: statement_1
|
||||
// val_2: statement_2
|
||||
// ... val_n: statement_n
|
||||
// endcase
|
||||
// The first statement_i such that expr == val_i is executed.
|
||||
// Now if expr is 1'b1:
|
||||
// case (1'b1)
|
||||
// cond_1: statement_1
|
||||
// cond_2: statement_2
|
||||
// ... cond_n: statement_n
|
||||
// endcase
|
||||
// It is *exactly the same thing*, the first statement_i such that
|
||||
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
|
||||
// in other words, such that cond_i is true)
|
||||
// More on this:
|
||||
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
|
||||
//
|
||||
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
|
||||
// It uses a larger number of bits (one bit per state), but often results in
|
||||
// a both more compact (fewer LUTs) and faster state machine.
|
||||
730
RTL/PROCESSOR/femtorv32_individua.v
Normal file
730
RTL/PROCESSOR/femtorv32_individua.v
Normal file
@@ -0,0 +1,730 @@
|
||||
/******************************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
//
|
||||
// This version: The "Individua", with full interrupt, atomic and
|
||||
// RVC compressed instructions support.
|
||||
// A single VERILOG file, compact & understandable code.
|
||||
//
|
||||
// Instruction set: RV32IMAC + CSR + MRET
|
||||
//
|
||||
// Parameters:
|
||||
// Reset address can be defined using RESET_ADDR (default is 0).
|
||||
//
|
||||
// The ADDR_WIDTH parameter lets you define the width of the internal
|
||||
// address bus (and address computation logic).
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/******************************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32imac"
|
||||
`define NRV_ABI "ilp32"
|
||||
`define NRV_OPTIMIZE "-O3"
|
||||
`define NRV_INTERRUPTS
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input interrupt_request,
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The destination register
|
||||
wire [4:0] rdId = instr[11:7];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *)
|
||||
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
|
||||
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
|
||||
wire isAMO = (instr[6:2] == 5'b01011); // various
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] registerFile [31:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (writeBack)
|
||||
if (rdId != 0)
|
||||
registerFile[rdId] <= writeBackData;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except divisions.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = isAMO ? mem_rdata : rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isAMO | isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe, starts dividing.
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ?
|
||||
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
|
||||
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
|
||||
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
|
||||
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
|
||||
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
|
||||
aluIn1[30], aluIn1[31]} : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = {
|
||||
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
|
||||
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
|
||||
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
|
||||
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
|
||||
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
|
||||
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
|
||||
shifter[30], shifter[31]};
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = isALUreg & funcM & instr[14];
|
||||
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
wire [31:0] aluOut_muldiv =
|
||||
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
|
||||
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
|
||||
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
|
||||
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
|
||||
|
||||
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [31:0] quotient_msk;
|
||||
|
||||
wire divstep_do = (divisor <= {31'b0, dividend});
|
||||
|
||||
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
|
||||
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
|
||||
|
||||
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] != aluIn2[31]) & |aluIn2);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (isDivide & aluWr) begin
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
quotient_msk <= 1 << 31;
|
||||
end else begin
|
||||
dividend <= dividendN;
|
||||
divisor <= divisor >> 1;
|
||||
quotient <= quotientN;
|
||||
quotient_msk <= quotient_msk >> 1;
|
||||
end
|
||||
end
|
||||
|
||||
reg [31:0] divResult;
|
||||
always @(posedge clk) begin
|
||||
divResult <= instr[13] ? dividendN : quotientN;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
/***************************************************************************/
|
||||
// Special ALU for atomic opcodes
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] amoALU =
|
||||
|
||||
(instr[31:27] == 5'h00 ? aluPlus : 32'b0) | // amoadd.w
|
||||
(instr[31:27] == 5'h01 ? aluIn2 : 32'b0) | // amoswap.w
|
||||
(instr[31:27] == 5'h04 ? aluIn1 ^ aluIn2 : 32'b0) | // amoxor.w
|
||||
(instr[31:27] == 5'h08 ? aluIn1 | aluIn2 : 32'b0) | // amoor.w
|
||||
(instr[31:27] == 5'h0C ? aluIn1 & aluIn2 : 32'b0) | // amoand.w
|
||||
(instr[31:27] == 5'h10 ? ( LT ? aluIn1 : aluIn2) : 32'b0) | // amomin.w
|
||||
(instr[31:27] == 5'h14 ? (!LT ? aluIn1 : aluIn2) : 32'b0) | // amomax.w
|
||||
(instr[31:27] == 5'h18 ? ( LTU ? aluIn1 : aluIn2) : 32'b0) | // amominu.w
|
||||
(instr[31:27] == 5'h1C ? (!LTU ? aluIn1 : aluIn2) : 32'b0) ; // amomaxu.w
|
||||
|
||||
reg [31:0] amo_wdata;
|
||||
|
||||
wire amo_write = state[WRITE_AMO_bit] | state[WAIT_AMO_bit];
|
||||
|
||||
wire isAMOlr = instr[31:27] == 5'h02; // amolr.w
|
||||
wire isAMOsc = instr[31:27] == 5'h03; // amosc.w
|
||||
|
||||
reg [ADDR_WIDTH-1:0] amo_location;
|
||||
reg amo_location_unchanged;
|
||||
|
||||
wire reserved_addr = mem_addr[ADDR_WIDTH-1:0] == amo_location;
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
wire [ADDR_WIDTH-1:0] PCinc = long_instr ? PCplus4 : PCplus2;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
|
||||
: {PC [ADDR_WIDTH-1:2], 2'b00}
|
||||
: isAMO ? rs1[ADDR_WIDTH-1:0] : loadstore_addr;
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// Interrupt logic, CSR registers and opcodes.
|
||||
/***************************************************************************/
|
||||
|
||||
// Remember interrupt requests as they are not checked for every cycle
|
||||
reg interrupt_request_sticky;
|
||||
|
||||
// Interrupt enable and lock logic
|
||||
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
|
||||
|
||||
// Processor accepts interrupts in EXECUTE state.
|
||||
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
|
||||
|
||||
// If current interrupt is accepted, there already might be the next one,
|
||||
// which should not be missed:
|
||||
always @(posedge clk) begin
|
||||
interrupt_request_sticky <=
|
||||
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
|
||||
end
|
||||
|
||||
// Decoder for mret opcode
|
||||
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
|
||||
|
||||
// CSRs:
|
||||
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
|
||||
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
|
||||
reg mstatus; // Interrupt enable
|
||||
reg mcause; // Interrupt cause (and lock)
|
||||
reg [63:0] cycles; // Cycle counter
|
||||
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
wire sel_mstatus = (instr[31:20] == 12'h300);
|
||||
wire sel_mtvec = (instr[31:20] == 12'h305);
|
||||
wire sel_mepc = (instr[31:20] == 12'h341);
|
||||
wire sel_mcause = (instr[31:20] == 12'h342);
|
||||
wire sel_cycles = (instr[31:20] == 12'hC00);
|
||||
wire sel_cyclesh = (instr[31:20] == 12'hC80);
|
||||
|
||||
// Read CSRs
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] CSR_read =
|
||||
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
|
||||
(sel_mtvec ? mtvec : 32'b0) |
|
||||
(sel_mepc ? mepc : 32'b0) |
|
||||
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
|
||||
(sel_cycles ? cycles[31:0] : 32'b0) |
|
||||
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
// Write CSRs: 5 bit unsigned immediate or content of RS1
|
||||
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
|
||||
|
||||
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
|
||||
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
|
||||
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
mstatus <= 0;
|
||||
end else begin
|
||||
// Execute a CSR opcode
|
||||
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
|
||||
if (sel_mstatus) mstatus <= CSR_write[3];
|
||||
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? PCinc : 32'b0) | // JAL, JALR
|
||||
(isLoad | isAMO & ~isAMOsc ? LOAD_data : 32'b0) | // Load, AMO
|
||||
(isAMO & isAMOsc ? {31'b0, ~amo_location_unchanged} : 32'b0); // AMOsc
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
|
||||
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = amo_write ? amo_wdata[ 7: 0] : rs2[7:0];
|
||||
assign mem_wdata[15: 8] = amo_write ? amo_wdata[15: 8] : loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = amo_write ? amo_wdata[23:16] : loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = amo_write ? amo_wdata[31:24] : loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/***************************************************************************/
|
||||
// Unaligned fetch mechanism and compressed opcode handling
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:2] cached_addr;
|
||||
reg [31:0] cached_data;
|
||||
|
||||
wire current_cache_hit = cached_addr == PC [ADDR_WIDTH-1:2];
|
||||
wire next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
|
||||
|
||||
wire current_unaligned_long = &cached_mem [17:16] & PC [1];
|
||||
wire next_unaligned_long = &cached_data[17:16] & PC_new[1];
|
||||
|
||||
reg fetch_second_half;
|
||||
reg long_instr;
|
||||
|
||||
wire [31:0] cached_mem = current_cache_hit ? cached_data : mem_rdata;
|
||||
wire [31:0] decomp_input = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
|
||||
: cached_mem;
|
||||
wire [31:0] decompressed;
|
||||
|
||||
decompressor _decomp ( .c(decomp_input), .d(decompressed) );
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE_bit = 2;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 3;
|
||||
localparam WRITE_AMO_bit = 4;
|
||||
localparam WAIT_AMO_bit = 5;
|
||||
|
||||
localparam NB_STATES = 6;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE = 1 << EXECUTE_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
localparam WRITE_AMO = 1 << WRITE_AMO_bit;
|
||||
localparam WAIT_AMO = 1 << WAIT_AMO_bit;
|
||||
|
||||
reg SkipFetch; // Skip fetch state later
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) & (
|
||||
state[EXECUTE_bit] |
|
||||
state[WAIT_ALU_OR_MEM_bit]
|
||||
);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE_bit] & (isLoad | isAMO & ~isAMOsc) | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE_bit] & (isStore | isAMO & isAMOsc & reserved_addr & amo_location_unchanged) | state[WRITE_AMO_bit]}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (divide) in the ALU.
|
||||
assign aluWr = state[EXECUTE_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
|
||||
wire needToWait = isLoad | isStore | isDivide | isAMO;
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PC_new =
|
||||
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
interrupt_return ? mepc :
|
||||
PCinc;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; //Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
mcause <= 0;
|
||||
cached_addr <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
|
||||
fetch_second_half <= 0;
|
||||
SkipFetch <= 0;
|
||||
amo_location <= 0;
|
||||
amo_location_unchanged <= 0;
|
||||
end else begin
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
// Update cache
|
||||
if (~current_cache_hit | fetch_second_half) begin
|
||||
cached_addr <= mem_addr[ADDR_WIDTH-1:2];
|
||||
cached_data <= mem_rdata;
|
||||
end;
|
||||
|
||||
// Decode instruction
|
||||
rs1 <= registerFile[decompressed[19:15]];
|
||||
rs2 <= registerFile[decompressed[24:20]];
|
||||
instr <= decompressed[31:2];
|
||||
long_instr <= &decomp_input[1:0];
|
||||
|
||||
// Long opcode, unaligned, first part fetched,
|
||||
// happens in non-linear code
|
||||
if (current_unaligned_long & ~fetch_second_half) begin
|
||||
fetch_second_half <= 1;
|
||||
state <= FETCH_INSTR;
|
||||
end else begin
|
||||
fetch_second_half <= 0;
|
||||
state <= EXECUTE;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE_bit]: begin
|
||||
if (interrupt) begin
|
||||
PC <= mtvec;
|
||||
mepc <= PC_new;
|
||||
mcause <= 1;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
SkipFetch <= 0;
|
||||
end else begin
|
||||
PC <= PC_new;
|
||||
if (interrupt_return) mcause <= 0;
|
||||
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM :
|
||||
next_cache_hit & ~next_unaligned_long ? WAIT_INSTR :
|
||||
FETCH_INSTR;
|
||||
SkipFetch <= next_cache_hit & ~next_unaligned_long;
|
||||
|
||||
fetch_second_half <= next_cache_hit & next_unaligned_long;
|
||||
end
|
||||
|
||||
// Watching a reserved memory location
|
||||
if (isAMO & isAMOlr) begin
|
||||
amo_location <= rs1[ADDR_WIDTH-1:0];
|
||||
amo_location_unchanged <= 1;
|
||||
end else
|
||||
if (isAMO | (isStore & reserved_addr)) begin
|
||||
amo_location_unchanged <= 0;
|
||||
end
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !mem_rbusy & !mem_wbusy) begin
|
||||
amo_wdata <= amoALU;
|
||||
state <= isAMO & ~isAMOlr & ~isAMOsc ? WRITE_AMO :
|
||||
SkipFetch ? WAIT_INSTR :
|
||||
FETCH_INSTR ;
|
||||
end
|
||||
end
|
||||
|
||||
state[WRITE_AMO_bit]: begin
|
||||
state <= WAIT_AMO;
|
||||
end
|
||||
|
||||
state[WAIT_AMO_bit]: begin
|
||||
if(!mem_wbusy) state <= SkipFetch ? WAIT_INSTR : FETCH_INSTR;
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef BENCH
|
||||
initial begin
|
||||
cycles = 0;
|
||||
registerFile[0] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
// if c[15:0] is a compressed instrution, decompresses it in d
|
||||
// else copies c to d
|
||||
module decompressor(
|
||||
input wire [31:0] c,
|
||||
output reg [31:0] d
|
||||
);
|
||||
|
||||
// How to handle illegal and unknown opcodes
|
||||
|
||||
localparam illegal = 32'h00000000;
|
||||
localparam unknown = 32'h00000000;
|
||||
|
||||
// Register decoder
|
||||
|
||||
wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
|
||||
wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
|
||||
|
||||
wire [4:0] rwl = c[ 6:2]; // Register wide low
|
||||
wire [4:0] rwh = c[11:7]; // Register wide high
|
||||
|
||||
localparam x0 = 5'b00000;
|
||||
localparam x1 = 5'b00001;
|
||||
localparam x2 = 5'b00010;
|
||||
|
||||
// Immediate decoder
|
||||
|
||||
wire [4:0] shiftImm = c[6:2];
|
||||
|
||||
wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
|
||||
wire [11:0] lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
|
||||
wire [11:0] lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
|
||||
wire [11:0] swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
|
||||
|
||||
wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
|
||||
wire [11:0] addImm = {{ 7{c[12]}}, c[6:2]};
|
||||
|
||||
/* verilator lint_off UNUSED */
|
||||
wire [12:0] bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
|
||||
wire [20:0] jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
|
||||
wire [31:0] luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
always @*
|
||||
casez (c[15:0])
|
||||
// imm / funct7 + rs2 rs1 fn3 rd opcode
|
||||
16'b???___????????_???_11 : d = c ; // Long opcode, no need to decompress
|
||||
|
||||
/* verilator lint_off CASEOVERLAP */
|
||||
|
||||
16'b000___00000000_000_00 : d = illegal ; // c.illegal --> illegal
|
||||
16'b000___????????_???_00 : d = { addi4spnImm, x2, 3'b000, rcl, 7'b00100_11} ; // c.addi4spn --> addi rd', x2, nzuimm[9:2]
|
||||
/* verilator lint_on CASEOVERLAP */
|
||||
|
||||
16'b010_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00000_11} ; // c.lw --> lw rd', offset[6:2](rs1')
|
||||
16'b110_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01000_11} ; // c.sw --> sw rs2', offset[6:2](rs1')
|
||||
|
||||
16'b000_???_???_??_???_01 : d = { addImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi --> addi rd, rd, nzimm[5:0]
|
||||
16'b001____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x1, 7'b11011_11} ; // c.jal --> jal x1, offset[11:1]
|
||||
16'b010__?_?????_?????_01 : d = { addImm, x0, 3'b000, rwh, 7'b00100_11} ; // c.li --> addi rd, x0, imm[5:0]
|
||||
16'b011__?_00010_?????_01 : d = { addi16spImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi16sp --> addi x2, x2, nzimm[9:4]
|
||||
16'b011__?_?????_?????_01 : d = { luiImm[31:12], rwh, 7'b01101_11} ; // c.lui --> lui rd, nzuimm[17:12]
|
||||
16'b100_?_00_???_?????_01 : d = { 7'b0000000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srli --> srli rd', rd', shamt[5:0]
|
||||
16'b100_?_01_???_?????_01 : d = { 7'b0100000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srai --> srai rd', rd', shamt[5:0]
|
||||
16'b100_?_10_???_?????_01 : d = { addImm, rch, 3'b111, rch, 7'b00100_11} ; // c.andi --> andi rd', rd', imm[5:0]
|
||||
16'b100_011_???_00_???_01 : d = { 7'b0100000, rcl, rch, 3'b000, rch, 7'b01100_11} ; // c.sub --> sub rd', rd', rs2'
|
||||
16'b100_011_???_01_???_01 : d = { 7'b0000000, rcl, rch, 3'b100, rch, 7'b01100_11} ; // c.xor --> xor rd', rd', rs2'
|
||||
16'b100_011_???_10_???_01 : d = { 7'b0000000, rcl, rch, 3'b110, rch, 7'b01100_11} ; // c.or --> or rd', rd', rs2'
|
||||
16'b100_011_???_11_???_01 : d = { 7'b0000000, rcl, rch, 3'b111, rch, 7'b01100_11} ; // c.and --> and rd', rd', rs2'
|
||||
16'b101____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x0, 7'b11011_11} ; // c.j --> jal x0, offset[11:1]
|
||||
16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz --> beq rs1', x0, offset[8:1]
|
||||
16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez --> bne rs1', x0, offset[8:1]
|
||||
|
||||
16'b000__?_?????_?????_10 : d = { 7'b0000000, shiftImm, rwh, 3'b001, rwh, 7'b00100_11} ; // c.slli --> slli rd, rd, shamt[5:0]
|
||||
16'b010__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00000_11} ; // c.lwsp --> lw rd, offset[7:2](x2)
|
||||
16'b100__0_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x0, 7'b11001_11} ; // c.jr --> jalr x0, rs1, 0
|
||||
16'b100__0_?????_?????_10 : d = { 7'b0000000, rwl, x0, 3'b000, rwh, 7'b01100_11} ; // c.mv --> add rd, x0, rs2
|
||||
// 16'b100__1_00000_00000_10 : d = { 25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak --> ebreak
|
||||
16'b100__1_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x1, 7'b11001_11} ; // c.jalr --> jalr x1, rs1, 0
|
||||
16'b100__1_?????_?????_10 : d = { 7'b0000000, rwl, rwh, 3'b000, rwh, 7'b01100_11} ; // c.add --> add rd, rd, rs2
|
||||
16'b110__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01000_11} ; // c.swsp --> sw rs2, offset[7:2](x2)
|
||||
|
||||
default: d = unknown ; // Unknown opcode
|
||||
endcase
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
// Notes:
|
||||
//
|
||||
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
|
||||
// It is just a cleaner way of writing a series of cascaded if() statements,
|
||||
// To understand it, think about the case statement *in general* as follows:
|
||||
// case (expr)
|
||||
// val_1: statement_1
|
||||
// val_2: statement_2
|
||||
// ... val_n: statement_n
|
||||
// endcase
|
||||
// The first statement_i such that expr == val_i is executed.
|
||||
// Now if expr is 1'b1:
|
||||
// case (1'b1)
|
||||
// cond_1: statement_1
|
||||
// cond_2: statement_2
|
||||
// ... cond_n: statement_n
|
||||
// endcase
|
||||
// It is *exactly the same thing*, the first statement_i such that
|
||||
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
|
||||
// in other words, such that cond_i is true)
|
||||
// More on this:
|
||||
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
|
||||
//
|
||||
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
|
||||
// It uses a larger number of bits (one bit per state), but often results in
|
||||
// a both more compact (fewer LUTs) and faster state machine.
|
||||
523
RTL/PROCESSOR/femtorv32_intermissum.v
Normal file
523
RTL/PROCESSOR/femtorv32_intermissum.v
Normal file
@@ -0,0 +1,523 @@
|
||||
/*******************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
//
|
||||
// This version: The "Intermissum", with full interrupt support.
|
||||
// A single VERILOG file, compact & understandable code.
|
||||
//
|
||||
// Instruction set: RV32IM + CSR + MRET
|
||||
//
|
||||
// Parameters:
|
||||
// Reset address can be defined using RESET_ADDR (default is 0).
|
||||
//
|
||||
// The ADDR_WIDTH parameter lets you define the width of the internal
|
||||
// address bus (and address computation logic).
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/*******************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32im"
|
||||
`define NRV_ABI "ilp32"
|
||||
`define NRV_OPTIMIZE "-O3"
|
||||
`define NRV_INTERRUPTS
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input interrupt_request,
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The destination register
|
||||
wire [4:0] rdId = instr[11:7];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *)
|
||||
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
|
||||
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] registerFile [31:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (writeBack)
|
||||
if (rdId != 0)
|
||||
registerFile[rdId] <= writeBackData;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except divisions.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe, starts dividing.
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ?
|
||||
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
|
||||
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
|
||||
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
|
||||
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
|
||||
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
|
||||
aluIn1[30], aluIn1[31]} : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = {
|
||||
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
|
||||
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
|
||||
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
|
||||
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
|
||||
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
|
||||
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
|
||||
shifter[30], shifter[31]};
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
|
||||
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
wire [31:0] aluOut_muldiv =
|
||||
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
|
||||
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
|
||||
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
|
||||
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
|
||||
|
||||
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [31:0] quotient_msk;
|
||||
|
||||
wire divstep_do = divisor <= {31'b0, dividend};
|
||||
|
||||
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
|
||||
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
|
||||
|
||||
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] != aluIn2[31]) & |aluIn2);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (isDivide & aluWr) begin
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
quotient_msk <= 1 << 31;
|
||||
end else begin
|
||||
dividend <= dividendN;
|
||||
divisor <= divisor >> 1;
|
||||
quotient <= quotientN;
|
||||
quotient_msk <= quotient_msk >> 1;
|
||||
end
|
||||
end
|
||||
|
||||
reg [31:0] divResult;
|
||||
always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
PC : loadstore_addr ;
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// Interrupt logic, CSR registers and opcodes.
|
||||
/***************************************************************************/
|
||||
|
||||
// Interrupt logic:
|
||||
|
||||
// Remember interrupt requests as they are not checked for every cycle
|
||||
reg interrupt_request_sticky;
|
||||
// Interrupt enable and lock logic
|
||||
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
|
||||
// Processor accepts interrupts in EXECUTE state.
|
||||
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
|
||||
|
||||
// If current interrupt is accepted, there already might be the next one,
|
||||
// which should not be missed:
|
||||
always @(posedge clk) begin
|
||||
interrupt_request_sticky <=
|
||||
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
|
||||
end
|
||||
|
||||
// Decoder for mret opcode
|
||||
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
|
||||
|
||||
// CSRs:
|
||||
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
|
||||
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
|
||||
reg mstatus; // Interrupt enable
|
||||
reg mcause; // Interrupt cause (and lock)
|
||||
reg [63:0] cycles; // Cycle counter
|
||||
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
wire sel_mstatus = (instr[31:20] == 12'h300);
|
||||
wire sel_mtvec = (instr[31:20] == 12'h305);
|
||||
wire sel_mepc = (instr[31:20] == 12'h341);
|
||||
wire sel_mcause = (instr[31:20] == 12'h342);
|
||||
wire sel_cycles = (instr[31:20] == 12'hC00);
|
||||
wire sel_cyclesh = (instr[31:20] == 12'hC80);
|
||||
|
||||
// Read CSRs:
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] CSR_read =
|
||||
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
|
||||
(sel_mtvec ? mtvec : 32'b0) |
|
||||
(sel_mepc ? mepc : 32'b0) |
|
||||
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
|
||||
(sel_cycles ? cycles[31:0] : 32'b0) |
|
||||
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
// Write CSRs: 5 bit unsigned immediate or content of RS1
|
||||
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
|
||||
|
||||
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
|
||||
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
|
||||
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
mstatus <= 0;
|
||||
end else begin
|
||||
// Execute a CSR opcode
|
||||
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
|
||||
if (sel_mstatus) mstatus <= CSR_write[3];
|
||||
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0) ; // Load
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
|
||||
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE_bit = 2;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 3;
|
||||
localparam NB_STATES = 4;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE = 1 << EXECUTE_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) &
|
||||
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (shifts) in the ALU.
|
||||
assign aluWr = state[EXECUTE_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
|
||||
wire needToWait = isLoad | isStore | isDivide;
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PC_new =
|
||||
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
interrupt_return ? mepc :
|
||||
PCplus4;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
mcause <= 0;
|
||||
end else
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
rs1 <= registerFile[mem_rdata[19:15]];
|
||||
rs2 <= registerFile[mem_rdata[24:20]];
|
||||
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
|
||||
state <= EXECUTE; // also the declaration of instr).
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE_bit]: begin
|
||||
if (interrupt) begin
|
||||
PC <= mtvec;
|
||||
mepc <= PC_new;
|
||||
mcause <= 1;
|
||||
end else begin
|
||||
PC <= PC_new;
|
||||
if (interrupt_return) mcause <= 0;
|
||||
end
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
endcase
|
||||
end
|
||||
|
||||
`ifdef BENCH
|
||||
initial begin
|
||||
cycles = 0;
|
||||
registerFile[0] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
// Notes:
|
||||
//
|
||||
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
|
||||
// It is just a cleaner way of writing a series of cascaded if() statements,
|
||||
// To understand it, think about the case statement *in general* as follows:
|
||||
// case (expr)
|
||||
// val_1: statement_1
|
||||
// val_2: statement_2
|
||||
// ... val_n: statement_n
|
||||
// endcase
|
||||
// The first statement_i such that expr == val_i is executed.
|
||||
// Now if expr is 1'b1:
|
||||
// case (1'b1)
|
||||
// cond_1: statement_1
|
||||
// cond_2: statement_2
|
||||
// ... cond_n: statement_n
|
||||
// endcase
|
||||
// It is *exactly the same thing*, the first statement_i such that
|
||||
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
|
||||
// in other words, such that cond_i is true)
|
||||
// More on this:
|
||||
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
|
||||
//
|
||||
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
|
||||
// It uses a larger number of bits (one bit per state), but often results in
|
||||
// a both more compact (fewer LUTs) and faster state machine.
|
||||
|
||||
790
RTL/PROCESSOR/femtorv32_petitbateau.v
Normal file
790
RTL/PROCESSOR/femtorv32_petitbateau.v
Normal file
@@ -0,0 +1,790 @@
|
||||
/******************************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
//
|
||||
// This version: PetitBateau (make it float), RV32IMFC
|
||||
// Rounding works as follows:
|
||||
// - all subnormals are flushed to zero
|
||||
// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
|
||||
// - FDIV and FSQRT do not have correct rounding
|
||||
//
|
||||
// [TODO] add FPU CSR (and instret for perf stat)]
|
||||
// [TODO] FSW/FLW unaligned (does not seem to occur, but the norm requires it)
|
||||
// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
|
||||
// [TODO] support IEEE754 denormals
|
||||
// [TODO] NaNs propagation and infinity
|
||||
// [TODO] support all IEEE754 rounding modes
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/******************************************************************************/
|
||||
|
||||
`include "petitbateau.v"
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
// Note: atomic instructions not supported, but 'a' is set in
|
||||
// compiler flag, because there is no toolchain/libs for
|
||||
// rv32imfc / imf in most risc-V compiler distributions.
|
||||
|
||||
`define NRV_ARCH "rv32imafc"
|
||||
`define NRV_ABI "ilp32f"
|
||||
|
||||
`define NRV_OPTIMIZE "-O3"
|
||||
`define NRV_INTERRUPTS
|
||||
|
||||
// Check condition and display message in simulation
|
||||
`ifdef BENCH
|
||||
`define ASSERT(cond,msg) if(!(cond)) $display msg
|
||||
`define ASSERT_NOT_REACHED(msg) $display msg
|
||||
`else
|
||||
`define ASSERT(cond,msg)
|
||||
`define ASSERT_NOT_REACHED(msg)
|
||||
`endif
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input interrupt_request,
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
// Flip a 32 bit word. Used by the shifter (a single shifter for
|
||||
// left and right shifts, saves silicium !)
|
||||
function [31:0] flip32;
|
||||
input [31:0] x;
|
||||
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
|
||||
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
|
||||
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
|
||||
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
|
||||
endfunction
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
wire [2:0] funct3 = instr[14:12];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *) wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
|
||||
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:3] == 4'b0000 ); // rd <-mem[rs1+Iimm] (bit 2:FLW)
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isStore = (instr[6:3] == 4'b0100 ); // mem[rs1+Simm]<-rs2 (bit 2:FSW)
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
|
||||
wire isFPU = (instr[6:5] == 2'b10); // all FPU instr except FLW/FSW
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] rs3; // this one is used by the FMA instructions.
|
||||
|
||||
reg [31:0] registerFile [63:0]; // 0..31: integer registers
|
||||
// 32..63: floating-point registers
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except divisions.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe, starts dividing.
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = flip32(shifter);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = isALUreg & funcM & instr[14];
|
||||
wire aluBusy = |div_cnt; // ALU is busy if division is in progress.
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
reg [31:0] aluOut_mul;
|
||||
always @(posedge clk) begin
|
||||
aluOut_mul <= funct3Is[0] ? multiply[31:0] : multiply[63:32];
|
||||
end
|
||||
|
||||
reg [31:0] aluOut_div;
|
||||
always @(posedge clk) begin
|
||||
(* parallel_case, full_case *)
|
||||
case(1'b1)
|
||||
instr[13] & div_sign: aluOut_div <= -dividend;
|
||||
instr[13] & !div_sign: aluOut_div <= dividend;
|
||||
!instr[13] & div_sign: aluOut_div <= -quotient;
|
||||
!instr[13] & !div_sign: aluOut_div <= quotient;
|
||||
endcase
|
||||
end
|
||||
|
||||
reg [31:0] aluOut;
|
||||
always @(*) begin
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
isALUreg & funcM & instr[14]: aluOut = aluOut_div;
|
||||
isALUreg & funcM & !instr[14]: aluOut = aluOut_mul;
|
||||
default: aluOut = aluOut_base;
|
||||
endcase
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [5:0] div_cnt;
|
||||
reg div_sign;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (aluWr) begin
|
||||
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] != aluIn2[31]) & |aluIn2);
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
div_cnt <= isDivide ? 33 : 0; // one additional cycle for aluOut_div
|
||||
end else begin
|
||||
if(aluBusy) div_cnt <= div_cnt - 1;
|
||||
end
|
||||
if(|div_cnt[5:1]) begin
|
||||
divisor <= divisor >> 1;
|
||||
if(divisor <= {31'b0, dividend}) begin
|
||||
quotient <= {quotient[30:0],1'b1};
|
||||
dividend <= dividend - divisor[31:0];
|
||||
end else begin
|
||||
quotient <= {quotient[30:0],1'b0};
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
|
||||
wire predicate = funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
/***************************************************************************/
|
||||
// Registers read-write
|
||||
/***************************************************************************/
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit]) begin
|
||||
// Fetch registers as soon as instruction is ready.
|
||||
rs1 <= registerFile[{raw_rs1IsFP,raw_instr[19:15]}];
|
||||
rs2 <= registerFile[{raw_rs2IsFP,raw_instr[24:20]}];
|
||||
rs3 <= registerFile[{1'b1, raw_instr[31:27]}];
|
||||
end else if(state[DECOMPRESS_GETREGS_bit]) begin
|
||||
// For compressed instructions, fetch registers once decompressed.
|
||||
rs1 <= registerFile[{decomp_rs1IsFP,instr[19:15]}];
|
||||
rs2 <= registerFile[{decomp_rs2IsFP,instr[24:20]}];
|
||||
// no need to fetch rs3 here, there is no compressed FMA.
|
||||
end else if(writeBack & !fpuBusy) begin
|
||||
if(rdIsFP || |instr[11:7]) begin
|
||||
registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The FPU
|
||||
/***************************************************************************/
|
||||
|
||||
wire fpuBusy;
|
||||
wire [31:0] fpuOut;
|
||||
PetitBateau FPU(
|
||||
.clk(clk),
|
||||
.wr(state[EXECUTE_bit] & isFPU),
|
||||
.instr(instr[31:2]),
|
||||
.rs1(rs1),
|
||||
.rs2(rs2),
|
||||
.rs3(rs3),
|
||||
.busy(fpuBusy),
|
||||
.out(fpuOut)
|
||||
);
|
||||
|
||||
// There is a single register bank, registers 0..31 are the integer
|
||||
// registers, and 32..63 are the floating point registers, hence
|
||||
// bit 5 of rs1,rs2,rd index is set to 0 for an integer register
|
||||
// and 1 for a fp register.
|
||||
|
||||
// asserted if the destination register is a floating-point register
|
||||
wire rdIsFP = (instr[6:2] == 5'b00001) || // FLW
|
||||
(instr[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB
|
||||
(instr[6:4] == 3'b101 && (
|
||||
(instr[31] == 1'b0) || // R-Type FPU
|
||||
(instr[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(instr[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
|
||||
// rs1 is a FP register if instr[6:5] = 2'b10 except for:
|
||||
// FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
|
||||
// FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
|
||||
// (two versions of the signal, one for regular instruction decode,
|
||||
// the other one for compressed instructions).
|
||||
wire raw_rs1IsFP = (raw_instr[6:5] == 2'b10 ) &&
|
||||
!((raw_instr[4:2] == 3'b100) && (
|
||||
(raw_instr[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(raw_instr[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
|
||||
wire decomp_rs1IsFP = (instr[6:5] == 2'b10 ) &&
|
||||
!((instr[4:2] == 3'b100) && (
|
||||
(instr[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(instr[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
|
||||
// rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
|
||||
// (two versions of the signal, one for regular instruction decode,
|
||||
// the other one for compressed instructions).
|
||||
wire raw_rs2IsFP = (raw_instr[6:5] == 2'b10) || (raw_instr[6:2]==5'b01001);
|
||||
wire decomp_rs2IsFP = (instr[6:5] == 2'b10) || (instr[6:2]==5'b01001);
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
wire [ADDR_WIDTH-1:0] PCinc = long_instr ? PCplus4 : PCplus2;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
|
||||
: {PC [ADDR_WIDTH-1:2], 2'b00}
|
||||
: loadstore_addr ;
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// Interrupt logic, CSR registers and opcodes.
|
||||
/***************************************************************************/
|
||||
|
||||
// Remember interrupt requests as they are not checked for every cycle
|
||||
reg interrupt_request_sticky;
|
||||
|
||||
// Interrupt enable and lock logic
|
||||
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
|
||||
|
||||
// Processor accepts interrupts in EXECUTE state.
|
||||
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
|
||||
|
||||
// If current interrupt is accepted, there already might be the next one,
|
||||
// which should not be missed:
|
||||
always @(posedge clk) begin
|
||||
interrupt_request_sticky <=
|
||||
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
|
||||
end
|
||||
|
||||
// Decoder for mret opcode
|
||||
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
|
||||
|
||||
// CSRs:
|
||||
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
|
||||
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
|
||||
reg mstatus; // Interrupt enable
|
||||
reg mcause; // Interrupt cause (and lock)
|
||||
reg [63:0] cycles; // Cycle counter
|
||||
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
wire sel_mstatus = (instr[31:20] == 12'h300);
|
||||
wire sel_mtvec = (instr[31:20] == 12'h305);
|
||||
wire sel_mepc = (instr[31:20] == 12'h341);
|
||||
wire sel_mcause = (instr[31:20] == 12'h342);
|
||||
wire sel_cycles = (instr[31:20] == 12'hC00);
|
||||
wire sel_cyclesh = (instr[31:20] == 12'hC80);
|
||||
|
||||
// Read CSRs
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] CSR_read =
|
||||
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
|
||||
(sel_mtvec ? mtvec : 32'b0) |
|
||||
(sel_mepc ? mepc : 32'b0) |
|
||||
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
|
||||
(sel_cycles ? cycles[31:0] : 32'b0) |
|
||||
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
// Write CSRs: 5 bit unsigned immediate or content of RS1
|
||||
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
|
||||
|
||||
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
|
||||
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
|
||||
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
mstatus <= 0;
|
||||
end else begin
|
||||
// Execute a CSR opcode
|
||||
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
|
||||
if (sel_mstatus) mstatus <= CSR_write[3];
|
||||
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isFPU ? fpuOut : 32'b0) | // FPU
|
||||
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? PCinc : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0) ; // Load
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
// TODO: support unaligned accesses for FLW and FSW
|
||||
|
||||
// instr[2] is set for FLW and FSW. instr[13:12] = func3[1:0]
|
||||
wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00);
|
||||
wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01);
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/***************************************************************************/
|
||||
// Unaligned fetch mechanism and compressed opcode handling
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:2] cached_addr;
|
||||
reg [31:0] cached_data;
|
||||
|
||||
wire current_cache_hit = cached_addr == PC [ADDR_WIDTH-1:2];
|
||||
wire next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
|
||||
|
||||
wire current_unaligned_long = &cached_mem [17:16] & PC [1];
|
||||
wire next_unaligned_long = &cached_data[17:16] & PC_new[1];
|
||||
|
||||
reg fetch_second_half;
|
||||
reg long_instr;
|
||||
|
||||
wire [31:0] cached_mem = current_cache_hit ? cached_data : mem_rdata;
|
||||
wire [31:0] raw_instr = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
|
||||
: cached_mem;
|
||||
wire [31:0] decompressed;
|
||||
decompressor _decomp ( .c(raw_instr[15:0]), .d(decompressed) );
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam DECOMPRESS_GETREGS_bit = 2;
|
||||
localparam EXECUTE_bit = 3;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 4;
|
||||
localparam WAIT_ALU_OR_MEM_SKIP_bit = 5;
|
||||
|
||||
localparam NB_STATES = 6;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam DECOMPRESS_GETREGS = 1 << DECOMPRESS_GETREGS_bit;
|
||||
localparam EXECUTE = 1 << EXECUTE_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) & !fpuBusy & (
|
||||
state[EXECUTE_bit] |
|
||||
state[WAIT_ALU_OR_MEM_bit] |
|
||||
state[WAIT_ALU_OR_MEM_SKIP_bit]
|
||||
);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (divide) in the ALU.
|
||||
assign aluWr = state[EXECUTE_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
|
||||
`ifdef NRV_IS_IO_ADDR
|
||||
wire needToWait = isLoad |
|
||||
(isStore & `NRV_IS_IO_ADDR(mem_addr)) |
|
||||
isALUreg & funcM /* isDivide */ |
|
||||
isFPU;
|
||||
`else
|
||||
wire needToWait = isLoad |
|
||||
isStore |
|
||||
isALUreg & funcM /* isDivide */ |
|
||||
isFPU;
|
||||
`endif
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PC_new =
|
||||
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
interrupt_return ? mepc :
|
||||
PCinc;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; //Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
mcause <= 0;
|
||||
cached_addr <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
|
||||
fetch_second_half <= 0;
|
||||
end else begin
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
// Update cache
|
||||
if (~current_cache_hit | fetch_second_half) begin
|
||||
cached_addr <= mem_addr[ADDR_WIDTH-1:2];
|
||||
cached_data <= mem_rdata;
|
||||
end;
|
||||
|
||||
// Decode instruction
|
||||
// Registers are fetched at the same time, in the
|
||||
// FPU's always block.
|
||||
instr <= &raw_instr[1:0] ? raw_instr[31:2]
|
||||
: decompressed[31:2];
|
||||
long_instr <= &raw_instr[1:0];
|
||||
|
||||
// Long opcode, unaligned, first part fetched,
|
||||
// happens in non-linear code
|
||||
if (current_unaligned_long & ~fetch_second_half) begin
|
||||
fetch_second_half <= 1;
|
||||
state <= FETCH_INSTR;
|
||||
end else begin
|
||||
fetch_second_half <= 0;
|
||||
state <= &raw_instr[1:0] ? EXECUTE : DECOMPRESS_GETREGS;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
state[DECOMPRESS_GETREGS_bit]: begin
|
||||
// All the registers are fetched in FPU's always block.
|
||||
state <= EXECUTE;
|
||||
end
|
||||
|
||||
state[EXECUTE_bit]: begin
|
||||
if (interrupt) begin
|
||||
PC <= mtvec;
|
||||
mepc <= PC_new;
|
||||
mcause <= 1;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end else begin
|
||||
// Unaligned load/store not implemented yet
|
||||
// (the norm supposes that FLW and FSW can handle them)
|
||||
`ASSERT(
|
||||
!((isLoad|isStore) && instr[2] && |loadstore_addr[1:0]),
|
||||
("PC=%x UNALIGNED FLW/FSW",PC)
|
||||
);
|
||||
|
||||
PC <= PC_new;
|
||||
if (interrupt_return) mcause <= 0;
|
||||
|
||||
state <= next_cache_hit & ~next_unaligned_long
|
||||
? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
|
||||
: (needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR);
|
||||
|
||||
fetch_second_half <= next_cache_hit & next_unaligned_long;
|
||||
end
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
|
||||
state <= FETCH_INSTR;
|
||||
end
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
|
||||
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef BENCH
|
||||
initial begin
|
||||
cycles = 0;
|
||||
registerFile[0] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
module decompressor(
|
||||
input wire [15:0] c,
|
||||
output reg [31:0] d
|
||||
);
|
||||
|
||||
// Notes: * replaced illegal, unknown, x0, x1, x2 with
|
||||
// 'localparam' instead of 'wire='
|
||||
// * could split decoding into multiple cycles
|
||||
// if decompressor is a bottleneck
|
||||
|
||||
// How to handle illegal and unknown opcodes
|
||||
localparam illegal = 32'h0;
|
||||
localparam unknown = 32'h0;
|
||||
|
||||
// Register decoder
|
||||
|
||||
wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
|
||||
wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
|
||||
|
||||
wire [4:0] rwl = c[ 6:2]; // Register wide low
|
||||
wire [4:0] rwh = c[11:7]; // Register wide high
|
||||
|
||||
localparam x0 = 5'b00000;
|
||||
localparam x1 = 5'b00001;
|
||||
localparam x2 = 5'b00010;
|
||||
|
||||
// Immediate decoder
|
||||
|
||||
wire [4:0] shiftImm = c[6:2];
|
||||
|
||||
wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
|
||||
wire [11:0] lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
|
||||
wire [11:0] lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
|
||||
wire [11:0] swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
|
||||
|
||||
wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
|
||||
wire [11:0] addImm = {{ 7{c[12]}}, c[6:2]};
|
||||
|
||||
/* verilator lint_off UNUSED */
|
||||
wire [12:0] bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
|
||||
wire [20:0] jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
|
||||
wire [31:0] luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
always @*
|
||||
casez (c[15:0])
|
||||
// imm / funct7 + rs2 rs1 fn3 rd opcode
|
||||
// 16'b???___????????_???_11 : d = c ; // Long opcode, no need to decompress
|
||||
|
||||
/* verilator lint_off CASEOVERLAP */
|
||||
16'b000___00000000_000_00 : d = illegal ; // c.illegal --> illegal
|
||||
16'b000___????????_???_00 : d = { addi4spnImm, x2, 3'b000, rcl, 7'b00100_11} ; // c.addi4spn --> addi rd', x2, nzuimm[9:2]
|
||||
/* verilator lint_on CASEOVERLAP */
|
||||
|
||||
16'b010_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00000_11} ; // c.lw --> lw rd', offset[6:2](rs1')
|
||||
16'b110_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01000_11} ; // c.sw --> sw rs2', offset[6:2](rs1')
|
||||
|
||||
|
||||
16'b000_???_???_??_???_01 : d = { addImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi --> addi rd, rd, nzimm[5:0]
|
||||
16'b001____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x1, 7'b11011_11} ; // c.jal --> jal x1, offset[11:1]
|
||||
16'b010__?_?????_?????_01 : d = { addImm, x0, 3'b000, rwh, 7'b00100_11} ; // c.li --> addi rd, x0, imm[5:0]
|
||||
16'b011__?_00010_?????_01 : d = { addi16spImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi16sp --> addi x2, x2, nzimm[9:4]
|
||||
16'b011__?_?????_?????_01 : d = { luiImm[31:12], rwh, 7'b01101_11} ; // c.lui --> lui rd, nzuimm[17:12]
|
||||
16'b100_?_00_???_?????_01 : d = { 7'b0000000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srli --> srli rd', rd', shamt[5:0]
|
||||
16'b100_?_01_???_?????_01 : d = { 7'b0100000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srai --> srai rd', rd', shamt[5:0]
|
||||
16'b100_?_10_???_?????_01 : d = { addImm, rch, 3'b111, rch, 7'b00100_11} ; // c.andi --> andi rd', rd', imm[5:0]
|
||||
16'b100_011_???_00_???_01 : d = { 7'b0100000, rcl, rch, 3'b000, rch, 7'b01100_11} ; // c.sub --> sub rd', rd', rs2'
|
||||
16'b100_011_???_01_???_01 : d = { 7'b0000000, rcl, rch, 3'b100, rch, 7'b01100_11} ; // c.xor --> xor rd', rd', rs2'
|
||||
16'b100_011_???_10_???_01 : d = { 7'b0000000, rcl, rch, 3'b110, rch, 7'b01100_11} ; // c.or --> or rd', rd', rs2'
|
||||
16'b100_011_???_11_???_01 : d = { 7'b0000000, rcl, rch, 3'b111, rch, 7'b01100_11} ; // c.and --> and rd', rd', rs2'
|
||||
16'b101____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x0, 7'b11011_11} ; // c.j --> jal x0, offset[11:1]
|
||||
16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz --> beq rs1', x0, offset[8:1]
|
||||
16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez --> bne rs1', x0, offset[8:1]
|
||||
|
||||
16'b000__?_?????_?????_10 : d = { 7'b0000000, shiftImm, rwh, 3'b001, rwh, 7'b00100_11} ; // c.slli --> slli rd, rd, shamt[5:0]
|
||||
16'b010__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00000_11} ; // c.lwsp --> lw rd, offset[7:2](x2)
|
||||
16'b100__0_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x0, 7'b11001_11} ; // c.jr --> jalr x0, rs1, 0
|
||||
16'b100__0_?????_?????_10 : d = { 7'b0000000, rwl, x0, 3'b000, rwh, 7'b01100_11} ; // c.mv --> add rd, x0, rs2
|
||||
// 16'b100__1_00000_00000_10 : d = { 25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak --> ebreak
|
||||
16'b100__1_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x1, 7'b11001_11} ; // c.jalr --> jalr x1, rs1, 0
|
||||
16'b100__1_?????_?????_10 : d = { 7'b0000000, rwl, rwh, 3'b000, rwh, 7'b01100_11} ; // c.add --> add rd, rd, rs2
|
||||
16'b110__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01000_11} ; // c.swsp --> sw rs2, offset[7:2](x2)
|
||||
|
||||
// Four compressed RV32F load/store instructions
|
||||
16'b011_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00001_11} ; // c.flw --> flw rd', offset[6:2](rs1')
|
||||
16'b111_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01001_11} ; // c.fsw --> fsw rs2', offset[6:2](rs1')
|
||||
16'b011__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00001_11} ; // c.flwsp --> flw rd, offset[7:2](x2)
|
||||
16'b111__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01001_11} ; // c.fswsp --> fsw rs2, offset[7:2](x2)
|
||||
|
||||
|
||||
// default: d = unknown ; // Unknown opcode
|
||||
default: d = 32'bXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
|
||||
endcase
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
420
RTL/PROCESSOR/femtorv32_quark.v
Normal file
420
RTL/PROCESSOR/femtorv32_quark.v
Normal file
@@ -0,0 +1,420 @@
|
||||
/*******************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
// This version: The "Quark", the most elementary version of FemtoRV32.
|
||||
// A single VERILOG file, compact & understandable code.
|
||||
// (200 lines of code, 400 lines counting comments)
|
||||
//
|
||||
// Instruction set: RV32I + RDCYCLES
|
||||
//
|
||||
// Parameters:
|
||||
// Reset address can be defined using RESET_ADDR (default is 0).
|
||||
//
|
||||
// The ADDR_WIDTH parameter lets you define the width of the internal
|
||||
// address bus (and address computation logic).
|
||||
//
|
||||
// Macros:
|
||||
// optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
|
||||
// evaluate to 1 if addr is in mapped IO space,
|
||||
// evaluate to 0 otherwise
|
||||
// (additional wait states are used when in IO space).
|
||||
// If left undefined, wait states are always used.
|
||||
//
|
||||
// NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
|
||||
// by the ticks counter. If not defined, a 32-bits counter is generated.
|
||||
// (reducing its width may be useful for space-constrained designs).
|
||||
//
|
||||
// NRV_TWOLEVEL_SHIFTER may be defined to make shift operations faster
|
||||
// (uses a two-level shifter inspired by picorv32).
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/*******************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32i"
|
||||
`define NRV_ABI "ilp32"
|
||||
`define NRV_OPTIMIZE "-Os"
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The destination register
|
||||
wire [4:0] rdId = instr[11:7];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *)
|
||||
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm = { instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
|
||||
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- cycles
|
||||
wire isJAL = instr[3]; // (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
|
||||
(* no_rw_check *)
|
||||
reg [31:0] registerFile [31:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (writeBack)
|
||||
if (rdId != 0)
|
||||
registerFile[rdId] <= writeBackData;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except shifts.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
reg [31:0] aluReg; // The internal register of the ALU, used by shift.
|
||||
reg [4:0] aluShamt; // Current shift amount.
|
||||
|
||||
wire aluBusy = |aluShamt; // ALU is busy if shift amount is non-zero.
|
||||
wire aluWr; // ALU write strobe, starts shifting.
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) |
|
||||
(funct3IsShift ? aluReg : 32'b0) ;
|
||||
|
||||
wire funct3IsShift = funct3Is[1] | funct3Is[5];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(aluWr) begin
|
||||
if (funct3IsShift) begin // SLL, SRA, SRL
|
||||
aluReg <= aluIn1;
|
||||
aluShamt <= aluIn2[4:0];
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef NRV_TWOLEVEL_SHIFTER
|
||||
else if(|aluShamt[4:2]) begin // Shift by 4
|
||||
aluShamt <= aluShamt - 4;
|
||||
aluReg <= funct3Is[1] ? aluReg << 4 :
|
||||
{{4{instr[30] & aluReg[31]}}, aluReg[31:4]};
|
||||
end else
|
||||
`endif
|
||||
// Compact form of:
|
||||
// funct3=001 -> SLL (aluReg <= aluReg << 1)
|
||||
// funct3=101 & instr[30] -> SRA (aluReg <= {aluReg[31], aluReg[31:1]})
|
||||
// funct3=101 & !instr[30] -> SRL (aluReg <= {1'b0, aluReg[31:1]})
|
||||
|
||||
if (|aluShamt) begin
|
||||
aluShamt <= aluShamt - 1;
|
||||
aluReg <= funct3Is[1] ? aluReg << 1 : // SLL
|
||||
{instr[30] & aluReg[31], aluReg[31:1]}; // SRA,SRL
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
// internal address registers and cycles counter may have less than
|
||||
// 32 bits, so we deactivate width test for mem_addr and writeBackData
|
||||
|
||||
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
PC : loadstore_addr ;
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0) ; // Load
|
||||
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
|
||||
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE_bit = 2;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 3;
|
||||
localparam NB_STATES = 4;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE = 1 << EXECUTE_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) &
|
||||
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (shifts) in the ALU.
|
||||
assign aluWr = state[EXECUTE_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
`ifdef NRV_IS_IO_ADDR
|
||||
wire needToWait = isLoad |
|
||||
isStore & `NRV_IS_IO_ADDR(mem_addr) |
|
||||
isALU & funct3IsShift;
|
||||
`else
|
||||
wire needToWait = isLoad | isStore | isALU & funct3IsShift;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
end else
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
rs1 <= registerFile[mem_rdata[19:15]];
|
||||
rs2 <= registerFile[mem_rdata[24:20]];
|
||||
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
|
||||
state <= EXECUTE; // also the declaration of instr).
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE_bit]: begin
|
||||
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
PCplus4;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
endcase
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Cycle counter
|
||||
/***************************************************************************/
|
||||
|
||||
`ifdef NRV_COUNTER_WIDTH
|
||||
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
|
||||
`else
|
||||
reg [31:0] cycles;
|
||||
`endif
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
`ifdef BENCH
|
||||
initial begin
|
||||
cycles = 0;
|
||||
aluShamt = 0;
|
||||
registerFile[0] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
// Notes:
|
||||
//
|
||||
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
|
||||
// It is just a cleaner way of writing a series of cascaded if() statements,
|
||||
// To understand it, think about the case statement *in general* as follows:
|
||||
// case (expr)
|
||||
// val_1: statement_1
|
||||
// val_2: statement_2
|
||||
// ... val_n: statement_n
|
||||
// endcase
|
||||
// The first statement_i such that expr == val_i is executed.
|
||||
// Now if expr is 1'b1:
|
||||
// case (1'b1)
|
||||
// cond_1: statement_1
|
||||
// cond_2: statement_2
|
||||
// ... cond_n: statement_n
|
||||
// endcase
|
||||
// It is *exactly the same thing*, the first statement_i such that
|
||||
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
|
||||
// in other words, such that cond_i is true)
|
||||
// More on this:
|
||||
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
|
||||
//
|
||||
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
|
||||
// It uses a larger number of bits (one bit per state), but often results in
|
||||
// a both more compact (fewer LUTs) and faster state machine.
|
||||
|
||||
409
RTL/PROCESSOR/femtorv32_quark_bicycle.v
Normal file
409
RTL/PROCESSOR/femtorv32_quark_bicycle.v
Normal file
@@ -0,0 +1,409 @@
|
||||
/*******************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
// This version: The "Quark", the most elementary version of FemtoRV32.
|
||||
// A single VERILOG file, compact & understandable code.
|
||||
// (200 lines of code, 400 lines counting comments)
|
||||
//
|
||||
// Instruction set: RV32I + RDCYCLES
|
||||
//
|
||||
// Parameters:
|
||||
// Reset address can be defined using RESET_ADDR (default is 0).
|
||||
//
|
||||
// The ADDR_WIDTH parameter lets you define the width of the internal
|
||||
// address bus (and address computation logic).
|
||||
//
|
||||
// Macros:
|
||||
// optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
|
||||
// evaluate to 1 if addr is in mapped IO space,
|
||||
// evaluate to 0 otherwise
|
||||
// (additional wait states are used when in IO space).
|
||||
// If left undefined, wait states are always used.
|
||||
//
|
||||
// NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
|
||||
// by the ticks counter. If not defined, a 32-bits counter is generated.
|
||||
// (reducing its width may be useful for space-constrained designs).
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/*******************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32i"
|
||||
`define NRV_ABI "ilp32"
|
||||
`define NRV_OPTIMIZE "-Os"
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The destination register
|
||||
wire [4:0] rdId = instr[11:7];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *)
|
||||
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm = { instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
|
||||
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- cycles
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] registerFile [31:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (writeBack)
|
||||
if (rdId != 0)
|
||||
registerFile[rdId] <= writeBackData;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except shifts.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ?
|
||||
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
|
||||
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
|
||||
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
|
||||
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
|
||||
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
|
||||
aluIn1[30], aluIn1[31]} : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = {
|
||||
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
|
||||
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
|
||||
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
|
||||
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
|
||||
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
|
||||
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
|
||||
shifter[30], shifter[31]};
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
// internal address registers and cycles counter may have less than
|
||||
// 32 bits, so we deactivate width test for mem_addr and writeBackData
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PC_new =
|
||||
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
PCplus4;
|
||||
|
||||
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ? PC :
|
||||
state[EXECUTE_bit] & ~isLoad & ~isStore ? PC_new :
|
||||
loadstore_addr ;
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0) ; // Load
|
||||
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
|
||||
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE_bit = 2;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 3;
|
||||
localparam NB_STATES = 4;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE = 1 << EXECUTE_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) &
|
||||
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE_bit] & ~isStore | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
`ifdef NRV_IS_IO_ADDR
|
||||
wire needToWait = isLoad |
|
||||
isStore & `NRV_IS_IO_ADDR(mem_addr) ;
|
||||
`else
|
||||
wire needToWait = isLoad | isStore ;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
end else
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
rs1 <= registerFile[mem_rdata[19:15]];
|
||||
rs2 <= registerFile[mem_rdata[24:20]];
|
||||
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
|
||||
state <= EXECUTE; // also the declaration of instr).
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE_bit]: begin
|
||||
PC <= PC_new;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : WAIT_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
endcase
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Cycle counter
|
||||
/***************************************************************************/
|
||||
|
||||
`ifdef NRV_COUNTER_WIDTH
|
||||
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
|
||||
`else
|
||||
reg [31:0] cycles;
|
||||
`endif
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
`ifdef BENCH
|
||||
initial begin
|
||||
cycles = 0;
|
||||
registerFile[0] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
// Notes:
|
||||
//
|
||||
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
|
||||
// It is just a cleaner way of writing a series of cascaded if() statements,
|
||||
// To understand it, think about the case statement *in general* as follows:
|
||||
// case (expr)
|
||||
// val_1: statement_1
|
||||
// val_2: statement_2
|
||||
// ... val_n: statement_n
|
||||
// endcase
|
||||
// The first statement_i such that expr == val_i is executed.
|
||||
// Now if expr is 1'b1:
|
||||
// case (1'b1)
|
||||
// cond_1: statement_1
|
||||
// cond_2: statement_2
|
||||
// ... cond_n: statement_n
|
||||
// endcase
|
||||
// It is *exactly the same thing*, the first statement_i such that
|
||||
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
|
||||
// in other words, such that cond_i is true)
|
||||
// More on this:
|
||||
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
|
||||
//
|
||||
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
|
||||
// It uses a larger number of bits (one bit per state), but often results in
|
||||
// a both more compact (fewer LUTs) and faster state machine.
|
||||
|
||||
421
RTL/PROCESSOR/femtorv32_tachyon.v
Normal file
421
RTL/PROCESSOR/femtorv32_tachyon.v
Normal file
@@ -0,0 +1,421 @@
|
||||
/*******************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
// This version: The "Tachyon". It works like the "Quark", with the
|
||||
// difference that EXECUTE is split into two steps. This allows
|
||||
// higher maxfreq.
|
||||
//
|
||||
// Instruction set: RV32I + RDCYCLES
|
||||
//
|
||||
// Parameters:
|
||||
// Reset address can be defined using RESET_ADDR (default is 0).
|
||||
//
|
||||
// The ADDR_WIDTH parameter lets you define the width of the internal
|
||||
// address bus (and address computation logic).
|
||||
//
|
||||
// Macros:
|
||||
// optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
|
||||
// evaluate to 1 if addr is in mapped IO space,
|
||||
// evaluate to 0 otherwise
|
||||
// (additional wait states are used when in IO space).
|
||||
// If left undefined, wait states are always used.
|
||||
//
|
||||
// NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
|
||||
// by the ticks counter. If not defined, a 32-bits counter is generated.
|
||||
// (reducing its width may be useful for space-constrained designs).
|
||||
//
|
||||
// NRV_TWOLEVEL_SHIFTER may be defined to make shift operations faster
|
||||
// (uses a two-level shifter inspired by picorv32).
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/*******************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32i"
|
||||
`define NRV_ABI "ilp32"
|
||||
`define NRV_OPTIMIZE "-Os"
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The destination register
|
||||
wire [4:0] rdId = instr[11:7];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *)
|
||||
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm = { instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
|
||||
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- cycles
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] registerFile [31:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (writeBack)
|
||||
if (rdId != 0)
|
||||
registerFile[rdId] <= writeBackData;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except shifts.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
reg [31:0] aluReg; // The internal register of the ALU, used by shift.
|
||||
reg [4:0] aluShamt; // Current shift amount.
|
||||
|
||||
wire aluBusy = |aluShamt; // ALU is busy if shift amount is non-zero.
|
||||
wire aluWr; // ALU write strobe, starts shifting.
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut = aluReg;
|
||||
|
||||
wire funct3IsShift = funct3Is[1] | funct3Is[5];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(aluWr) begin
|
||||
aluShamt <= funct3IsShift ? aluIn2[4:0] : 5'b0;
|
||||
aluReg <=
|
||||
(funct3IsShift ? aluIn1 : 32'b0 ) |
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
end
|
||||
|
||||
`ifdef NRV_TWOLEVEL_SHIFTER
|
||||
else if(|aluShamt[3:2]) begin // Shift by 4
|
||||
aluShamt <= aluShamt - 4;
|
||||
aluReg <= funct3Is[1] ? aluReg << 4 :
|
||||
{{4{instr[30] & aluReg[31]}}, aluReg[31:4]};
|
||||
end else
|
||||
`endif
|
||||
// Compact form of:
|
||||
// funct3=001 -> SLL (aluReg <= aluReg << 1)
|
||||
// funct3=101 & instr[30] -> SRA (aluReg <= {aluReg[31], aluReg[31:1]})
|
||||
// funct3=101 & !instr[30] -> SRL (aluReg <= {1'b0, aluReg[31:1]})
|
||||
|
||||
if (|aluShamt) begin
|
||||
aluShamt <= aluShamt - 1;
|
||||
aluReg <= funct3Is[1] ? aluReg << 1 : // SLL
|
||||
{instr[30] & aluReg[31], aluReg[31:1]}; // SRA,SRL
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate_ =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
reg predicate;
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
reg [ADDR_WIDTH-1:0] PCplusImm;
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
reg [ADDR_WIDTH-1:0] loadstore_addr;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
// internal address registers and cycles counter may have less than
|
||||
// 32 bits, so we deactivate width test for mem_addr and writeBackData
|
||||
|
||||
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
PC : loadstore_addr ;
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0) ; // Load
|
||||
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
|
||||
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE1_bit = 2;
|
||||
localparam EXECUTE2_bit = 3;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 4;
|
||||
localparam NB_STATES = 5;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE1 = 1 << EXECUTE1_bit;
|
||||
localparam EXECUTE2 = 1 << EXECUTE2_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) &
|
||||
(state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit]);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (shifts) in the ALU.
|
||||
assign aluWr = state[EXECUTE1_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
`ifdef NRV_IS_IO_ADDR
|
||||
wire needToWait = isLoad |
|
||||
isStore & `NRV_IS_IO_ADDR(mem_addr) |
|
||||
aluBusy;
|
||||
`else
|
||||
wire needToWait = isLoad | isStore | aluBusy;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
end else
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
rs1 <= registerFile[mem_rdata[19:15]];
|
||||
rs2 <= registerFile[mem_rdata[24:20]];
|
||||
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
|
||||
state <= EXECUTE1; // also the declaration of instr).
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE1_bit]: begin
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to:
|
||||
// PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
loadstore_addr <= rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
predicate <= predicate_;
|
||||
state <= EXECUTE2;
|
||||
end
|
||||
|
||||
state[EXECUTE2_bit]: begin
|
||||
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
PCplus4;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
endcase
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Cycle counter
|
||||
/***************************************************************************/
|
||||
|
||||
`ifdef NRV_COUNTER_WIDTH
|
||||
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
|
||||
`else
|
||||
reg [31:0] cycles;
|
||||
`endif
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
// Notes:
|
||||
//
|
||||
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
|
||||
// It is just a cleaner way of writing a series of cascaded if() statements,
|
||||
// To understand it, think about the case statement *in general* as follows:
|
||||
// case (expr)
|
||||
// val_1: statement_1
|
||||
// val_2: statement_2
|
||||
// ... val_n: statement_n
|
||||
// endcase
|
||||
// The first statement_i such that expr == val_i is executed.
|
||||
// Now if expr is 1'b1:
|
||||
// case (1'b1)
|
||||
// cond_1: statement_1
|
||||
// cond_2: statement_2
|
||||
// ... cond_n: statement_n
|
||||
// endcase
|
||||
// It is *exactly the same thing*, the first statement_i such that
|
||||
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
|
||||
// in other words, such that cond_i is true)
|
||||
// More on this:
|
||||
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
|
||||
//
|
||||
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
|
||||
// It uses a larger number of bits (one bit per state), but often results in
|
||||
// a both more compact (fewer LUTs) and faster state machine.
|
||||
|
||||
782
RTL/PROCESSOR/femtorv32_testdrive.v
Normal file
782
RTL/PROCESSOR/femtorv32_testdrive.v
Normal file
@@ -0,0 +1,782 @@
|
||||
/******************************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
//
|
||||
// This version: PetitBateau (make it float), RV32IMFC
|
||||
// Rounding works as follows:
|
||||
// - all subnormals are flushed to zero
|
||||
// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
|
||||
// - FDIV and FSQRT do not have correct rounding
|
||||
//
|
||||
// [TODO] add FPU CSR (and instret for perf stat)]
|
||||
// [TODO] FSW/FLW unaligned (does not seem to occur, but the norm requires it)
|
||||
// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
|
||||
// [TODO] support IEEE754 denormals
|
||||
// [TODO] NaNs propagation and infinity
|
||||
// [TODO] support all IEEE754 rounding modes
|
||||
//
|
||||
// Bruno Levy, Matthias Koch, 2020-2021
|
||||
/******************************************************************************/
|
||||
|
||||
`include "petitbateau.v"
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
// Note: atomic instructions not supported, but 'a' is set in
|
||||
// compiler flag, because there is no toolchain/libs for
|
||||
// rv32imfc / imf in most risc-V compiler distributions.
|
||||
|
||||
`define NRV_ARCH "rv32imafc"
|
||||
`define NRV_ABI "ilp32f"
|
||||
|
||||
`define NRV_OPTIMIZE "-O0"
|
||||
`define NRV_INTERRUPTS
|
||||
|
||||
// Check condition and display message in simulation
|
||||
`ifdef BENCH
|
||||
`define ASSERT(cond,msg) if(!(cond)) $display msg
|
||||
`define ASSERT_NOT_REACHED(msg) $display msg
|
||||
`else
|
||||
`define ASSERT(cond,msg)
|
||||
`define ASSERT_NOT_REACHED(msg)
|
||||
`endif
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input interrupt_request,
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
// Flip a 32 bit word. Used by the shifter (a single shifter for
|
||||
// left and right shifts, saves silicium !)
|
||||
function [31:0] flip32;
|
||||
input [31:0] x;
|
||||
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
|
||||
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
|
||||
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
|
||||
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
|
||||
endfunction
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
wire [2:0] funct3 = instr[14:12];
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *) wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
|
||||
|
||||
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
|
||||
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
|
||||
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
|
||||
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
|
||||
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
|
||||
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
|
||||
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
wire isLoad = (instr[6:3] == 4'b0000 ); // rd <-mem[rs1+Iimm] (bit 2:FLW)
|
||||
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
wire isStore = (instr[6:3] == 4'b0100 ); // mem[rs1+Simm]<-rs2 (bit 2:FSW)
|
||||
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
|
||||
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
|
||||
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
|
||||
wire isFPU = (instr[6:5] == 2'b10); // all FPU instr except FLW/FSW
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] rs3; // this one is used by the FMA instructions.
|
||||
|
||||
reg [31:0] registerFile [63:0]; // 0..31: integer registers
|
||||
// 32..63: floating-point registers
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except divisions.
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe, starts dividing.
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = flip32(shifter);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = isALUreg & funcM & instr[14];
|
||||
wire aluBusy = |div_cnt; // ALU is busy if division is in progress.
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] aluOut_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
reg [31:0] aluOut_mul;
|
||||
always @(posedge clk) begin
|
||||
aluOut_mul <= funct3Is[0] ? multiply[31:0] : multiply[63:32];
|
||||
end
|
||||
|
||||
reg [31:0] aluOut_div;
|
||||
always @(posedge clk) begin
|
||||
(* parallel_case, full_case *)
|
||||
case(1'b1)
|
||||
instr[13] & div_sign: aluOut_div <= -dividend;
|
||||
instr[13] & !div_sign: aluOut_div <= dividend;
|
||||
!instr[13] & div_sign: aluOut_div <= -quotient;
|
||||
!instr[13] & !div_sign: aluOut_div <= quotient;
|
||||
endcase
|
||||
end
|
||||
|
||||
reg [31:0] aluOut;
|
||||
always @(*) begin
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
isALUreg & funcM & instr[14]: aluOut = aluOut_div;
|
||||
isALUreg & funcM & !instr[14]: aluOut = aluOut_mul;
|
||||
default: aluOut = aluOut_base;
|
||||
endcase
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [5:0] div_cnt;
|
||||
reg div_sign;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (aluWr) begin
|
||||
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] != aluIn2[31]) & |aluIn2);
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
div_cnt <= isDivide ? 33 : 0; // one additional cycle for aluOut_div
|
||||
end else begin
|
||||
if(aluBusy) div_cnt <= div_cnt - 1;
|
||||
end
|
||||
if(|div_cnt[5:1]) begin
|
||||
divisor <= divisor >> 1;
|
||||
if(divisor <= {31'b0, dividend}) begin
|
||||
quotient <= {quotient[30:0],1'b1};
|
||||
dividend <= dividend - divisor[31:0];
|
||||
end else begin
|
||||
quotient <= {quotient[30:0],1'b0};
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
|
||||
wire predicate = funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
/***************************************************************************/
|
||||
// Registers read-write
|
||||
/***************************************************************************/
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit]) begin
|
||||
// Fetch registers as soon as instruction is ready.
|
||||
rs1 <= registerFile[{raw_rs1IsFP,raw_instr[19:15]}];
|
||||
rs2 <= registerFile[{raw_rs2IsFP,raw_instr[24:20]}];
|
||||
rs3 <= registerFile[{1'b1, raw_instr[31:27]}];
|
||||
end else if(state[DECOMPRESS_GETREGS_bit]) begin
|
||||
// For compressed instructions, fetch registers once decompressed.
|
||||
rs1 <= registerFile[{decomp_rs1IsFP,instr[19:15]}];
|
||||
rs2 <= registerFile[{decomp_rs2IsFP,instr[24:20]}];
|
||||
// no need to fetch rs3 here, there is no compressed FMA.
|
||||
end else if(writeBack & !fpuBusy) begin
|
||||
if(rdIsFP || |instr[11:7]) begin
|
||||
registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The FPU
|
||||
/***************************************************************************/
|
||||
|
||||
wire fpuBusy;
|
||||
wire [31:0] fpuOut;
|
||||
PetitBateau FPU(
|
||||
.clk(clk),
|
||||
.wr(state[EXECUTE_bit] & isFPU),
|
||||
.instr(instr[31:2]),
|
||||
.rs1(rs1),
|
||||
.rs2(rs2),
|
||||
.rs3(rs3),
|
||||
.busy(fpuBusy),
|
||||
.out(fpuOut)
|
||||
);
|
||||
|
||||
// There is a single register bank, registers 0..31 are the integer
|
||||
// registers, and 32..63 are the floating point registers, hence
|
||||
// bit 5 of rs1,rs2,rd index is set to 0 for an integer register
|
||||
// and 1 for a fp register.
|
||||
|
||||
// asserted if the destination register is a floating-point register
|
||||
wire rdIsFP = (instr[6:2] == 5'b00001) || // FLW
|
||||
(instr[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB
|
||||
(instr[6:4] == 3'b101 && (
|
||||
(instr[31] == 1'b0) || // R-Type FPU
|
||||
(instr[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(instr[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
|
||||
// rs1 is a FP register if instr[6:5] = 2'b10 except for:
|
||||
// FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
|
||||
// FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
|
||||
// (two versions of the signal, one for regular instruction decode,
|
||||
// the other one for compressed instructions).
|
||||
wire raw_rs1IsFP = (raw_instr[6:5] == 2'b10 ) &&
|
||||
!((raw_instr[4:2] == 3'b100) && (
|
||||
(raw_instr[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(raw_instr[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
|
||||
wire decomp_rs1IsFP = (instr[6:5] == 2'b10 ) &&
|
||||
!((instr[4:2] == 3'b100) && (
|
||||
(instr[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(instr[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
|
||||
// rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
|
||||
// (two versions of the signal, one for regular instruction decode,
|
||||
// the other one for compressed instructions).
|
||||
wire raw_rs2IsFP = (raw_instr[6:5] == 2'b10) || (raw_instr[6:2]==5'b01001);
|
||||
wire decomp_rs2IsFP = (instr[6:5] == 2'b10) || (instr[6:2]==5'b01001);
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
wire [ADDR_WIDTH-1:0] PCinc = long_instr ? PCplus4 : PCplus2;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
assign mem_addr = {ADDR_PAD,
|
||||
state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
|
||||
: {PC [ADDR_WIDTH-1:2], 2'b00}
|
||||
: loadstore_addr
|
||||
};
|
||||
|
||||
/***************************************************************************/
|
||||
// Interrupt logic, CSR registers and opcodes.
|
||||
/***************************************************************************/
|
||||
|
||||
// Remember interrupt requests as they are not checked for every cycle
|
||||
reg interrupt_request_sticky;
|
||||
|
||||
// Interrupt enable and lock logic
|
||||
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
|
||||
|
||||
// Processor accepts interrupts in EXECUTE state.
|
||||
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
|
||||
|
||||
// If current interrupt is accepted, there already might be the next one,
|
||||
// which should not be missed:
|
||||
always @(posedge clk) begin
|
||||
interrupt_request_sticky <=
|
||||
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
|
||||
end
|
||||
|
||||
// Decoder for mret opcode
|
||||
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
|
||||
|
||||
// CSRs:
|
||||
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
|
||||
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
|
||||
reg mstatus; // Interrupt enable
|
||||
reg mcause; // Interrupt cause (and lock)
|
||||
reg [63:0] cycles; // Cycle counter
|
||||
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
wire sel_mstatus = (instr[31:20] == 12'h300);
|
||||
wire sel_mtvec = (instr[31:20] == 12'h305);
|
||||
wire sel_mepc = (instr[31:20] == 12'h341);
|
||||
wire sel_mcause = (instr[31:20] == 12'h342);
|
||||
wire sel_cycles = (instr[31:20] == 12'hC00);
|
||||
wire sel_cyclesh = (instr[31:20] == 12'hC80);
|
||||
|
||||
// Read CSRs
|
||||
wire [31:0] CSR_read =
|
||||
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
|
||||
(sel_mtvec ? {ADDR_PAD, mtvec} : 32'b0) |
|
||||
(sel_mepc ? {ADDR_PAD, mepc } : 32'b0) |
|
||||
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
|
||||
(sel_cycles ? cycles[31:0] : 32'b0) |
|
||||
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
|
||||
|
||||
|
||||
// Write CSRs: 5 bit unsigned immediate or content of RS1
|
||||
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
|
||||
|
||||
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
|
||||
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
|
||||
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
mstatus <= 0;
|
||||
end else begin
|
||||
// Execute a CSR opcode
|
||||
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
|
||||
if (sel_mstatus) mstatus <= CSR_write[3];
|
||||
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] writeBackData =
|
||||
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isFPU ? fpuOut : 32'b0) | // FPU
|
||||
(isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? {ADDR_PAD,PCinc } : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0); // Load
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
|
||||
// TODO: support unaligned accesses for FLW and FSW
|
||||
|
||||
// instr[2] is set for FLW and FSW. instr[13:12] = func3[1:0]
|
||||
wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00);
|
||||
wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01);
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/***************************************************************************/
|
||||
// Unaligned fetch mechanism and compressed opcode handling
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:2] cached_addr;
|
||||
reg [31:0] cached_data;
|
||||
|
||||
wire current_cache_hit = cached_addr == PC [ADDR_WIDTH-1:2];
|
||||
wire next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
|
||||
|
||||
wire current_unaligned_long = &cached_mem [17:16] & PC [1];
|
||||
wire next_unaligned_long = &cached_data[17:16] & PC_new[1];
|
||||
|
||||
reg fetch_second_half;
|
||||
reg long_instr;
|
||||
|
||||
wire [31:0] cached_mem = current_cache_hit ? cached_data : mem_rdata;
|
||||
wire [31:0] raw_instr = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
|
||||
: cached_mem;
|
||||
wire [31:0] decompressed;
|
||||
decompressor _decomp ( .c(raw_instr[15:0]), .d(decompressed) );
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam DECOMPRESS_GETREGS_bit = 2;
|
||||
localparam EXECUTE_bit = 3;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 4;
|
||||
localparam WAIT_ALU_OR_MEM_SKIP_bit = 5;
|
||||
|
||||
localparam NB_STATES = 6;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam DECOMPRESS_GETREGS = 1 << DECOMPRESS_GETREGS_bit;
|
||||
localparam EXECUTE = 1 << EXECUTE_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// register write-back enable.
|
||||
wire writeBack = ~(isBranch | isStore ) & !fpuBusy & (
|
||||
state[EXECUTE_bit] |
|
||||
state[WAIT_ALU_OR_MEM_bit] |
|
||||
state[WAIT_ALU_OR_MEM_SKIP_bit]
|
||||
);
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (divide) in the ALU.
|
||||
assign aluWr = state[EXECUTE_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
|
||||
wire needToWait = isLoad |
|
||||
(isStore & `NRV_IS_IO_ADDR(mem_addr)) |
|
||||
isALUreg & funcM /* isDivide */ |
|
||||
isFPU;
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PC_new =
|
||||
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
interrupt_return ? mepc :
|
||||
PCinc;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; //Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
mcause <= 0;
|
||||
cached_addr <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
|
||||
fetch_second_half <= 0;
|
||||
end else begin
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
// Update cache
|
||||
if (~current_cache_hit | fetch_second_half) begin
|
||||
cached_addr <= mem_addr[ADDR_WIDTH-1:2];
|
||||
cached_data <= mem_rdata;
|
||||
end;
|
||||
|
||||
// Decode instruction
|
||||
// Registers are fetched at the same time, in the
|
||||
// FPU's always block.
|
||||
instr <= &raw_instr[1:0] ? raw_instr[31:2]
|
||||
: decompressed[31:2];
|
||||
long_instr <= &raw_instr[1:0];
|
||||
|
||||
// Long opcode, unaligned, first part fetched,
|
||||
// happens in non-linear code
|
||||
if (current_unaligned_long & ~fetch_second_half) begin
|
||||
fetch_second_half <= 1;
|
||||
state <= FETCH_INSTR;
|
||||
end else begin
|
||||
fetch_second_half <= 0;
|
||||
state <= &raw_instr[1:0] ? EXECUTE : DECOMPRESS_GETREGS;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
state[DECOMPRESS_GETREGS_bit]: begin
|
||||
// All the registers are fetched in FPU's always block.
|
||||
state <= EXECUTE;
|
||||
end
|
||||
|
||||
state[EXECUTE_bit]: begin
|
||||
if (interrupt) begin
|
||||
PC <= mtvec;
|
||||
mepc <= PC_new;
|
||||
mcause <= 1;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end else begin
|
||||
// Unaligned load/store not implemented yet
|
||||
// (the norm supposes that FLW and FSW can handle them)
|
||||
`ASSERT(
|
||||
!((isLoad|isStore) && instr[2] && |loadstore_addr[1:0]),
|
||||
("PC=%x UNALIGNED FLW/FSW",PC)
|
||||
);
|
||||
|
||||
PC <= PC_new;
|
||||
if (interrupt_return) mcause <= 0;
|
||||
|
||||
state <= next_cache_hit & ~next_unaligned_long
|
||||
? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
|
||||
: (needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR);
|
||||
|
||||
fetch_second_half <= next_cache_hit & next_unaligned_long;
|
||||
end
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
|
||||
state <= FETCH_INSTR;
|
||||
end
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
|
||||
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef BENCH
|
||||
initial begin
|
||||
cycles = 0;
|
||||
registerFile[0] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
module decompressor(
|
||||
input wire [15:0] c,
|
||||
output reg [31:0] d
|
||||
);
|
||||
|
||||
// Notes: * replaced illegal, unknown, x0, x1, x2 with
|
||||
// 'localparam' instead of 'wire='
|
||||
// * could split decoding into multiple cycles
|
||||
// if decompressor is a bottleneck
|
||||
|
||||
// How to handle illegal and unknown opcodes
|
||||
localparam illegal = 32'h0;
|
||||
localparam unknown = 32'h0;
|
||||
|
||||
// Register decoder
|
||||
|
||||
wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
|
||||
wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
|
||||
|
||||
wire [4:0] rwl = c[ 6:2]; // Register wide low
|
||||
wire [4:0] rwh = c[11:7]; // Register wide high
|
||||
|
||||
localparam x0 = 5'b00000;
|
||||
localparam x1 = 5'b00001;
|
||||
localparam x2 = 5'b00010;
|
||||
|
||||
// Immediate decoder
|
||||
|
||||
wire [4:0] shiftImm = c[6:2];
|
||||
|
||||
wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
|
||||
wire [11:0] lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
|
||||
wire [11:0] lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
|
||||
wire [11:0] swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
|
||||
|
||||
wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
|
||||
wire [11:0] addImm = {{ 7{c[12]}}, c[6:2]};
|
||||
|
||||
/* verilator lint_off UNUSED */
|
||||
wire [12:0] bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
|
||||
wire [20:0] jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
|
||||
wire [31:0] luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
always @*
|
||||
casez (c[15:0])
|
||||
// imm / funct7 + rs2 rs1 fn3 rd opcode
|
||||
// 16'b???___????????_???_11 : d = c ; // Long opcode, no need to decompress
|
||||
|
||||
/* verilator lint_off CASEOVERLAP */
|
||||
16'b000___00000000_000_00 : d = illegal ; // c.illegal --> illegal
|
||||
16'b000___????????_???_00 : d = { addi4spnImm, x2, 3'b000, rcl, 7'b00100_11} ; // c.addi4spn --> addi rd', x2, nzuimm[9:2]
|
||||
/* verilator lint_on CASEOVERLAP */
|
||||
|
||||
16'b010_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00000_11} ; // c.lw --> lw rd', offset[6:2](rs1')
|
||||
16'b110_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01000_11} ; // c.sw --> sw rs2', offset[6:2](rs1')
|
||||
|
||||
|
||||
16'b000_???_???_??_???_01 : d = { addImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi --> addi rd, rd, nzimm[5:0]
|
||||
16'b001____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x1, 7'b11011_11} ; // c.jal --> jal x1, offset[11:1]
|
||||
16'b010__?_?????_?????_01 : d = { addImm, x0, 3'b000, rwh, 7'b00100_11} ; // c.li --> addi rd, x0, imm[5:0]
|
||||
16'b011__?_00010_?????_01 : d = { addi16spImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi16sp --> addi x2, x2, nzimm[9:4]
|
||||
16'b011__?_?????_?????_01 : d = { luiImm[31:12], rwh, 7'b01101_11} ; // c.lui --> lui rd, nzuimm[17:12]
|
||||
16'b100_?_00_???_?????_01 : d = { 7'b0000000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srli --> srli rd', rd', shamt[5:0]
|
||||
16'b100_?_01_???_?????_01 : d = { 7'b0100000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srai --> srai rd', rd', shamt[5:0]
|
||||
16'b100_?_10_???_?????_01 : d = { addImm, rch, 3'b111, rch, 7'b00100_11} ; // c.andi --> andi rd', rd', imm[5:0]
|
||||
16'b100_011_???_00_???_01 : d = { 7'b0100000, rcl, rch, 3'b000, rch, 7'b01100_11} ; // c.sub --> sub rd', rd', rs2'
|
||||
16'b100_011_???_01_???_01 : d = { 7'b0000000, rcl, rch, 3'b100, rch, 7'b01100_11} ; // c.xor --> xor rd', rd', rs2'
|
||||
16'b100_011_???_10_???_01 : d = { 7'b0000000, rcl, rch, 3'b110, rch, 7'b01100_11} ; // c.or --> or rd', rd', rs2'
|
||||
16'b100_011_???_11_???_01 : d = { 7'b0000000, rcl, rch, 3'b111, rch, 7'b01100_11} ; // c.and --> and rd', rd', rs2'
|
||||
16'b101____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x0, 7'b11011_11} ; // c.j --> jal x0, offset[11:1]
|
||||
16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz --> beq rs1', x0, offset[8:1]
|
||||
16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez --> bne rs1', x0, offset[8:1]
|
||||
|
||||
16'b000__?_?????_?????_10 : d = { 7'b0000000, shiftImm, rwh, 3'b001, rwh, 7'b00100_11} ; // c.slli --> slli rd, rd, shamt[5:0]
|
||||
16'b010__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00000_11} ; // c.lwsp --> lw rd, offset[7:2](x2)
|
||||
16'b100__0_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x0, 7'b11001_11} ; // c.jr --> jalr x0, rs1, 0
|
||||
16'b100__0_?????_?????_10 : d = { 7'b0000000, rwl, x0, 3'b000, rwh, 7'b01100_11} ; // c.mv --> add rd, x0, rs2
|
||||
// 16'b100__1_00000_00000_10 : d = { 25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak --> ebreak
|
||||
16'b100__1_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x1, 7'b11001_11} ; // c.jalr --> jalr x1, rs1, 0
|
||||
16'b100__1_?????_?????_10 : d = { 7'b0000000, rwl, rwh, 3'b000, rwh, 7'b01100_11} ; // c.add --> add rd, rd, rs2
|
||||
16'b110__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01000_11} ; // c.swsp --> sw rs2, offset[7:2](x2)
|
||||
|
||||
// Four compressed RV32F load/store instructions
|
||||
16'b011_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00001_11} ; // c.flw --> flw rd', offset[6:2](rs1')
|
||||
16'b111_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01001_11} ; // c.fsw --> fsw rs2', offset[6:2](rs1')
|
||||
16'b011__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00001_11} ; // c.flwsp --> flw rd, offset[7:2](x2)
|
||||
16'b111__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01001_11} ; // c.fswsp --> fsw rs2, offset[7:2](x2)
|
||||
|
||||
|
||||
// default: d = unknown ; // Unknown opcode
|
||||
default: d = 32'bXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
|
||||
endcase
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
856
RTL/PROCESSOR/petitbateau.v
Normal file
856
RTL/PROCESSOR/petitbateau.v
Normal file
@@ -0,0 +1,856 @@
|
||||
/******************************************************************************/
|
||||
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
|
||||
//
|
||||
// PetitBateau (make it float): a simple single-precision RISC-V FPU
|
||||
// Mission statement: achieve a good area/performance ratio, by
|
||||
// implementing a full-precision FMA (48 bits), and micro-programmed
|
||||
// Newton-Raphson for FDIV and FSQRT (that reuse the FMA).
|
||||
//
|
||||
// Rounding works as follows:
|
||||
// - all subnormals are flushed to zero
|
||||
// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
|
||||
// - FDIV and FSQRT do not have correct rounding
|
||||
// if PRECISE_DIV is set (default), then FDIV rounding is validated in
|
||||
// tinyraytracer test. Complete proof remains to be done
|
||||
//
|
||||
// [TODO] add FPU CSR (and instret for perf stat)]
|
||||
// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
|
||||
// [TODO] support IEEE754 denormals
|
||||
// [TODO] NaNs propagation and infinity
|
||||
// [TODO] support all IEEE754 rounding modes
|
||||
//
|
||||
// Bruno Levy, 2021
|
||||
/******************************************************************************/
|
||||
|
||||
// TODO: instead of mux between A,B,C and FMA, make FMA always compute
|
||||
// A*B+C and mux rs1,rs2,rs3,1.0,0.0 to A,B,C based on instr (mux
|
||||
// will be more complicated but will probably reduce overall
|
||||
// critical path) ?
|
||||
// TODO: there are too many different paths between the internal registers,
|
||||
// maybe micro-instructions could be redesigned with this in mind.
|
||||
// A could be the MSBs of X, avoiding all MV_A_X instructions.
|
||||
// TODO: the necessity to copy rs1 in E without flushing denormals for
|
||||
// the int-to-fp instructions is unelegant.
|
||||
|
||||
// Include guard for LiteX
|
||||
`ifndef PETITBATEAU_INCLUDED
|
||||
`define PETITBATEAU_INCLUDED
|
||||
|
||||
// Check condition and display message in simulation
|
||||
`ifdef BENCH
|
||||
`define ASSERT(cond,msg) if(!(cond)) $display msg
|
||||
`define ASSERT_NOT_REACHED(msg) $display msg
|
||||
`else
|
||||
`define ASSERT(cond,msg)
|
||||
`define ASSERT_NOT_REACHED(msg)
|
||||
`endif
|
||||
|
||||
module PetitBateau(
|
||||
input clk,
|
||||
input wr, // write strobe, starts computation
|
||||
input [31:2] instr, // current riscv instruction
|
||||
|
||||
// operands
|
||||
input [31:0] rs1,
|
||||
input [31:0] rs2,
|
||||
input [31:0] rs3,
|
||||
|
||||
// outputs
|
||||
output busy,
|
||||
output [31:0] out
|
||||
);
|
||||
|
||||
// Set to 1 for higher-precision FDIV (costs 30 additional cycles per FDIV)
|
||||
parameter PRECISE_DIV = 1;
|
||||
|
||||
|
||||
// Uncomment the line below to emulate all FPU instructions in Verilator
|
||||
// (useful to test instruction decoder and implementations of micro-instr
|
||||
// in C++). See SIM/FPU_funcs.{h,cpp}
|
||||
//`define FPU_EMUL
|
||||
|
||||
// Two high-resolution registers for the FMA, that computes X+Y
|
||||
// Register X has the accumulator / shifters / leading zero counter
|
||||
// Normalized if first bit set is bit 47
|
||||
// Represented number is +/- frac * 2^(exp-127-47)
|
||||
|
||||
reg X_sign; reg signed [8:0] X_exp; reg signed [49:0] X_frac;
|
||||
reg Y_sign; reg signed [8:0] Y_exp; reg signed [49:0] Y_frac;
|
||||
|
||||
// FPU output = 32 MSBs of X register (see below)
|
||||
// A macro to easily write to it (`X <= ...),
|
||||
// used when FPU output is an integer.
|
||||
`define X {X_sign, X_exp[7:0], X_frac[46:24]}
|
||||
assign out = `X;
|
||||
|
||||
// Five single-precision floating-point registers for internal use.
|
||||
// A,B,C are wired to the FMA that computes either A*B+C or A+B
|
||||
// D,E are temporaries used by FDIV and FSQRT
|
||||
// Following IEEE754, represented number is +/- frac * 2^(exp-127-23)
|
||||
// (127: bias 23: position of first bit set for normalized numbers)
|
||||
reg A_sign; reg [7:0] A_exp; reg [23:0] A_frac;
|
||||
reg B_sign; reg [7:0] B_exp; reg [23:0] B_frac;
|
||||
reg C_sign; reg [7:0] C_exp; reg [23:0] C_frac;
|
||||
reg D_sign; reg [7:0] D_exp; reg [23:0] D_frac;
|
||||
reg E_sign; reg [7:0] E_exp; reg [23:0] E_frac;
|
||||
|
||||
/*************************************************************************/
|
||||
|
||||
// Load a 32-bit value in RD
|
||||
// RD: one of A,B,C,D,E
|
||||
// VAL: a 32-bit value
|
||||
`define FP_LD32(RD,VAL) \
|
||||
{RD``_sign, RD``_exp, RD``_frac[22:0]} <= VAL; RD``_frac[23] <= 1'b1
|
||||
|
||||
// Load floating point value in RD by sign, exponent, fraction
|
||||
// RD: one of A,B,C,D,E
|
||||
// sign: 1'b1 (-) or 1'b0 (+)
|
||||
// exp: 8-bits, biased exponent
|
||||
// frac: 24-bit fraction
|
||||
`define FP_LD(RD,sign,eexp,frac) \
|
||||
{RD``_sign, RD``_exp, RD``_frac} <= {sign,eexp,frac}
|
||||
|
||||
// RD <= RS
|
||||
// RD,RS: one of A,B,C,D,E
|
||||
`define FP_MV(RD,RS) \
|
||||
{RD``_sign, RD``_exp, RD``_frac} <= {RS``_sign, RS``_exp, RS``_frac}
|
||||
|
||||
/** FPU micro-instructions and ROM ****************************************/
|
||||
|
||||
|
||||
localparam FPMI_READY = 0;
|
||||
localparam FPMI_LOAD_XY = 1; // X <- A; Y <- B
|
||||
localparam FPMI_LOAD_XY_MUL = 2; // X <- norm(A*B); Y <- C
|
||||
localparam FPMI_ADD_SWAP = 3; // if |X|>|Y| swap(X,Y);
|
||||
// if sign(X) != sign(Y) X <- -X
|
||||
localparam FPMI_ADD_SHIFT = 4; // shift X to match Y exponent
|
||||
localparam FPMI_ADD_ADD = 5; // X <- X + Y
|
||||
localparam FPMI_ADD_NORM = 6; // X <- norm(X) (after ADD_ADD)
|
||||
|
||||
localparam FPMI_CMP = 7; // X <- test X,Y (FEQ,FLE,FLT)
|
||||
|
||||
localparam FPMI_MV_A_X = 8; // A <- X
|
||||
localparam FPMI_MV_B_D = 9; // B <- D
|
||||
localparam FPMI_MV_B_NH_D = 10; // B <- -0.5*|D|
|
||||
localparam FPMI_MV_B_E = 11; // B <- E
|
||||
localparam FPMI_MV_C_A = 12; // C <- A
|
||||
localparam FPMI_MV_E_X = 13; // E <- X
|
||||
|
||||
localparam FPMI_FRCP_PROLOG = 14; // init reciprocal (1/x)
|
||||
localparam FPMI_FRCP_ITER1 = 15; // iteration for reciprocal
|
||||
localparam FPMI_FRCP_ITER2 = 16; // iteration for reciprocal
|
||||
localparam FPMI_FRCP_EPILOG = 17; // epilog for reciprocal
|
||||
localparam FPMI_FDIV_EPILOG = 18; // epilog for fdiv IEEE-754 rounding
|
||||
|
||||
localparam FPMI_FRSQRT_PROLOG = 19; // init recipr sqr root (1/sqrt(x))
|
||||
|
||||
localparam FPMI_FP_TO_INT = 20; // fpuOut <- fpoint_to_int(A)
|
||||
localparam FPMI_INT_TO_FP = 21; // X <- int_to_fpoint(X)
|
||||
localparam FPMI_MIN_MAX = 22; // fpuOut <- min/max(X,Y)
|
||||
|
||||
localparam FPMI_LOAD_Y_ROUND = 23; // Y <- round to nearest
|
||||
|
||||
localparam FPMI_NB = 24;
|
||||
|
||||
// Instruction exit flag (if set in current micro-instr, exit microprogram)
|
||||
localparam FPMI_EXIT_FLAG_bit = 1+$clog2(FPMI_NB);
|
||||
localparam FPMI_EXIT_FLAG = 1 << FPMI_EXIT_FLAG_bit;
|
||||
|
||||
reg [6:0] fpmi_PC; // current micro-instruction pointer
|
||||
reg [1+$clog2(FPMI_NB):0] fpmi_instr; // current micro-instruction
|
||||
|
||||
// current micro-instruction as 1-hot: fpmi_instr == NNN <=> fpmi_is[NNN]
|
||||
(* onehot *)
|
||||
wire [FPMI_NB-1:0] fpmi_is = 1 << fpmi_instr[$clog2(FPMI_NB):0];
|
||||
initial fpmi_PC = 0;
|
||||
assign busy = !fpmi_is[FPMI_READY];
|
||||
|
||||
// Generate a micro-instructions in ROM
|
||||
task fpmi_gen; input [6:0] instr; begin
|
||||
fpmi_ROM[I] = instr;
|
||||
I = I + 1;
|
||||
end endtask
|
||||
|
||||
// Generate a FMA sequence in ROM.
|
||||
// Use fpmi_gen_fma(0) in the middle of a micro-program
|
||||
// Use fpmi_gen_fma(FPMI_EXIT_FLAG) if last instruction of micro-program
|
||||
task fpmi_gen_fma; input [6:0] flags; begin
|
||||
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- norm(A*B), Y <- C
|
||||
fpmi_gen(FPMI_ADD_SWAP); // if(|X| > |Y|) swap(X,Y) (and sgn)
|
||||
fpmi_gen(FPMI_ADD_SHIFT); // shift X according to Y exp
|
||||
fpmi_gen(FPMI_ADD_ADD); // X <- X + Y
|
||||
fpmi_gen(FPMI_ADD_NORM | flags); // X <- normalize(X)
|
||||
end endtask
|
||||
|
||||
integer I; // current ROM location in initialization
|
||||
integer iter; // iteration variable for generate Newton-Raphson (FDIV,FSQRT)
|
||||
localparam FPMI_ROM_SIZE=82 + (12 + 18)*PRECISE_DIV;
|
||||
reg [1+$clog2(FPMI_NB):0] fpmi_ROM[0:FPMI_ROM_SIZE-1];
|
||||
|
||||
// Microprograms start addresses
|
||||
// Programatically determined when generating the ROM ('initial' block below)
|
||||
integer FPMPROG_CMP, FPMPROG_ADD, FPMPROG_MUL, FPMPROG_MADD, FPMPROG_DIV;
|
||||
integer FPMPROG_FP_TO_INT, FPMPROG_INT_TO_FP, FPMPROG_SQRT, FPMPROG_MIN_MAX;
|
||||
|
||||
// Start the definition of a microprogram (determines start address)
|
||||
`define FPMPROG_BEGIN(prg) prg = I
|
||||
|
||||
// Ends the definition of a microprogram (displays stats in Verilator)
|
||||
`ifdef BENCH
|
||||
`define FPMPROG_END(prg) \
|
||||
$display("# %3d microinstructions used by %d:%s",I-prg,prg,`"prg`")
|
||||
`else
|
||||
`define FPMPROG_END(prg)
|
||||
`endif
|
||||
|
||||
/******************** Generate microprograms in ROM **********************/
|
||||
initial begin
|
||||
|
||||
`ifdef BENCH
|
||||
$display("# Generating FPMI ROM...");
|
||||
`endif
|
||||
I = 0;
|
||||
fpmi_gen(FPMI_READY | FPMI_EXIT_FLAG);
|
||||
|
||||
// ******************** FLT, FLE, FEQ *********************************
|
||||
`FPMPROG_BEGIN(FPMPROG_CMP);
|
||||
fpmi_gen(FPMI_LOAD_XY); // X <- A, Y <- B
|
||||
fpmi_gen(FPMI_CMP | FPMI_EXIT_FLAG); // X <- compare(X,Y)
|
||||
`FPMPROG_END(FPMPROG_CMP);
|
||||
|
||||
// ******************** FADD, FSUB ************************************
|
||||
`FPMPROG_BEGIN(FPMPROG_ADD);
|
||||
fpmi_gen(FPMI_LOAD_XY); // X <- A, Y <- B
|
||||
fpmi_gen(FPMI_ADD_SWAP); // if(|X| > |Y|) swap(X,Y) (,sgn)
|
||||
fpmi_gen(FPMI_ADD_SHIFT); // shift X according to Y exp
|
||||
fpmi_gen(FPMI_ADD_ADD); // X <- X + Y
|
||||
fpmi_gen(FPMI_ADD_NORM | FPMI_EXIT_FLAG); // X <- normalize(X)
|
||||
`FPMPROG_END(FPMPROG_ADD);
|
||||
|
||||
// ******************** FMUL ******************************************
|
||||
`FPMPROG_BEGIN(FPMPROG_MUL);
|
||||
fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B
|
||||
`FPMPROG_END(FPMPROG_MUL);
|
||||
|
||||
// ******************** FMADD, FMSUB, FNMADD, FNMSUB ******************
|
||||
`FPMPROG_BEGIN(FPMPROG_MADD);
|
||||
fpmi_gen_fma(FPMI_EXIT_FLAG); // X <- A*B+C (5 cycles)
|
||||
`FPMPROG_END(FPMPROG_MADD);
|
||||
|
||||
// ******************** FDIV ******************************************
|
||||
// https://en.wikipedia.org/wiki/Division_algorithm
|
||||
// https://stackoverflow.com/questions/24792966/
|
||||
// error-using-newton-raphson-iteration-method-for-
|
||||
// floating-point-division
|
||||
//
|
||||
`FPMPROG_BEGIN(FPMPROG_DIV);
|
||||
// D' = denominator (rs2) normalized between [0.5,1] (set exp to 126)
|
||||
fpmi_gen(FPMI_FRCP_PROLOG); // D<-A; E<-B; A<-(-D'); B<-32/17; C<-48/17
|
||||
fpmi_gen_fma(0); // X <- A*B+C (= -D'*32/17 + 48/17)
|
||||
for(iter=0; iter<3; iter=iter+1) begin
|
||||
if(PRECISE_DIV) begin
|
||||
// X <- X + X*(1-D'*X)
|
||||
// (slower more precise iter, but not IEEE754 compliant yet...)
|
||||
fpmi_gen(FPMI_FRCP_ITER1); // A <- -D'; B <- X; C <- 1.0f
|
||||
fpmi_gen_fma(0); // X <- A*B+C (5 cycles)
|
||||
fpmi_gen(FPMI_FRCP_ITER2); // A <- X; C <- B
|
||||
fpmi_gen_fma(0); // X <- A*B+C (5 cycles)
|
||||
end else begin
|
||||
// X <- X * (-X*D' + 2)
|
||||
// (faster but less precise)
|
||||
fpmi_gen(FPMI_FRCP_ITER1); // A <- -D'; B <- X; C <- 2.0f
|
||||
fpmi_gen_fma(0); // X <- A*B+C (5 cycles)
|
||||
fpmi_gen(FPMI_MV_A_X); // A <- X
|
||||
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B; Y <- C
|
||||
end
|
||||
end
|
||||
if(PRECISE_DIV) begin // round X to nearest
|
||||
fpmi_gen(FPMI_LOAD_Y_ROUND);
|
||||
fpmi_gen(FPMI_ADD_ADD);
|
||||
fpmi_gen(FPMI_ADD_NORM);
|
||||
end
|
||||
fpmi_gen(FPMI_FRCP_EPILOG); // A <- (E_sign,frcp_exp,X_frac); B <- D
|
||||
if(PRECISE_DIV) begin // error correction
|
||||
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B
|
||||
fpmi_gen(FPMI_FDIV_EPILOG); // B <- -E; C <- D; D <- A
|
||||
fpmi_gen(FPMI_MV_A_X);
|
||||
fpmi_gen_fma(0);
|
||||
fpmi_gen(FPMI_MV_C_A);
|
||||
fpmi_gen(FPMI_MV_B_D);
|
||||
fpmi_gen(FPMI_MV_A_X);
|
||||
fpmi_gen_fma(FPMI_EXIT_FLAG);
|
||||
end else begin
|
||||
fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B
|
||||
end
|
||||
`FPMPROG_END(FPMPROG_DIV);
|
||||
|
||||
// ******************** FCVT.W.S, FCVT.WU.S ***************************
|
||||
`FPMPROG_BEGIN(FPMPROG_FP_TO_INT);
|
||||
fpmi_gen(FPMI_LOAD_XY);
|
||||
fpmi_gen(FPMI_FP_TO_INT | FPMI_EXIT_FLAG);
|
||||
`FPMPROG_END(FPMPROG_FP_TO_INT);
|
||||
|
||||
// ******************** FCVT.S.W, FCVT.S.WU ***************************
|
||||
`FPMPROG_BEGIN(FPMPROG_INT_TO_FP); // Compute A+0 (use CLZ plugged on X)
|
||||
fpmi_gen(FPMI_INT_TO_FP); // X <- 0; Y <- A
|
||||
fpmi_gen(FPMI_ADD_ADD); // X <- X + Y
|
||||
fpmi_gen(FPMI_ADD_NORM | FPMI_EXIT_FLAG); // X <- normalize(X)
|
||||
`FPMPROG_END(FPMPROG_INT_TO_FP);
|
||||
|
||||
// ******************** FSQRT *****************************************
|
||||
// Using Doom's fast inverse square root algorithm:
|
||||
// https://en.wikipedia.org/wiki/Fast_inverse_square_root
|
||||
// http://www.lomont.org/papers/2003/InvSqrt.pdf
|
||||
// TODO: IEEE754-compliant version
|
||||
// See https://t.co/V1SWQ6N6xD?amp=1 (Method of Switching Constants)
|
||||
// See simple effective fast inverse square root with two magic
|
||||
// constants.
|
||||
//
|
||||
`FPMPROG_BEGIN(FPMPROG_SQRT);
|
||||
// D<-rs1; E,A,B<-(doom_magic - (A >> 1)); C<-3/2
|
||||
fpmi_gen(FPMI_FRSQRT_PROLOG);
|
||||
for(iter=0; iter<2; iter=iter+1) begin
|
||||
// X <- X * (3/2 - (0.5*rs1*X*X))
|
||||
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B; Y <- C
|
||||
fpmi_gen(FPMI_MV_A_X); // A <- X
|
||||
fpmi_gen(FPMI_MV_B_NH_D); // B <- -0.5*|D|
|
||||
fpmi_gen_fma(0); // X <- A*B+C
|
||||
fpmi_gen(FPMI_MV_A_X); // A <- X
|
||||
fpmi_gen(FPMI_MV_B_E); // B <- E
|
||||
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B; Y <- C
|
||||
if(iter==0) begin
|
||||
fpmi_gen(FPMI_MV_E_X); // E <- X
|
||||
fpmi_gen(FPMI_MV_A_X); // A <- X
|
||||
fpmi_gen(FPMI_MV_B_E); // B <- E
|
||||
end
|
||||
end // X contains 1/sqrt(rs1), now compute rs1*X to get sqrt(rs1)
|
||||
fpmi_gen(FPMI_MV_A_X); // A <- X
|
||||
fpmi_gen(FPMI_MV_B_D); // B <- D
|
||||
fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B; Y <- C
|
||||
`FPMPROG_END(FPMPROG_SQRT);
|
||||
|
||||
// ******************** FMIN, FMAX ************************************
|
||||
`FPMPROG_BEGIN(FPMPROG_MIN_MAX);
|
||||
fpmi_gen(FPMI_LOAD_XY);
|
||||
fpmi_gen(FPMI_MIN_MAX | FPMI_EXIT_FLAG);
|
||||
`FPMPROG_END(FPMPROG_MIN_MAX);
|
||||
|
||||
`ifdef BENCH
|
||||
$display("# FPMI ROM max address:%d",I-1);
|
||||
$display("# FPMI ROM size :%d",FPMI_ROM_SIZE);
|
||||
`ASSERT(I <= FPMI_ROM_SIZE,("!!!!!!! FPMI ROM SIZE exceeded !!!!!!!"));
|
||||
`endif
|
||||
end
|
||||
|
||||
`ifndef FPU_EMUL
|
||||
|
||||
// determine microprogram to be called based on decoded instruction
|
||||
reg [6:0] fpmprog;
|
||||
always @(*) begin
|
||||
(* parallel_case, full_case *)
|
||||
case(1'b1)
|
||||
isFLT | isFLE | isFEQ : fpmprog = FPMPROG_CMP[6:0];
|
||||
isFADD | isFSUB : fpmprog = FPMPROG_ADD[6:0];
|
||||
isFMUL : fpmprog = FPMPROG_MUL[6:0];
|
||||
isFMADD | isFMSUB | isFNMADD | isFNMSUB : fpmprog = FPMPROG_MADD[6:0];
|
||||
isFDIV : fpmprog = FPMPROG_DIV[6:0];
|
||||
isFSQRT : fpmprog = FPMPROG_SQRT[6:0];
|
||||
isFCVTWS | isFCVTWUS : fpmprog = FPMPROG_FP_TO_INT[6:0];
|
||||
isFCVTSW | isFCVTSWU : fpmprog = FPMPROG_INT_TO_FP[6:0];
|
||||
isFMIN | isFMAX : fpmprog = FPMPROG_MIN_MAX[6:0];
|
||||
default : fpmprog = 0;
|
||||
endcase
|
||||
end
|
||||
|
||||
// next micro-instruction program counter
|
||||
wire [6:0] fpmi_PC_next =
|
||||
wr ? fpmprog :
|
||||
fpmi_instr[FPMI_EXIT_FLAG_bit] ? 0 :
|
||||
fpmi_PC+1 ;
|
||||
always @(posedge clk) begin
|
||||
fpmi_PC <= fpmi_PC_next;
|
||||
fpmi_instr <= fpmi_ROM[fpmi_PC_next];
|
||||
end
|
||||
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(wr) begin
|
||||
// Denormals are flushed to zero
|
||||
`FP_LD(A, rs1[31], rs1[30:23], (|rs1[30:23]?{1'b1,rs1[22:0]}:24'b0));
|
||||
`FP_LD(B, rs2[31], rs2[30:23], (|rs2[30:23]?{1'b1,rs2[22:0]}:24'b0));
|
||||
`FP_LD(C, rs3[31], rs3[30:23], (|rs3[30:23]?{1'b1,rs3[22:0]}:24'b0));
|
||||
|
||||
// Backup rs1 in E without flushing to zero (for int2fp instructions)
|
||||
`FP_LD32(E, rs1);
|
||||
|
||||
// Single-cycle instructions
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
isFSGNJ : `X <= { rs2[31], rs1[30:0]};
|
||||
isFSGNJN : `X <= { !rs2[31], rs1[30:0]};
|
||||
isFSGNJX : `X <= { rs1[31]^rs2[31], rs1[30:0]};
|
||||
isFCLASS : `X <= fclass;
|
||||
isFMVXW | isFMVWX : `X <= rs1;
|
||||
endcase
|
||||
end else if(busy) begin
|
||||
|
||||
// Implementation of the micro-instructions
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
// X <- A ; Y <- B
|
||||
fpmi_is[FPMI_LOAD_XY]: begin
|
||||
X_sign <= A_sign;
|
||||
X_frac <= {2'b0, A_frac, 24'd0};
|
||||
X_exp <= {1'b0, A_exp};
|
||||
Y_sign <= B_sign ^ isFSUB;
|
||||
Y_frac <= {2'b0, B_frac, 24'd0};
|
||||
Y_exp <= {1'b0, B_exp};
|
||||
end
|
||||
|
||||
// X <- (+/-) normalize(A*B); Y <- (+/-)C
|
||||
fpmi_is[FPMI_LOAD_XY_MUL]: begin
|
||||
X_sign <= A_sign ^ B_sign ^ (isFNMSUB | isFNMADD);
|
||||
X_frac <= prod_Z ? 0 :
|
||||
(prod_frac[47] ? prod_frac : {prod_frac[48:0],1'b0});
|
||||
X_exp <= prod_Z ? 0 : prod_exp_norm;
|
||||
Y_sign <= C_sign ^ (isFMSUB | isFNMADD);
|
||||
Y_frac <= {2'b0, C_frac, 24'd0};
|
||||
Y_exp <= {1'b0, C_exp};
|
||||
end
|
||||
|
||||
// if(|X| > |Y|) swap(X,Y)
|
||||
// if X_sign != Y_sign X <- -X
|
||||
// We always *add*, but replace X_frac with -X_frac if the
|
||||
// sign of the operands differ, THEN we shift (signed shift). In
|
||||
// this way, rounding is correct, even when subtracting a
|
||||
// low magnitude numner from a large magnitude one.
|
||||
fpmi_is[FPMI_ADD_SWAP]: begin
|
||||
if(fabsY_LT_fabsX) begin
|
||||
X_frac <= (X_sign ^ Y_sign) ? -Y_frac : Y_frac;
|
||||
Y_frac <= X_frac;
|
||||
X_exp <= Y_exp; Y_exp <= X_exp;
|
||||
X_sign <= Y_sign; Y_sign <= X_sign;
|
||||
end else if(X_sign ^ Y_sign) begin
|
||||
X_frac <= -X_frac;
|
||||
end
|
||||
end
|
||||
|
||||
// shift A in order to make it match B exponent
|
||||
fpmi_is[FPMI_ADD_SHIFT]: begin
|
||||
`ASSERT(!fabsY_LT_fabsX, ("ADD_SHIFT: incorrect order"));
|
||||
X_frac <= X_frac >>> exp_diff; // note the signed shift !
|
||||
X_exp <= Y_exp;
|
||||
end
|
||||
|
||||
// A <- A (+/-) B
|
||||
fpmi_is[FPMI_ADD_ADD]: begin
|
||||
X_frac <= frac_sum[49:0];
|
||||
X_sign <= Y_sign;
|
||||
// normalization left shamt = 47 - first_bit_set = clz - 16
|
||||
norm_lshamt <= frac_sum_clz - 16;
|
||||
// Exponent of X once normalized = X_exp + first_bit_set - 47
|
||||
// = X_exp + 63 - clz - 47 = X_exp + 16 - clz
|
||||
X_exp_norm <= X_exp + 16 - {3'b000,frac_sum_clz};
|
||||
end
|
||||
|
||||
// X <- normalize(X) (after ADD_ADD -> norm_lshamt and A_exp_norm)
|
||||
fpmi_is[FPMI_ADD_NORM]: begin
|
||||
if(X_exp_norm <= 0 || (X_frac == 0)) begin
|
||||
X_frac <= 0;
|
||||
X_exp <= 0;
|
||||
end else begin
|
||||
X_frac <= X_frac[48] ? (X_frac >> 1) : X_frac << norm_lshamt;
|
||||
X_exp <= X_exp_norm;
|
||||
end
|
||||
end
|
||||
|
||||
fpmi_is[FPMI_LOAD_Y_ROUND]: begin
|
||||
Y_sign <= X_sign;
|
||||
Y_exp <= X_exp;
|
||||
Y_frac <= X_frac[23] ? (1 << 24) : 50'd0;
|
||||
end
|
||||
|
||||
// X <- result of comparison between X and Y
|
||||
fpmi_is[FPMI_CMP]: begin
|
||||
`X <= { 31'b0,
|
||||
isFLT && X_LT_Y ||
|
||||
isFLE && X_LE_Y ||
|
||||
isFEQ && X_EQ_Y
|
||||
};
|
||||
end
|
||||
|
||||
fpmi_is[FPMI_MV_B_D] : `FP_MV(B,D);
|
||||
fpmi_is[FPMI_MV_B_E] : `FP_MV(B,E);
|
||||
fpmi_is[FPMI_MV_A_X] : `FP_LD(A,X_sign,X_exp[7:0],X_frac[47:24]);
|
||||
fpmi_is[FPMI_MV_C_A] : `FP_MV(C,A);
|
||||
fpmi_is[FPMI_MV_E_X] : `FP_LD(E,X_sign,X_exp[7:0],X_frac[47:24]);
|
||||
|
||||
// B <= -|D| / 2.0
|
||||
fpmi_is[FPMI_MV_B_NH_D]:
|
||||
{B_sign, B_exp, B_frac} <= {1'b1,D_exp-8'd1,D_frac};
|
||||
|
||||
fpmi_is[FPMI_FRCP_PROLOG]: begin
|
||||
`FP_MV(D,A);
|
||||
`FP_MV(E,B);
|
||||
// A <= -D', that is, -(B normalized in [0.5,1])
|
||||
`FP_LD(A,1'b1,8'd126, B_frac);
|
||||
`FP_LD32(B, 32'h3FF0F0F1); // 32/17
|
||||
`FP_LD32(C, 32'h4034B4B5); // 48/17
|
||||
end
|
||||
|
||||
fpmi_is[FPMI_FRCP_ITER1]: begin
|
||||
`FP_LD(A,1'b1,8'd126, E_frac); // A <= -D'
|
||||
`FP_LD(B,X_sign,X_exp[7:0],X_frac[47:24]); // B <= X
|
||||
// 1.0 2.0
|
||||
`FP_LD32(C, PRECISE_DIV ? 32'h3f800000 : 32'h40000000);
|
||||
end
|
||||
|
||||
// This one is used only if PRECISE_DIV is set
|
||||
fpmi_is[FPMI_FRCP_ITER2]: begin
|
||||
`FP_LD(A,X_sign,X_exp[7:0],X_frac[47:24]); // A <= X
|
||||
`FP_MV(C,B);
|
||||
end
|
||||
|
||||
fpmi_is[FPMI_FRCP_EPILOG]: begin
|
||||
`FP_LD(A,E_sign,frcp_exp[7:0],X_frac[47:24]);
|
||||
`FP_MV(B,D);
|
||||
end
|
||||
|
||||
// This one is used only if PRECISE_DIV is set
|
||||
fpmi_is[FPMI_FDIV_EPILOG]: begin
|
||||
`FP_LD(B,!E_sign, E_exp, E_frac); // B <= -E
|
||||
`FP_MV(C,D);
|
||||
`FP_MV(D,A);
|
||||
end
|
||||
|
||||
fpmi_is[FPMI_FRSQRT_PROLOG]: begin
|
||||
`FP_LD32(D, rs1);
|
||||
`FP_LD32(E, rsqrt_doom_magic);
|
||||
`FP_LD32(A, rsqrt_doom_magic);
|
||||
`FP_LD32(B, rsqrt_doom_magic);
|
||||
`FP_LD32(C, 32'h3fc00000); // 1.5
|
||||
end
|
||||
|
||||
fpmi_is[FPMI_FP_TO_INT]: begin
|
||||
// TODO: check overflow
|
||||
`X <=
|
||||
(isFCVTWUS | !X_sign) ? X_fcvt_ftoi_shifted
|
||||
: -$signed(X_fcvt_ftoi_shifted);
|
||||
end
|
||||
|
||||
fpmi_is[FPMI_INT_TO_FP]: begin
|
||||
// TODO: rounding
|
||||
// We do a fake addition with zero, to prepare normalization
|
||||
// (uses CLZ plugged on the adder).
|
||||
X_frac <= 0;
|
||||
// 127+23: standard exponent bias
|
||||
// +6 because it is bit 29 of rs1 that overwrites
|
||||
// bit 47 of A_frac, instead of bit 23 (and 29-23 = 6).
|
||||
X_exp <= 127+23+6;
|
||||
Y_frac <=
|
||||
(isFCVTSWU | !E_sign) ? {E_sign, E_exp, E_frac[22:0], 18'd0}
|
||||
: {-$signed({E_sign, E_exp, E_frac[22:0]}), 18'd0};
|
||||
Y_sign <= isFCVTSW & E_sign;
|
||||
end
|
||||
|
||||
fpmi_is[FPMI_MIN_MAX]: begin
|
||||
`X <= (X_LT_Y ^ isFMAX)
|
||||
? {X_sign, X_exp[7:0], X_frac[46:24]}
|
||||
: {Y_sign, Y_exp[7:0], Y_frac[46:24]};
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
// Some circuitry used by the FPU micro-instructions:
|
||||
|
||||
// ******************* Comparisons ******************************************
|
||||
// Exponent adder
|
||||
wire signed [8:0] exp_sum = Y_exp + X_exp;
|
||||
wire signed [8:0] exp_diff = Y_exp - X_exp;
|
||||
|
||||
wire expX_EQ_expY = (exp_diff == 0);
|
||||
wire fracX_EQ_fracY = (frac_diff == 0);
|
||||
wire fabsX_EQ_fabsY = (expX_EQ_expY && fracX_EQ_fracY);
|
||||
wire fabsX_LT_fabsY = (!exp_diff[8] && !expX_EQ_expY) ||
|
||||
(expX_EQ_expY && !fracX_EQ_fracY && !frac_diff[50]);
|
||||
|
||||
wire fabsX_LE_fabsY = (!exp_diff[8] && !expX_EQ_expY) ||
|
||||
(expX_EQ_expY && !frac_diff[50]);
|
||||
|
||||
wire fabsY_LT_fabsX = exp_diff[8] || (expX_EQ_expY && frac_diff[50]);
|
||||
|
||||
wire fabsY_LE_fabsX = exp_diff[8] ||
|
||||
(expX_EQ_expY && (frac_diff[50] || fracX_EQ_fracY));
|
||||
|
||||
wire X_LT_Y = X_sign && !Y_sign ||
|
||||
X_sign && Y_sign && fabsY_LT_fabsX ||
|
||||
!X_sign && !Y_sign && fabsX_LT_fabsY ;
|
||||
|
||||
wire X_LE_Y = X_sign && !Y_sign ||
|
||||
X_sign && Y_sign && fabsY_LE_fabsX ||
|
||||
!X_sign && !Y_sign && fabsX_LE_fabsY ;
|
||||
|
||||
wire X_EQ_Y = fabsX_EQ_fabsY && (X_sign == Y_sign);
|
||||
|
||||
// ****************** Addition, subtraction *********************************
|
||||
wire signed [50:0] frac_sum = Y_frac + X_frac;
|
||||
wire signed [50:0] frac_diff = Y_frac - X_frac;
|
||||
|
||||
// ****************** Product ***********************************************
|
||||
wire [49:0] prod_frac = A_frac * B_frac; // TODO: check overflows
|
||||
|
||||
// exponent of product, once normalized
|
||||
// (obtained by writing expression of product and inspecting exponent)
|
||||
// Two cases: first bit set = 47 or 46 (only possible cases with normals)
|
||||
wire signed [8:0] prod_exp_norm = A_exp+B_exp-127+{7'b0,prod_frac[47]};
|
||||
|
||||
// detect null product and underflows (all denormals are flushed to zero)
|
||||
wire prod_Z = (prod_exp_norm <= 0) || !(|prod_frac[47:46]);
|
||||
|
||||
// ****************** Normalization *****************************************
|
||||
// Count leading zeroes in A+B
|
||||
// Note1: CLZ only work with power of two width (hence 13'b0 padding).
|
||||
// Note2: first bit set = 63 - CLZ (of course !)
|
||||
wire [5:0] frac_sum_clz;
|
||||
CLZ clz2({13'b0,frac_sum}, frac_sum_clz);
|
||||
reg [5:0] norm_lshamt; // shift amount for ADD normalization
|
||||
|
||||
// Exponent of A once normalized = X_exp + first_bit_set - 47
|
||||
// = X_exp + 63 - clz - 47 = X_exp + 16 - clz
|
||||
// X_exp_norm <= X_exp + 16 - {3'b000,A_clz};
|
||||
reg signed [8:0] X_exp_norm;
|
||||
|
||||
// ****************** Reciprocal (1/x), used by FDIV ************************
|
||||
// Exponent for reciprocal (1/x)
|
||||
// Initial value of x kept in E.
|
||||
wire signed [8:0] frcp_exp = 9'd126 + X_exp - $signed({1'b0, E_exp});
|
||||
|
||||
// ****************** Reciprocal square root (1/sqrt(x)) ********************
|
||||
// https://en.wikipedia.org/wiki/Fast_inverse_square_root
|
||||
wire [31:0] rsqrt_doom_magic = 32'h5f3759df - {1'b0,A_exp, A_frac[22:1]};
|
||||
|
||||
// ****************** Float to Integer conversion ***************************
|
||||
// -127-23 is standard exponent bias
|
||||
// -6 because it is bit 29 of X that corresponds to bit 47 of X_frac,
|
||||
// instead of bit 23 (and 23-29 = -6).
|
||||
wire signed [8:0] fcvt_ftoi_shift = A_exp - 9'd127 - 9'd23 - 9'd6;
|
||||
wire signed [8:0] neg_fcvt_ftoi_shift = -fcvt_ftoi_shift;
|
||||
|
||||
wire [31:0] X_fcvt_ftoi_shifted = fcvt_ftoi_shift[8] ? // R or L shift
|
||||
(|neg_fcvt_ftoi_shift[8:5] ? 0 : // underflow
|
||||
({X_frac[49:18]} >> neg_fcvt_ftoi_shift[4:0])) :
|
||||
({X_frac[49:18]} << fcvt_ftoi_shift[4:0]);
|
||||
|
||||
// ******************* Classification ***************************************
|
||||
|
||||
wire rs1_exp_Z = (rs1[30:23] == 0 );
|
||||
wire rs1_exp_255 = (rs1[30:23] == 255);
|
||||
wire rs1_frac_Z = (rs1[22:0] == 0 );
|
||||
|
||||
wire [31:0] fclass = {
|
||||
22'b0,
|
||||
rs1_exp_255 & rs1[22], // 9: quiet NaN
|
||||
rs1_exp_255 & !rs1[22] & (|rs1[21:0]), // 8: sig NaN
|
||||
!rs1[31] & rs1_exp_255 & rs1_frac_Z, // 7: +infinity
|
||||
!rs1[31] & !rs1_exp_Z & !rs1_exp_255, // 6: +normal
|
||||
!rs1[31] & rs1_exp_Z & !rs1_frac_Z, // 5: +subnormal
|
||||
!rs1[31] & rs1_exp_Z & rs1_frac_Z, // 4: +0
|
||||
rs1[31] & rs1_exp_Z & rs1_frac_Z, // 3: -0
|
||||
rs1[31] & rs1_exp_Z & !rs1_frac_Z, // 2: -subnormal
|
||||
rs1[31] & !rs1_exp_Z & !rs1_exp_255, // 1: -normal
|
||||
rs1[31] & rs1_exp_255 & rs1_frac_Z // 0: -infinity
|
||||
};
|
||||
|
||||
/************************************************************************/
|
||||
|
||||
// RV32F instruction decoder
|
||||
// See table p133 (RV32G instruction listings)
|
||||
// Notes:
|
||||
// - FLW/FSW handled by LOAD/STORE in femtorv32 (instr[2] set if FLW/FSW)
|
||||
// - For all other F instructions, instr[6:5] == 2'b10
|
||||
// - FMADD/FMSUB/FNMADD/FNMSUB: instr[4] = 1'b0
|
||||
// - For all remaining F instructions, instr[4] = 1'b1
|
||||
// - FMV.X.W and FCLASS have same funct7 (7'b1110000),
|
||||
// (discriminated by instr[12])
|
||||
// - there is a big gotcha in the official doc for RV32F:
|
||||
// the doc says FNMADD computes -rs1*rs2-rs3
|
||||
// (yes, with *minus* rs3)
|
||||
// it should have said FNMADD computes -(rs1*rs2+rs3)
|
||||
// and FNMSUB compures -(rs1*rs2-rs3)
|
||||
// they probably did not put the parentheses because when
|
||||
// you implement it, you change the sign of rs1 and rs3 according
|
||||
// to the operation rather than the sign of the whole result
|
||||
// (here, it is done by the FPMI_LOAD_XY_MUL micro instruction).
|
||||
|
||||
reg isFMADD, isFMSUB, isFNMSUB, isFNMADD;
|
||||
reg isFADD, isFSUB, isFMUL, isFDIV, isFSQRT;
|
||||
reg isFSGNJ, isFSGNJN, isFSGNJX;
|
||||
reg isFMIN, isFMAX;
|
||||
reg isFEQ, isFLT, isFLE;
|
||||
reg isFCLASS, isFCVTWS, isFCVTWUS;
|
||||
reg isFCVTSW, isFCVTSWU;
|
||||
reg isFMVXW, isFMVWX;
|
||||
|
||||
always @(*) begin
|
||||
isFMADD = (instr[4:2] == 3'b000); // rd <- rs1*rs2+rs3
|
||||
isFMSUB = (instr[4:2] == 3'b001); // rd <- rs1*rs2-rs3
|
||||
isFNMSUB = (instr[4:2] == 3'b010); // rd <- -(rs1*rs2-rs3)
|
||||
isFNMADD = (instr[4:2] == 3'b011); // rd <- -(rs1*rs2+rs3)
|
||||
|
||||
isFADD = (instr[4] && (instr[31:27] == 5'b00000));
|
||||
isFSUB = (instr[4] && (instr[31:27] == 5'b00001));
|
||||
isFMUL = (instr[4] && (instr[31:27] == 5'b00010));
|
||||
isFDIV = (instr[4] && (instr[31:27] == 5'b00011));
|
||||
isFSQRT = (instr[4] && (instr[31:27] == 5'b01011));
|
||||
|
||||
isFSGNJ = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b00));
|
||||
isFSGNJN = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b01));
|
||||
isFSGNJX = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b10));
|
||||
|
||||
isFMIN = (instr[4] && (instr[31:27] == 5'b00101) && !instr[12]);
|
||||
isFMAX = (instr[4] && (instr[31:27] == 5'b00101) && instr[12]);
|
||||
|
||||
isFEQ =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b10));
|
||||
isFLT =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b01));
|
||||
isFLE =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b00));
|
||||
|
||||
isFCLASS = (instr[4] && (instr[31:27] == 5'b11100) && instr[12]);
|
||||
|
||||
isFCVTWS = (instr[4] && (instr[31:27] == 5'b11000) && !instr[20]);
|
||||
isFCVTWUS = (instr[4] && (instr[31:27] == 5'b11000) && instr[20]);
|
||||
|
||||
isFCVTSW = (instr[4] && (instr[31:27] == 5'b11010) && !instr[20]);
|
||||
isFCVTSWU = (instr[4] && (instr[31:27] == 5'b11010) && instr[20]);
|
||||
|
||||
isFMVXW = (instr[4] && (instr[31:27] == 5'b11100) && !instr[12]);
|
||||
isFMVWX = (instr[4] && (instr[31:27] == 5'b11110));
|
||||
end
|
||||
|
||||
`ifdef FPU_EMUL
|
||||
`define FPU_EMUL1(op) `X <= $c32(op,"(",rs1,")")
|
||||
`define FPU_EMUL2(op) `X <= $c32(op,"(",rs1,",",rs2,")")
|
||||
`define FPU_EMUL3(op) `X <= $c32(op,"(",rs1,",",rs2,",",rs3,")")
|
||||
always @(posedge clk) begin
|
||||
if(wr) begin
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
isFMUL : `FPU_EMUL2("FMUL");
|
||||
isFADD : `FPU_EMUL2("FADD");
|
||||
isFSUB : `FPU_EMUL2("FSUB");
|
||||
isFDIV : `FPU_EMUL2("FDIV");
|
||||
isFSQRT : `FPU_EMUL1("FSQRT");
|
||||
isFMADD : `FPU_EMUL3("FMADD");
|
||||
isFMSUB : `FPU_EMUL3("FMSUB");
|
||||
isFNMADD : `FPU_EMUL3("FNMADD");
|
||||
isFNMSUB : `FPU_EMUL3("FNMSUB");
|
||||
isFEQ : `FPU_EMUL2("FEQ");
|
||||
isFLT : `FPU_EMUL2("FLT");
|
||||
isFLE : `FPU_EMUL2("FLE");
|
||||
isFCVTWS : `FPU_EMUL1("FCVTWS");
|
||||
isFCVTWUS: `FPU_EMUL1("FCVTWUS");
|
||||
isFCVTSW : `FPU_EMUL1("FCVTSW");
|
||||
isFCVTSWU: `FPU_EMUL1("FCVTSWU");
|
||||
isFMIN : `FPU_EMUL2("FMIN");
|
||||
isFMAX : `FPU_EMUL2("FMAX");
|
||||
isFCLASS : `FPU_EMUL1("FCLASS");
|
||||
isFSGNJ : `FPU_EMUL2("FSGNJ");
|
||||
isFSGNJN : `FPU_EMUL2("FSGNJN");
|
||||
isFSGNJX : `FPU_EMUL2("FSGNJX");
|
||||
isFMVXW | isFMVWX : `X <= rs1;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
/****************************************************************************/
|
||||
// When doing simulations, compare the result of all operations with
|
||||
// what's computed on the host CPU.
|
||||
// Note: my FDIV and FSQRT are not IEEE754 compliant (yet) !
|
||||
// (checks commented-out for now)
|
||||
|
||||
`ifdef NRV_FEMTORV32_PETITBATEAU // makes sure we are in the learn-FPGA fmwk
|
||||
`ifdef VERILATOR
|
||||
|
||||
`define FPU_CHECK1(op) \
|
||||
z <= $c32("CHECK_",op,"(",`X,",",rs1,")")
|
||||
`define FPU_CHECK2(op) \
|
||||
z <= $c32("CHECK_",op,"(",`X,",",rs1,",",rs2,")")
|
||||
`define FPU_CHECK3(op) \
|
||||
z <= $c32("CHECK_",op,"(",`X,",",rs1,",",rs2,",",rs3,")")
|
||||
|
||||
reg [31:0] z;
|
||||
reg active;
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
||||
if(wr) begin
|
||||
active <= 1'b1;
|
||||
end
|
||||
|
||||
if(active && !busy) begin
|
||||
active <= 1'b0;
|
||||
case(1'b1)
|
||||
isFMUL : `FPU_CHECK2("FMUL");
|
||||
isFADD : `FPU_CHECK2("FADD");
|
||||
isFSUB : `FPU_CHECK2("FSUB");
|
||||
isFDIV : `FPU_CHECK2("FDIV");
|
||||
// isFSQRT: `FPU_CHECK1("FSQRT"); // yes I know, not IEEE754 yet
|
||||
isFMADD: `FPU_CHECK3("FMADD");
|
||||
isFMSUB: `FPU_CHECK3("FMSUB");
|
||||
isFNMADD: `FPU_CHECK3("FNMADD");
|
||||
isFNMSUB: `FPU_CHECK3("FNMSUB");
|
||||
isFEQ: `FPU_CHECK2("FEQ");
|
||||
isFLT: `FPU_CHECK2("FLT");
|
||||
isFLE: `FPU_CHECK2("FLE");
|
||||
isFCVTWS : `FPU_CHECK1("FCVTWS");
|
||||
isFCVTWUS: `FPU_CHECK1("FCVTWUS");
|
||||
isFCVTSW : `FPU_CHECK1("FCVTSW");
|
||||
isFCVTSWU: `FPU_CHECK1("FCVTSWU");
|
||||
isFMIN: `FPU_CHECK2("FMIN");
|
||||
isFMAX: `FPU_CHECK2("FMAX");
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`endif
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
// FPU Normalization needs to detect the position of the first bit set
|
||||
// in the A_frac register. It is easier to count the number of leading
|
||||
// zeroes (CLZ for Count Leading Zeroes), as follows. See:
|
||||
// https://electronics.stackexchange.com/questions/196914/
|
||||
// verilog-synthesize-high-speed-leading-zero-count
|
||||
// TODO: test also Dean Gaudet's algorithm (see Hackers Delights p. 110)
|
||||
module CLZ #(
|
||||
parameter W_IN = 64, // must be power of 2, >= 2
|
||||
parameter W_OUT = $clog2(W_IN)
|
||||
) (
|
||||
input wire [W_IN-1:0] in,
|
||||
output wire [W_OUT-1:0] out
|
||||
);
|
||||
generate
|
||||
if(W_IN == 2) begin
|
||||
assign out = !in[1];
|
||||
end else begin
|
||||
wire [W_OUT-2:0] half_count;
|
||||
wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2];
|
||||
wire [W_IN/2-1:0] rhs = in[0 +: W_IN/2];
|
||||
wire left_empty = ~|lhs;
|
||||
CLZ #(
|
||||
.W_IN(W_IN/2)
|
||||
) inner(
|
||||
.in(left_empty ? rhs : lhs),
|
||||
.out(half_count)
|
||||
);
|
||||
assign out = {left_empty, half_count};
|
||||
end
|
||||
endgenerate
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
22
RTL/PROCESSOR/utils.v
Normal file
22
RTL/PROCESSOR/utils.v
Normal file
@@ -0,0 +1,22 @@
|
||||
/********************* Utilities, macros for debugging *************/
|
||||
|
||||
`ifdef VERBOSE
|
||||
`define verbose(command) command
|
||||
`else
|
||||
`define verbose(command)
|
||||
`endif
|
||||
|
||||
`ifdef BENCH
|
||||
`define BENCH_OR_LINT
|
||||
`ifdef QUIET
|
||||
`define bench(command)
|
||||
`else
|
||||
`define bench(command) command
|
||||
`endif
|
||||
`else
|
||||
`define bench(command)
|
||||
`endif
|
||||
|
||||
`ifdef verilator
|
||||
`define BENCH_OR_LINT
|
||||
`endif
|
||||
Reference in New Issue
Block a user