457 lines
16 KiB
Verilog
457 lines
16 KiB
Verilog
// femtorv32, a minimalistic RISC-V RV32I core
|
|
// (minus SYSTEM and FENCE that are not implemented)
|
|
// Bruno Levy, May-June 2020
|
|
//
|
|
// drop-in replacement of femtorv32,
|
|
// does 3 CPIs (cycles per instructions) in linear execution flow
|
|
// (two be compared with 2 CPIs with femtorv32.v),
|
|
// saves 20-50 LUTs
|
|
// in femtosoc.v, replace `include "femtorv32.v"
|
|
// with `include "mini_femtorv32.v"
|
|
//
|
|
// NOTE: the structure of the decoder has changed, *** NEEDS TO BE ADAPTED ***
|
|
|
|
|
|
/*******************************************************************/
|
|
|
|
`include "utils.v" // Utilities, macros for debugging
|
|
`include "register_file.v" // The 31 general-purpose registers
|
|
`include "small_alu.v" // Used on IceStick, RV32I
|
|
`include "large_alu.v" // For larger FPGAs, RV32IM
|
|
`include "branch_predicates.v" // Tests for branch instructions
|
|
`include "decoder.v" // The instruction decoder
|
|
`include "aligned_memory_access.v" // Read/write bytes, hwords and words from memory
|
|
`include "CSR_file.v" // (Optional) Control and Status registers
|
|
|
|
/********************* Nrv processor *******************************/
|
|
|
|
module FemtoRV32 #(
|
|
parameter [0:0] RV32M = 0, // Set to 1 to support mul/div/rem instructions
|
|
parameter ADDR_WIDTH = 16 // width of the address bus
|
|
) (
|
|
input clk,
|
|
|
|
// Memory interface: using the same protocol as Claire Wolf's picoR32
|
|
// (WIP: add mem_valid / mem_ready protocol)
|
|
output [31:0] mem_addr, // address bus, only ADDR_WIDTH bits are used
|
|
output wire [31:0] mem_wdata, // data to be written
|
|
output wire [3:0] mem_wmask, // write mask for individual bytes (1 means write byte)
|
|
input [31:0] mem_rdata, // input lines for both data and instr
|
|
output wire mem_rstrb, // active to initiate memory read
|
|
input wire mem_rbusy, // asserted if memory is busy reading value
|
|
input wire mem_wbusy, // asserted if memory is busy writing value
|
|
|
|
input wire reset, // set to 0 to reset the processor
|
|
output wire error // 1 if current instruction could not be decoded
|
|
);
|
|
|
|
|
|
// The internal register that stores the current address,
|
|
// directly wired to the address bus.
|
|
reg [ADDR_WIDTH-1:0] addressReg;
|
|
|
|
// The program counter (not storing the two LSBs, always aligned)
|
|
reg [ADDR_WIDTH-3:0] PC;
|
|
|
|
assign mem_addr = addressReg;
|
|
|
|
reg [31:0] instr; // Latched instruction.
|
|
reg [31:0] nextInstr; // Prefetched instruction.
|
|
|
|
|
|
// Next program counter in normal operation: advance one word
|
|
// I do not use the ALU, I create an additional adder for that.
|
|
// (not that the two LSBs are not stored, always aligned).
|
|
wire [ADDR_WIDTH-3:0] PCplus4 = PC + 1;
|
|
|
|
/**************************************************************************************************/
|
|
// Instruction decoding.
|
|
|
|
// Internal signals, all generated by the decoder from the current instruction.
|
|
wire [4:0] writeBackRegId; // The register to be written back
|
|
wire writeBackEn; // Needs to be asserted for writing back
|
|
wire [3:0] writeBackSel; // 0001: ALU 0010: PC+4 0100: RAM 1000: CSR
|
|
wire [4:0] regId1; // Register output 1
|
|
wire [4:0] regId2; // Register output 2
|
|
wire aluInSel1; // 0: register 1: pc
|
|
wire aluInSel2; // 0: register 1: imm
|
|
wire aluSel; // 0: force aluOp,aluQual to zero (ADD) 1: use aluOp,aluQual from instr field
|
|
wire [2:0] aluOp; // one of the 8 operations done by the ALU
|
|
wire aluQual; // 'qualifier' used by some operations (+/-, logical/arith shifts)
|
|
wire aluM; // asserted if instr is RV32M.
|
|
wire [31:0] imm; // immediate value decoded from the instruction
|
|
wire needWaitALU; // asserted if instruction uses at least one additional phase in ALU
|
|
wire isLoad; // guess what
|
|
wire isStore; // guess what
|
|
wire isJump; // guess what
|
|
wire isBranch; // guess what
|
|
wire decoderError; // true if instr does not correspond to any known instr
|
|
|
|
// The instruction decoder, that reads the current instruction
|
|
// and generates all the signals from it. It is in fact just a
|
|
// big combinatorial function.
|
|
NrvDecoder decoder(
|
|
.instr(instr),
|
|
.writeBackRegId(writeBackRegId),
|
|
.writeBackEn(writeBackEn),
|
|
.writeBackSel(writeBackSel),
|
|
.inRegId1(regId1),
|
|
.inRegId2(regId2),
|
|
.aluInSel1(aluInSel1),
|
|
.aluInSel2(aluInSel2),
|
|
.aluSel(aluSel),
|
|
.aluOp(aluOp),
|
|
.aluQual(aluQual),
|
|
.aluM(aluM),
|
|
.needWaitALU(needWaitALU),
|
|
.isLoad(isLoad),
|
|
.isStore(isStore),
|
|
.isJump(isJump),
|
|
.isBranch(isBranch),
|
|
.imm(imm),
|
|
.error(decoderError)
|
|
);
|
|
|
|
/**************************************************************************************************/
|
|
// Maybe not necessary, but I'd rather latch this one,
|
|
// if this one glitches, then it will break everything...
|
|
reg error_latched;
|
|
assign error = error_latched;
|
|
|
|
/**************************************************************************************************/
|
|
// The register file. At each cycle, it can read two
|
|
// registers (available at next cycle) and write one.
|
|
wire writeBack;
|
|
|
|
reg [31:0] writeBackData;
|
|
wire [31:0] regOut1;
|
|
wire [31:0] regOut2;
|
|
NrvRegisterFile regs(
|
|
.clk(clk),
|
|
.in(writeBackData),
|
|
.inEn(writeBack),
|
|
.inRegId(writeBackRegId),
|
|
.outRegId1(regId1),
|
|
.outRegId2(regId2),
|
|
.out1(regOut1),
|
|
.out2(regOut2)
|
|
);
|
|
|
|
/**************************************************************************************************/
|
|
// The ALU, partly combinatorial, partly state (for shifts).
|
|
wire [31:0] aluOut;
|
|
wire aluBusy;
|
|
wire alu_wenable;
|
|
wire [31:0] aluIn1 = aluInSel1 ? {PC, 2'b00} : regOut1;
|
|
wire [31:0] aluIn2 = aluInSel2 ? imm : regOut2;
|
|
|
|
// Select the ALU based on RV32M (use large ALU) or plain RV32I (use small ALU)
|
|
generate
|
|
if(RV32M) begin
|
|
NrvLargeALU alu(
|
|
.clk(clk),
|
|
.in1(aluIn1),
|
|
.in2(aluIn2),
|
|
.op(aluOp & {3{aluSel}}),
|
|
.opqual(aluQual & aluSel),
|
|
.opM(aluM),
|
|
.out(aluOut),
|
|
.wr(alu_wenable),
|
|
.busy(aluBusy)
|
|
);
|
|
end else begin
|
|
NrvSmallALU #(
|
|
`ifdef NRV_TWOSTAGE_SHIFTER
|
|
.TWOSTAGE_SHIFTER(1)
|
|
`else
|
|
.TWOSTAGE_SHIFTER(0)
|
|
`endif
|
|
) alu(
|
|
.clk(clk),
|
|
.in1(aluIn1),
|
|
.in2(aluIn2),
|
|
.op(aluOp & {3{aluSel}}),
|
|
.opqual(aluQual & aluSel),
|
|
.out(aluOut),
|
|
.wr(alu_wenable),
|
|
.busy(aluBusy)
|
|
);
|
|
end
|
|
endgenerate
|
|
|
|
/****************************************************************************/
|
|
|
|
// Memory only does 32-bit aligned accesses. Internally we have two small
|
|
// circuits (one for LOAD and one for STORE) that shift and adapt data
|
|
// according to data type (byte, halfword, word) and memory alignment (addr[1:0]).
|
|
// In addition, it does sign-expansion (when loading a signed byte to a word for
|
|
// instance).
|
|
|
|
// LOAD: a small combinatorial circuit that realigns
|
|
// and sign-expands mem_rdata based
|
|
// on width (aluOp[1:0]), signed/unsigned flag (aluOp[2])
|
|
// and the two LSBs of the address.
|
|
wire [31:0] LOAD_mem_rdata_aligned;
|
|
NrvLoadFromMemory load_from_mem(
|
|
.mem_rdata(mem_rdata), // Raw data read from mem
|
|
.addr_LSBs(mem_addr[1:0]), // The two LSBs of the address
|
|
.width(aluOp[1:0]), // Data width: 00:byte 01:hword 10:word
|
|
.is_unsigned(aluOp[2]), // signed/unsigned flag
|
|
.data(LOAD_mem_rdata_aligned) // Data ready to be sent to register
|
|
);
|
|
|
|
// STORE: a small combinatorial circuit that realigns
|
|
// data to be written based on width and the two LSBs
|
|
// of the address.
|
|
// When a STORE instruction is executed, the data to be stored to
|
|
// mem is available from the second register (regOut2) and the
|
|
// address where to store it is the output of the ALU (aluOut).
|
|
wire mem_wenable;
|
|
NrvStoreToMemory store_to_mem(
|
|
.data(regOut2), // Data to be sent, out of register
|
|
.addr_LSBs(aluOut[1:0]), // The two LSBs of the address
|
|
.width(aluOp[1:0]), // Data width: 00:byte 01:hword 10:word
|
|
.mem_wdata(mem_wdata), // Shifted data to be sent to memory
|
|
.mem_wmask(mem_wmask), // Write mask for the 4 bytes
|
|
.wr_enable(mem_wenable) // Write enable ('anded' with write mask)
|
|
);
|
|
|
|
/*************************************************************************/
|
|
// Control and status registers
|
|
|
|
`ifdef NRV_CSR
|
|
wire [31:0] CSR_rdata;
|
|
wire instr_retired;
|
|
NrvControlStatusRegisterFile CSR(
|
|
.clk(clk), // for counting cycles
|
|
.instr_cnt(instr_retired), // for counting retired instructions
|
|
.reset(reset), // reset all CSRs to default value
|
|
.CSRid(instr[31:20]), // CSR Id, extracted from instr
|
|
.rdata(CSR_rdata) // Read CSR value
|
|
// TODO: test for errors (.error)
|
|
);
|
|
`endif
|
|
// Note: writing to CSRs not implemented yet
|
|
|
|
|
|
/*************************************************************************/
|
|
// The value written back to the register file.
|
|
|
|
always @(*) begin
|
|
(* parallel_case, full_case *)
|
|
case(1'b1)
|
|
writeBackSel[0]: writeBackData = aluOut;
|
|
writeBackSel[1]: writeBackData = {PCplus4, 2'b00};
|
|
writeBackSel[2]: writeBackData = LOAD_mem_rdata_aligned;
|
|
`ifdef NRV_CSR
|
|
writeBackSel[3]: writeBackData = CSR_rdata;
|
|
`endif
|
|
endcase
|
|
end
|
|
|
|
/*************************************************************************/
|
|
// The predicate for conditional branches.
|
|
|
|
wire predOut;
|
|
NrvPredicate pred(
|
|
.in1(regOut1),
|
|
.in2(regOut2),
|
|
.op(aluOp),
|
|
.out(predOut)
|
|
);
|
|
|
|
/*************************************************************************/
|
|
// And, last but not least, the state machine.
|
|
/*************************************************************************/
|
|
|
|
// The states, using 1-hot encoding (reduces
|
|
// both LUT count and critical path).
|
|
|
|
localparam INITIAL = 8'b00000000;
|
|
localparam WAIT_INSTR = 8'b00000001;
|
|
localparam FETCH_INSTR = 8'b00000010;
|
|
localparam USE_PREFETCHED_INSTR = 8'b00000100;
|
|
localparam FETCH_REGS = 8'b00001000;
|
|
localparam EXECUTE = 8'b00010000;
|
|
localparam WAIT_ALU_OR_DATA = 8'b00100000;
|
|
localparam LOAD = 8'b01000000;
|
|
localparam ERROR = 8'b10000000;
|
|
|
|
localparam WAIT_INSTR_bit = 0;
|
|
localparam FETCH_INSTR_bit = 1;
|
|
localparam USE_PREFETCHED_INSTR_bit = 2;
|
|
localparam FETCH_REGS_bit = 3;
|
|
localparam EXECUTE_bit = 4;
|
|
localparam WAIT_ALU_OR_DATA_bit = 5;
|
|
localparam LOAD_bit = 6;
|
|
localparam ERROR_bit = 7;
|
|
|
|
reg [7:0] state = INITIAL;
|
|
|
|
// the internal signals that are determined combinatorially from
|
|
// state and other signals.
|
|
|
|
// The internal signal that enables register write-back
|
|
assign writeBack = (state[EXECUTE_bit] && writeBackEn) || state[WAIT_ALU_OR_DATA_bit];
|
|
|
|
// The memory-read signal. It is only needed for IO, hence it is only enabled
|
|
// right before the LOAD state. To allow execution from IO-mapped devices, it
|
|
// will be necessary to also enable it before instruction fetch.
|
|
assign mem_rstrb = (state[EXECUTE_bit] && isLoad);
|
|
|
|
// NOTE: memory write are done during the USE_PREFETCHED_INSTR state,
|
|
// Can't be done during EXECUTE (would be better), because mem_addr
|
|
// (needed) is updated at the end of EXECUTE.
|
|
// See also how load_from_mem and store_to_mem are wired.
|
|
assign mem_wenable = (state[USE_PREFETCHED_INSTR_bit] && isStore);
|
|
|
|
// alu_wenable starts computation in the ALU (for functions that
|
|
// require several cycles).
|
|
assign alu_wenable = (state[EXECUTE_bit]);
|
|
|
|
// instr_retired is asserted during one cycle for each
|
|
// retired instructions. It is used to update the instruction
|
|
// counter 'instret' in the control and status registers
|
|
`ifdef NRV_CSR
|
|
assign instr_retired = state[FETCH_REGS_bit];
|
|
`endif
|
|
|
|
// And now the state machine
|
|
|
|
`define show_state(state) `verbose($display(" %s",state))
|
|
|
|
always @(posedge clk) begin
|
|
if(!reset) begin
|
|
state <= INITIAL;
|
|
addressReg <= 0;
|
|
PC <= 0;
|
|
end else
|
|
case(1'b1)
|
|
(state == 0): begin
|
|
`show_state("initial");
|
|
state <= WAIT_INSTR;
|
|
end
|
|
state[WAIT_INSTR_bit]: begin
|
|
`show_state("wait_instr");
|
|
// this state to give enough time to fetch the
|
|
// instruction. Used for jumps and taken branches (and
|
|
// when fetching the first instruction).
|
|
state <= FETCH_INSTR;
|
|
end
|
|
state[FETCH_INSTR_bit]: begin
|
|
`show_state("fetch_instr");
|
|
instr <= mem_rdata;
|
|
// update instr address so that next instr is fetched during
|
|
// decode (and ready if there was no jump or branch)
|
|
addressReg <= {PCplus4, 2'b00};
|
|
state <= FETCH_REGS;
|
|
end
|
|
state[USE_PREFETCHED_INSTR_bit]: begin
|
|
`show_state("use_prefetched_instr");
|
|
// for linear execution flow, the prefetched isntr (nextInstr)
|
|
// can be used.
|
|
instr <= nextInstr;
|
|
// update instr address so that next instr is fetched during
|
|
// decode (and ready if there was no jump or branch)
|
|
addressReg <= {PCplus4, 2'b00};
|
|
// In addition, STORE instructions write to memory here.
|
|
// (see NrvStoreToMemory store_to_mem at beginning of file).
|
|
state <= FETCH_REGS;
|
|
end
|
|
state[FETCH_REGS_bit]: begin
|
|
`show_state("fetch_regs");
|
|
// instr was just updated -> input register ids also
|
|
// input registers available at next cycle
|
|
state <= EXECUTE;
|
|
error_latched <= decoderError;
|
|
end
|
|
state[EXECUTE_bit]: begin
|
|
`show_state("execute");
|
|
|
|
// input registers are read, aluOut is up to date
|
|
|
|
// Looked-ahead instr.
|
|
nextInstr <= mem_rdata;
|
|
|
|
// Needed for LOAD,STORE,jump,branch
|
|
// (in other cases it will be ignored)
|
|
addressReg <= aluOut;
|
|
|
|
if(error_latched) begin
|
|
state <= ERROR;
|
|
end else if(isLoad) begin
|
|
state <= LOAD;
|
|
PC <= PCplus4;
|
|
end else begin
|
|
(* parallel_case, full_case *)
|
|
case(1'b1)
|
|
isJump: begin
|
|
PC <= aluOut[31:2];
|
|
state <= WAIT_INSTR;
|
|
end
|
|
isBranch: begin
|
|
if(predOut) begin
|
|
PC <= aluOut[31:2];
|
|
state <= WAIT_INSTR;
|
|
end else begin
|
|
PC <= PCplus4;
|
|
state <= USE_PREFETCHED_INSTR;
|
|
end
|
|
end
|
|
default: begin // linear execution flow
|
|
PC <= PCplus4;
|
|
state <= needWaitALU ? WAIT_ALU_OR_DATA : USE_PREFETCHED_INSTR;
|
|
end
|
|
endcase
|
|
end
|
|
end
|
|
state[LOAD_bit]: begin
|
|
`show_state("load");
|
|
// data address (aluOut) was just updated
|
|
// data ready at next cycle
|
|
// we go to WAIT_ALU_OR_DATA to write back read data
|
|
state <= WAIT_ALU_OR_DATA;
|
|
end
|
|
state[WAIT_ALU_OR_DATA_bit]: begin
|
|
`show_state("wait_alu_or_data");
|
|
// - If ALU is still busy, continue to wait.
|
|
// - register writeback is active
|
|
state <= aluBusy ? WAIT_ALU_OR_DATA : USE_PREFETCHED_INSTR;
|
|
end
|
|
state[ERROR_bit]: begin
|
|
`bench($display("ERROR"));
|
|
state <= ERROR;
|
|
end
|
|
default: begin
|
|
`bench($display("UNKNOWN STATE"));
|
|
state <= ERROR;
|
|
end
|
|
endcase
|
|
end
|
|
|
|
/*********************************************************************/
|
|
|
|
`define show_opcode(opcode) `verbose($display("%x: %s",{PC,2'b00},opcode))
|
|
|
|
`ifdef BENCH
|
|
always @(posedge clk) begin
|
|
if(state[FETCH_REGS_bit]) begin
|
|
case(instr[6:0])
|
|
7'b0110111: `show_opcode("LUI");
|
|
7'b0010111: `show_opcode("AUIPC");
|
|
7'b1101111: `show_opcode("JAL");
|
|
7'b1100111: `show_opcode("JALR");
|
|
7'b1100011: `show_opcode("BRANCH");
|
|
7'b0010011: `show_opcode("ALU reg imm");
|
|
7'b0110011: `show_opcode("ALU reg reg");
|
|
7'b0000011: `show_opcode("LOAD");
|
|
7'b0100011: `show_opcode("STORE");
|
|
7'b0001111: `show_opcode("FENCE");
|
|
7'b1110011: `show_opcode("SYSTEM");
|
|
endcase // case (instr[6:0])
|
|
end // if (state[EXECUTE_bit])
|
|
end
|
|
`endif
|
|
|
|
endmodule
|