M Makefile => Makefile +8 -2
@@ 25,7 25,7 @@ show: ./waves/$(MODULE).vcd
# These are runtime dependencies, not build time dependencies.
.PRECIOUS: ./programs/bin/%.dat ./programs/bin/%.bin
-run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv ./out
+run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv src/stages/*.sv ./out
verilator --binary --trace \
-GCPU_PROGRAM_PATH="\"./programs/bin/$(PROGRAM).dat\"" \
-GTRACE_FILE_PATH="\"out/program_$(notdir $(basename $<)).vcd\"" \
@@ 40,6 40,9 @@ run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv
src/alu.sv \
src/register_file.sv \
src/program_counter.sv \
+ src/forwarder.sv \
+ src/jumps.sv \
+ src/stages/*.sv \
src/ram.sv \
src/cpu.sv \
src/file_program_memory.sv \
@@ 48,7 51,7 @@ run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv
--top tb_cpu_program
./obj_dir/Vtb_cpu_program_$(notdir $(basename $<))
-./obj_dir/Vtb_%: testbench/tb_%.sv src/*.sv
+./obj_dir/Vtb_%: testbench/tb_%.sv src/*.sv src/stages/*.sv
verilator --binary --trace \
--trace-max-array 512 \
src/cpu_types.sv \
@@ 57,6 60,9 @@ run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv
src/alu.sv \
src/register_file.sv \
src/program_counter.sv \
+ src/forwarder.sv \
+ src/jumps.sv \
+ src/stages/*.sv \
src/ram.sv \
src/cpu.sv \
src/file_program_memory.sv \
M README.md => README.md +41 -0
@@ 2,8 2,49 @@
Available at https://github.com/Rutherther/verilog-riscv-semestral-project
This repository contains RISC-V processor written in SystemVerilog.
+It contains both singlecycle and pipelined version.
+Classic RISC pipeline is utilized.
## Architecture
+The singlecycle version is located in `src/cpu_singlecycle.sv`.
+The pipelined version is in `src/cpu.sv`.
+
+There are five stages in the pipelined version
+- Fetch (fetches instruction from memory)
+- Decode (decodes the fetched instruction, performs jumps, gets data from forwarder)
+- Execute (alu)
+- Memory access (loads, stores)
+- Writeback (stores data in registers)
+
+There are forwards whenever possible for data dependencies.
+The forward is realized inside of the decode stage
+that will supply arguments to the execute stage.
+If there is a read from memory, there has to be a stall,
+the pipeline can stall. The stalling is implemented using
+ready flags in each of the stages. It is thus possible to easily
+implement a stage that would block for multiple cycles
+instead of producing valid data every cycle.
+For now, all of the stages take one cycle to produce valid data.
+
+The forwarding is done by keeping address and data known in each stage
+inside of the status.data. Outputs of execute, memory access, and
+input of writeback are used for forwarding. It would be possible to
+skip the writeback forwarding if instead the register file outputted data
+to be written instead of the contents of the register until it's actually
+written to. I am afraid this could cause other issues, hence I chose forwarding instead.
+
+All stages have valid and ready flags.
+
+Ready flag is used for stalling. If a stage is not ready, there cannot
+be data going into it, and the pipeline before that has to be stopped,
+including program counter changes. This is used for stalling when waiting for a read
+out of the memory, but could also be used for making the execute stage more complex,
+ie. making it work multiple cycles instead of a one. It should also be possible to implement
+reads that are not aligned, by reading from two consequent positions in the memory in two cycles.
+
+Valid flag is for "killing" data that cannot be valid.
+It's utilized when stalling - data from decode are not
+valid in that case. When stalling, both valid and ready should be 0.
## Requirements
- make
M flake.nix => flake.nix +1 -1
@@ 19,7 19,7 @@
};
in rec {
devShells.default = pkgs.mkShell {
- name = "pap-processor-singlecycle";
+ name = "riscv-sv-processor-toolchain";
packages = [
# verilog simulation
M src/control_unit.sv => src/control_unit.sv +2 -2
@@ 25,7 25,7 @@ module control_unit(
// going to alu
output [2:0] alu_op,
- output alu_signed,
+ output alu_sign,
output alu_negate,
output alu_add_one,
@@ 106,7 106,7 @@ module control_unit(
assign alu_negate = conditional_jump ? alu_jump_negate :
alu_override ? 0'b0 :
alu_reg_negate;
- assign alu_signed = conditional_jump ? 0'b0 :
+ assign alu_sign = conditional_jump ? 0'b0 :
alu_override ? 0'b0 :
alu_reg_signed;
M src/cpu.sv => src/cpu.sv +90 -129
@@ 11,156 11,130 @@ module cpu(
// ram
output [31:0] memory_address,
input [31:0] memory_out,
- output reg [31:0] memory_write,
+ output [31:0] memory_write,
output [3:0] memory_byte_enable,
- output reg memory_we,
+ output memory_we,
- output ebreak
+ output ebreak
);
parameter WIDTH = 32;
reg [31:0] pc_next;
- wire pc_src;
-
- reg [31:0] alu_1, alu_2;
- wire alu_1_src, alu_2_src;
-
- wire [2:0] alu_op;
- wire alu_add_one, alu_negate, alu_signed;
- wire alu_zero;
- wire [31:0] alu_out;
wire [4:0] reg_a_1, reg_a_2, reg_a_w;
wire [31:0] reg_rd1, reg_rd2;
reg [31:0] reg_write;
- wire [1:0] reg_write_src;
wire reg_we;
- wire [31:0] immediate;
+ reg [2:0] last_non_ready_stage;
+ reg all_stages_ready;
- wire jump_instruction, jump_negate_zero;
- wire jump_taken;
+ wire jump;
+ wire [31:0] jumping_pc_next;
- wire memory_sign_extension;
+ stage_status_t stages_in[1:4];
- memory_mask_t memory_mask;
+ /// verilator doesn't like that data taken from stages_out[i]
+ // are used in stages_out[i + 1]. But that shouldn't really matter
+ // as there is not really a cyclic dependency.
+ // It just seems that verilator is not very good at "separating"
+ // array elements
+/* verilator lint_off UNOPTFLAT */
+ stage_status_t stages_out[0:3];
+/* verilator lint_on UNOPTFLAT */
- function bit[31:0] mem_sext_maybe;
- input [31:0] num;
- input memory_mask_t mask;
- input sext;
- begin
- case(mask)
- MEM_BYTE: return {{(32 - 8){sext & num[7]}}, num[7:0]};
- MEM_HALFWORD: return {{(32 - 16){sext & num[15]}}, num[15:0]};
- MEM_WORD: return num[31:0]; // rv32i, no 64 bit regs, no sign extension needed
- default: return 0;
- endcase
+ assign ebreak = stages_out[ACCESS].instruction.ebreak;
+
+ // stage registers
+ always_ff @(posedge clk) begin
+ if (rst_n == 0) begin
+ for (int i = 0; i < $size(stages_in); i++) begin
+ stages_in[i].data.address = 0;
+ end
end
- endfunction
-
- function bit[3:0] mask_to_mask_bytes;
- input memory_mask_t mask;
- begin
- case(mask)
- MEM_BYTE: return 4'b0001;
- MEM_HALFWORD: return 4'b0011;
- MEM_WORD: return 4'b1111;
- default: return 0;
- endcase
+ else begin
+ for (int i = 0; i < $size(stages_in); i++) begin
+ if (all_stages_ready || i >= last_non_ready_stage) begin
+ stages_in[i + 1] = stages_out[i];
+ end
+ end
end
- endfunction
-
- assign memory_byte_enable = mask_to_mask_bytes(.mask(memory_mask)) << memory_address[1:0];
- assign memory_write = reg_rd2 << (8*memory_address[1:0]);
- assign memory_address = alu_out;
-
- // alu source 1
- always_comb begin
- case (alu_1_src)
- REG_FILE_RS1 : alu_1 = reg_rd1;
- PC : alu_1 = pc;
- endcase
end
- // alu source 2
+ // find first non ready stage. Stages before that will be stalled
always_comb begin
- case (alu_2_src)
- REG_FILE_RS2 : alu_2 = reg_rd2;
- IMMEDIATE : alu_2 = immediate;
- endcase
+ last_non_ready_stage = 0;
+ all_stages_ready = 1'b1;
+ for (int i = 0; i < $size(stages_out); i++) begin
+ if (!stages_out[i].ready) begin
+ last_non_ready_stage = i[2:0];
+ all_stages_ready = 1'b0;
+ end
+ end
end
- // pc source
- assign jump_taken = jump_instruction && (alu_zero ^ jump_negate_zero);
always_comb begin
- if (ebreak)
+ if (jump)
+ pc_next = jumping_pc_next;
+ else if (all_stages_ready) // assume no jump. If jump, if result will be thrown out
+ pc_next = pc + 4;
+ else // stalling (in any stage, meaning not fetching new instructions)
pc_next = pc;
- else
- case (pc_src)
- PC_PLUS : begin
- if (jump_taken)
- pc_next = pc + immediate;
- else
- pc_next = pc + 4;
- end
- PC_ALU : pc_next = alu_out;
- endcase
end
- // register file write source
- // TODO forwarding pipelined, split to two instead
- always_comb begin
- case (reg_write_src)
- RD_ALU : reg_write = alu_out;
- RD_PC_PLUS : reg_write = pc + 4;
- RD_MEMORY : reg_write = mem_sext_maybe(.num(memory_out >> (8*memory_address[1:0])), .mask(memory_mask), .sext(memory_sign_extension));
- default : ;
- endcase
- end
+ // data for forwarding from the stages
+ // Note: this is a record instead of an array
+ // just because verilator didn't like it as an array
+ // consider switching back to array.
+ forwarding_data_status_t data_in_pipeline;
+ assign data_in_pipeline.execute_out = stages_out[EXECUTE].data;
+ assign data_in_pipeline.access_out = stages_out[ACCESS].data;
+ assign data_in_pipeline.writeback_in = stages_in[WRITEBACK].data;
- control_unit control_unit_inst(
- .instruction(instruction),
-
- .ebreak(ebreak),
-
- .immediate(immediate),
+ fetch fetch_inst(
+ .clk(clk),
+ .pc(pc),
+ .mem_instruction(instruction),
+ .jump(jump),
+ .stage_out(stages_out[FETCH])
+ );
- .alu_op(alu_op),
- .alu_add_one(alu_add_one),
- .alu_negate(alu_negate),
- .alu_signed(alu_signed),
+ decode decode_inst(
+ .clk(clk),
+ .data_in_pipeline(data_in_pipeline),
+ .reg_a_1(reg_a_1),
+ .reg_a_2(reg_a_2),
+ .reg_rd1(reg_rd1),
+ .reg_rd2(reg_rd2),
+ .jump(jump),
+ .pc_next(jumping_pc_next),
+ .stage_in(stages_in[DECODE]),
+ .stage_out(stages_out[DECODE])
+ );
- .memory_mask(memory_mask),
- .memory_sign_extension(memory_sign_extension),
+ execute execute_inst(
+ .clk(clk),
+ .stage_in(stages_in[EXECUTE]),
+ .stage_out(stages_out[EXECUTE])
+ );
+ memory_access memory_access_inst(
+ .clk(clk),
+ .memory_out(memory_out),
+ .memory_byte_enable(memory_byte_enable),
+ .memory_write(memory_write),
.memory_we(memory_we),
-
- .jump_instruction(jump_instruction),
- .jump_negate_zero(jump_negate_zero),
-
- .pc_src(pc_src),
- .alu_src_1(alu_1_src),
- .alu_src_2(alu_2_src),
- .reg_rd_src(reg_write_src),
-
- .reg_rs1(reg_a_1),
- .reg_rs2(reg_a_2),
- .reg_rd(reg_a_w),
- .reg_we(reg_we)
+ .memory_address(memory_address),
+ .stage_in(stages_in[ACCESS]),
+ .stage_out(stages_out[ACCESS])
);
- alu #(.WIDTH(WIDTH)) alu_inst(
- .a(alu_1),
- .b(alu_2),
-
- .out(alu_out),
-
- .op(alu_op),
- .b_add_one(alu_add_one),
- .b_negate(alu_negate),
- .sign(alu_signed),
- .zero_flag(alu_zero)
+ writeback writeback_inst(
+ .clk(clk),
+ .reg_a_write(reg_a_w),
+ .reg_we(reg_we),
+ .reg_write(reg_write),
+ .stage_in(stages_in[WRITEBACK])
);
register_file #(.WIDTH(WIDTH), .ADDRESS_LENGTH(5)) register_file_inst(
@@ 180,17 154,4 @@ module cpu(
.pc(pc[11:0]),
.pc_next(pc_next[11:0])
);
-
- // program_memory program_memory_inst(
- // .addr(pc[11:0]),
- // .instruction(instruction)
- // );
-
- // ram memory_inst(
- // .clk(clk),
- // .a(memory_address),
- // .we(memory_we),
- // .wd(memory_write),
- // .rd(memory_out)
- // );
endmodule
A src/cpu_singlecycle.sv => src/cpu_singlecycle.sv +195 -0
@@ 0,0 1,195 @@
+import cpu_types::*;
+
+module cpu(
+ input clk,
+ input rst_n,
+
+ // program memory
+ input [31:0] instruction,
+ output reg [31:0] pc,
+
+ // ram
+ output [31:0] memory_address,
+ input [31:0] memory_out,
+ output reg [31:0] memory_write,
+ output [3:0] memory_byte_enable,
+ output reg memory_we,
+
+ output ebreak
+);
+ parameter WIDTH = 32;
+
+ reg [31:0] pc_next;
+ wire pc_src;
+
+ reg [31:0] alu_1, alu_2;
+ wire alu_1_src, alu_2_src;
+
+ wire [2:0] alu_op;
+ wire alu_add_one, alu_negate, alu_signed;
+ wire alu_zero;
+ wire [31:0] alu_out;
+
+ wire [4:0] reg_a_1, reg_a_2, reg_a_w;
+ wire [31:0] reg_rd1, reg_rd2;
+ reg [31:0] reg_write;
+ wire [1:0] reg_write_src;
+ wire reg_we;
+
+ wire [31:0] immediate;
+
+ wire jump_instruction, jump_negate_zero;
+ wire jump_taken;
+
+ wire memory_sign_extension;
+
+ memory_mask_t memory_mask;
+
+ function bit[31:0] mem_sext_maybe;
+ input [31:0] num;
+ input memory_mask_t mask;
+ input sext;
+ begin
+ case(mask)
+ MEM_BYTE: return {{(32 - 8){sext & num[7]}}, num[7:0]};
+ MEM_HALFWORD: return {{(32 - 16){sext & num[15]}}, num[15:0]};
+ MEM_WORD: return num[31:0]; // rv32i, no 64 bit regs, no sign extension needed
+ default: return 0;
+ endcase
+ end
+ endfunction
+
+ function bit[3:0] mask_to_mask_bytes;
+ input memory_mask_t mask;
+ begin
+ case(mask)
+ MEM_BYTE: return 4'b0001;
+ MEM_HALFWORD: return 4'b0011;
+ MEM_WORD: return 4'b1111;
+ default: return 0;
+ endcase
+ end
+ endfunction
+
+ assign memory_byte_enable = mask_to_mask_bytes(.mask(memory_mask)) << memory_address[1:0];
+ assign memory_write = reg_rd2 << (8*memory_address[1:0]);
+ assign memory_address = alu_out;
+
+ // alu source 1
+ always_comb begin
+ case (alu_1_src)
+ REG_FILE_RS1 : alu_1 = reg_rd1;
+ PC : alu_1 = pc;
+ endcase
+ end
+
+ // alu source 2
+ always_comb begin
+ case (alu_2_src)
+ REG_FILE_RS2 : alu_2 = reg_rd2;
+ IMMEDIATE : alu_2 = immediate;
+ endcase
+ end
+
+ // pc source
+ assign jump_taken = jump_instruction && (alu_zero ^ jump_negate_zero);
+ always_comb begin
+ if (ebreak)
+ pc_next = pc;
+ else
+ case (pc_src)
+ PC_PLUS : begin
+ if (jump_taken)
+ pc_next = pc + immediate;
+ else
+ pc_next = pc + 4;
+ end
+ PC_ALU : pc_next = alu_out;
+ endcase
+ end
+
+ // register file write source
+ always_comb begin
+ case (reg_write_src)
+ RD_ALU : reg_write = alu_out;
+ RD_PC_PLUS : reg_write = pc + 4;
+ RD_MEMORY : reg_write = mem_sext_maybe(.num(memory_out >> (8*memory_address[1:0])), .mask(memory_mask), .sext(memory_sign_extension));
+ default : ;
+ endcase
+ end
+
+ control_unit control_unit_inst(
+ .instruction(instruction),
+
+ .ebreak(ebreak),
+
+ .immediate(immediate),
+
+ .alu_op(alu_op),
+ .alu_add_one(alu_add_one),
+ .alu_negate(alu_negate),
+ .alu_signed(alu_signed),
+
+ .memory_mask(memory_mask),
+ .memory_sign_extension(memory_sign_extension),
+
+ .memory_we(memory_we),
+
+ .jump_instruction(jump_instruction),
+ .jump_negate_zero(jump_negate_zero),
+
+ .pc_src(pc_src),
+ .alu_src_1(alu_1_src),
+ .alu_src_2(alu_2_src),
+ .reg_rd_src(reg_write_src),
+
+ .reg_rs1(reg_a_1),
+ .reg_rs2(reg_a_2),
+ .reg_rd(reg_a_w),
+ .reg_we(reg_we)
+ );
+
+ alu #(.WIDTH(WIDTH)) alu_inst(
+ .a(alu_1),
+ .b(alu_2),
+
+ .out(alu_out),
+
+ .op(alu_op),
+ .b_add_one(alu_add_one),
+ .b_negate(alu_negate),
+ .sign(alu_signed),
+ .zero_flag(alu_zero)
+ );
+
+ register_file #(.WIDTH(WIDTH), .ADDRESS_LENGTH(5)) register_file_inst(
+ .clk(clk),
+ .a1(reg_a_1),
+ .a2(reg_a_2),
+ .a3(reg_a_w),
+ .we3(reg_we),
+ .wd3(reg_write),
+ .rd1(reg_rd1),
+ .rd2(reg_rd2)
+ );
+
+ program_counter program_counter_inst(
+ .clk(clk),
+ .rst_n(rst_n),
+ .pc(pc[11:0]),
+ .pc_next(pc_next[11:0])
+ );
+
+ // program_memory program_memory_inst(
+ // .addr(pc[11:0]),
+ // .instruction(instruction)
+ // );
+
+ // ram memory_inst(
+ // .clk(clk),
+ // .a(memory_address),
+ // .we(memory_we),
+ // .wd(memory_write),
+ // .rd(memory_out)
+ // );
+endmodule
M src/cpu_types.sv => src/cpu_types.sv +61 -0
@@ 5,4 5,65 @@ package cpu_types;
typedef enum bit[1:0] { RD_ALU, RD_PC_PLUS, RD_MEMORY } reg_rd_source_t;
typedef enum bit[1:0] { MEM_BYTE, MEM_HALFWORD, MEM_WORD } memory_mask_t;
+
+ typedef struct {
+ bit [31:0] instruction;
+
+ bit [31:0] immediate;
+ bit ebreak;
+
+ alu_1_source_t alu_1_src;
+ alu_2_source_t alu_2_src;
+
+ bit [2:0] alu_op;
+ bit alu_add_one;
+ bit alu_negate;
+ bit alu_sign;
+
+ memory_mask_t memory_mask;
+ bit memory_sign_extension;
+ bit memory_we;
+
+ bit reg_we;
+
+ reg_rd_source_t reg_rd_src;
+
+ } decoded_instruction_t;
+
+ // For pipelining, used in execute, memory, and writeback stages.
+ // The instruction decode stage will check if any tag matches the
+ // address being read from. If yes, it has to be forwarded instead
+ // of getting it from the register. Additionaly, if the data
+ // are invalid, stalling will be necessary.
+ typedef struct {
+ bit [4:0] address; // The address the data will be written to
+ bit [31:0] data; // The data to be written to the address
+ bit valid; // Are the data valid? (data will be invalid for memory operations in execute stage)
+ } register_data_status_t;
+
+ typedef struct {
+ register_data_status_t execute_out;
+ register_data_status_t access_out;
+ register_data_status_t writeback_in;
+ } forwarding_data_status_t;
+
+ typedef struct {
+ decoded_instruction_t instruction;
+ register_data_status_t data;
+
+ bit [31:0] pc;
+
+ bit [31:0] reg_rd1;
+ bit [31:0] reg_rd2;
+
+ bit valid;
+ bit ready;
+ // !ready == stall
+ } stage_status_t;
+
+ const int FETCH = 0;
+ const int DECODE = 1;
+ const int EXECUTE = 2;
+ const int ACCESS = 3;
+ const int WRITEBACK = 4;
endpackage
A src/forwarder.sv => src/forwarder.sv +40 -0
@@ 0,0 1,40 @@
+import cpu_types::*;
+
+module forwarder(
+ input clk,
+ input [4:0] read_address,
+ input [31:0] register_file_data,
+ input forwarding_data_status_t data_in_pipeline,
+
+ output forwarding,
+ output reg stall,
+ output reg [31:0] data
+);
+ // if any data in the pipeline match the reading address,
+ // these will be used instead of the register_file_data
+ //
+ // if there are multiple matches, the first one is taken
+ // to get the most recent data
+
+ always_comb begin
+ stall = 0;
+ data = register_file_data;
+ forwarding = 0;
+
+ if (read_address != 0 && data_in_pipeline.execute_out.address == read_address) begin
+ stall = !data_in_pipeline.execute_out.valid;
+ data = data_in_pipeline.execute_out.data;
+ forwarding = 1;
+ end
+ else if (read_address != 0 && data_in_pipeline.access_out.address == read_address) begin
+ stall = !data_in_pipeline.access_out.valid;
+ data = data_in_pipeline.access_out.data;
+ forwarding = 1;
+ end
+ else if (read_address != 0 && data_in_pipeline.writeback_in.address == read_address) begin
+ stall = !data_in_pipeline.writeback_in.valid;
+ data = data_in_pipeline.writeback_in.data;
+ forwarding = 1;
+ end
+ end
+endmodule
A src/jumps.sv => src/jumps.sv +49 -0
@@ 0,0 1,49 @@
+import cpu_types::*;
+
+module jumps(
+ input [31:0] pc,
+ input [31:0] immediate,
+ input pc_source_t pc_src,
+ input jump_negate_zero,
+ input jump_instruction,
+
+ input [2:0] alu_op,
+ input [31:0] alu_a, alu_b,
+ input alu_sign,
+ input alu_b_add_one,
+ input alu_b_negate,
+
+ output [31:0] pc_next,
+ output jumping
+);
+ wire [31:0] alu_out;
+ wire alu_zero;
+
+ wire branch_taken;
+
+ assign jumping = branch_taken || pc_src == PC_ALU;
+
+ assign branch_taken = jump_instruction && (alu_zero ^ jump_negate_zero);
+ always_comb begin
+ pc_next = 32'bX;
+ case (pc_src)
+ PC_PLUS : begin
+ if (branch_taken)
+ pc_next = pc + immediate;
+ end
+ PC_ALU : pc_next = alu_out;
+ endcase
+ end
+
+ alu #(.WIDTH(32)) alu_inst(
+ .a(alu_a),
+ .b(alu_b),
+ .out(alu_out),
+
+ .op(alu_op),
+ .b_add_one(alu_b_add_one),
+ .b_negate(alu_b_negate),
+ .sign(alu_sign),
+ .zero_flag(alu_zero)
+ );
+endmodule
A src/stages/decode.sv => src/stages/decode.sv +165 -0
@@ 0,0 1,165 @@
+import cpu_types::*;
+
+module decode(
+ input clk,
+
+ input forwarding_data_status_t data_in_pipeline,
+
+ output [4:0] reg_a_1,
+ output [4:0] reg_a_2,
+ input [31:0] reg_rd1,
+ input [31:0] reg_rd2,
+
+ output jump,
+ output [31:0] pc_next,
+
+ input stage_status_t stage_in,
+ output stage_status_t stage_out
+);
+
+ wire [2:0] alu_op;
+ wire alu_add_one;
+ wire alu_negate;
+ wire alu_sign;
+
+ wire [31:0] immediate;
+ wire jump_instruction, jump_negate_zero;
+ wire jump_taken;
+
+ wire pc_src;
+
+ wire [4:0] reg_rd;
+ wire reg_we;
+
+ alu_1_source_t alu_1_src;
+ alu_2_source_t alu_2_src;
+
+ wire stall_1, stall_2;
+ wire [31:0] forwarded_reg_rd1, forwarded_reg_rd2;
+
+ wire memory_we;
+
+ assign stage_out.data.address = reg_we && !stalling ? reg_rd : 0;
+ assign stage_out.data.valid = 0; // the data cannot be valid at this point;
+
+ assign stage_out.pc = stage_in.pc;
+
+ assign stage_out.instruction.reg_we = reg_we;
+
+ assign stage_out.reg_rd1 = forwarded_reg_rd1;
+ assign stage_out.reg_rd2 = forwarded_reg_rd2;
+
+ assign stage_out.instruction.immediate = immediate;
+ assign stage_out.instruction.alu_1_src = alu_1_src;
+ assign stage_out.instruction.alu_2_src = alu_2_src;
+ assign stage_out.instruction.alu_op = alu_op;
+ assign stage_out.instruction.alu_add_one = alu_add_one;
+ assign stage_out.instruction.alu_negate = alu_negate;
+ assign stage_out.instruction.alu_sign = alu_sign;
+ assign stage_out.instruction.memory_we = memory_we;
+
+ control_unit control_unit_inst(
+ .instruction(stage_in.instruction.instruction),
+
+ .ebreak(stage_out.instruction.ebreak),
+
+ .immediate(immediate),
+
+ .alu_op(alu_op),
+ .alu_add_one(alu_add_one),
+ .alu_negate(alu_negate),
+ .alu_sign(alu_sign),
+
+ .memory_mask(stage_out.instruction.memory_mask),
+ .memory_sign_extension(stage_out.instruction.memory_sign_extension),
+
+ .memory_we(memory_we),
+
+ .jump_instruction(jump_instruction),
+ .jump_negate_zero(jump_negate_zero),
+
+ .pc_src(pc_src),
+ .alu_src_1(alu_1_src),
+ .alu_src_2(alu_2_src),
+ .reg_rd_src(stage_out.instruction.reg_rd_src),
+
+ .reg_rs1(reg_a_1),
+ .reg_rs2(reg_a_2),
+ .reg_rd(reg_rd),
+ .reg_we(reg_we)
+ );
+
+ forwarder forwarder_a_inst(
+ .clk(clk),
+ .read_address(reg_a_1),
+ .register_file_data(reg_rd1),
+ .data_in_pipeline(data_in_pipeline),
+ .stall(stall_1),
+ .forwarding(),
+ .data(forwarded_reg_rd1)
+ );
+
+ forwarder forwarder_b_inst(
+ .clk(clk),
+ .read_address(reg_a_2),
+ .register_file_data(reg_rd2),
+ .data_in_pipeline(data_in_pipeline),
+ .stall(stall_2),
+ .forwarding(),
+ .data(forwarded_reg_rd2)
+ );
+
+ // TODO: this is there twice instead of just once
+ // the second is in execute stage. Maybe merge these?
+ // alu source 1
+ reg [31:0] alu_1, alu_2;
+ always_comb begin
+ case (alu_1_src)
+ REG_FILE_RS1 : alu_1 = forwarded_reg_rd1;
+ PC : alu_1 = stage_in.pc;
+ endcase
+ end
+
+ // alu source 2
+ always_comb begin
+ case (alu_2_src)
+ REG_FILE_RS2 : alu_2 = forwarded_reg_rd2;
+ IMMEDIATE : alu_2 = immediate;
+ endcase
+ end
+
+ // // jumping logic
+ wire jumps_jumping;
+ jumps jumps_inst(
+ .pc(stage_in.pc),
+ .immediate(immediate),
+ .pc_src(pc_src),
+ .jump_negate_zero(jump_negate_zero),
+ .jump_instruction(jump_instruction),
+
+ .alu_op(alu_op),
+ .alu_a(alu_1),
+ .alu_b(alu_2),
+ .alu_sign(alu_sign),
+ .alu_b_add_one(alu_add_one),
+ .alu_b_negate(alu_negate),
+
+ .pc_next(pc_next),
+ .jumping(jumps_jumping)
+ );
+
+ assign jump = !stalling && jumps_jumping;
+
+ // stalling logic
+ // if should use reg_rd1 => wait until stall_1 == 0
+ // if should use reg_rd2 => wait until stall_2 == 0
+ wire uses_reg_rd1, uses_reg_rd2;
+ assign uses_reg_rd1 = (alu_1_src == REG_FILE_RS1);
+ assign uses_reg_rd2 = (alu_2_src == REG_FILE_RS2) || memory_we;
+
+ wire stalling;
+ assign stalling = (uses_reg_rd1 && stall_1) || (uses_reg_rd2 && stall_2);
+ assign stage_out.valid = !stalling && stage_in.valid;
+ assign stage_out.ready = !stalling || !stage_in.valid;
+ // if input is not valid, do not care about stalling...
+endmodule
A src/stages/execute.sv => src/stages/execute.sv +51 -0
@@ 0,0 1,51 @@
+import cpu_types::*;
+
+module execute(
+ input clk,
+
+ input stage_status_t stage_in,
+ output stage_status_t stage_out
+);
+ reg [31:0] alu_1, alu_2;
+ wire [31:0] alu_out;
+
+ assign stage_out.instruction = stage_in.instruction;
+ assign stage_out.pc = stage_in.pc;
+ assign stage_out.reg_rd1 = stage_in.reg_rd1;
+ assign stage_out.reg_rd2 = stage_in.reg_rd2;
+
+ assign stage_out.data.address = stage_in.valid ? stage_in.data.address : 0;
+ assign stage_out.data.data = stage_in.instruction.reg_rd_src == RD_PC_PLUS ? stage_in.pc + 4 : alu_out;
+ assign stage_out.data.valid = stage_in.valid && (stage_in.instruction.reg_rd_src != RD_MEMORY);
+
+ assign stage_out.valid = stage_in.valid;
+ assign stage_out.ready = 1;
+
+ // alu source 1
+ always_comb begin
+ case (stage_in.instruction.alu_1_src)
+ REG_FILE_RS1 : alu_1 = stage_in.reg_rd1;
+ PC : alu_1 = stage_in.pc;
+ endcase
+ end
+
+ // alu source 2
+ always_comb begin
+ case (stage_in.instruction.alu_2_src)
+ REG_FILE_RS2 : alu_2 = stage_in.reg_rd2;
+ IMMEDIATE : alu_2 = stage_in.instruction.immediate;
+ endcase
+ end
+
+ alu #(.WIDTH(32)) alu_inst(
+ .a(alu_1),
+ .b(alu_2),
+ .out(alu_out),
+
+ .op(stage_in.instruction.alu_op),
+ .b_add_one(stage_in.instruction.alu_add_one),
+ .b_negate(stage_in.instruction.alu_negate),
+ .sign(stage_in.instruction.alu_sign),
+ .zero_flag()
+ );
+endmodule
A src/stages/fetch.sv => src/stages/fetch.sv +16 -0
@@ 0,0 1,16 @@
+import cpu_types::*;
+
+module fetch(
+ input clk,
+ input [31:0] pc,
+ input [31:0] mem_instruction,
+ input jump,
+
+ output stage_status_t stage_out
+);
+ assign stage_out.instruction.instruction = mem_instruction;
+ assign stage_out.pc = pc;
+
+ assign stage_out.valid = !jump;
+ assign stage_out.ready = 1;
+endmodule
A src/stages/memory_access.sv => src/stages/memory_access.sv +65 -0
@@ 0,0 1,65 @@
+import cpu_types::*;
+
+module memory_access(
+ input clk,
+
+ input [31:0] memory_out,
+ output [3:0] memory_byte_enable,
+ output [31:0] memory_write,
+ output memory_we,
+ output [31:0] memory_address,
+
+ input stage_status_t stage_in,
+ output stage_status_t stage_out
+);
+
+ function bit[31:0] mem_sext_maybe;
+ input [31:0] num;
+ input memory_mask_t mask;
+ input sext;
+ begin
+ case(mask)
+ MEM_BYTE: return {{(32 - 8){sext & num[7]}}, num[7:0]};
+ MEM_HALFWORD: return {{(32 - 16){sext & num[15]}}, num[15:0]};
+ MEM_WORD: return num[31:0]; // rv32i, no 64 bit regs, no sign extension needed
+ default: return 0;
+ endcase
+ end
+ endfunction
+
+ function bit[3:0] mask_to_mask_bytes;
+ input memory_mask_t mask;
+ begin
+ case(mask)
+ MEM_BYTE: return 4'b0001;
+ MEM_HALFWORD: return 4'b0011;
+ MEM_WORD: return 4'b1111;
+ default: return 0;
+ endcase
+ end
+ endfunction
+
+ assign memory_byte_enable = mask_to_mask_bytes(.mask(stage_in.instruction.memory_mask)) << memory_address[1:0];
+ assign memory_write = stage_in.reg_rd2 << (8*memory_address[1:0]);
+ assign memory_address = stage_in.data.data;
+ assign memory_we = stage_in.valid ? stage_in.instruction.memory_we : 0;
+
+ assign stage_out.instruction = stage_in.instruction;
+ assign stage_out.pc = stage_in.pc;
+ assign stage_out.reg_rd1 = stage_in.reg_rd1;
+ assign stage_out.reg_rd2 = stage_in.reg_rd2;
+
+ assign stage_out.data.valid = stage_in.valid;
+ assign stage_out.data.address = stage_in.valid ? stage_in.data.address : 0;
+ assign stage_out.data.data =
+ stage_in.instruction.reg_rd_src == RD_MEMORY ?
+ mem_sext_maybe(
+ .num(memory_out >> (8*memory_address[1:0])),
+ .mask(stage_in.instruction.memory_mask),
+ .sext(stage_in.instruction.memory_sign_extension)
+ ) :
+ stage_in.data.data;
+
+ assign stage_out.valid = stage_in.valid;
+ assign stage_out.ready = 1;
+endmodule
A src/stages/writeback.sv => src/stages/writeback.sv +16 -0
@@ 0,0 1,16 @@
+import cpu_types::*;
+
+module writeback(
+
+ input clk,
+
+ output [4:0] reg_a_write,
+ output reg_we,
+ output [31:0] reg_write,
+
+ input stage_status_t stage_in
+);
+ assign reg_a_write = stage_in.data.address;
+ assign reg_we = stage_in.valid && stage_in.data.valid && stage_in.instruction.reg_we; // stage_in.data.address != 0
+ assign reg_write = stage_in.data.data;
+endmodule
M testbench/tb_control_unit.sv => testbench/tb_control_unit.sv +2 -2
@@ 16,7 16,7 @@ module tb_control_unit();
alu_2_source_t alu_src_2;
wire [2:0] alu_op;
- wire alu_signed;
+ wire alu_sign;
wire alu_negate;
wire alu_add_one;
@@ 42,7 42,7 @@ module tb_control_unit();
.alu_src_1(alu_src_1),
.alu_src_2(alu_src_2),
.alu_op(alu_op),
- .alu_signed(alu_signed),
+ .alu_sign(alu_sign),
.alu_negate(alu_negate),
.alu_add_one(alu_add_one),
.reg_rs1(reg_rs1),
M tests/comp_list.lst => tests/comp_list.lst +10 -0
@@ 4,6 4,16 @@ src/control_unit.sv
src/alu.sv
src/register_file.sv
src/program_counter.sv
+
+src/forwarder.sv
+src/jumps.sv
+
+src/stages/fetch.sv
+src/stages/decode.sv
+src/stages/execute.sv
+src/stages/memory_access.sv
+src/stages/writeback.sv
+
src/ram.sv
src/cpu.sv
src/file_program_memory.sv