~ruther/verilog-riscv-semestral-project

f8e4e3ed2dc54033786b23aa41cd88ba92eb83e2 — Rutherther 1 year, 3 months ago d4e70aa + 6da6eb9
Merge pull request #1 from Rutherther/feat/pipeline

Implement pipeline
M Makefile => Makefile +8 -2
@@ 25,7 25,7 @@ show: ./waves/$(MODULE).vcd
# These are runtime dependencies, not build time dependencies.
.PRECIOUS: ./programs/bin/%.dat ./programs/bin/%.bin

run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv ./out
run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv src/stages/*.sv ./out
	verilator --binary --trace \
		-GCPU_PROGRAM_PATH="\"./programs/bin/$(PROGRAM).dat\"" \
		-GTRACE_FILE_PATH="\"out/program_$(notdir $(basename $<)).vcd\"" \


@@ 40,6 40,9 @@ run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv 
		src/alu.sv \
		src/register_file.sv \
		src/program_counter.sv \
		src/forwarder.sv \
		src/jumps.sv \
        src/stages/*.sv \
		src/ram.sv \
		src/cpu.sv \
		src/file_program_memory.sv \


@@ 48,7 51,7 @@ run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv 
		--top tb_cpu_program
	./obj_dir/Vtb_cpu_program_$(notdir $(basename $<))

./obj_dir/Vtb_%: testbench/tb_%.sv src/*.sv
./obj_dir/Vtb_%: testbench/tb_%.sv src/*.sv src/stages/*.sv
	verilator --binary --trace \
		--trace-max-array 512 \
		src/cpu_types.sv \


@@ 57,6 60,9 @@ run_program: ./programs/bin/$(PROGRAM).dat testbench/tb_cpu_program.sv src/*.sv 
		src/alu.sv \
		src/register_file.sv \
		src/program_counter.sv \
		src/forwarder.sv \
		src/jumps.sv \
        src/stages/*.sv \
		src/ram.sv \
		src/cpu.sv \
		src/file_program_memory.sv \

M README.md => README.md +41 -0
@@ 2,8 2,49 @@
Available at https://github.com/Rutherther/verilog-riscv-semestral-project

This repository contains RISC-V processor written in SystemVerilog.
It contains both singlecycle and pipelined version.
Classic RISC pipeline is utilized.

## Architecture
The singlecycle version is located in `src/cpu_singlecycle.sv`.
The pipelined version is in `src/cpu.sv`.

There are five stages in the pipelined version
- Fetch (fetches instruction from memory)
- Decode (decodes the fetched instruction, performs jumps, gets data from forwarder)
- Execute (alu)
- Memory access (loads, stores)
- Writeback (stores data in registers)

There are forwards whenever possible for data dependencies.
The forward is realized inside of the decode stage
that will supply arguments to the execute stage.
If there is a read from memory, there has to be a stall,
the pipeline can stall. The stalling is implemented using
ready flags in each of the stages. It is thus possible to easily
implement a stage that would block for multiple cycles
instead of producing valid data every cycle.
For now, all of the stages take one cycle to produce valid data.

The forwarding is done by keeping address and data known in each stage
inside of the status.data. Outputs of execute, memory access, and
input of writeback are used for forwarding. It would be possible to
skip the writeback forwarding if instead the register file outputted data
to be written instead of the contents of the register until it's actually
written to. I am afraid this could cause other issues, hence I chose forwarding instead.

All stages have valid and ready flags.

Ready flag is used for stalling. If a stage is not ready, there cannot
be data going into it, and the pipeline before that has to be stopped,
including program counter changes. This is used for stalling when waiting for a read
out of the memory, but could also be used for making the execute stage more complex,
ie. making it work multiple cycles instead of a one. It should also be possible to implement
reads that are not aligned, by reading from two consequent positions in the memory in two cycles.

Valid flag is for "killing" data that cannot be valid. 
It's utilized when stalling - data from decode are not
valid in that case. When stalling, both valid and ready should be 0.

## Requirements
- make

M flake.nix => flake.nix +1 -1
@@ 19,7 19,7 @@
            };
        in rec {
          devShells.default = pkgs.mkShell {
            name = "pap-processor-singlecycle";
            name = "riscv-sv-processor-toolchain";

            packages = [
              # verilog simulation

M src/control_unit.sv => src/control_unit.sv +2 -2
@@ 25,7 25,7 @@ module control_unit(

  // going to alu
  output [2:0]  alu_op,
  output        alu_signed,
  output        alu_sign,
  output        alu_negate,
  output        alu_add_one,



@@ 106,7 106,7 @@ module control_unit(
  assign alu_negate = conditional_jump ? alu_jump_negate :
                      alu_override     ? 0'b0            :
                                         alu_reg_negate;
  assign alu_signed = conditional_jump ? 0'b0 :
  assign alu_sign = conditional_jump ? 0'b0 :
                      alu_override     ? 0'b0 :
                                         alu_reg_signed;


M src/cpu.sv => src/cpu.sv +90 -129
@@ 11,156 11,130 @@ module cpu(
  // ram
  output [31:0]     memory_address,
  input [31:0]      memory_out,
  output reg [31:0] memory_write,
  output [31:0]     memory_write,
  output [3:0]      memory_byte_enable,
  output reg        memory_we,
  output            memory_we,

  output ebreak
  output            ebreak
);
  parameter WIDTH = 32;

  reg [31:0]  pc_next;
  wire        pc_src;

  reg [31:0]  alu_1, alu_2;
  wire        alu_1_src, alu_2_src;

  wire [2:0]  alu_op;
  wire        alu_add_one, alu_negate, alu_signed;
  wire        alu_zero;
  wire [31:0] alu_out;

  wire [4:0]  reg_a_1, reg_a_2, reg_a_w;
  wire [31:0] reg_rd1, reg_rd2;
  reg [31:0]  reg_write;
  wire [1:0]  reg_write_src;
  wire        reg_we;

  wire [31:0] immediate;
  reg [2:0]  last_non_ready_stage;
  reg        all_stages_ready;

  wire        jump_instruction, jump_negate_zero;
  wire        jump_taken;
  wire        jump;
  wire [31:0] jumping_pc_next;

  wire        memory_sign_extension;
  stage_status_t stages_in[1:4];

  memory_mask_t memory_mask;
  /// verilator doesn't like that data taken from stages_out[i]
  // are used in stages_out[i + 1]. But that shouldn't really matter
  // as there is not really a cyclic dependency.
  // It just seems that verilator is not very good at "separating"
  // array elements
/* verilator lint_off UNOPTFLAT */
  stage_status_t stages_out[0:3];
/* verilator lint_on UNOPTFLAT */

  function bit[31:0] mem_sext_maybe;
    input [31:0] num;
    input        memory_mask_t mask;
    input        sext;
    begin
      case(mask)
        MEM_BYTE: return {{(32 - 8){sext & num[7]}}, num[7:0]};
        MEM_HALFWORD: return {{(32 - 16){sext & num[15]}}, num[15:0]};
        MEM_WORD: return num[31:0]; // rv32i, no 64 bit regs, no sign extension needed
        default: return 0;
      endcase
  assign ebreak = stages_out[ACCESS].instruction.ebreak;

  // stage registers
  always_ff @(posedge clk) begin
    if (rst_n == 0) begin
      for (int i = 0; i < $size(stages_in); i++) begin
        stages_in[i].data.address = 0;
      end
    end
  endfunction

  function bit[3:0] mask_to_mask_bytes;
    input memory_mask_t mask;
    begin
      case(mask)
        MEM_BYTE: return 4'b0001;
        MEM_HALFWORD: return 4'b0011;
        MEM_WORD: return 4'b1111;
        default: return 0;
      endcase
    else begin
      for (int i = 0; i < $size(stages_in); i++) begin
        if (all_stages_ready || i >= last_non_ready_stage) begin
          stages_in[i + 1] = stages_out[i];
        end
      end
    end
  endfunction

  assign memory_byte_enable = mask_to_mask_bytes(.mask(memory_mask)) << memory_address[1:0];
  assign memory_write = reg_rd2 << (8*memory_address[1:0]);
  assign memory_address = alu_out;

  // alu source 1
  always_comb begin
    case (alu_1_src)
      REG_FILE_RS1 : alu_1 = reg_rd1;
      PC : alu_1 = pc;
    endcase
  end

  // alu source 2
  // find first non ready stage. Stages before that will be stalled
  always_comb begin
    case (alu_2_src)
      REG_FILE_RS2 : alu_2 = reg_rd2;
      IMMEDIATE : alu_2 = immediate;
    endcase
    last_non_ready_stage = 0;
    all_stages_ready = 1'b1;
    for (int i = 0; i < $size(stages_out); i++) begin
      if (!stages_out[i].ready) begin
        last_non_ready_stage = i[2:0];
        all_stages_ready = 1'b0;
      end
    end
  end

  // pc source
  assign jump_taken = jump_instruction && (alu_zero ^ jump_negate_zero);
  always_comb begin
    if (ebreak)
    if (jump)
      pc_next = jumping_pc_next;
    else if (all_stages_ready) // assume no jump. If jump, if result will be thrown out
      pc_next = pc + 4;
    else // stalling (in any stage, meaning not fetching new instructions)
      pc_next = pc;
    else
      case (pc_src)
        PC_PLUS : begin
          if (jump_taken)
            pc_next = pc + immediate;
          else
            pc_next = pc + 4;
        end
        PC_ALU : pc_next = alu_out;
      endcase
  end

  // register file write source
  // TODO forwarding pipelined, split to two instead
  always_comb begin
    case (reg_write_src)
      RD_ALU : reg_write = alu_out;
      RD_PC_PLUS : reg_write = pc + 4;
      RD_MEMORY : reg_write = mem_sext_maybe(.num(memory_out >> (8*memory_address[1:0])), .mask(memory_mask), .sext(memory_sign_extension));
      default : ;
    endcase
  end
  // data for forwarding from the stages
  // Note: this is a record instead of an array
  // just because verilator didn't like it as an array
  // consider switching back to array.
  forwarding_data_status_t data_in_pipeline;
  assign data_in_pipeline.execute_out = stages_out[EXECUTE].data;
  assign data_in_pipeline.access_out = stages_out[ACCESS].data;
  assign data_in_pipeline.writeback_in = stages_in[WRITEBACK].data;

  control_unit control_unit_inst(
    .instruction(instruction),

    .ebreak(ebreak),

    .immediate(immediate),
  fetch fetch_inst(
    .clk(clk),
    .pc(pc),
    .mem_instruction(instruction),
    .jump(jump),
    .stage_out(stages_out[FETCH])
  );

    .alu_op(alu_op),
    .alu_add_one(alu_add_one),
    .alu_negate(alu_negate),
    .alu_signed(alu_signed),
  decode decode_inst(
    .clk(clk),
    .data_in_pipeline(data_in_pipeline),
    .reg_a_1(reg_a_1),
    .reg_a_2(reg_a_2),
    .reg_rd1(reg_rd1),
    .reg_rd2(reg_rd2),
    .jump(jump),
    .pc_next(jumping_pc_next),
    .stage_in(stages_in[DECODE]),
    .stage_out(stages_out[DECODE])
  );

    .memory_mask(memory_mask),
    .memory_sign_extension(memory_sign_extension),
  execute execute_inst(
    .clk(clk),
    .stage_in(stages_in[EXECUTE]),
    .stage_out(stages_out[EXECUTE])
  );

  memory_access memory_access_inst(
    .clk(clk),
    .memory_out(memory_out),
    .memory_byte_enable(memory_byte_enable),
    .memory_write(memory_write),
    .memory_we(memory_we),

    .jump_instruction(jump_instruction),
    .jump_negate_zero(jump_negate_zero),

    .pc_src(pc_src),
    .alu_src_1(alu_1_src),
    .alu_src_2(alu_2_src),
    .reg_rd_src(reg_write_src),

    .reg_rs1(reg_a_1),
    .reg_rs2(reg_a_2),
    .reg_rd(reg_a_w),
    .reg_we(reg_we)
    .memory_address(memory_address),
    .stage_in(stages_in[ACCESS]),
    .stage_out(stages_out[ACCESS])
  );

  alu #(.WIDTH(WIDTH)) alu_inst(
    .a(alu_1),
    .b(alu_2),

    .out(alu_out),

    .op(alu_op),
    .b_add_one(alu_add_one),
    .b_negate(alu_negate),
    .sign(alu_signed),
    .zero_flag(alu_zero)
  writeback writeback_inst(
    .clk(clk),
    .reg_a_write(reg_a_w),
    .reg_we(reg_we),
    .reg_write(reg_write),
    .stage_in(stages_in[WRITEBACK])
  );

  register_file #(.WIDTH(WIDTH), .ADDRESS_LENGTH(5)) register_file_inst(


@@ 180,17 154,4 @@ module cpu(
    .pc(pc[11:0]),
    .pc_next(pc_next[11:0])
  );

  // program_memory program_memory_inst(
  //   .addr(pc[11:0]),
  //   .instruction(instruction)
  // );

  // ram memory_inst(
  //   .clk(clk),
  //   .a(memory_address),
  //   .we(memory_we),
  //   .wd(memory_write),
  //   .rd(memory_out)
  // );
endmodule

A src/cpu_singlecycle.sv => src/cpu_singlecycle.sv +195 -0
@@ 0,0 1,195 @@
import cpu_types::*;

module cpu(
  input             clk,
  input             rst_n,

  // program memory
  input [31:0]      instruction,
  output reg [31:0] pc,

  // ram
  output [31:0]     memory_address,
  input [31:0]      memory_out,
  output reg [31:0] memory_write,
  output [3:0]      memory_byte_enable,
  output reg        memory_we,

  output ebreak
);
  parameter WIDTH = 32;

  reg [31:0]  pc_next;
  wire        pc_src;

  reg [31:0]  alu_1, alu_2;
  wire        alu_1_src, alu_2_src;

  wire [2:0]  alu_op;
  wire        alu_add_one, alu_negate, alu_signed;
  wire        alu_zero;
  wire [31:0] alu_out;

  wire [4:0]  reg_a_1, reg_a_2, reg_a_w;
  wire [31:0] reg_rd1, reg_rd2;
  reg [31:0]  reg_write;
  wire [1:0]  reg_write_src;
  wire        reg_we;

  wire [31:0] immediate;

  wire        jump_instruction, jump_negate_zero;
  wire        jump_taken;

  wire        memory_sign_extension;

  memory_mask_t memory_mask;

  function bit[31:0] mem_sext_maybe;
    input [31:0] num;
    input        memory_mask_t mask;
    input        sext;
    begin
      case(mask)
        MEM_BYTE: return {{(32 - 8){sext & num[7]}}, num[7:0]};
        MEM_HALFWORD: return {{(32 - 16){sext & num[15]}}, num[15:0]};
        MEM_WORD: return num[31:0]; // rv32i, no 64 bit regs, no sign extension needed
        default: return 0;
      endcase
    end
  endfunction

  function bit[3:0] mask_to_mask_bytes;
    input memory_mask_t mask;
    begin
      case(mask)
        MEM_BYTE: return 4'b0001;
        MEM_HALFWORD: return 4'b0011;
        MEM_WORD: return 4'b1111;
        default: return 0;
      endcase
    end
  endfunction

  assign memory_byte_enable = mask_to_mask_bytes(.mask(memory_mask)) << memory_address[1:0];
  assign memory_write = reg_rd2 << (8*memory_address[1:0]);
  assign memory_address = alu_out;

  // alu source 1
  always_comb begin
    case (alu_1_src)
      REG_FILE_RS1 : alu_1 = reg_rd1;
      PC : alu_1 = pc;
    endcase
  end

  // alu source 2
  always_comb begin
    case (alu_2_src)
      REG_FILE_RS2 : alu_2 = reg_rd2;
      IMMEDIATE : alu_2 = immediate;
    endcase
  end

  // pc source
  assign jump_taken = jump_instruction && (alu_zero ^ jump_negate_zero);
  always_comb begin
    if (ebreak)
      pc_next = pc;
    else
      case (pc_src)
        PC_PLUS : begin
          if (jump_taken)
            pc_next = pc + immediate;
          else
            pc_next = pc + 4;
        end
        PC_ALU : pc_next = alu_out;
      endcase
  end

  // register file write source
  always_comb begin
    case (reg_write_src)
      RD_ALU : reg_write = alu_out;
      RD_PC_PLUS : reg_write = pc + 4;
      RD_MEMORY : reg_write = mem_sext_maybe(.num(memory_out >> (8*memory_address[1:0])), .mask(memory_mask), .sext(memory_sign_extension));
      default : ;
    endcase
  end

  control_unit control_unit_inst(
    .instruction(instruction),

    .ebreak(ebreak),

    .immediate(immediate),

    .alu_op(alu_op),
    .alu_add_one(alu_add_one),
    .alu_negate(alu_negate),
    .alu_signed(alu_signed),

    .memory_mask(memory_mask),
    .memory_sign_extension(memory_sign_extension),

    .memory_we(memory_we),

    .jump_instruction(jump_instruction),
    .jump_negate_zero(jump_negate_zero),

    .pc_src(pc_src),
    .alu_src_1(alu_1_src),
    .alu_src_2(alu_2_src),
    .reg_rd_src(reg_write_src),

    .reg_rs1(reg_a_1),
    .reg_rs2(reg_a_2),
    .reg_rd(reg_a_w),
    .reg_we(reg_we)
  );

  alu #(.WIDTH(WIDTH)) alu_inst(
    .a(alu_1),
    .b(alu_2),

    .out(alu_out),

    .op(alu_op),
    .b_add_one(alu_add_one),
    .b_negate(alu_negate),
    .sign(alu_signed),
    .zero_flag(alu_zero)
  );

  register_file #(.WIDTH(WIDTH), .ADDRESS_LENGTH(5)) register_file_inst(
    .clk(clk),
    .a1(reg_a_1),
    .a2(reg_a_2),
    .a3(reg_a_w),
    .we3(reg_we),
    .wd3(reg_write),
    .rd1(reg_rd1),
    .rd2(reg_rd2)
  );

  program_counter program_counter_inst(
    .clk(clk),
    .rst_n(rst_n),
    .pc(pc[11:0]),
    .pc_next(pc_next[11:0])
  );

  // program_memory program_memory_inst(
  //   .addr(pc[11:0]),
  //   .instruction(instruction)
  // );

  // ram memory_inst(
  //   .clk(clk),
  //   .a(memory_address),
  //   .we(memory_we),
  //   .wd(memory_write),
  //   .rd(memory_out)
  // );
endmodule

M src/cpu_types.sv => src/cpu_types.sv +61 -0
@@ 5,4 5,65 @@ package cpu_types;
  typedef enum bit[1:0] { RD_ALU, RD_PC_PLUS, RD_MEMORY } reg_rd_source_t;

  typedef enum bit[1:0] { MEM_BYTE, MEM_HALFWORD, MEM_WORD } memory_mask_t;

  typedef struct {
    bit [31:0] instruction;

    bit [31:0] immediate;
    bit ebreak;

    alu_1_source_t alu_1_src;
    alu_2_source_t alu_2_src;

    bit [2:0] alu_op;
    bit alu_add_one;
    bit alu_negate;
    bit alu_sign;

    memory_mask_t memory_mask;
    bit memory_sign_extension;
    bit memory_we;

    bit reg_we;

    reg_rd_source_t reg_rd_src;

  } decoded_instruction_t;

  // For pipelining, used in execute, memory, and writeback stages.
  // The instruction decode stage will check if any tag matches the
  // address being read from. If yes, it has to be forwarded instead
  // of getting it from the register. Additionaly, if the data
  // are invalid, stalling will be necessary.
  typedef struct {
    bit [4:0]  address; // The address the data will be written to
    bit [31:0] data; // The data to be written to the address
    bit        valid; // Are the data valid? (data will be invalid for memory operations in execute stage)
  } register_data_status_t;

  typedef struct {
    register_data_status_t execute_out;
    register_data_status_t access_out;
    register_data_status_t writeback_in;
  } forwarding_data_status_t;

  typedef struct {
    decoded_instruction_t instruction;
    register_data_status_t data;

    bit [31:0] pc;

    bit [31:0] reg_rd1;
    bit [31:0] reg_rd2;

    bit valid;
    bit ready;
    // !ready == stall
  } stage_status_t;

  const int FETCH = 0;
  const int DECODE = 1;
  const int EXECUTE = 2;
  const int ACCESS = 3;
  const int WRITEBACK = 4;
endpackage

A src/forwarder.sv => src/forwarder.sv +40 -0
@@ 0,0 1,40 @@
import cpu_types::*;

module forwarder(
  input             clk,
  input [4:0]       read_address,
  input [31:0]      register_file_data,
  input             forwarding_data_status_t data_in_pipeline,

  output            forwarding,
  output reg        stall,
  output reg [31:0] data
);
  // if any data in the pipeline match the reading address,
  // these will be used instead of the register_file_data
  //
  // if there are multiple matches, the first one is taken
  // to get the most recent data

  always_comb begin
    stall = 0;
    data = register_file_data;
    forwarding = 0;

    if (read_address != 0 && data_in_pipeline.execute_out.address == read_address) begin
      stall = !data_in_pipeline.execute_out.valid;
      data = data_in_pipeline.execute_out.data;
      forwarding = 1;
    end
    else if (read_address != 0 && data_in_pipeline.access_out.address == read_address) begin
      stall = !data_in_pipeline.access_out.valid;
      data = data_in_pipeline.access_out.data;
      forwarding = 1;
    end
    else if (read_address != 0 && data_in_pipeline.writeback_in.address == read_address) begin
      stall = !data_in_pipeline.writeback_in.valid;
      data = data_in_pipeline.writeback_in.data;
      forwarding = 1;
    end
  end
endmodule

A src/jumps.sv => src/jumps.sv +49 -0
@@ 0,0 1,49 @@
import cpu_types::*;

module jumps(
  input [31:0]  pc,
  input [31:0]  immediate,
  input         pc_source_t pc_src,
  input         jump_negate_zero,
  input         jump_instruction,

  input [2:0]   alu_op,
  input [31:0]  alu_a, alu_b,
  input         alu_sign,
  input         alu_b_add_one,
  input         alu_b_negate,

  output [31:0] pc_next,
  output        jumping
);
  wire [31:0] alu_out;
  wire        alu_zero;

  wire        branch_taken;

  assign jumping = branch_taken || pc_src == PC_ALU;

  assign branch_taken = jump_instruction && (alu_zero ^ jump_negate_zero);
  always_comb begin
    pc_next = 32'bX;
    case (pc_src)
      PC_PLUS : begin
        if (branch_taken)
          pc_next = pc + immediate;
      end
      PC_ALU : pc_next = alu_out;
    endcase
  end

  alu #(.WIDTH(32)) alu_inst(
    .a(alu_a),
    .b(alu_b),
    .out(alu_out),

    .op(alu_op),
    .b_add_one(alu_b_add_one),
    .b_negate(alu_b_negate),
    .sign(alu_sign),
    .zero_flag(alu_zero)
  );
endmodule

A src/stages/decode.sv => src/stages/decode.sv +165 -0
@@ 0,0 1,165 @@
import cpu_types::*;

module decode(
  input         clk,

  input         forwarding_data_status_t data_in_pipeline,

  output [4:0]  reg_a_1,
  output [4:0]  reg_a_2,
  input [31:0]  reg_rd1,
  input [31:0]  reg_rd2,

  output        jump,
  output [31:0] pc_next,

  input         stage_status_t stage_in,
  output        stage_status_t stage_out
);

  wire [2:0] alu_op;
  wire       alu_add_one;
  wire       alu_negate;
  wire       alu_sign;

  wire [31:0] immediate;
  wire        jump_instruction, jump_negate_zero;
  wire        jump_taken;

  wire        pc_src;

  wire [4:0]  reg_rd;
  wire        reg_we;

  alu_1_source_t alu_1_src;
  alu_2_source_t alu_2_src;

  wire        stall_1, stall_2;
  wire [31:0] forwarded_reg_rd1, forwarded_reg_rd2;

  wire        memory_we;

  assign stage_out.data.address = reg_we && !stalling ? reg_rd : 0;
  assign stage_out.data.valid = 0; // the data cannot be valid at this point;

  assign stage_out.pc = stage_in.pc;

  assign stage_out.instruction.reg_we = reg_we;

  assign stage_out.reg_rd1 = forwarded_reg_rd1;
  assign stage_out.reg_rd2 = forwarded_reg_rd2;

  assign stage_out.instruction.immediate = immediate;
  assign stage_out.instruction.alu_1_src = alu_1_src;
  assign stage_out.instruction.alu_2_src = alu_2_src;
  assign stage_out.instruction.alu_op = alu_op;
  assign stage_out.instruction.alu_add_one = alu_add_one;
  assign stage_out.instruction.alu_negate = alu_negate;
  assign stage_out.instruction.alu_sign = alu_sign;
  assign stage_out.instruction.memory_we = memory_we;

  control_unit control_unit_inst(
    .instruction(stage_in.instruction.instruction),

    .ebreak(stage_out.instruction.ebreak),

    .immediate(immediate),

    .alu_op(alu_op),
    .alu_add_one(alu_add_one),
    .alu_negate(alu_negate),
    .alu_sign(alu_sign),

    .memory_mask(stage_out.instruction.memory_mask),
    .memory_sign_extension(stage_out.instruction.memory_sign_extension),

    .memory_we(memory_we),

    .jump_instruction(jump_instruction),
    .jump_negate_zero(jump_negate_zero),

    .pc_src(pc_src),
    .alu_src_1(alu_1_src),
    .alu_src_2(alu_2_src),
    .reg_rd_src(stage_out.instruction.reg_rd_src),

    .reg_rs1(reg_a_1),
    .reg_rs2(reg_a_2),
    .reg_rd(reg_rd),
    .reg_we(reg_we)
  );

  forwarder forwarder_a_inst(
    .clk(clk),
    .read_address(reg_a_1),
    .register_file_data(reg_rd1),
    .data_in_pipeline(data_in_pipeline),
    .stall(stall_1),
    .forwarding(),
    .data(forwarded_reg_rd1)
  );

  forwarder forwarder_b_inst(
    .clk(clk),
    .read_address(reg_a_2),
    .register_file_data(reg_rd2),
    .data_in_pipeline(data_in_pipeline),
    .stall(stall_2),
    .forwarding(),
    .data(forwarded_reg_rd2)
  );

  // TODO: this is there twice instead of just once
  // the second is in execute stage. Maybe merge these?
  // alu source 1
  reg [31:0] alu_1, alu_2;
  always_comb begin
    case (alu_1_src)
      REG_FILE_RS1 : alu_1 = forwarded_reg_rd1;
      PC : alu_1 = stage_in.pc;
    endcase
  end

  // alu source 2
  always_comb begin
    case (alu_2_src)
      REG_FILE_RS2 : alu_2 = forwarded_reg_rd2;
      IMMEDIATE : alu_2 = immediate;
    endcase
  end

  // // jumping logic
  wire jumps_jumping;
  jumps jumps_inst(
    .pc(stage_in.pc),
    .immediate(immediate),
    .pc_src(pc_src),
    .jump_negate_zero(jump_negate_zero),
    .jump_instruction(jump_instruction),

    .alu_op(alu_op),
    .alu_a(alu_1),
    .alu_b(alu_2),
    .alu_sign(alu_sign),
    .alu_b_add_one(alu_add_one),
    .alu_b_negate(alu_negate),

    .pc_next(pc_next),
    .jumping(jumps_jumping)
  );

  assign jump = !stalling && jumps_jumping;

  // stalling logic
  //   if should use reg_rd1 => wait until stall_1 == 0
  //   if should use reg_rd2 => wait until stall_2 == 0
  wire uses_reg_rd1, uses_reg_rd2;
  assign uses_reg_rd1 = (alu_1_src == REG_FILE_RS1);
  assign uses_reg_rd2 = (alu_2_src == REG_FILE_RS2) || memory_we;

  wire stalling;
  assign stalling = (uses_reg_rd1 && stall_1) || (uses_reg_rd2 && stall_2);
  assign stage_out.valid = !stalling && stage_in.valid;
  assign stage_out.ready = !stalling || !stage_in.valid;
    // if input is not valid, do not care about stalling...
endmodule

A src/stages/execute.sv => src/stages/execute.sv +51 -0
@@ 0,0 1,51 @@
import cpu_types::*;

module execute(
  input clk,

  input stage_status_t stage_in,
  output stage_status_t stage_out
);
  reg [31:0] alu_1, alu_2;
  wire [31:0] alu_out;

  assign stage_out.instruction = stage_in.instruction;
  assign stage_out.pc = stage_in.pc;
  assign stage_out.reg_rd1 = stage_in.reg_rd1;
  assign stage_out.reg_rd2 = stage_in.reg_rd2;

  assign stage_out.data.address = stage_in.valid ? stage_in.data.address : 0;
  assign stage_out.data.data = stage_in.instruction.reg_rd_src == RD_PC_PLUS ? stage_in.pc + 4 : alu_out;
  assign stage_out.data.valid = stage_in.valid && (stage_in.instruction.reg_rd_src != RD_MEMORY);

  assign stage_out.valid = stage_in.valid;
  assign stage_out.ready = 1;

  // alu source 1
  always_comb begin
    case (stage_in.instruction.alu_1_src)
      REG_FILE_RS1 : alu_1 = stage_in.reg_rd1;
      PC : alu_1 = stage_in.pc;
    endcase
  end

  // alu source 2
  always_comb begin
    case (stage_in.instruction.alu_2_src)
      REG_FILE_RS2 : alu_2 = stage_in.reg_rd2;
      IMMEDIATE : alu_2 = stage_in.instruction.immediate;
    endcase
  end

  alu #(.WIDTH(32)) alu_inst(
    .a(alu_1),
    .b(alu_2),
    .out(alu_out),

    .op(stage_in.instruction.alu_op),
    .b_add_one(stage_in.instruction.alu_add_one),
    .b_negate(stage_in.instruction.alu_negate),
    .sign(stage_in.instruction.alu_sign),
    .zero_flag()
  );
endmodule

A src/stages/fetch.sv => src/stages/fetch.sv +16 -0
@@ 0,0 1,16 @@
import cpu_types::*;

module fetch(
  input        clk,
  input [31:0] pc,
  input [31:0] mem_instruction,
  input        jump,

  output       stage_status_t stage_out
);
  assign stage_out.instruction.instruction = mem_instruction;
  assign stage_out.pc = pc;

  assign stage_out.valid = !jump;
  assign stage_out.ready = 1;
endmodule

A src/stages/memory_access.sv => src/stages/memory_access.sv +65 -0
@@ 0,0 1,65 @@
import cpu_types::*;

module memory_access(
  input         clk,

  input [31:0]  memory_out,
  output [3:0]  memory_byte_enable,
  output [31:0] memory_write,
  output        memory_we,
  output [31:0] memory_address,

  input         stage_status_t stage_in,
  output        stage_status_t stage_out
);

  function bit[31:0] mem_sext_maybe;
    input [31:0] num;
    input        memory_mask_t mask;
    input        sext;
    begin
      case(mask)
        MEM_BYTE: return {{(32 - 8){sext & num[7]}}, num[7:0]};
        MEM_HALFWORD: return {{(32 - 16){sext & num[15]}}, num[15:0]};
        MEM_WORD: return num[31:0]; // rv32i, no 64 bit regs, no sign extension needed
        default: return 0;
      endcase
    end
  endfunction

  function bit[3:0] mask_to_mask_bytes;
    input memory_mask_t mask;
    begin
      case(mask)
        MEM_BYTE: return 4'b0001;
        MEM_HALFWORD: return 4'b0011;
        MEM_WORD: return 4'b1111;
        default: return 0;
      endcase
    end
  endfunction

  assign memory_byte_enable = mask_to_mask_bytes(.mask(stage_in.instruction.memory_mask)) << memory_address[1:0];
  assign memory_write = stage_in.reg_rd2 << (8*memory_address[1:0]);
  assign memory_address = stage_in.data.data;
  assign memory_we = stage_in.valid ? stage_in.instruction.memory_we : 0;

  assign stage_out.instruction = stage_in.instruction;
  assign stage_out.pc = stage_in.pc;
  assign stage_out.reg_rd1 = stage_in.reg_rd1;
  assign stage_out.reg_rd2 = stage_in.reg_rd2;

  assign stage_out.data.valid = stage_in.valid;
  assign stage_out.data.address = stage_in.valid ? stage_in.data.address : 0;
  assign stage_out.data.data =
    stage_in.instruction.reg_rd_src == RD_MEMORY ?
        mem_sext_maybe(
            .num(memory_out >> (8*memory_address[1:0])),
            .mask(stage_in.instruction.memory_mask),
            .sext(stage_in.instruction.memory_sign_extension)
        ) :
        stage_in.data.data;

  assign stage_out.valid = stage_in.valid;
  assign stage_out.ready = 1;
endmodule

A src/stages/writeback.sv => src/stages/writeback.sv +16 -0
@@ 0,0 1,16 @@
import cpu_types::*;

module writeback(

  input         clk,

  output [4:0]  reg_a_write,
  output        reg_we,
  output [31:0] reg_write,

  input         stage_status_t stage_in
);
  assign reg_a_write = stage_in.data.address;
  assign reg_we = stage_in.valid && stage_in.data.valid && stage_in.instruction.reg_we; // stage_in.data.address != 0
  assign reg_write = stage_in.data.data;
endmodule

M testbench/tb_control_unit.sv => testbench/tb_control_unit.sv +2 -2
@@ 16,7 16,7 @@ module tb_control_unit();
  alu_2_source_t alu_src_2;

  wire [2:0]  alu_op;
  wire        alu_signed;
  wire        alu_sign;
  wire        alu_negate;
  wire        alu_add_one;



@@ 42,7 42,7 @@ module tb_control_unit();
    .alu_src_1(alu_src_1),
    .alu_src_2(alu_src_2),
    .alu_op(alu_op),
    .alu_signed(alu_signed),
    .alu_sign(alu_sign),
    .alu_negate(alu_negate),
    .alu_add_one(alu_add_one),
    .reg_rs1(reg_rs1),

M tests/comp_list.lst => tests/comp_list.lst +10 -0
@@ 4,6 4,16 @@ src/control_unit.sv
src/alu.sv
src/register_file.sv
src/program_counter.sv

src/forwarder.sv
src/jumps.sv

src/stages/fetch.sv
src/stages/decode.sv
src/stages/execute.sv
src/stages/memory_access.sv
src/stages/writeback.sv

src/ram.sv
src/cpu.sv
src/file_program_memory.sv

Do not follow this link