import cpu_types::*;

module cpu(
  input             clk,
  input             rst_n,

  // program memory
  input [31:0]      instruction,
  output reg [31:0] pc,

  // ram
  output [31:0]     memory_address,
  input [31:0]      memory_out,
  output [31:0]     memory_write,
  output [3:0]      memory_byte_enable,
  output            memory_we,

  output            ebreak
);
  parameter WIDTH = 32;

  reg [31:0]  pc_next;

  wire [4:0]  reg_a_1, reg_a_2, reg_a_w;
  wire [31:0] reg_rd1, reg_rd2;
  reg [31:0]  reg_write;
  wire        reg_we;

  reg [2:0]  last_non_ready_stage;
  reg        all_stages_ready;

  wire        jump;
  wire [31:0] jumping_pc_next;

  stage_status_t stages_in[1:4];

  /// verilator doesn't like that data taken from stages_out[i]
  // are used in stages_out[i + 1]. But that shouldn't really matter
  // as there is not really a cyclic dependency.
  // It just seems that verilator is not very good at "separating"
  // array elements
/* verilator lint_off UNOPTFLAT */
  stage_status_t stages_out[0:3];
/* verilator lint_on UNOPTFLAT */

  assign ebreak = stages_out[ACCESS].instruction.ebreak;

  // stage registers
  always_ff @(posedge clk) begin
    if (rst_n == 0) begin
      for (int i = 0; i < $size(stages_in); i++) begin
        stages_in[i].data.address = 0;
      end
    end
    else begin
      for (int i = 0; i < $size(stages_in); i++) begin
        if (all_stages_ready || i >= last_non_ready_stage) begin
          stages_in[i + 1] = stages_out[i];
        end
      end
    end
  end

  // find first non ready stage. Stages before that will be stalled
  always_comb begin
    last_non_ready_stage = 0;
    all_stages_ready = 1'b1;
    for (int i = 0; i < $size(stages_out); i++) begin
      if (!stages_out[i].ready) begin
        last_non_ready_stage = i[2:0];
        all_stages_ready = 1'b0;
      end
    end
  end

  always_comb begin
    if (jump)
      pc_next = jumping_pc_next;
    else if (all_stages_ready) // assume no jump. If jump, if result will be thrown out
      pc_next = pc + 4;
    else // stalling (in any stage, meaning not fetching new instructions)
      pc_next = pc;
  end

  // data for forwarding from the stages
  // Note: this is a record instead of an array
  // just because verilator didn't like it as an array
  // consider switching back to array.
  forwarding_data_status_t data_in_pipeline;
  assign data_in_pipeline.execute_out = stages_out[EXECUTE].data;
  assign data_in_pipeline.access_out = stages_out[ACCESS].data;
  assign data_in_pipeline.writeback_in = stages_in[WRITEBACK].data;

  fetch fetch_inst(
    .clk(clk),
    .pc(pc),
    .mem_instruction(instruction),
    .jump(jump),
    .stage_out(stages_out[FETCH])
  );

  decode decode_inst(
    .clk(clk),
    .data_in_pipeline(data_in_pipeline),
    .reg_a_1(reg_a_1),
    .reg_a_2(reg_a_2),
    .reg_rd1(reg_rd1),
    .reg_rd2(reg_rd2),
    .jump(jump),
    .pc_next(jumping_pc_next),
    .stage_in(stages_in[DECODE]),
    .stage_out(stages_out[DECODE])
  );

  execute execute_inst(
    .clk(clk),
    .stage_in(stages_in[EXECUTE]),
    .stage_out(stages_out[EXECUTE])
  );

  memory_access memory_access_inst(
    .clk(clk),
    .memory_out(memory_out),
    .memory_byte_enable(memory_byte_enable),
    .memory_write(memory_write),
    .memory_we(memory_we),
    .memory_address(memory_address),
    .stage_in(stages_in[ACCESS]),
    .stage_out(stages_out[ACCESS])
  );

  writeback writeback_inst(
    .clk(clk),
    .reg_a_write(reg_a_w),
    .reg_we(reg_we),
    .reg_write(reg_write),
    .stage_in(stages_in[WRITEBACK])
  );

  register_file #(.WIDTH(WIDTH), .ADDRESS_LENGTH(5)) register_file_inst(
    .clk(clk),
    .a1(reg_a_1),
    .a2(reg_a_2),
    .a3(reg_a_w),
    .we3(reg_we),
    .wd3(reg_write),
    .rd1(reg_rd1),
    .rd2(reg_rd2)
  );

  program_counter program_counter_inst(
    .clk(clk),
    .rst_n(rst_n),
    .pc(pc[11:0]),
    .pc_next(pc_next[11:0])
  );
endmodule