-- BUGS: None known.

library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.numeric_std.all;

entity Line_Rasterization is
  port(
    CLK : in std_logic;
    nRST : in std_logic;
    
-- From 3d_projector    
    DATA : in std_logic_vector(47 downto 0);

-- From screen buffer
    SB_BUSY : in std_logic;
    

-- To screen buffer    
    RESULT : out std_logic_vector(23 downto 0);
    SB_ENABLE : out std_logic;
    
-- From controller
    ENABLE : in std_logic;
    BUSY : out std_logic;
    OPCODE : in std_logic_vector(1 downto 0));
end Line_Rasterization;

architecture GPU of Line_Rasterization is

-- returns the maximum of two signed numbers
function maximum (
  a, b    : signed)
  return signed is
begin
  if a > b then return a;
  else return b;
  end if;
end function maximum;

-- returns the minimum of two signed numbers
function minimum (
  a, b : signed)
  return signed is
begin
  if a < b then return a;
  else return b;
  end if;
end function minimum;


-- signed divide of two 16:16 signed fixed point numbers
-- note: sign is two's complement
-- this routine properly shifts the numerator to perform an accurate
--   fixed point divide without the need for 64-bit signals
component signed_divide is
  port (
    X : in  STD_LOGIC_VECTOR(31 downto 0);  -- 16.16 numerator
    Y : in  STD_LOGIC_VECTOR(31 downto 0);  -- 16.16 denominator
    Z : out STD_LOGIC_VECTOR(31 downto 0);  -- 16.16 quotient
    V : out STD_LOGIC);                 -- overflow (or divide by zero)
end component;
  
type state_type is (ST_WAIT_FOR_OPCODE, ST_LOOPOVERX,
                    ST_LOOPOVERY, ST_WAITFORSENDX,
                    ST_WAITFORSENDY);

-- state machine signals
signal state, nextstate : state_type;

-- divider inputs for four dividers
signal DIV_X : std_logic_vector(31 downto 0);
signal DIV_Y : std_logic_vector(31 downto 0);
signal DIV_Z : std_logic_vector(31 downto 0);
signal DIV_V : std_logic;

signal DIV2_X : std_logic_vector(31 downto 0);
signal DIV2_Y : std_logic_vector(31 downto 0);
signal DIV2_Z : std_logic_vector(31 downto 0);
signal DIV2_V : std_logic;


-- directly derived from the input data signal (x1,y1,z1) to (x2,y2,z2)
signal x1, x2, y1, y2, z1, z2 : signed(31 downto 0);

-- for flip-flopping the DATA input and the RESULT
signal storedDATA : std_logic_vector(47 downto 0);
signal nextDATA : std_logic_vector(47 downto 0);

signal storedRESULT : std_logic_vector(23 downto 0);
signal nextRESULT : std_logic_vector(23 downto 0);

-- the slope for y = mx + b, where for slopeAperB32, m = dA/dB.
signal slopeyperx32, slopezperx32 : signed(31 downto 0);
signal slopexpery32, slopezpery32 : signed(31 downto 0);

signal nextslopeyperx32, nextslopezperx32 : signed(31 downto 0);
signal nextslopexpery32, nextslopezpery32 : signed(31 downto 0);

-- AIntforB is the A-intercept for the Bth variable,
-- ie with y = mx + b, then b = the y-intercept for x
signal xIntforY, zIntforX : signed(63 downto 0);
signal yIntforX, zIntforY : signed(63 downto 0);

-- for keeping track of the x or y value to compute the other two from
--   these are incremented by one each time to get all pixel values
signal currX : signed(31 downto 0);
signal nextX : signed(31 downto 0);
signal currY : signed(31 downto 0);
signal nexty : signed(31 downto 0);

-- the computed "other two" values, computed by using the slope and intercept
--  and curr* from above
signal X : signed(63 downto 0);
signal Y : signed(63 downto 0);
signal Z : signed(63 downto 0);

begin  -- GPU

  -- four divide modules are used
  LINE_DIV : signed_divide port map (
    DIV_X, DIV_Y, DIV_Z, DIV_V);
  LINE_DIV2 : signed_divide port map (
    DIV2_X, DIV2_Y, DIV2_Z, DIV2_V);   

  -- convert the DATA to 16-bit fixed point numbers
  --  DATA = MSB<x1 y1 z1 x2 y2 z2>LSB, where we want a line
  --  from (x1,y1,z1) to (x2,y2,z2)
  x1 <= signed(x"00" & nextDATA(47 downto 40) & x"0000");
  y1 <= signed(x"00" & nextDATA(39 downto 32) & x"0000");
  z1 <= signed(x"00" & nextDATA(31 downto 24) & x"0000");
  x2 <= signed(x"00" & nextDATA(23 downto 16) & x"0000");
  y2 <= signed(x"00" & nextDATA(15 downto 8) & x"0000");
  z2 <= signed(x"00" & nextDATA(7 downto 0) & x"0000");

  
  -- y-intercepts: b = y - mx (in general)
  yIntforX <= (x"0000" & y1 & x"0000") - slopeyperx32 * x1;
  ZIntforX <= (x"0000" & z1 & x"0000") - slopezperx32 * x1;
  xIntforY <= (x"0000" & x1 & x"0000") - slopexpery32 * y1;
  ZIntforY <= (x"0000" & z1 & x"0000") - slopezpery32 * y1;

  -- synchronous state machine logic
  StateReg: process (CLK, nRST)
  begin  -- process StateReg
    if (nRST = '0') then                  -- asynchronous reset (active low)
      state <= ST_WAIT_FOR_OPCODE;
      currX <= x"00000000";
      currY <= x"00000000";
      storedDATA <= x"000000000000";
      storedRESULT <= x"000000";
      slopeyperx32 <= x"00010000";
      slopezperx32 <= x"00010000";
      slopexpery32 <= x"00010000";
      slopezpery32 <= x"00010000";
    elsif (CLK'event and CLK = '1') then  -- rising clock edge
      
      -- synchronous reset
      if (ENABLE = '0') then
        state <= ST_WAIT_FOR_OPCODE;
        currX <= x"00000000";
        currY <= x"00000000";
        storedDATA <= x"000000000000";
        storedRESULT <= x"000000";
        slopeyperx32 <= x"00010000";
        slopezperx32 <= x"00010000";
        slopexpery32 <= x"00010000";
        slopezpery32 <= x"00010000";
      else
        storedRESULT <= nextRESULT;
        storedDATA <= nextDATA;
        state <= nextstate;
        currX <= nextX;
        currY <= nextY;

        slopeyperx32 <= nextslopeyperx32;
        slopezperx32 <= nextslopezperx32;
        slopexpery32 <= nextslopexpery32;
        slopezpery32 <= nextslopezpery32;
        
      end if;
    end if;
  end process StateReg;


  -- the core state machine combinational logic
  Logic: process(nRST, state, OPCODE, currX, currY, x1, x2, y1, y2,
                 SB_BUSY, X, Y, Z, DATA, xIntforY, zIntforY, 
                 slopexpery32, slopezpery32, yIntforX,
                 slopeyperx32, zIntforX, slopezperx32, storedDATA,
                 storedRESULT, DIV_Z, DIV2_Z, z1, z2)

  begin  -- process Logic

    -- defaults
    SB_ENABLE <= '0';
    BUSY <= '0';
    RESULT <= x"000000";
    X <= x"0000000000000000";
    Y <= x"0000000000000000";
    Z <= x"0000000000000000";
    
    DIV_X <= x"00000000";
    DIV_Y <= x"00000000";
    DIV2_X <= x"00000000";
    DIV2_Y <= x"00000000";
    
    -- maintain current states
    nextX <= currX;
    nextY <= currY;
    nextstate <= state;
    nextDATA <= storedDATA;
    nextRESULT <= storedRESULT;

    nextslopeyperx32 <= slopeyperx32;
    nextslopezperx32 <= slopezperx32;
    nextslopexpery32 <= slopexpery32;
    nextslopezpery32 <= slopezpery32;
    
    -- output the stored result to the pin for synchrony
    RESULT <= storedRESULT;
    
    -- asynchronous reset
    if (nRST = '0') then
      SB_ENABLE <= '0';
      nextstate <= ST_WAIT_FOR_OPCODE;
    else

      -- next state logic
      case state is
        when ST_WAIT_FOR_OPCODE =>
          BUSY <= '0';
          X <= x"0000000000000000";
          Y <= x"0000000000000000";
          Z <= x"0000000000000000";
          SB_ENABLE <= 'Z';
          RESULT <= "ZZZZZZZZZZZZZZZZZZZZZZZZ";
          DIV_X <= x"00000000";
          DIV_Y <= x"00000000";
          DIV2_X <= x"00000000";
          DIV2_Y <= x"00000000";
          
          
          if (OPCODE /= "00") then
            nextDATA <= DATA;
            
            -- note: (x, y) bounded to be positive numbers
            if (abs(y2 - y1) > abs(x2 - x1)) then
              -- make all of the divisions in parallel:
              -- slopex: incr of y required for one incr of x
              --  slopexpery32 <= (x2 - x1) / (y2 - y1);
              --  slopezpery32 <= (z2 - z1) / (y2 - y1);

              DIV_X <= std_logic_vector((x2 - x1));
              DIV_Y <= std_logic_vector((y2 - y1));
              DIV2_X <= std_logic_vector((z2 - z1));
              DIV2_Y <= std_logic_vector((y2 - y1));

              nextslopexpery32 <= signed(DIV_Z); 
              nextslopezpery32 <= signed(DIV2_Z);
              
              if (y1 >= y2) then
                nextY <= y2;
              else
                nextY <= y1;
              end if;
              
              nextstate <= ST_WAITFORSENDY;              
            else

              --  slopeyperx32 <= (y2 - y1) / (x2 - x1);
              --  slopezperx32 <= (z2 - z1) / (x2 - x1);
              DIV_X  <= std_logic_vector((y2 - y1));
              DIV_Y  <= std_logic_vector((x2 - x1));
              DIV2_X <= std_logic_vector((z2 - z1));
              DIV2_Y <= std_logic_vector((x2 - x1));

              nextslopeyperx32 <= signed(DIV_Z); 
              nextslopezperx32 <= signed(DIV2_Z);
              
              if (x1 >= x2) then
                nextX <= x2;
              else
                nextX <= x1;
              end if;

            nextstate <= ST_WAITFORSENDX;              
            end if;
          else
            nextstate <= ST_WAIT_FOR_OPCODE;
          end if;

            
        when ST_LOOPOVERX =>
          BUSY <= '1';
          SB_ENABLE <= '1';
          
          if (SB_BUSY = '1') then
            
            if (currX = maximum(x1, x2)) then
              nextstate <= ST_WAIT_FOR_OPCODE;
            else
              nextX <= currX + x"00010000";
              nextstate <= ST_WAITFORSENDX;
            end if;
            
          else
            nextstate <= ST_LOOPOVERX;
          end if;

          
        when ST_LOOPOVERY =>
          BUSY <= '1';
          SB_ENABLE <= '1';
          
          if (SB_BUSY = '1') then

            if (currY >= maximum(y1, y2)) then
              nextstate <= ST_WAIT_FOR_OPCODE;
            else
              nextstate <= ST_WAITFORSENDY;
                          nextY <= currY + x"00010000";
            end if;
          else
            nextstate <= ST_LOOPOVERY;
          end if;

          
        -- Wait until Screen Buffer not busy
        when ST_WAITFORSENDX =>
          BUSY <= '1';
          SB_ENABLE <= '0';
          
          if (SB_BUSY = '0') then
            nextstate <= ST_LOOPOVERX;

            Y <= (slopeyperx32 * currX) + yIntforX + x"0000000080000000";
            Z <= (slopezperx32 * currX) + zIntforX + x"0000000080000000";

            nextRESULT <= std_logic_vector(currX(23 downto 16) &
                                       Y(39 downto 32) &
                                       Z(39 downto 32));
            
          else
            nextstate <= ST_WAITFORSENDX;
          end if;

        -- Wait until Screen Buffer not busy
        when ST_WAITFORSENDY =>
          BUSY <= '1';
          SB_ENABLE <= '0';
          
          if (SB_BUSY = '0') then
            nextstate <= ST_LOOPOVERY;

            X <= (slopexpery32 * currY) + xIntforY + x"0000000080000000";
            Z <= (slopezpery32 * currY) + zIntforY + x"0000000080000000";
            
            nextRESULT <= std_logic_vector(X(39 downto 32) &
                                           currY(23 downto 16) &
                                           Z(39 downto 32));
          else
            nextstate <= ST_WAITFORSENDY;
          end if;

        when others => null;
      end case;
    end if;
  end process Logic;

end GPU;
