How to put array with many numbers to represent a matrix in DLX assembly language

I'm making an assembly language about Haar DWT and as you know, haar dwt needs a matrix represented from an image. I want an assembly language that able to put 64x64 or 128x128 or even 256x256 matrix. How do I do that?

I am new to this assembly thing, let alone to use DLX microprocessor. I have the assembly code for the calculation of the DWT itself. But somehow the biggest matrix I can put in haar_in array is 4x4 matrix/16 values only. I don't know if it's the memory thing, or the fact that I calculate big floating point that is too heavy for the program, but I can't get it done. I have tried to change the space of haar_out, the M and N, but it didn't work too.

    .double 1.4142

    .double 148, 142, 130, 168, 166, 117, 138, 120, 116, 118, 109, 136, 99, 89, 123, 120

    .space 10000

    .word 4
    .word 4
    .word 1

; Register Usage
;   GP:
;     R0 => 0
;     R1 => M
;     R2 => N
;     R3.. => Internally used
;   FP:
;     F0 => 0
;     F8 => Square Root of 2
;     F2.. => Internally used
  ld    f8, sqrt(r0)              ; F8 contains square root of 2
  lw    r1, M(r0)                   ; Get value of M in R1
  lw    r2, N(r0)                   ; Get value of N in R2
  addi  r3, r0, 0                   ; Set R3 to 0 as a counter for M
  addi  r4, r0, 0                   ; Set R4 to 0 as a counter for N

  ; Copy the haar_in to the haar_out array
  sub   r2, r2, r4
  beqz  r2, n_loop_exit             ; Exit N-loop if counter reached N
  lw    r2, N(r0)                   ; Get value of N in R2
  slli  r4, r4, 3                   ; Align to the next pointer
  lw    r1, M(r0)                   ; Get value of M in R1
  addi  r3, r0, 0                   ; Set R3 to 0 as a counter for M
    sub   r1, r1, r3
    beqz  r1, m_loop_exit           ; Exit M-loop if counter reached M
    lw    r1, M(r0)                 ; Get value of M in R1
    slli  r3, r3, 3                 ; Align to the next pointer
    mult  r5, r4, r1                ; Form R5 = (j*m)
    add   r5, r5, r3                ;      R5 = i + j*m
    ld    f2, haar_in(r5)           ; Get the array from haar array
    sd    haar_out(r5), f2          ; Store it in internal array
    srli  r3, r3, 3                 ; Get the original count back
    addi  r3, r3, 1                 ; Increment the count by 1
    j     copy_m_haar_out           ; Loop back
  srli  r4, r4, 3                   ; Get the original count back
  addi  r4, r4, 1                   ; Increment the count by 1
  j     copy_n_haar_out             ; Loop back
  lw    r1, M(r0)                   ; Get value of M in R1
  lw    r2, N(r0)                   ; Get value of N in R2

  ; Determine K, the largest power of 2 such that K <= M
  lw    r3, K(r0)                   ; Get the initial value of K in R3
  slli  r5, r3, 1                   ; R5 = K*2
  slt   r4, r1, r5                  ; Set R4 if M <= K
  subi  r4, r4, 1                   ; Check if R4 is set
  beqz  r4, k_loop_m_exit           ; Exit loop if set
  sub   r4, r1, r3                  ; Check if K == M
  beqz  r4, k_loop_m_exit           ; Exit loop if equal
  slli  r3, r3, 1                   ; K = K*2
  j     k_less_than_m               ; Loop back
  sw    K(r0), r3                   ; Store the value in K

  ; Transform all the columns now
  lw    r6, K(r0)                   ; Get the updated value of K in R6
  addi  r3, r0, 0                   ; Set R3 to 0 as a counter for K
  addi  r4, r0, 0                   ; Set R4 to 0 as a counter for N
  slti  r7, r6, 1                   ; Set R7 if K < 1
  bnez  r7, col_trans_exit          ; Exit Loop if R7 is set
  srli  r6, r6, 1                   ; K = K/2
  sw    K(r0), r6                   ; Store value of K

  ; Perform the actual transform
    sub   r2, r2, r4
    beqz  r2, col_trans_n_exit        ; Exit N-loop if counter reached N
    lw    r2, N(r0)                   ; Get value of N in R2
    slli  r4, r4, 3                   ; Align to the next pointer
    lw    r6, K(r0)                   ; Get value of K in R6
    addi  r3, r0, 0                   ; Set R3 to 0 as a counter for M
      sub   r6, r6, r3
      beqz  r6, col_trans_k_exit      ; Exit K-loop if counter reached K
      lw    r6, K(r0)                 ; Get value of K in R6
      slli  r3, r3, 3                 ; Align to the next pointer
      slli  r5, r3, 1                 ; Form R5 = (2*i)
      mult  r8, r4, r1                ; Form R8 = (j*m)
      add   r9, r5, r8                ;      R9 = (2*i) + (j*m)
      ld    f2, haar_in(r9)           ; Get the value from haar array in F2
      addi  r9, r9, 8                 ; Move to the next index
      ld    f6, haar_in(r9)           ; Get the next value from array in F6
      addd  f4, f2, f6                ; Add the results in F4
      divd  f4, f4, f8                ; F4 = F4/sqrt(2)
      subd  f10, f2, f6               ; Sub the results in F10
      divd  f10, f10, f8              ; F10 = F10/sqrt(2)
      add   r5, r3, r8                ; Form R5 = i + j*m
      sd    haar_out(r5), f4          ; Store the result in out array
      slli  r6, r6, 3                 ; Form the array index in K
      add   r5, r5, r6                ; Form R5 = (k+i+j*m)
      sd    haar_out(r5), f10         ; Store it in internal array
      srli  r6, r6, 3                 ; Get the original value back
      srli  r3, r3, 3                 ; Get the original count back
      addi  r3, r3, 1                 ; Increment the count by 1
      j     col_trans_k               ; Loop back
    srli  r4, r4, 3                   ; Get the original count back
    addi  r4, r4, 1                   ; Increment the count by 1
    j     col_trans_n                 ; Loop back

  j     col_transform

This is my .data, so whenever I'm calculating a number, I will load dp from haar_in to an FP register, then the result of the calculation will be stored in haar_out. Whenever I put more that 4x4 matrix, the message will be a timeout, an overflow, or illegal number. Help..


