Fortran low performance with allocatable arrays

293 Views Asked by At

I use Intel Visual Fortran, both IVF2013 and IVF2019. When using allocatable arrays, the program is much slower than the one using static memory allocation. That is to say, if I change from

Method 1: by using fixed array

do i = 1, 1000
  call A
end do 

subroutine A
  real(8) :: x(30)
  do things
end subroutine A

to something like

Method 2: by using allocatable arrays

module module_size_is_defined
  n = 30
end module

do i = 1, 1000
    call A
end do 

subroutine A
  use module_size_is_defined
  real(8), allocatable :: x(:)
  allocate(x(n))
  do things
end subroutine A

The code is much slower. For my code, the static allocation takes 1 minutes 30 seconds while the dynamic allocation takes 2 minutes and 30 seconds. Then, I thought is might because that the allocation action was run takes too much time as it is in the loop, then I tried following two methods:

Method 3: by using the module to allocate the array only once

module module_x_is_allocated 
  n = 30
  allocat(x(n))
end module

do i = 1, 1000
  call A
end do 

subroutine A
  use module_x_is_allocated
  do things
end subroutine A

Method 4: by using automatic array

module module_size_is_defined
  n = 30
end module


do i = 1, 1000
  call A
end do 

subroutine A
  use module_size_is_defined
  real(x) :: x(n)
  do things
end subroutine A

Both Method 3 and Method 4 take almost the same time of the one using dynamic allocated array Method 2. Both around 2 mins 30s. All cases are compiling with same optimization. I tried IVF 2013 and IVF 2019, and same results. I don't know why. Especially for Method 3, although the allocate is only run once, it still takes the same time. It seems that dynamic allocated array is stored at the place that is slower than the static allocated array, and allocation does not take extra time (since method 2 and 3 take the same time).

Any ideas and suggestions that to allocate the arrays in a more efficient manner to reduce the performance penalty? Thanks.

!========================================================================= Edit 1:

My program is too long to post here. Thus, I tried a few small codes. The results are a little bit strange. I tried three cases,

Method 1: takes 28.98s

module module_size_is_defined
  implicit none
  integer(4) :: n
end module

program main
  use module_size_is_defined
  implicit none
  
  integer(4) :: i
  real(8) :: y(50,50),z(50,50),t
  
  n = 50
  do i =1,50000
    t=dble(i) * 2.0D0
    call A(y,t)
    z = z + y
  end do
  write(*,*) z(1,1)
end
  
subroutine A(y,t)
  use module_size_is_defined
  implicit none
  real(8),intent(out):: y(n,n)
  real(8),intent(in) :: t
  integer(4) :: j
  real(8) :: x(1,50)
  
  y=0.0D0
  do j = 1, 200
    call getX(x,t,j)
    y = y + matmul( transpose(x) + dble(j)**2, x )
  end do
endsubroutine A
  
  
subroutine getX(x,t,j)
  use module_size_is_defined
  implicit none
  real(8),intent(out) :: x(1,n)
  real(8),intent(in) :: t
  integer(4),intent(in) :: j
  integer(4) :: i
  
  do i =1, n
    x(1,i)  = dble(i+j) * t ** (1.5D00) 
  end do
endsubroutine getX

Method 2: takes 30.56s

module module_size_is_defined
  implicit none
  integer(4) :: n
end module

program main
  use module_size_is_defined
  implicit none
  
  integer(4) :: i
  real(8) :: y(50,50),z(50,50),t
  
  n = 50
  do i =1,50000
    t=dble(i) * 2.0D0
    call A(y,t)
    z = z + y
  end do
  write(*,*) z(1,1)
end
  
subroutine A(y,t)
  use module_size_is_defined
  implicit none
  real(8),intent(out):: y(n,n)
  real(8),intent(in) :: t
  integer(4) :: j
  real(8),allocatable :: x(:,:)
  allocate(x(1,n))
  
  y=0.0D0
  do j = 1, 200
    call getX(x,t,j)
    y = y + matmul( transpose(x) + dble(j)**2, x )
  end do
endsubroutine A
  
  
subroutine getX(x,t,j)
  use module_size_is_defined
  implicit none
  real(8),intent(out) :: x(1,n)
  real(8),intent(in) :: t
  integer(4),intent(in) :: j
  integer(4) :: i
  
  do i =1, n
    x(1,i)  = dble(i+j) * t ** (1.5D00) 
  end do
endsubroutine getX

Method 3: takes 78.72s

module module_size_is_defined
  implicit none
  integer(4) :: n
endmodule

module module_array_is_allocated
  use module_size_is_defined
  implicit none
  real(8), allocatable,save :: x(:,:)

  contains
  subroutine init
    implicit none
    allocate(x(1,n))
  endsubroutine
endmodule module_array_is_allocated

program main
  use module_size_is_defined
  use module_array_is_allocated
  implicit none
  
  integer(4) :: i
  real(8) :: y(50,50),z(50,50),t
  
  n = 50
  call init
  do i =1,50000
    t=dble(i) * 2.0D0
    call A(y,t)
    z = z + y
  end do
  write(*,*) z(1,1)
end
  
subroutine A(y,t)
  use module_size_is_defined
  use module_array_is_allocated
  implicit none
  real(8),intent(out):: y(n,n)
  real(8),intent(in) :: t
  integer(4) :: j
  
  y=0.0D0
  do j = 1, 200
    call getX(x,t,j)
    y = y + matmul( transpose(x) + dble(j)**2, x )
  end do
endsubroutine A
  
  
subroutine getX(x,t,j)
  use module_size_is_defined
  implicit none
  real(8),intent(out) :: x(1,n)
  real(8),intent(in) :: t
  integer(4),intent(in) :: j
  integer(4) :: i
  
  do i =1, n
    x(1,i)  = dble(i+j) * t ** (1.5D00) 
  end do
endsubroutine getX

Now, with samller size problem, Method 1 and Method 2 is almost same time. But Method 3 should be better than Method 2, since it only allocate x(1,n) once. But it is much slower. But in my previous program, Method 2 gives almost the same time as Method 3. It is strange.

I complied in both Windows and Linux, with release setup, -O2 Optimization, with different version of IVF.

0

There are 0 best solutions below