I use Intel Visual Fortran, both IVF2013 and IVF2019. When using allocatable arrays, the program is much slower than the one using static memory allocation. That is to say, if I change from
Method 1: by using fixed array
do i = 1, 1000
call A
end do
subroutine A
real(8) :: x(30)
do things
end subroutine A
to something like
Method 2: by using allocatable arrays
module module_size_is_defined
n = 30
end module
do i = 1, 1000
call A
end do
subroutine A
use module_size_is_defined
real(8), allocatable :: x(:)
allocate(x(n))
do things
end subroutine A
The code is much slower. For my code, the static allocation takes 1 minutes 30 seconds while the dynamic allocation takes 2 minutes and 30 seconds. Then, I thought is might because that the allocation action was run takes too much time as it is in the loop, then I tried following two methods:
Method 3: by using the module to allocate the array only once
module module_x_is_allocated
n = 30
allocat(x(n))
end module
do i = 1, 1000
call A
end do
subroutine A
use module_x_is_allocated
do things
end subroutine A
Method 4: by using automatic array
module module_size_is_defined
n = 30
end module
do i = 1, 1000
call A
end do
subroutine A
use module_size_is_defined
real(x) :: x(n)
do things
end subroutine A
Both Method 3 and Method 4 take almost the same time of the one using dynamic allocated array Method 2. Both around 2 mins 30s. All cases are compiling with same optimization. I tried IVF 2013 and IVF 2019, and same results. I don't know why. Especially for Method 3, although the allocate is only run once, it still takes the same time. It seems that dynamic allocated array is stored at the place that is slower than the static allocated array, and allocation does not take extra time (since method 2 and 3 take the same time).
Any ideas and suggestions that to allocate the arrays in a more efficient manner to reduce the performance penalty? Thanks.
!========================================================================= Edit 1:
My program is too long to post here. Thus, I tried a few small codes. The results are a little bit strange. I tried three cases,
Method 1: takes 28.98s
module module_size_is_defined
implicit none
integer(4) :: n
end module
program main
use module_size_is_defined
implicit none
integer(4) :: i
real(8) :: y(50,50),z(50,50),t
n = 50
do i =1,50000
t=dble(i) * 2.0D0
call A(y,t)
z = z + y
end do
write(*,*) z(1,1)
end
subroutine A(y,t)
use module_size_is_defined
implicit none
real(8),intent(out):: y(n,n)
real(8),intent(in) :: t
integer(4) :: j
real(8) :: x(1,50)
y=0.0D0
do j = 1, 200
call getX(x,t,j)
y = y + matmul( transpose(x) + dble(j)**2, x )
end do
endsubroutine A
subroutine getX(x,t,j)
use module_size_is_defined
implicit none
real(8),intent(out) :: x(1,n)
real(8),intent(in) :: t
integer(4),intent(in) :: j
integer(4) :: i
do i =1, n
x(1,i) = dble(i+j) * t ** (1.5D00)
end do
endsubroutine getX
Method 2: takes 30.56s
module module_size_is_defined
implicit none
integer(4) :: n
end module
program main
use module_size_is_defined
implicit none
integer(4) :: i
real(8) :: y(50,50),z(50,50),t
n = 50
do i =1,50000
t=dble(i) * 2.0D0
call A(y,t)
z = z + y
end do
write(*,*) z(1,1)
end
subroutine A(y,t)
use module_size_is_defined
implicit none
real(8),intent(out):: y(n,n)
real(8),intent(in) :: t
integer(4) :: j
real(8),allocatable :: x(:,:)
allocate(x(1,n))
y=0.0D0
do j = 1, 200
call getX(x,t,j)
y = y + matmul( transpose(x) + dble(j)**2, x )
end do
endsubroutine A
subroutine getX(x,t,j)
use module_size_is_defined
implicit none
real(8),intent(out) :: x(1,n)
real(8),intent(in) :: t
integer(4),intent(in) :: j
integer(4) :: i
do i =1, n
x(1,i) = dble(i+j) * t ** (1.5D00)
end do
endsubroutine getX
Method 3: takes 78.72s
module module_size_is_defined
implicit none
integer(4) :: n
endmodule
module module_array_is_allocated
use module_size_is_defined
implicit none
real(8), allocatable,save :: x(:,:)
contains
subroutine init
implicit none
allocate(x(1,n))
endsubroutine
endmodule module_array_is_allocated
program main
use module_size_is_defined
use module_array_is_allocated
implicit none
integer(4) :: i
real(8) :: y(50,50),z(50,50),t
n = 50
call init
do i =1,50000
t=dble(i) * 2.0D0
call A(y,t)
z = z + y
end do
write(*,*) z(1,1)
end
subroutine A(y,t)
use module_size_is_defined
use module_array_is_allocated
implicit none
real(8),intent(out):: y(n,n)
real(8),intent(in) :: t
integer(4) :: j
y=0.0D0
do j = 1, 200
call getX(x,t,j)
y = y + matmul( transpose(x) + dble(j)**2, x )
end do
endsubroutine A
subroutine getX(x,t,j)
use module_size_is_defined
implicit none
real(8),intent(out) :: x(1,n)
real(8),intent(in) :: t
integer(4),intent(in) :: j
integer(4) :: i
do i =1, n
x(1,i) = dble(i+j) * t ** (1.5D00)
end do
endsubroutine getX
Now, with samller size problem, Method 1 and Method 2 is almost same time. But Method 3 should be better than Method 2, since it only allocate x(1,n) once. But it is much slower. But in my previous program, Method 2 gives almost the same time as Method 3. It is strange.
I complied in both Windows and Linux, with release setup, -O2 Optimization, with different version of IVF.