Computer: intel i7-7820 OS: Windows 10 pro 64bit and WinXP pro running in Oracle VM Virtual box Fortran Compilers: Compaq Visual Fortran(CVF) Pro. 6.6.C Intel Fortran ifort version 2024.0.2.27 Gfortran version: 7.3.0
I’ve compiled the exact same source code on three compilers with full optimization on each as 32bit executable. The code performs 50 timing trials of user specified number of complex fast Fourier transforms (FFT) and then averages the number of FFT’s per second. The CVF generated code was generated in WinXP Pro running within a VM. The resultant CVF exe was executed in the XP OS (within the VM) and on the host machine. The gfortran and Intel generated code was compiled in the Win10Pro 64 OS as 32 bit code. I’m seeing over an order of magnitude difference in speed between the CVF generated code and that of the Intel and Gfortran generated code. Am I doing something stupid (I do that on a regular basis) or is this normal?? How can I get CVF speeds with a newer FORTRAN compiler???
Speeds and executable file size:
CVF on XP within VM 37774 FFT/sec 312KB,
CVF on Win10 Pro 64bit 39451FFT/sec 312KB,
Intel ifort on Win10 Pro 64bit 2588 FFT/sec 29KB,
Gfortran on Win10 Pro 64bit 2421 FFT/sec 419 KB,
Source code:
c do serial calculation
c
c
c create some fake data
program main
c
c
character filename*100
integer dtstart(8),dtend(8)
common /ddata/y(16384),n,k,numfft
c
c
c
c numfft=100000 !number of ffts per threa
ntrials=50 !number of trials or random timings
c
c find scaling of 0-1.0 random number to 1 to nthmax
c
write (*,4)
4 format ('Enter number of ffts per trial')
read (*,*) numfft
write (*,8)
8 format ('Enter output file name (e.g. fftout.txt) ')
read (*,*) filename
c
c
open (unit=10,file=filename,
c form='formatted',status='unknown')
write (10,67) numfft,ntrials
67 format (' GNU Serial Processing '/i6,4x,'Number of FFTs'/
ci6,4x,'Number of trials')
c
c generate fake data
pi=3.141592653589793
n=14
k=2**14
do 1 j=1,k
1 y(j)=(1.0*sin(2.0*pi*j/float(k))) + (3.0*sin(20.0*pi*j/float(k)))
c
c loop through the number of trials NTRIALS
c write header
write (10,6)
6 format ('total CPU time, clock time, FFT rate')
c
sumratcpu=0.0
sumrattime=0.0
do 3 itrial=1,ntrials
write (*,5) itrial
5 format (//'ITRIAL: ',i5)
c
c call gettim(ihr0,imin0,isec0,i100th0)
call cpu_time(timestart)
call date_and_time(VALUES=dtstart)
c
ndir=1
do 88 j=1,numfft
call fftx(n,k,ndir)
88 continue
c
c
c call gettim(ihr1,imin1,isec1,i100th1)
call cpu_time(timeend)
call date_and_time(VALUES=dtend)
deltim=timeend-timestart
ratcpu=numfft/deltim
c
tt=(dtend(5)-dtstart(5))*3600 +
c (dtend(6)-dtstart(6))*60 +
c (dtend(7)-dtstart(7)) +
c (dtend(8)-dtstart(7))*0.001
rattime=numfft/tt
c
write (10,7) deltim,ratcpu,tt,rattime
write (*,7) deltim,ratcpu,tt,rattime
7 format (/'cpu time = ',6x,f10.3,' fft/sec = ',f10.3/
c'date_time time = ',f10.3,' fft/sec = ',f10.3)
c
sumratcpu=sumratcpu+ratcpu
sumrattime=sumrattime+rattime
c
3 continue
avecpu=sumratcpu/ntrials
avetime=sumrattime/ntrials
write (10,2) avecpu,avetime
write (*,2) avecpu,avetime
2 format ('average rate (cpu)= ',f10.3/
c'average rate (date_time) = ',f10.3)
close (unit=10)
c
stop
end
SUBROUTINE FFTX(N,NB,NDIR)
C this rountine was checked on 4/3/94
C NDIR=1 FORWARD TRANSFORM
C NDIR=-1 INVERSE TRANSFORM
C IF NDIR=1 forward transform
C INPUT: A(1)-->A(NB/2+1) ARE signal AMPLITUDEs for positive times
c if dx=delta x (i.e. dx=x(n+1)-x(n) then
c x(1),a(1) corresponds to t=0=0*dx
c x(2),a(2) corresponds to t=1*dx
c x(nb/2+1), a(nb/2+1) corresponds to t=(nb/2)*dx
c x(nb),a(nb) corresponds to t=-1*dx
c x(nb-1),a(nb-1) corresponds to t=-2*dx
c x(nb/2+2),a(nb/2+2) corresponds to t=-(nb/2+1)*dx
C OUTPUT: A(1)-->A(NB/2+1) ARE +FREQUENCY AMPLITUDE
C A(NB/2 +2)-->A(NB) -FREQUENCY AMPLITUDE
C X=FREQUENCies orders as described above
C IF NDIR=-1 forward transform
C INPUT: A(1)-->A(NB/2+1) ARE +FREQUENCY AMPLITUDES
C A(NB/2 + 2)<--A(NB) -FREQUENCY AMPLITUDES
C X=FREQUENCIES (as described above)
C OUTPUT: A(1)-A(NB) TIME AMPLITUDE
C X(1)-X(NB) TIME (as described above)
c
C
C NOTE:
C FMAX=1/(DELTA-X)*2=NB/(2*XMAX)
C DELTAF = 1/XMAX
C XMAX=NB/2*FMAX
C
IMPLICIT REAL*4 (A-H,O-Z)
COMPLEX A(16384),U,W,T
common /ddata/y(16384),n11,n22,n33
c
c
do 8 kx=1,nb
8 a(kx)=cmplx(y(kx),0.0)
c
C REORDER SEQUENCE
NBD2=NB/2
NBM1=NB-1
J=1
DO 4 L=1,NBM1
IF (L.GE.J) GO TO 2
T=A(J)
A(J)=A(L)
A(L)=T
2 K=NBD2
3 IF (K.GE.J) GO TO 4
J=J-K
K=K/2
GO TO 3
4 J=J+K
C CALCULATE FFT
PI=3.141592653589793
DO 6 M=1,N
U=CMPLX(1.00,0.00)
ME=2**M
K=ME/2
W=CMPLX(COS(PI/K),-NDIR*SIN(PI/K))
DO 6 J=1,K
DO 5 L=J,NB,ME
LPK=L+K
T=A(LPK)*U
A(LPK)=A(L)-T
5 A(L)=A(L)+T
6 U=U*W
RETURN
END
I was expecting newer compilers to generate code comparable in speed to the old CVF compiler.