Hi
I have previously written ccNUMA aware code in Fortran by initializing my arrays in parallel using the "first touch" principle , but it appears something has happened lately so this no longer works. For memory bandwidth sensitive code I used to see performance scale linearly with the number of NUMA nodes in the system, but running the code below I now obtain virtually identical results for both the NUMA and non-NUMA aware code ...
Any suggestions as to what is causing this? I have tested the code on both Intel 2 socket systems and AMD 4 socket systems with the same result ...
Best regards,
C
program Console6
use ifport
use omp_lib
implicit none
integer*8 :: I,J,N
integer :: Repetitions
real*8,allocatable :: iVector(:),oVector(:)
real*8 :: Runtimebegin,RuntimeEnd,FLops
logical :: Success
N=2e8
allocate(iVector(N))
allocate(oVector(N))
success = SETENVQQ("KMP_AFFINITY=verbose,scatter")
!$OMP PARALLEL
!Do nothing except for initializing the OMP threads ...
!$OMP END PARALLEL
call omp_set_num_Threads(8)
Repetitions=50
!initialize the data structure using first touch - everything will reside on the NUMA node of the master thread
do i=1,N
iVector(i)=1d0
oVector(i)=0d0
end do
!Perform calculation
RuntimeBegin=omp_get_wtime()
!$OMP PARALLEL private(i) shared(iVector,oVector,N)
!$OMP DO SCHEDULE(STATIC)
do j=1,Repetitions
do i=1,N
oVector(i)=oVector(i)+iVector(i)*0.01
end do
end do
!$OMP END DO
!$OMP END PARALLEL
print *,(oVector(1))
RuntimeEnd=omp_get_wtime()
Flops=2.0*N*Repetitions/((RunTimeEnd-RunTimeBegin)*1024**3)
print *,'NO DISTRIBUTION ACROSS NUMA NODES ...'
print *,'Time=',RunTimeEnd-RuntimeBegin,'GFlops=',Flops
!Deallocate the data and repeat the calculation with the data distributed across the NUMA nodes of the system
deallocate(iVector)
deallocate(oVector)
allocate(iVector(N))
allocate(oVector(N))
!Distribute the data across NUMA nodes using the first tough principle ...
!$OMP PARALLEL private(i) shared(iVector,oVector,N)
!$OMP DO SCHEDULE(STATIC)
do i=1,N
iVector(i)=1d0
oVector(i)=0d0
end do
!$OMP END DO
!$OMP END PARALLEL
RuntimeBegin=omp_get_wtime()
!$OMP PARALLEL private(i) shared(iVector,oVector,N)
!$OMP DO SCHEDULE(STATIC)
do j=1,Repetitions
do i=1,N
oVector(i)=oVector(i)+iVector(i)*0.01
end do
end do
!$OMP END DO
!$OMP END PARALLEL
print *,(oVector(1))
RuntimeEnd=omp_get_wtime()
Flops=2.0*N*Repetitions/((RunTimeEnd-RunTimeBegin)*1024**3)
print *,'DATA DISTRIBUTED ACROSS NUMA NODES ...'
print *,'Time=',RunTimeEnd-RuntimeBegin,'GFlops=',Flops
end program Console6