Quantcast
Channel: Intel® Software - Intel® Visual Fortran Compiler for Windows*
Viewing all articles
Browse latest Browse all 5691

Problem distributing data across ccNUMA nodes

$
0
0

Hi

I have previously written ccNUMA aware code in Fortran by initializing my arrays in parallel using the "first touch" principle , but it appears something has happened lately so this no longer works. For memory bandwidth sensitive code I used to see performance scale linearly with the number of NUMA nodes in the system, but running the code below I now obtain virtually identical results for both the NUMA and non-NUMA aware code ...

Any suggestions as to what is causing this? I have tested the code on both Intel 2 socket systems and AMD 4 socket systems with the same result ...

Best regards,

C

 

  

    program Console6

    use ifport

    use omp_lib

    implicit none

    integer*8          :: I,J,N

    integer            :: Repetitions

    real*8,allocatable :: iVector(:),oVector(:)

    real*8             :: Runtimebegin,RuntimeEnd,FLops

    logical            :: Success

    N=2e8

    allocate(iVector(N))

    allocate(oVector(N))   

    success = SETENVQQ("KMP_AFFINITY=verbose,scatter")

!$OMP PARALLEL

!Do nothing except for initializing the OMP threads ...

!$OMP END PARALLEL    

   call omp_set_num_Threads(8)

   Repetitions=50

   !initialize the data structure using first touch - everything will reside on the NUMA node of the master thread

   do i=1,N

     iVector(i)=1d0

     oVector(i)=0d0

   end do

   !Perform calculation  

   RuntimeBegin=omp_get_wtime()

!$OMP PARALLEL private(i) shared(iVector,oVector,N)

!$OMP DO SCHEDULE(STATIC)

   do j=1,Repetitions

     do i=1,N

      oVector(i)=oVector(i)+iVector(i)*0.01

     end do

   end do  

!$OMP END DO

!$OMP END PARALLEL

    print *,(oVector(1))

    RuntimeEnd=omp_get_wtime()

    Flops=2.0*N*Repetitions/((RunTimeEnd-RunTimeBegin)*1024**3)

    print *,'NO DISTRIBUTION ACROSS NUMA NODES ...'

    print *,'Time=',RunTimeEnd-RuntimeBegin,'GFlops=',Flops

   !Deallocate the data and repeat the calculation with the data distributed across the NUMA nodes of the system

   deallocate(iVector)

   deallocate(oVector)

   allocate(iVector(N))

   allocate(oVector(N))  

   !Distribute the data across NUMA nodes using the first tough principle ...

!$OMP PARALLEL private(i) shared(iVector,oVector,N)

!$OMP DO  SCHEDULE(STATIC)

     do i=1,N

       iVector(i)=1d0

       oVector(i)=0d0

     end do

!$OMP END DO

!$OMP END PARALLEL

    

    RuntimeBegin=omp_get_wtime()

!$OMP PARALLEL private(i) shared(iVector,oVector,N)

!$OMP DO  SCHEDULE(STATIC)

   do j=1,Repetitions

     do i=1,N

      oVector(i)=oVector(i)+iVector(i)*0.01

     end do

   end do  

!$OMP END DO

!$OMP END PARALLEL

    print *,(oVector(1))

    RuntimeEnd=omp_get_wtime()

    Flops=2.0*N*Repetitions/((RunTimeEnd-RunTimeBegin)*1024**3)

    print *,'DATA DISTRIBUTED ACROSS NUMA NODES ...'

    print *,'Time=',RunTimeEnd-RuntimeBegin,'GFlops=',Flops

     

    end program Console6


Viewing all articles
Browse latest Browse all 5691

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>