I have a very simple 3D in-place FFT transform code with FFTW and openmp multi-thread support. I tried to get the best performance in a linux machine (Ubuntu with AMD Genoa CPUs -2 sockets). I built it with AMD compiler, aocc 5.0, and AMD-FFTW (optimized with openmp, avx-512) like
clang++ bench_fftw.cpp -o bench_fftw -fopenmp -march=znver4 -O3 -flto -mavx512 -ffast-math -L/opt/AMD/amd-fftw/lib -lfftw3f_omp -lfftw3f -lm -I/opt/AMD/amd-fftw/include
Regarding how to run it, I typically set
export OMP_NUM_THREADS=8 #for 1 CCD/NUMA
export OMP_PLACES=cores #only using the physical core
export OMP_PROC_BIND=close
I also have a version with MKL FFT interface, it is built with Intel compiler icpx and MKL-FFT.
icpx bench_mkl.cpp -o bench_mkl -qopenmp -O3 -0fast -ffast-math -axCORE-AVX2,CORE-AVX512 -qmkl
The binary built with icpx+mkl-fft performs much better than that with aocc+amd-fftw, almost twice faster.
#if 1
#include <fftw3.h>
#else
#include "fftw/fftw3_mkl.h"
#endif
#include <omp.h>
#define N 1024 // Size of 3D cube (N x N x N)
int main() {
float *data;
fftwf_plan plan_forward, plan_backward;
fftwf_init_threads();
fftwf_plan_with_nthreads(omp_get_max_threads());
data = (float*)fftwf_malloc(sizeof(float) * N * N * (2*(N/2+1)));
#pragma omp parallel for collapse(3)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
data[i*N*N + j*N + k] = (i + j + k) % 2;
}
}
}
#if 0
plan_forward = fftwf_plan_dft_r2c_3d(N, N, N, data, (fftwf_complex*)data, FFTW_ESTIMATE);
plan_backward = fftwf_plan_dft_c2r_3d(N, N, N, (fftwf_complex*)data, data, FFTW_ESTIMATE);
#else
plan_forward = fftwf_plan_dft_r2c_3d(N, N, N, data, (fftwf_complex*)data, FFTW_MEASURE);
plan_backward = fftwf_plan_dft_c2r_3d(N, N, N, (fftwf_complex*)data, data, FFTW_MEASURE);
#endif
for(int i=0; i<50; i++){
fftwf_execute(plan_forward);
fftwf_execute(plan_backward);
}
#pragma omp parallel for
for (int i = 0; i < N * N * N; i++) {
data[i] /= (N * N * N);
}
fftwf_destroy_plan(plan_forward);
fftwf_destroy_plan(plan_backward);
fftwf_free(data);
fftwf_cleanup_threads();
return 0;
}
Any advice on how to tune this code in AMD Genoa?
gq