# On DevCloud: access node with GPU support

#qsub -I -l nodes=1:gpu:ppn=2 -d .
qsub -I -l nodes=1:gen9:ppn=2 -d .

# Setup Intel oneAPI 2022.3 environment

source /opt/intel/oneapi/setvars.sh
which dpcpp icx sycl-ls advisor

# Check devices

sycl-ls

################
# OpenMP Nbody #
################

# Get Nbody code from https://github.com/fbaru-dev/nbody-demo

git clone https://github.com/fbaru-dev/nbody-demo.git

# Go to best optimized versions

pushd nbody-demo/ver8

# Build the executable (without AVX512 flags: OPTFLAGS=)

#make -j
make -j OPTFLAGS=

# Run without Advisor, showing that OpenMP is enabled

export KMP_AFFINITY=verbose
./nbody.x 64000

export KMP_AFFINITY=verbose,scatter
./nbody.x 64000

export KMP_AFFINITY=scatter

# Do Offload Advisor data collection targeting Gen9 GPU (takes about 7 minutes)
# * reduce the analysis to outer OpenMP loops: --markup omp
# * switch off the dependency analysis for OpenMP loop: --collect basic

time advisor-python $APM/run_oa.py --verbose 3 ./adv_oa_64000 --config gen9_gt2 --markup omp --collect basic --out-dir ./result_oa_64000 -- ./nbody.x 64000
#time advisor-python $APM/run_oa.py --verbose 3 ./adv_oa_64000.gen11 --config gen11_icl --markup omp --collect basic --out-dir ./result_oa_64000.gen11 -- ./nbody.x 64000

# Open the HTML result report

firefox ./result_oa_64000/report.html

# Move the "Gen9 GT2 configuration" sliders to virtually increase the target performance
# Download the file or open it and copy-paste into file scalers.toml
# or copy result_oa_64000/config.toml to file scalers.toml and edit it

# Re-do performance estimation with scaled target values in scalers.toml
# EU_count = 192
# L3_BW = 1024000000000

#cat scalers.toml
#time advisor-python $APM/analyze.py --verbose 3 ./adv_oa_64000 --config gen9_gt2 --config scalers.toml --out-dir ./result_oa_64000_scalers

# Open the new HTML result report

#firefox ./result_oa_64000_scalers/report.html

# Re-do performance estimation with scaled target values in --set-parameter flag

time advisor-python $APM/analyze.py --verbose 3 ./adv_oa_64000 --config gen9_gt2 --set-parameter "scale={EU_count=192,L3_BW=1024000000000}" --out-dir ./result_oa_64000_parameters

# Open the new HTML result report

firefox ./result_oa_64000_parameters/report.html

# Go back to main directory

popd

###############
# DPC++ Nbody #
###############

# Get oneAPI samples from https://github.com/oneapi-src/oneAPI-samples

git clone https://github.com/oneapi-src/oneAPI-samples.git

# Go to Nbody code

pushd oneAPI-samples/DirectProgramming/DPC++/N-BodyMethods/Nbody

# Build the executable

mkdir build
cd build
cmake ..
make -j VERBOSE=1

# Check available devices

sycl-ls

# Run with checking the device

SYCL_PI_TRACE=1 src/nbody 64000
#SYCL_PI_TRACE=1 SYCL_DEVICE_FILTER=level_zero:gpu src/nbody 64000
SYCL_PI_TRACE=1 SYCL_DEVICE_FILTER=opencl:gpu src/nbody 64000

# Run Roofline analysis (survey + tripcounts)

time advisor --collect=survey --profile-gpu --project-dir=./adv_roof -- src/nbody 64000

time advisor --collect=tripcounts --stacks --flop --profile-gpu --enable-data-transfer-analysis --data-transfer=light --project-dir=./adv_roof -- src/nbody 64000

# Create Roofline report

time advisor --report=roofline --gpu --project-dir=./adv_roof --report-output=./adv_roof/roofline.html

# Open the new Roofline HTML report

firefox ./adv_roof/roofline.html


# GPU-to-GPU projection

# Possible values: gen11_icl gen12_dg1 gen12_tgl gen9_gt2 gen9_gt3e gen9_gt4e xehp_sdv_480eu xehpg_256xve xehpg_512xve
time advisor --collect=projection --profile-gpu --config=gen11_icl --project-dir=./adv_roof

# Open the new projection HTML reports

#firefox ./adv_roof/e000/report/report.html
firefox ./adv_roof/e000/report/advisor-report.html

# Go back to main directory

popd