On VS 2022 lastest oneapi in debug mode
This is a long shot, but is there a chance that the oneAPI prompt is doing DLL injection, similarly to LD_PRELOAD on Linux?
That looks rather creepy. But the only DLLs that are loaded are the Windows system DLLs. At least that is what DependencyWalker is telling me.
Is hyperthreading activated on your system ? Just another longshot hypothesis. If so, you could try to deactivate it, reboot, retest.
That is a possibility - my system has 16 cores and 24 logical processors. So, indeed, if I understand the information I found correctly, that indicates hyperthreading is on. To turn it off I have to change BIOS settings. I’d rather like to consult my IT department about actions like that ![]()
No,it just sets paths. no magic, just a cmd.exe with a longer list of paths to point to the Intel binaries and includes.
One thought - you said you opened a “oneAPI Command WIndow” - was it an ia32 or x64 window?
What version of Intel Fortran is on your system, how old is it?
YOu have eps set to 0.000 for the tolerance to end, your current program just loops forever, is eps == 0.00000000000000000000 meant to be
subroutine calc_new( u, unew, error )
real, dimension(:,:), intent(in) :: u
real, dimension(:,:), intent(inout) :: unew
real, intent(out) :: error
real :: du
error = 0.0
do iy = 2,ny+1
do ix = 2,nx+1
du = diffx(ix-1,iy) * ( u(ix-1,iy) - u(ix,iy) ) &
+ diffx(ix,iy) * ( u(ix+1,iy) - u(ix,iy) ) &
+ diffy(ix,iy-1) * ( u(ix,iy-1) - u(ix,iy) ) &
+ diffy(ix,iy) * ( u(ix,iy+1) - u(ix,iy) ) &
+ force(ix,iy)
write(*,*)force(ix,iy)
unew(ix,iy) = u(ix,iy) + deltt * du
write(*,*)ix,iy,deltt, du, force(ix,iy)
error = error + abs( deltt * du )
enddo
enddo
force is not passed to calc_new, yet you are using it, debug cannot see it, but the program runs.
I am a bit lost why it does not stop.
! timing.f90 --
! Simple measurement of the performance of a program
!
MODULE timing
USE iso_fortran_env
INTEGER :: time_first, time_last, time_zero, time_rate
REAL :: cpu_first, cpu_last
INTEGER :: lurep = OUTPUT_UNIT, lucsv, luoverall
CONTAINS
! start_timer
! Initialise the time, so that we can keep track of how much wall clock/system time is spent
!
! Arguments:
! None
!
SUBROUTINE start_timer
CALL system_clock( count = time_first, count_rate = time_rate )
CALL cpu_time( cpu_first )
END SUBROUTINE start_timer
! stop_timer
! Measure the elapsed time and report it
!
! Arguments:
! None
!
SUBROUTINE stop_timer
CALL system_clock( count = time_last )
CALL cpu_time( cpu_last )
WRITE( lurep, '(a)' ) 'Report of simulation'
WRITE( lurep, '(a)' ) '--------------------'
WRITE( lurep, '(a)' ) 'Compiler version: ', compiler_version()
WRITE( lurep, '(a)' ) 'Compiler options: ', compiler_options()
WRITE( lurep, '(a,g13.6)' ) 'Wall clock (s): ', (time_last - time_first) / real(time_rate)
WRITE( lurep, '(a,g13.6)' ) 'CPU time (s): ', cpu_last - cpu_first
! WRITE( lucsv, '(100(g0.6,'',''))' ) &
! (time_last - time_first) / real(time_rate), cpu_last - cpu_first, &
! (timeused_new - timeused)
END SUBROUTINE stop_timer
END MODULE timing
! poisson_island_array.f90 --
! This is a naïve version of a solver for Poisson equations.
!
! Notes:
! - The grid is assumed to be rectangular with square cells
! - We reach the steady state using a pseudo-time step
! - Explicit loops used
! - All boundary conditions are Dirichlet: u = 0
! - In addition: there is an isolated "island" in the middle
! that does not allow diffusion.
! - The status of the grid cells is either:
! status = 1: ordinary grid cell
! status = 0: isolated grid cell, no flux
! status = -1: open boundary grid cell, fixed concentration
! - By distinguishing three statuses we can also implement
! zero-flux Neumann boundaries
! - Rather than if-statement, rely on an array of diffusion
! coefficients
!
! The diffusion arrays:
!
! | |
! | Dy(ix,iy) |
! | |
! ---+---------------+---
! | |
! Dx(ix-1,iy) | ix,iy | Dx(ix,iy)
! | |
! ---+---------------+---
! | |
! | Dy(ix,iy-1) |
! | |
!
! Note:
! This version uses pointers to swap the solution
!
program poisson_island_array
use timing
implicit none
integer, dimension(:,:), allocatable :: status
real, dimension(:,:), allocatable :: force, diffx, diffy
real, dimension(:,:), allocatable, target :: u, unew
real, dimension(:,:), pointer :: pu, punew, ptmp
integer :: nx, ny
integer :: ix, iy, iter
integer :: max_iter
real :: x_extent, y_extent, diff, deltt
real :: dx, dy
real :: error, eps
real :: flux_west, flux_east, flux_north, flux_south
open( 10, file = 'poisson_island_array.inp', status = "UNKNOWN")
open( 20, file = 'poisson_island_array.out', status = "UNKNOWN")
lurep = 20 ! Just one output file
read( 10, * ) nx, ny ! Sizes of the grid
write(*,*)nx,ny
read( 10, * ) x_extent, y_extent ! Extent of the grid in the two directions
write(*,*) x_extent, y_extent
read( 10, * ) diff ! Pseudo diffusion coefficient
read( 10, * ) eps ! Error level for finishing the calculation
write(*,*)eps
read( 10, * ) max_iter ! Maximum number of iterations
read( 10, * ) deltt ! Pseudo timestep
close( 10 )
allocate( u(nx+2,ny+2), unew(nx+2,ny+2), force(nx+2,ny+2), status(nx+2,ny+2), &
diffx(nx+2,ny+2), diffy(nx+2,ny+2) )
!
! Set up the calculation: force is set to 1 in the upper part of the grid
!
u = 0.0
unew = 0.0
force = 0.0
force(:,(ny+1)/2:) = 1.0
iter = 0
error = huge(error)
dx = diff / (x_extent/(nx-2)) ** 2
dy = diff / (y_extent/(ny-2)) ** 2
call set_status( status, dx, dy, diffx, diffy )
!
! Correct the force: the inactive cells have a force term zero
!
force = merge( force, 0.0, status /= 0 )
pu => u
punew => unew
call start_timer
do while ( error > eps .and. iter < max_iter )
call calc_new( pu, punew, error , force, deltt)
iter = iter + 1
write( *, * ) iter, error
write( 20, * ) iter, error
ptmp => pu
pu => punew
punew => ptmp
enddo
call stop_timer
write( 20, '(a)' ) 'Result'
do iy = 1,ny+2
write( 20, '(*(g11.4))') u(:,iy)
enddo
contains
! calc_new --
! Calculate the new field
!
! Arguments:
! u Array with values at the current time
! unew Array with the new values
! error Total error
!
subroutine calc_new( u, unew, error , force, deltt)
real, dimension(:,:), intent(in) :: u,force
real, dimension(:,:), intent(inout) :: unew
real, intent(out) :: error
real, intent(in) :: deltt
real duk(nx+2,ny+1)
real :: du
integer i
error = 0.0
do iy = 2,ny+1
do ix = 2,nx+1
du = diffx(ix-1,iy) * ( u(ix-1,iy) - u(ix,iy) ) + diffx(ix,iy) * ( u(ix+1,iy) - u(ix,iy) ) + diffy(ix,iy-1) * ( u(ix,iy-1) - u(ix,iy) ) + diffy(ix,iy) * ( u(ix,iy+1) - u(ix,iy) ) + force(ix,iy)
duk(ix,iy) = du
unew(ix,iy) = u(ix,iy) + deltt * du
error = error + abs( deltt * du )
enddo
write(*,200)iy, (unew(i,iy),i=1,nx+1)
write(*,200)iy, (duk(i,iy),i=1,nx+1)
200 format(i6," ",6(E10.3,2x))
enddo
error = error / (nx-2) / (ny-2)
end subroutine calc_new
! set_status --
! Set the status value for the grid cells
!
! Arguments:
! status Array holding the status encoding
! dx Nominal "diffusion" coefficient x-direction
! dy Nominal "diffusion" coefficient y-direction
! diffx Diffusion array x-direction
! diffy Diffusion array y-direction
!
subroutine set_status( status, dx, dy, diffx, diffy )
integer, dimension(:,:), intent(out) :: status
real, intent(in) :: dx, dy
real, dimension(:,:), intent(out) :: diffx, diffy
integer :: mx, my, ix, iy
mx = size(status,1)
my = size(status,2)
status = -1 ! All sides are "open boundaries"
status(2:mx-1,2:my-1) = 1 ! Inside cells are regular cells
!
! The "island"
!
status( (2*mx)/5:(3*mx)/5, (2*my)/5:(3*my)/5 ) = 0
!
! Encode this in the diffusion arrays
!
diffx = dx
diffy = dy
do iy = 1,my
do ix = 1,mx-1
if ( status(ix,iy) == 0 .or. status(ix+1,iy) == 0 ) then
diffx(ix,iy) = 0.0
endif
enddo
enddo
do iy = 1,my-1
do ix = 1,mx
if ( status(ix,iy) == 0 .or. status(ix,iy+1) == 0 ) then
diffy(ix,iy) = 0.0
endif
enddo
enddo
end subroutine set_status
end program poisson_island_array
This works and all variables passed. It appears to do what you want.
Your change from “subroutine calc_new( u, unew, error )” is unnecessary,
as force and deltt are in scope, as are diffx and diffy.
However, I do not like using contains to transfer these variables, as the coding does not clearly document the use of these variables. I also changed u, unew to u_in, u_ou so there was no confusion that these are provided by “contains”.
I also considered
a) the use of u0 = u(ix,iy), ( the compiler should optimise this anyway) or
b) using " real, dimension(nx+2) :: du, u0" and rewriting the do ix loop with array syntax. (hopeing to improve memory usage and AVX efficiency)
Finally, I tried adding OpenMP as with:
contains
! calc_new --
! Calculate the new field
!
! Arguments:
! u_in Array with values at the current time
! u_ou Array with the new values
! error Total error
!
subroutine calc_new ( u_in, u_ou, error )
real, dimension(:,:), intent(in) :: u_in
real, dimension(:,:), intent(inout) :: u_ou
real, intent(out) :: error
! Contains provides
! nx, ny, deltt
! force(:,:) diffx(:,:), diffy(:,:)
! Local variables
integer :: iy
real, dimension(nx+2) :: du, u0
error = 0.0
!$ threads_used = threads
!$omp parallel do default ( none ) &
!$omp& shared ( u_in, u_ou, force, diffx, diffy, deltt, nx, ny ) &
!$omp& private ( u0, du, iy ) &
!$omp& reduction (+ : error ) &
!$omp& num_threads ( threads ) &
!$omp& schedule ( static )
do iy = 2,ny+1
! do ix = 2,nx+1
! du = diffx(ix-1,iy) * ( u(ix-1,iy) - u(ix,iy) ) &
! + diffx(ix,iy) * ( u(ix+1,iy) - u(ix,iy) ) &
! + diffy(ix,iy-1) * ( u(ix,iy-1) - u(ix,iy) ) &
! + diffy(ix,iy) * ( u(ix,iy+1) - u(ix,iy) ) &
! + force(ix,iy)
! unew(ix,iy) = u(ix,iy) + deltt * du
! error = error + abs( deltt * du )
! enddo
u0(:) = u_in(:,iy)
du(2:nx+1) = force(2:nx+1,iy)
du(2:nx+1) = du(2:nx+1) + diffx(1:nx, iy) * ( u0(1:nx) - u0(2:nx+1) )
du(2:nx+1) = du(2:nx+1) + diffx(2:nx+1,iy) * ( u0(3:nx+2) - u0(2:nx+1) )
du(2:nx+1) = du(2:nx+1) + diffy(2:nx+1,iy-1) * ( u_in(2:nx+1,iy-1) - u0(2:nx+1) )
du(2:nx+1) = du(2:nx+1) + diffy(2:nx+1,iy) * ( u_in(2:nx+1,iy+1) - u0(2:nx+1) )
! u0(:) = u_in(:,iy)
! du(2:nx+1) = force(2:nx+1,iy) &
! + diffx(1:nx, iy) * ( u0(1:nx) - u0(2:nx+1) ) &
! + diffx(2:nx+1,iy) * ( u0(3:nx+2) - u0(2:nx+1) ) &
! + diffy(2:nx+1,iy-1) * ( u_in(2:nx+1,iy-1) - u0(2:nx+1) ) &
! + diffy(2:nx+1,iy) * ( u_in(2:nx+1,iy+1) - u0(2:nx+1) )
u_ou(2:nx+1,iy) = u0(2:nx+1) + deltt * du(2:nx+1)
error = error + sum ( abs( du(2:nx+1) ) ) * deltt
end do
!$omp end parallel do
error = error / (nx-2) / (ny-2)
end subroutine calc_new
Unfortunately, none of the changes provided a significant improvement, as there is a memory usage bottleneck on my Ryzen 5900X.
None of the tests indicated any majic performance at nx = 1000 or nx = 3000.
Although with nx=1000, each column of the arrays u, unew (and others) occupies slightly less than 1 memory page per column ( nx=1022 gives 1 memory page per column), and so would be using 2 memory pages for most column access, so potentially show some memory delays.
My testing of “-fprefetch-loop-arrays” showed no noticeable change.
My use of -fopenmp did not show a significant improvement, so I concluded memory bandwidth is a limitation. It would be interesting to see performance on a newer pc with much faster memory bandwidth.
I have a lot of OpenMP examples where memory transfer bottlenecks are an issue, so perhaps there may be different approaches to overcome these problems ( or use faster memory )
John
Yes, I saw that yesterday, but I could only post three times, I do not use contains in a program unit as it just an alternative form of common, I dislike commons, still having some programs from 1972 with common usage.
Passing the variables allows me to see them in debug, it helped as I tried to understand the algorithm. I do not do diffusion analysis.
Argen:
unew(ix,iy) = u(ix,iy) + deltt * du
This equation steps up at du with deltt being a millioneth so that the unew changes in small linear steps,
see picture of 10 by 10 with unew plotted. The diffusion equation would be better off estimating a new diffusion and then checking – not taking small steps.
Your system is interacting with the computer working arrays and so it is just random errors.
I do not think you can speed it up. As you are diffusing in two dimensions.
John
This has been a problem in the language since f90, but it was corrected in f2023 with the addition of import, only: which may be used within BLOCK and within CONTAINED subprograms. However, compilers are not yet supporting this new feature. This feature potentially affects both program clarity to humans, and program efficiency for the code generated by the compiler, so it could affect performance issues such as those being discussed here.
I am using Intel oneAPI 2025.0.0, 64 bits. To be more precise: the build number is 20241008.
No, it loops over a maximum number of iterations. The reason for setting eps to zero is that that way I know it will run the same number of iterations for all matrix sizes ![]()
(When I started this little research, I did have a non-zero eps, but that leads to incomparable program runs)
Hi Arjen I have installed tcl/tk on
my two main development systems and
have ran your tcl script on both systems.
I don’t get the widely different run times you do.
Here are some brief technical details about the systems.
System 1 - Dell 5820 - 192 GB Ram
gfortran 15.1.0 - equation.com
Visual Studio
Microsoft Visual Studio Community 2022 (64-bit) - Current
Version 17.14.15 (September 2025)
System 2 - Dell 5515 - 32 GB ram.
gfortran 14.2.0 - equation.com
Visual Studio
Microsoft Visual Studio Community 2022 (64-bit) - Current
Version 17.14.15 (September 2025)
Here are the runs
System 1 - dos shell
array 100 0.406000 0.406250
array 200 0.828000 0.828125
array 300 1.64000 1.64062
array 400 2.51500 2.51562
array 500 3.84400 3.84375
array 600 5.42200 5.42188
array 700 7.40600 7.40625
array 800 9.59400 9.59375
array 900 11.9690 11.9688
array 1000 14.5630 14.5625
array 1200 22.0780 22.0781
array 1400 31.1720 31.1719
array 1600 41.2970 41.2969
array 1800 54.9530 54.9531
array 2000 72.1710 72.1719
array 2300 92.7970 92.7812
array 2600 118.609 118.562
array 3000 164.078 164.078
System 1 - visual studio shell
array 100 0.421000 0.421875
array 200 0.843000 0.843750
array 300 1.64100 1.64062
array 400 2.51600 2.51562
array 500 3.82800 3.82812
array 600 5.32800 5.32812
array 700 7.29600 7.29688
array 800 9.15600 9.15625
array 900 11.9060 11.9062
array 1000 14.8750 14.8750
array 1200 21.6720 21.6719
array 1400 31.1410 31.1406
array 1600 42.5000 42.5000
array 1800 56.4380 56.3906
array 2000 71.3910 71.3750
array 2300 94.3910 94.3906
array 2600 125.125 125.062
array 3000 170.625 170.609
System 2 - dos shell
array 100 0.235000 0.234375
array 200 0.734000 0.734375
array 300 1.56200 1.56250
array 400 2.68700 2.68750
array 500 4.31200 4.31250
array 600 6.46900 6.46875
array 700 8.95300 8.95312
array 800 11.1400 11.1406
array 900 14.6560 14.6562
array 1000 19.3750 19.3594
array 1200 26.8280 26.8125
array 1400 34.1250 34.1250
array 1600 44.5940 44.5938
array 1800 55.4850 55.4688
array 2000 68.9060 68.8906
array 2300 90.3590 90.3281
array 2600 117.609 117.453
array 3000 156.344 156.312
System 2 - visual studio shell
array 100 0.266000 0.265625
array 200 0.828000 0.828125
array 300 1.73400 1.73438
array 400 3.12500 3.12500
array 500 4.82800 4.82812
array 600 6.81200 6.81250
array 700 9.64100 9.64062
array 800 11.3590 11.3594
array 900 14.6880 14.6875
array 1000 19.5310 19.5312
array 1200 25.0000 25.0000
array 1400 35.9840 35.9688
array 1600 45.9840 45.9375
array 1800 57.7810 57.7812
array 2000 70.4850 70.4531
array 2300 91.4530 91.4531
array 2600 122.953 122.938
array 3000 163.625 163.516
Do you want details of the paths on both systems and shells?
Do you want more hardware details of the systems?
Hi Ian, I am a bit surprised. My timings are definitely reproducible, but when you say “visual studio shell”, do you mean the command window from the Intel oneAPI installation or something from “Visual Studio”? I use the “Intel oneAPI command prompt for Intel 64 for Visual Studio 22” according to the description.
Apologies Arjen. Here is the Intel oneapi run.
array 100 0.422000 0.421875
array 200 0.843000 0.843750
array 300 1.62500 1.62500
array 400 2.56200 2.56250
array 500 3.82900 3.82812
array 600 5.28100 5.28125
array 700 7.15600 7.15625
array 800 9.04700 9.04688
array 900 11.7340 11.7344
array 1000 14.6090 14.6094
array 1200 23.4220 23.3906
array 1400 34.2500 34.2500
array 1600 42.5160 42.5156
array 1800 54.2960 54.2969
array 2000 70.9530 70.9531
array 2300 92.6250 92.6250
array 2600 124.016 124.016
array 3000 172.375 172.375
on system 1.
I’ll repeat on system 2 and post the results.
Here is the system 2 run
array 100 0.250000 0.250000
array 200 0.750000 0.734375
array 300 1.54700 1.54688
array 400 2.70400 2.70312
array 500 4.32800 4.32812
array 600 6.43700 6.42188
array 700 8.85900 8.85938
array 800 11.1250 11.1250
array 900 13.9840 13.9844
array 1000 17.4370 17.4219
array 1200 24.8280 24.7812
array 1400 33.2500 33.2188
array 1600 44.3910 44.3281
array 1800 55.0460 54.9531
array 2000 69.0160 68.9531
array 2300 89.3750 89.3438
array 2600 116.969 116.875
array 3000 153.688 153.531
ifx
Intel(R) Fortran Compiler for applications running on Intel(R) 64, Version 2025.2.0 Build 20250605
Copyright (C) 1985-2025 Intel Corporation. All rights reserved.
is running on both systems.
Can you post the path from the faulty runs? I’ll
have a look and see what the differences there are between our systems.
I have 3 program variations, which I tested for 1 or 6 threads.
- poisson_island_argen_origo.exe is basically the original Argen version using pointers, although I did introduce “u0 = u(ix,iy” in the calc_new inner loop.
- poisson_island_call_orig0.exe replaces call calc_new with two calls, eliminating pointers by alternating the array arguments..
- poisson_island_call_origv.exe introduces array syntax to “force” AVX calculations where identified. This may improve the cache memory usage (listed below)
The clock times for nx = 3,000 are:
argen_origo.exe 1 th :51.72 s 6 th : 47.33 s
call_orig0.exe 1 th : 114.19 s 6 th : 47.49 s
call_origv.exe 1 th : 88.05 s 6 th : 45.16 s
So,
Replacing Fortran pointers syntax with 2 alternate argument order calls is slower for a single thread.
Introducing 6 threads achieves only 10% run time improvement for the pointer option, but the array syntax approach now achieves a 5% improvement on the pointer approach
There appears to be a significant limitation from memory bandwidth for multi thread performance.
Using Fortran pointer syntax does not appear to adversely affect performance.
These results were achieved with Gfortran 15.1.0 on a Ryzen 5900X with 3200 mhz memory.
These results appear to be faster than @cmaapic 's Dell 5820. I would be interested to know the nominal processor and memory speeds.
PROCESSOR_DESCRIPTION=AMD Ryzen 9 5900X 12-Core Processor 64GB 3200MHz memory
options=-fimplicit-none -O3 -march=native -ffast-math -fstack-arrays -fprefetch-loop-arrays
gcc_path=C:\Program Files (x86)\gcc_eq\gcc_15.1.0\bin;C:\Program Files (x86)\gcc_eq\gcc_15.1.0\libexec\gcc\x86_64-w64-mingw32\15.1.0
+++++++++
test of Original routine with 2xcalls
+++++++++
Report of simulation
--------------------
Program Name: poisson_island_argen_origo
Compiler version: GCC version 15.1.0
Compiler options: -march=znver3 ... -mtune=znver3 -O3 -fimplicit-none -ffast-math -fstack-arrays -fprefetch-loop-arrays
#OrigO 10000 4.9958680E-07 100 0.0150 0.0156 1 100.0%
#OrigO 10000 9.0385913E-07 10 0.0000 0.0000 1 Inf%
#OrigO 10000 4.7870549E-07 200 0.0940 0.0938 1 63.8%
#OrigO 10000 4.7170593E-07 300 0.2180 0.2188 1 61.9%
#OrigO 10000 4.6820298E-07 400 0.3910 0.3906 1 61.4%
#OrigO 10000 4.6610134E-07 500 0.6090 0.6094 1 61.6%
#OrigO 10000 4.6469793E-07 600 1.0000 1.0000 1 54.0%
#OrigO 10000 4.6369848E-07 700 1.2030 1.2031 1 61.1%
#OrigO 10000 4.6294849E-07 800 1.5310 1.5156 1 62.7%
#OrigO 10000 4.6236192E-07 900 2.3600 2.3125 1 51.5%
#OrigO 10000 4.6189652E-07 1000 4.1560 4.1562 1 36.1%
#OrigO 10000 4.6119402E-07 1200 3.8590 3.8594 1 56.0%
#OrigO 10000 4.6069104E-07 1400 7.2500 7.2344 1 40.6%
#OrigO 10000 4.6031641E-07 1600 9.5620 9.5312 1 40.2%
#OrigO 10000 4.6002316E-07 1800 14.1100 14.0781 1 34.4%
#OrigO 10000 4.5978987E-07 2000 19.3280 19.2969 1 31.0%
#OrigO 10000 4.5951694E-07 2300 27.2340 27.1562 1 29.1%
#OrigO 10000 4.5930227E-07 2600 37.0940 36.9375 1 27.3%
#OrigO 10000 4.5908735E-07 3000 51.7190 51.5625 1 26.1%
#OrigO 10000 4.6195572E-07 1024 7.9530 7.9375 1 19.8%
PROCESSOR_DESCRIPTION=AMD Ryzen 9 5900X 12-Core Processor 64GB 3200MHz memory
options=-fimplicit-none -O3 -march=native -ffast-math -fstack-arrays -fprefetch-loop-arrays
gcc_path=C:\Program Files (x86)\gcc_eq\gcc_15.1.0\bin;C:\Program Files (x86)\gcc_eq\gcc_15.1.0\libexec\gcc\x86_64-w64-mingw32\15.1.0
+++++++++
test of Original routine with 2xcalls
+++++++++
Report of simulation
--------------------
Program Name: poisson_island_call_orig0
Compiler version: GCC version 15.1.0
Compiler options: -march=znver3 ... -mtune=znver3 -O3 -fimplicit-none -ffast-math -fstack-arrays -fprefetch-loop-arrays
#Orig0 10000 4.9958851E-07 100 0.1090 0.1094 1 100.0%
#Orig0 10000 9.0385930E-07 10 0.0000 0.0000 1 Inf%
#Orig0 10000 4.7867633E-07 200 0.4690 0.4688 1 93.0%
#Orig0 10000 4.7174834E-07 300 1.0310 1.0312 1 95.2%
#Orig0 10000 4.6822538E-07 400 1.8130 1.8125 1 96.2%
#Orig0 10000 4.6587644E-07 500 2.8120 2.8125 1 96.9%
#Orig0 10000 4.6435241E-07 600 4.0630 4.0625 1 96.6%
#Orig0 10000 4.6327645E-07 700 14.9840 14.9844 1 35.6%
#Orig0 10000 4.6249133E-07 800 7.2190 7.2188 1 96.6%
#Orig0 10000 4.6188163E-07 900 10.6710 10.6250 1 82.7%
#Orig0 10000 4.6142713E-07 1000 11.8130 11.8125 1 92.3%
#Orig0 10000 4.6165377E-07 1200 16.4060 16.4062 1 95.7%
#Orig0 10000 4.6236298E-07 1400 22.7190 22.6094 1 94.0%
#Orig0 10000 4.6306505E-07 1600 29.3590 29.3438 1 95.0%
#Orig0 10000 4.6351329E-07 1800 37.0310 37.0312 1 95.4%
#Orig0 10000 4.6349862E-07 2000 46.3280 46.3281 1 94.1%
#Orig0 10000 4.6025482E-07 2300 61.4530 61.4062 1 93.8%
#Orig0 10000 4.5569377E-07 2600 77.7500 77.6562 1 94.8%
#Orig0 10000 4.5164214E-07 3000 109.0620 109.0000 1 89.9%
#Orig0 10000 4.6144115E-07 1024 14.0470 14.0469 1 81.4%
PROCESSOR_DESCRIPTION=AMD Ryzen 9 5900X 12-Core Processor 64GB 3200MHz memory
options=-fimplicit-none -O3 -march=native -ffast-math -fstack-arrays -fprefetch-loop-arrays
gcc_path=C:\Program Files (x86)\gcc_eq\gcc_15.1.0\bin;C:\Program Files (x86)\gcc_eq\gcc_15.1.0\libexec\gcc\x86_64-w64-mingw32\15.1.0
+++++++++
test of Original routine with 2xcalls
+++++++++
Report of simulation
--------------------
Program Name: poisson_island_call_origv
Compiler version: GCC version 15.1.0
Compiler options: -march=znver3 ... -mtune=znver3 -O3 -fimplicit-none -ffast-math -fstack-arrays -fprefetch-loop-arrays
#OrigV 10000 4.9958675E-07 100 0.0780 0.0781 1 100.0%
#OrigV 10000 9.0385913E-07 10 0.0000 0.0000 1 Inf%
#OrigV 10000 4.7870549E-07 200 0.2500 0.2500 1 124.8%
#OrigV 10000 4.7170568E-07 300 0.5310 0.5312 1 132.2%
#OrigV 10000 4.6820293E-07 400 0.8910 0.8906 1 140.1%
#OrigV 10000 4.6610137E-07 500 1.3900 1.3906 1 140.3%
#OrigV 10000 4.6469779E-07 600 1.9220 1.9062 1 146.1%
#OrigV 10000 4.6369848E-07 700 2.7190 2.7188 1 140.6%
#OrigV 10000 4.6294832E-07 800 3.4530 3.4062 1 144.6%
#OrigV 10000 4.6236195E-07 900 4.3900 4.3906 1 143.9%
#OrigV 10000 4.6189643E-07 1000 5.4530 5.4375 1 143.0%
#OrigV 10000 4.6119391E-07 1200 8.8120 8.7500 1 127.5%
#OrigV 10000 4.6069104E-07 1400 14.4530 14.4062 1 105.8%
#OrigV 10000 4.6031670E-07 1600 20.9530 20.9375 1 95.3%
#OrigV 10000 4.6002310E-07 1800 27.7650 27.7500 1 91.0%
#OrigV 10000 4.5978985E-07 2000 35.7500 35.4375 1 87.3%
#OrigV 10000 4.5951708E-07 2300 49.0000 48.8125 1 84.2%
#OrigV 10000 4.5930227E-07 2600 64.6090 64.2344 1 81.6%
#OrigV 10000 4.5908740E-07 3000 88.0470 87.6094 1 79.7%
#OrigV 10000 4.6195572E-07 1024 6.0460 6.0000 1 135.3%
PROCESSOR_DESCRIPTION=AMD Ryzen 9 5900X 12-Core Processor 64GB 3200MHz memory
options=-fimplicit-none -O3 -fopenmp -march=native -fstack-arrays
gcc_path=C:\Program Files (x86)\gcc_eq\gcc_15.1.0\bin;C:\Program Files (x86)\gcc_eq\gcc_15.1.0\libexec\gcc\x86_64-w64-mingw32\15.1.0
+++++++++
test of Original routine with 2xcalls
+++++++++
Report of simulation
--------------------
Program Name: poisson_island_argen_origo
Compiler version: GCC version 15.1.0
Compiler options: -march=znver3 ... -mtune=znver3 -mthreads -O3 -fimplicit-none -fopenmp -fstack-arrays
#OrigO 10000 4.9958840E-07 100 0.5780 1.2500 6 100.0%
#OrigO 10000 9.0385907E-07 10 0.5630 1.1094 6 1.0%
#OrigO 10000 4.7868451E-07 200 0.4210 1.5469 6 549.2%
#OrigO 10000 4.7170420E-07 300 0.5630 2.2500 6 924.0%
#OrigO 10000 4.6825269E-07 400 0.8280 3.1562 6 1116.9%
#OrigO 10000 4.6607605E-07 500 1.0780 5.0938 6 1340.4%
#OrigO 10000 4.6449975E-07 600 1.4220 6.7500 6 1463.3%
#OrigO 10000 4.6338957E-07 700 8.7820 39.4219 6 322.5%
#OrigO 10000 4.6255104E-07 800 2.5150 11.8750 6 1470.9%
#OrigO 10000 4.6191593E-07 900 3.1400 15.3281 6 1491.0%
#OrigO 10000 4.6147736E-07 1000 3.6560 17.7656 6 1581.0%
#OrigO 10000 4.6073831E-07 1200 5.2190 24.2969 6 1594.8%
#OrigO 10000 4.6081024E-07 1400 7.0620 35.2656 6 1604.2%
#OrigO 10000 4.6142355E-07 1600 9.4220 43.7656 6 1570.5%
#OrigO 10000 4.6196368E-07 1800 12.1720 62.0469 6 1538.5%
#OrigO 10000 4.6241374E-07 2000 17.5000 91.0000 6 1321.1%
#OrigO 10000 4.6298089E-07 2300 23.9690 123.2344 6 1275.7%
#OrigO 10000 4.6229496E-07 2600 32.9220 170.7031 6 1186.8%
#OrigO 10000 4.5759504E-07 3000 47.3280 245.2031 6 1099.1%
#OrigO 10000 4.6149398E-07 1024 4.2190 20.4062 6 1436.5%
PROCESSOR_DESCRIPTION=AMD Ryzen 9 5900X 12-Core Processor 64GB 3200MHz memory
options=-fimplicit-none -O3 -fopenmp -march=native -fstack-arrays
gcc_path=C:\Program Files (x86)\gcc_eq\gcc_15.1.0\bin;C:\Program Files (x86)\gcc_eq\gcc_15.1.0\libexec\gcc\x86_64-w64-mingw32\15.1.0
+++++++++
test of Original routine with 2xcalls
+++++++++
Report of simulation
--------------------
Program Name: poisson_island_call_orig0
Compiler version: GCC version 15.1.0
Compiler options: -march=znver3 ... -mtune=znver3 -mthreads -O3 -fimplicit-none -fopenmp -fstack-arrays
#Orig0 10000 4.9958840E-07 100 0.5940 1.5938 6 100.0%
#Orig0 10000 9.0385913E-07 10 0.5780 1.0938 6 1.0%
#Orig0 10000 4.7868451E-07 200 0.4220 1.7656 6 563.0%
#Orig0 10000 4.7170420E-07 300 0.5620 2.2969 6 951.2%
#Orig0 10000 4.6825269E-07 400 0.7810 3.7656 6 1216.9%
#Orig0 10000 4.6607605E-07 500 1.0780 5.1094 6 1377.6%
#Orig0 10000 4.6449975E-07 600 1.4380 7.0312 6 1487.1%
#Orig0 10000 4.6338960E-07 700 1.9370 9.2969 6 1502.6%
#Orig0 10000 4.6255104E-07 800 2.5780 11.4062 6 1474.6%
#Orig0 10000 4.6191593E-07 900 3.8750 17.5781 6 1241.7%
#Orig0 10000 4.6147736E-07 1000 3.8130 17.9219 6 1557.8%
#Orig0 10000 4.6073831E-07 1200 5.0930 24.6094 6 1679.5%
#Orig0 10000 4.6081024E-07 1400 6.9690 34.4219 6 1670.6%
#Orig0 10000 4.6142353E-07 1600 9.6100 46.2188 6 1582.4%
#Orig0 10000 4.6196365E-07 1800 12.1090 61.8906 6 1589.4%
#Orig0 10000 4.6241371E-07 2000 15.9060 81.6406 6 1493.8%
#Orig0 10000 4.6298089E-07 2300 23.4380 122.1719 6 1340.7%
#Orig0 10000 4.6229496E-07 2600 32.3900 171.0156 6 1239.7%
#Orig0 10000 4.5759504E-07 3000 47.4850 249.4062 6 1125.8%
#Orig0 10000 4.6149395E-07 1024 4.2820 20.0625 6 1454.6%
PROCESSOR_DESCRIPTION=AMD Ryzen 9 5900X 12-Core Processor 64GB 3200MHz memory
options=-fimplicit-none -O3 -fopenmp -march=native -fstack-arrays
gcc_path=C:\Program Files (x86)\gcc_eq\gcc_15.1.0\bin;C:\Program Files (x86)\gcc_eq\gcc_15.1.0\libexec\gcc\x86_64-w64-mingw32\15.1.0
+++++++++
test of Original routine with 2xcalls
+++++++++
Report of simulation
--------------------
Program Name: poisson_island_call_origv
Compiler version: GCC version 15.1.0
Compiler options: -march=znver3 ... -mtune=znver3 -mthreads -O3 -fimplicit-none -fopenmp -fstack-arrays
#OrigV 10000 4.9958680E-07 100 0.5940 1.3125 6 100.0%
#OrigV 10000 9.0385913E-07 10 0.5630 1.1406 6 1.1%
#OrigV 10000 4.7870549E-07 200 0.4530 1.4219 6 524.5%
#OrigV 10000 4.7170599E-07 300 0.5310 1.9844 6 1006.8%
#OrigV 10000 4.6820307E-07 400 0.7030 3.0000 6 1351.9%
#OrigV 10000 4.6610117E-07 500 0.9220 4.6094 6 1610.6%
#OrigV 10000 4.6469916E-07 600 1.1560 5.8750 6 1849.8%
#OrigV 10000 4.6369800E-07 700 1.5320 7.6250 6 1899.9%
#OrigV 10000 4.6294647E-07 800 1.9530 9.6562 6 1946.5%
#OrigV 10000 4.6236210E-07 900 2.3910 12.5938 6 2012.3%
#OrigV 10000 4.6189507E-07 1000 2.9840 14.9062 6 1990.6%
#OrigV 10000 4.6119442E-07 1200 4.2500 21.4844 6 2012.6%
#OrigV 10000 4.6069360E-07 1400 5.9370 30.8906 6 1961.0%
#OrigV 10000 4.6031730E-07 1600 8.5780 46.6094 6 1772.7%
#OrigV 10000 4.6002634E-07 1800 12.1400 66.8125 6 1585.3%
#OrigV 10000 4.5979189E-07 2000 16.1870 89.2656 6 1467.8%
#OrigV 10000 4.5951600E-07 2300 23.1410 127.4062 6 1357.9%
#OrigV 10000 4.5930702E-07 2600 31.9060 179.0625 6 1258.5%
#OrigV 10000 4.5909147E-07 3000 45.1560 252.0000 6 1183.9%
#OrigV 10000 4.6195544E-07 1024 3.0940 15.1719 6 2013.1%
! This is based on origional with changes for
! run loop on nx and
! timing reporting
! append results to output file
! replaces pointers with two solver calls to alternate arrays and use OpenMP
! step 1 is to introduce 2 x call and change the names of arrays in calc_new
! step 2 is to introduce local u0 in calc_new
! step 3 change names u, unew, introduce OpenMP
! step 4 modify inner loop for vector calculation in calc_new
! step 5 use array syntax for inner array routines
! poisson_island_array.f90 --
! This is a naïve version of a solver for Poisson equations.
!
! Notes:
! - The grid is assumed to be rectangular with square cells
! - We reach the steady state using a pseudo-time step
! - Explicit loops used
! - All boundary conditions are Dirichlet: u = 0
! - In addition: there is an isolated "island" in the middle
! that does not allow diffusion.
! - The status of the grid cells is either:
! status = 1: ordinary grid cell
! status = 0: isolated grid cell, no flux
! status = -1: open boundary grid cell, fixed concentration
! - By distinguishing three statuses we can also implement
! zero-flux Neumann boundaries
! - Rather than if-statement, rely on an array of diffusion
! coefficients
!
! The diffusion arrays:
!
! | |
! | Dy(ix,iy) |
! | |
! ---+---------------+---
! | |
! Dx(ix-1,iy) | ix,iy | Dx(ix,iy)
! | |
! ---+---------------+---
! | |
! | Dy(ix,iy-1) |
! | |
!
! Note:
! This version uses pointers to swap the solution
!
program poisson_island_array
use timing
implicit none
!new
integer, parameter :: threads = 6 ! limit threads to limit memory clashes
integer :: threads_used = 1
integer, dimension(:,:), allocatable :: status
real, dimension(:,:), allocatable :: force, diffx, diffy
real, dimension(:,:), allocatable :: u, unew
!z real, dimension(:,:), allocatable, target :: u, unew
!z real, dimension(:,:), pointer :: pu, punew, ptmp
integer :: nx, ny
integer :: ix, iy
integer :: max_iter, iter
real :: x_extent, y_extent, diff, deltt
real :: dx, dy
real :: error, eps, mb, efk, efficiency = -1
!z real :: flux_west, flux_east, flux_north, flux_south
open( 10, file = 'poisson_island_array.inp' )
open( 20, file = 'poisson_island_array.out', position='append' )
lurep = 20 ! Just one output file
read( 10, * ) x_extent, y_extent ! Extent of the grid in the two directions
read( 10, * ) diff ! Pseudo diffusion coefficient
read( 10, * ) eps ! Error level for finishing the calculation
read( 10, * ) max_iter ! Maximum number of iterations
read( 10, * ) deltt ! Pseudo timestep
!new loop
write ( 20, fmt='(/a/a/a)' ) '+++++++++', ' test of Original routine with 2xcalls', '+++++++++'
!z close( 10 )
!z read( 10, * ) nx, ny ! Sizes of the grid
! test each problem size
do
read( 10, * ) nx ! Sizes of the grid
ny = nx
if ( nx < 5 ) exit
mb = (nx+2.)*(ny+2.) * 6 * 4./1024./1024.
write (*,fmt='(" Allocating for nx = ",i0,f8.2," Mbytes")') nx, mb
allocate( u(nx+2,ny+2), unew (nx+2,ny+2), &
force(nx+2,ny+2), status(nx+2,ny+2), &
diffx(nx+2,ny+2), diffy (nx+2,ny+2) )
!
! Set up the calculation: force is set to 1 in the upper part of the grid
!
u = 0.0
unew = 0.0
force = 0.0
force(:,(ny+1)/2:) = 1.0
iter = 0
error = huge(error)
dx = diff / (x_extent/(nx-2)) ** 2
dy = diff / (y_extent/(ny-2)) ** 2
call set_status( status, dx, dy, diffx, diffy )
!
! Correct the force: the inactive cells have a force term zero
!
force = merge( force, 0.0, status /= 0 )
! pu => u
! punew => unew
call start_timer
do while ( error > eps .and. iter < max_iter )
call calc_new ( u, unew, error )
iter = iter + 1
call calc_new ( unew, u, error )
iter = iter + 1
if ( mod(iter,1000) == 2 ) then
write( *, * ) iter, error
!z write( 20, * ) iter, error
end if
! ptmp => pu
! pu => punew
! punew => ptmp
end do
call stop_timer
if ( efficiency < 0 ) efficiency = clock_delta / (nx**2)
efk = (efficiency*nx**2) / clock_delta * 100.0
write( *, 21 ) iter, error, nx, clock_delta, cpu_delta, threads_used, efk
write( 20, 21 ) iter, error, nx, clock_delta, cpu_delta, threads_used, efk
21 format ( '#OrigV',i6,es15.7,i6,2f10.4,i5, f7.1,'%' )
! write( 20, '(a)' ) 'Result'
! do iy = 1,ny+2
! write( 20, '(*(g11.4))') u(:,iy)
! end do
deallocate ( u, unew, &
force, status, &
diffx, diffy )
end do
contains
! calc_new --
! Calculate the new field
!
! Arguments:
! u_in Array with values at the current time
! u_ou Array with the new values
! error Total error
!
subroutine calc_new ( u_in, u_ou, error )
real, dimension(:,:), intent(in) :: u_in
real, dimension(:,:), intent(inout) :: u_ou
real, intent(out) :: error
! Contains provides
! nx, ny, deltt
! force(:,:) diffx(:,:), diffy(:,:)
! Local variables
integer :: iy
real, dimension(nx+2) :: du, u0
error = 0.0
!$ threads_used = threads
!$omp parallel do default ( none ) &
!$omp& shared ( u_in, u_ou, force, diffx, diffy, deltt, nx, ny ) &
!$omp& private ( u0, du, iy ) &
!$omp& reduction (+ : error ) &
!$omp& num_threads ( threads ) &
!$omp& schedule ( static )
do iy = 2,ny+1
! do ix = 2,nx+1
! du = diffx(ix-1,iy) * ( u(ix-1,iy) - u(ix,iy) ) &
! + diffx(ix,iy) * ( u(ix+1,iy) - u(ix,iy) ) &
! + diffy(ix,iy-1) * ( u(ix,iy-1) - u(ix,iy) ) &
! + diffy(ix,iy) * ( u(ix,iy+1) - u(ix,iy) ) &
! + force(ix,iy)
! unew(ix,iy) = u(ix,iy) + deltt * du
! error = error + abs( deltt * du )
! enddo
u0(:) = u_in(:,iy)
du(2:nx+1) = force(2:nx+1,iy)
du(2:nx+1) = du(2:nx+1) + diffx(1:nx, iy) * ( u0(1:nx) - u0(2:nx+1) )
du(2:nx+1) = du(2:nx+1) + diffx(2:nx+1,iy) * ( u0(3:nx+2) - u0(2:nx+1) )
du(2:nx+1) = du(2:nx+1) + diffy(2:nx+1,iy-1) * ( u_in(2:nx+1,iy-1) - u0(2:nx+1) )
du(2:nx+1) = du(2:nx+1) + diffy(2:nx+1,iy) * ( u_in(2:nx+1,iy+1) - u0(2:nx+1) )
! u0(:) = u_in(:,iy)
! du(2:nx+1) = force(2:nx+1,iy) &
! + diffx(1:nx, iy) * ( u0(1:nx) - u0(2:nx+1) ) &
! + diffx(2:nx+1,iy) * ( u0(3:nx+2) - u0(2:nx+1) ) &
! + diffy(2:nx+1,iy-1) * ( u_in(2:nx+1,iy-1) - u0(2:nx+1) ) &
! + diffy(2:nx+1,iy) * ( u_in(2:nx+1,iy+1) - u0(2:nx+1) )
u_ou(2:nx+1,iy) = u0(2:nx+1) + deltt * du(2:nx+1)
error = error + sum ( abs( du(2:nx+1) ) ) * deltt
end do
!$omp end parallel do
error = error / (nx-2) / (ny-2)
end subroutine calc_new
! set_status --
! Set the status value for the grid cells
!
! Arguments:
! status Array holding the status encoding
! dx Nominal "diffusion" coefficient x-direction
! dy Nominal "diffusion" coefficient y-direction
! diffx Diffusion array x-direction
! diffy Diffusion array y-direction
!
subroutine set_status( status, dx, dy, diffx, diffy )
real, intent(in) :: dx, dy
integer, dimension(:,:), intent(out) :: status
real, dimension(:,:), intent(out) :: diffx, diffy
integer :: mx, my, ix, iy
mx = size (status,1) ! = nx+2
my = size (status,2) ! = ny+2
status = -1 ! All sides are "open boundaries"
status(2:mx-1,2:my-1) = 1 ! Inside cells are regular cells
!
! The "island"
!
status( (2*mx)/5:(3*mx)/5, (2*my)/5:(3*my)/5 ) = 0
!
! Encode this in the diffusion arrays
!
diffx = dx
diffy = dy
do iy = 1,my
do ix = 1,mx-1
if ( status(ix,iy) == 0 .or. status(ix+1,iy) == 0 ) then
diffx(ix,iy) = 0.0
end if
end do
end do
do iy = 1,my-1
do ix = 1,mx
if ( status(ix,iy) == 0 .or. status(ix,iy+1) == 0 ) then
diffy(ix,iy) = 0.0
end if
end do
end do
end subroutine set_status
end program poisson_island_array
! timing.f90 --
! Simple measurement of the performance of a program
!
MODULE timing
USE iso_fortran_env
INTEGER :: time_first, time_last, time_zero, time_rate
REAL :: cpu_first, cpu_last
real :: cpu_delta, clock_delta
INTEGER :: lurep = OUTPUT_UNIT, lucsv, luoverall
CONTAINS
! start_timer
! Initialise the time, so that we can keep track of how much wall clock/system time is spent
!
! Arguments:
! None
!
SUBROUTINE start_timer
CALL system_clock( count = time_first, count_rate = time_rate )
CALL cpu_time( cpu_first )
END SUBROUTINE start_timer
! stop_timer
! Measure the elapsed time and report it
!
! Arguments:
! None
!
SUBROUTINE stop_timer
integer :: k, leng
integer :: call_count = 0
character :: program_name*64
CALL system_clock ( count = time_last )
CALL cpu_time ( cpu_last )
clock_delta = (time_last - time_first) / real(time_rate)
cpu_delta = cpu_last - cpu_first
do k = 1,2
if ( k==1 ) lurep = 20
if ( k==2 ) lurep = 6
if ( call_count == 0 ) then
program_name = ' '
call get_command_argument ( 0, program_name, leng )
WRITE( lurep, '(a)' ) 'Report of simulation'
WRITE( lurep, '(a)' ) '--------------------'
WRITE( lurep, '(a,a)' ) 'Program Name: ', trim(program_name)
WRITE( lurep, '(a,a)' ) 'Compiler version: ', compiler_version()
WRITE( lurep, '(a,a)' ) 'Compiler options: ', compiler_options()
end if
!z write( lurep, '(a,i5)' ) ' Grid size',nx
if ( lurep /= 20 ) then
WRITE( lurep, '(a,g12.6)' ) 'Wall clock (s): ', clock_delta
WRITE( lurep, '(a,g12.6)' ) 'CPU time (s): ', cpu_delta
end if
! WRITE( lucsv, '(100(g0.6,'',''))' ) &
! (time_last - time_first) / real(time_rate), cpu_last - cpu_first, &
! (timeused_new - timeused)
end do
call_count = 1
END SUBROUTINE stop_timer
END MODULE timing

