MFlowCode · danieljvickers · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
@@ -174,6 +174,16 @@
 #endif
 #:enddef
 
+#:def END_GPU_ATOMIC_CAPTURE()
+    #:set acc_end_directive = '!$acc end atomic'
+    #:set omp_end_directive = '!$omp end atomic'
+#if defined(MFC_OpenACC)
+    $:acc_end_directive
+#elif defined(MFC_OpenMP)
+    $:omp_end_directive
+#endif
+#:enddef
+
 #:def GPU_UPDATE(host=None, device=None, extraAccArgs=None, extraOmpArgs=None)
     #:set acc_code = ACC_UPDATE(host=host, device=device, extraAccArgs=extraAccArgs)
     #:set omp_code = OMP_UPDATE(host=host, device=device, extraOmpArgs=extraOmpArgs)

diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
@@ -23,7 +23,7 @@ module m_constants
     integer, parameter :: fourier_rings = 5                       !< Fourier filter ring limit
     integer, parameter :: num_fluids_max = 10                     !< Maximum number of fluids in the simulation
     integer, parameter :: num_probes_max = 10                     !< Maximum number of flow probes in the simulation
-    integer, parameter :: num_patches_max = 10
+    integer, parameter :: num_patches_max = 1000
     integer, parameter :: num_bc_patches_max = 10
     integer, parameter :: pathlen_max = 400
     integer, parameter :: nnode = 4    !< Number of QBMM nodes

diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
@@ -183,12 +183,18 @@ module m_derived_types
     end type t_model
 
     type :: t_model_array
+        ! Original CPU-side fields (unchanged)
         type(t_model), allocatable :: model
         real(wp), allocatable, dimension(:, :, :) :: boundary_v
         real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v
         integer :: boundary_edge_count
         integer :: total_vertices
-        logical :: interpolate
+        integer :: interpolate
+
+        ! GPU-friendly flattened arrays
+        integer :: ntrs  ! copy of model%ntrs
+        real(wp), allocatable, dimension(:, :, :) :: trs_v  ! (3, 3, ntrs) - triangle vertices
+        real(wp), allocatable, dimension(:, :) :: trs_n  ! (3, ntrs)    - triangle normals
     end type t_model_array
-    type :: t_model_array
-        ! Original CPU-side fields (unchanged)
-        type(t_model), allocatable :: model
-        real(wp), allocatable, dimension(:, :, :) :: boundary_v
-        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v
-        integer :: boundary_edge_count
-        integer :: total_vertices
-        logical :: interpolate
-        integer :: interpolate
-
-        ! GPU-friendly flattened arrays
-        integer :: ntrs  ! copy of model%ntrs
-        real(wp), allocatable, dimension(:, :, :) :: trs_v  ! (3, 3, ntrs) - triangle vertices
-        real(wp), allocatable, dimension(:, :) :: trs_n  ! (3, ntrs)    - triangle normals
-    end type t_model_array
+    type :: t_model_array
+        ! Original CPU-side fields (unchanged)
+        type(t_model), allocatable :: model
+        real(wp), allocatable, dimension(:, :, :) :: boundary_v
+        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v
+        integer :: boundary_edge_count = 0
+        integer :: total_vertices = 0
+        integer :: interpolate = 0
+
+        ! GPU-friendly flattened arrays
+        integer :: ntrs = 0  ! copy of model%ntrs
+        real(wp), allocatable, dimension(:, :, :) :: trs_v  ! (3, 3, ntrs) - triangle vertices
+        real(wp), allocatable, dimension(:, :) :: trs_n  ! (3, ntrs)    - triangle normals
+    end type t_model_array
-    type :: t_model_array
-        ! Original CPU-side fields (unchanged)
-        type(t_model), allocatable :: model
-        real(wp), allocatable, dimension(:, :, :) :: boundary_v
-        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v
-        integer :: boundary_edge_count
-        integer :: total_vertices
-        logical :: interpolate
-        integer :: interpolate
-
-        ! GPU-friendly flattened arrays
-        integer :: ntrs  ! copy of model%ntrs
-        real(wp), allocatable, dimension(:, :, :) :: trs_v  ! (3, 3, ntrs) - triangle vertices
-        real(wp), allocatable, dimension(:, :) :: trs_n  ! (3, ntrs)    - triangle normals
-    end type t_model_array
+    type :: t_model_array
+        ! Original CPU-side fields (unchanged)
+        type(t_model), allocatable :: model
+        real(wp), allocatable, dimension(:, :, :) :: boundary_v
+        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v
+        integer :: boundary_edge_count = 0
+        integer :: total_vertices = 0
+        integer :: interpolate = 0
+
+        ! GPU-friendly flattened arrays
+        integer :: ntrs = 0  ! copy of model%ntrs
+        real(wp), allocatable, dimension(:, :, :) :: trs_v  ! (3, 3, ntrs) - triangle vertices
+        real(wp), allocatable, dimension(:, :) :: trs_n  ! (3, ntrs)    - triangle normals
+    end type t_model_array
 
     !> Derived type adding initial condition (ic) patch parameters as attributes

diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
@@ -333,6 +333,8 @@ contains
     !! @return The cross product of the two vectors.
     pure function f_cross(a, b) result(c)
 
+        $:GPU_ROUTINE(parallelism='[seq]')
+
         real(wp), dimension(3), intent(in) :: a, b
         real(wp), dimension(3) :: c
 

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
@@ -18,12 +18,28 @@ module m_model
 
     private
 
-    public :: f_model_read, s_model_write, s_model_free, f_model_is_inside
+    public :: f_model_read, s_model_write, s_model_free, f_model_is_inside, models, gpu_ntrs, &
+              gpu_trs_v, gpu_trs_n, gpu_boundary_v, gpu_interpolated_boundary_v, gpu_interpolate, gpu_boundary_edge_count, &
+              gpu_total_vertices, stl_bounding_boxes
 
     ! Subroutines for STL immersed boundaries
     public :: f_check_boundary, f_register_edge, f_check_interpolation_2D, &
               f_check_interpolation_3D, f_interpolate_2D, f_interpolate_3D, &
-              f_interpolated_distance, f_normals, f_distance, f_distance_normals_3D, f_tri_area
+              f_interpolated_distance, f_normals, f_distance, f_distance_normals_3D, f_tri_area, s_pack_model_for_gpu, &
+              f_model_is_inside_flat, f_distance_normals_3d_flat
+
+    !! array of STL models that can be allocated and then used in IB marker and levelset compute
+    type(t_model_array), allocatable, target :: models(:)
+    !! GPU-friendly flat arrays for STL model data
+    integer, allocatable :: gpu_ntrs(:)
+    real(wp), allocatable, dimension(:, :, :, :) :: gpu_trs_v
+    real(wp), allocatable, dimension(:, :, :) :: gpu_trs_n
+    real(wp), allocatable, dimension(:, :, :, :) :: gpu_boundary_v
+    real(wp), allocatable, dimension(:, :, :) :: gpu_interpolated_boundary_v
+    integer, allocatable :: gpu_interpolate(:)
+    integer, allocatable :: gpu_boundary_edge_count(:)
+    integer, allocatable :: gpu_total_vertices(:)
+    real(wp), allocatable :: stl_bounding_boxes(:, :, :)
 
 contains
 
@@ -481,6 +497,23 @@ contains
         is_buffered = .true.
     end subroutine s_skip_ignored_lines
 
+    !> This function is used to replace the fortran random number
+    !! generator because the native generator is not compatible being called
+    !! from GPU routines/functions
+    function f_model_random_number(seed) result(rval)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        integer, intent(inout) :: seed
+        real(wp) :: rval
+
+        seed = ieor(seed, ishft(seed, 13))
+        seed = ieor(seed, ishft(seed, -17))
+        seed = ieor(seed, ishft(seed, 5))
+
+        rval = abs(real(seed, wp))/real(huge(seed), wp)
+    end function f_model_random_number
+
     !> This procedure, recursively, finds whether a point is inside an octree.
     !! @param model    Model to search in.
     !! @param point    Point to test.
@@ -495,58 +528,116 @@ contains
         real(wp), dimension(1:3), intent(in) :: point
         real(wp), dimension(1:3), intent(in) :: spacing
         integer, intent(in) :: spc
+        real(wp) :: phi, theta
+        integer :: rand_seed
 
         real(wp) :: fraction
 
         type(t_ray) :: ray
-        integer :: i, j, nInOrOut, nHits
+        integer :: i, j, k, nInOrOut, nHits
 
         real(wp), dimension(1:spc, 1:3) :: ray_origins, ray_dirs
 
-        ! TODO :: The random number generation prohibits GPU compute due to the subroutine not being able to be called in kernels
-        ! This should be swapped out with something that allows GPU compute. I recommend the fibonacci sphere:
-        ! do i = 1, spc
-        !   phi = acos(1.0 - 2.0*(i-1.0)/(spc-1.0))
-        !   theta = pi * (1.0 + sqrt(5.0)) * (i-1.0)
-        !   ray_dirs(i,:) = [cos(theta)*sin(phi), sin(theta)*sin(phi), cos(phi)]
-        !   ray_origins(i,:) = point
-        ! end do
+        rand_seed = int(point(1)*73856093_wp) + &
+                    int(point(2)*19349663_wp) + &
+                    int(point(3)*83492791_wp)
-        rand_seed = int(point(1)*73856093_wp) + &
-                    int(point(2)*19349663_wp) + &
-                    int(point(3)*83492791_wp)
+        rand_seed = int(point(1)*real(73856093, wp)) + &
+                    int(point(2)*real(19349663, wp)) + &
+                    int(point(3)*real(83492791, wp))
-        rand_seed = int(point(1)*73856093_wp) + &
-                    int(point(2)*19349663_wp) + &
-                    int(point(3)*83492791_wp)
+        rand_seed = int(point(1)*real(73856093, wp)) + &
+                    int(point(2)*real(19349663, wp)) + &
+                    int(point(3)*real(83492791, wp))
+        if (rand_seed == 0) rand_seed = 1
 
+        ! generate our random collection or rays
         do i = 1, spc
-            call random_number(ray_origins(i, :))
-            ray_origins(i, :) = point + (ray_origins(i, :) - 0.5_wp)*spacing(:)
-
-            call random_number(ray_dirs(i, :))
-            ray_dirs(i, :) = ray_dirs(i, :) - 0.5_wp
+            do k = 1, 3
+                ! random jitter in the origin helps us estimate volume fraction instead of only at the cell center
+                ray_origins(i, k) = point(k) + (f_model_random_number(rand_seed) - 0.5_wp)*spacing(k)
+                ! cast sample rays in all directions
+                ray_dirs(i, k) = point(k) + f_model_random_number(rand_seed) - 0.5_wp
-                ray_dirs(i, k) = point(k) + f_model_random_number(rand_seed) - 0.5_wp
+                ray_dirs(i, k) = f_model_random_number(rand_seed) - 0.5_wp
-                ray_dirs(i, k) = point(k) + f_model_random_number(rand_seed) - 0.5_wp
+                ray_dirs(i, k) = f_model_random_number(rand_seed) - 0.5_wp
+            end do
             ray_dirs(i, :) = ray_dirs(i, :)/sqrt(sum(ray_dirs(i, :)*ray_dirs(i, :)))
         end do
 
+        ! ray trace
         nInOrOut = 0
         do i = 1, spc
             ray%o = ray_origins(i, :)
             ray%d = ray_dirs(i, :)
 
             nHits = 0
             do j = 1, model%ntrs
+                ! count the number of triangles this ray intersects
                 if (f_intersects_triangle(ray, model%trs(j))) then
                     nHits = nHits + 1
                 end if
             end do
 
+            ! if the ray hits an odd number of triangles on its way out, then
+            ! it must be on the inside of the model
             nInOrOut = nInOrOut + mod(nHits, 2)
         end do
 
         fraction = real(nInOrOut)/real(spc)
-        fraction = real(nInOrOut)/real(spc)
+        fraction = real(nInOrOut, wp)/real(spc, wp)
-        fraction = real(nInOrOut)/real(spc)
+        fraction = real(nInOrOut, wp)/real(spc, wp)
-        fraction = real(nInOrOut)/real(spc)
+        fraction = real(nInOrOut, wp)/real(spc, wp)
-        fraction = real(nInOrOut)/real(spc)
+        fraction = real(nInOrOut, wp)/real(spc, wp)
 
     end function f_model_is_inside
 
+    impure function f_model_is_inside_flat(ntrs, trs_v, trs_n, pid, point, spacing, spc) result(fraction)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        integer, intent(in) :: ntrs
+        real(wp), dimension(:, :, :, :), intent(in) :: trs_v
+        real(wp), dimension(:, :, :), intent(in) :: trs_n
+        integer, intent(in) :: pid
+        real(wp), dimension(1:3), intent(in) :: point
+        real(wp), dimension(1:3), intent(in) :: spacing
+        integer, intent(in) :: spc
+
+        real(wp) :: fraction
+        real(wp) :: origin(1:3), dir(1:3), dir_mag
+        type(t_ray) :: ray
+        type(t_triangle) :: tri
+        integer :: i, j, k, nInOrOut, nHits
+        integer :: rand_seed
+
+        rand_seed = int(point(1)*73856093_wp) + &
+                    int(point(2)*19349663_wp) + &
+                    int(point(3)*83492791_wp)
+        if (rand_seed == 0) rand_seed = 1
+
+        ! generate our random collection of rays
+        nInOrOut = 0
+        do i = 1, spc
+            ! Generate one ray at a time — no arrays needed
+            do k = 1, 3
+                origin(k) = point(k) + (f_model_random_number(rand_seed) - 0.5_wp)*spacing(k)
+                dir(k) = point(k) + f_model_random_number(rand_seed) - 0.5_wp
+            end do
+            dir_mag = sqrt(dir(1)*dir(1) + dir(2)*dir(2) + dir(3)*dir(3))
+            dir(:) = dir(:)/dir_mag
-            dir(:) = dir(:)/dir_mag
+            if (dir_mag > 0._wp) then
+                dir(:) = dir(:)/dir_mag
+            else
+                dir(:) = 0._wp
+            end if
-            dir(:) = dir(:)/dir_mag
+            if (dir_mag > 0._wp) then
+                dir(:) = dir(:)/dir_mag
+            else
+                dir(:) = 0._wp
+            end if
+
+            ray%o = origin
+            ray%d = dir
+
+            nHits = 0
+            do j = 1, ntrs
+                tri%v(:, :) = trs_v(:, :, j, pid)
+                tri%n(:) = trs_n(:, j, pid)
+                if (f_intersects_triangle(ray, tri)) then
+                    nHits = nHits + 1
+                end if
+            end do
+            nInOrOut = nInOrOut + mod(nHits, 2)
+        end do
+
+        fraction = real(nInOrOut)/real(spc)
+    end function f_model_is_inside_flat
+
     ! From https://www.scratchapixel.com/lessons/3e-basic-rendering/ray-tracing-rendering-a-triangle/ray-triangle-intersection-geometric-solution.html
     !> This procedure checks if a ray intersects a triangle.
     !! @param ray      Ray.
     !! @param triangle Triangle.
     !! @return         True if the ray intersects the triangle, false otherwise.
     elemental function f_intersects_triangle(ray, triangle) result(intersects)
 
+        $:GPU_ROUTINE(parallelism='[seq]')
+
         type(t_ray), intent(in) :: ray
         type(t_triangle), intent(in) :: triangle
 
@@ -1109,6 +1200,66 @@ contains
 
     end subroutine f_distance_normals_3D
 
+    subroutine f_distance_normals_3D_flat(ntrs, trs_v, trs_n, pid, point, normals, distance)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        integer, intent(in) :: ntrs
+        real(wp), dimension(:, :, :, :), intent(in) :: trs_v
+        real(wp), dimension(:, :, :), intent(in) :: trs_n
+        integer, intent(in) :: pid
+        real(wp), dimension(1:3), intent(in) :: point
+        real(wp), dimension(1:3), intent(out) :: normals
+        real(wp), intent(out) :: distance
+
+        real(wp), dimension(1:3, 1:3) :: tri
+        real(wp) :: dist_min, dist_t_min
+        real(wp) :: dist_min_normal, dist_buffer_normal
+        real(wp), dimension(1:3) :: midp
+        real(wp), dimension(1:3) :: dist_buffer
+        integer :: i, j, tri_idx
+
+        dist_min = 1.e12_wp
-        dist_min = 1.e12_wp
+        ! Handle degenerate case with no triangles to avoid invalid indexing
+        if (ntrs <= 0) then
+            normals = 0._wp
+            distance = 0._wp
+            return
+        end if
+
-        dist_min = 1.e12_wp
+        ! Handle degenerate case with no triangles to avoid invalid indexing
+        if (ntrs <= 0) then
+            normals = 0._wp
+            distance = 0._wp
+            return
+        end if
+
+        dist_min_normal = 1.e12_wp
+        distance = 0._wp
+
+        tri_idx = 0
+        do i = 1, ntrs
+            do j = 1, 3
+                tri(j, 1) = trs_v(j, 1, i, pid)
+                tri(j, 2) = trs_v(j, 2, i, pid)
+                tri(j, 3) = trs_v(j, 3, i, pid)
+                dist_buffer(j) = sqrt((point(1) - tri(j, 1))**2 + &
+                                      (point(2) - tri(j, 2))**2 + &
+                                      (point(3) - tri(j, 3))**2)
+            end do
+
+            do j = 1, 3
+                midp(j) = (tri(1, j) + tri(2, j) + tri(3, j))/3
+            end do
+
+            dist_t_min = minval(dist_buffer(1:3))
+            dist_buffer_normal = sqrt((point(1) - midp(1))**2 + &
+                                      (point(2) - midp(2))**2 + &
+                                      (point(3) - midp(3))**2)
+
+            if (dist_t_min < dist_min) then
+                dist_min = dist_t_min
+            end if
+
+            if (dist_buffer_normal < dist_min_normal) then
+                dist_min_normal = dist_buffer_normal
+                tri_idx = i
+            end if
+        end do
+
+        normals(1) = trs_n(1, tri_idx, pid)
+        normals(2) = trs_n(2, tri_idx, pid)
+        normals(3) = trs_n(3, tri_idx, pid)
+        distance = dist_min
+
+    end subroutine f_distance_normals_3D_flat
+
     !> This procedure determines the levelset distance of 2D models without interpolation.
     !! @param boundary_v                   Group of all the boundary vertices of the 2D model without interpolation
     !! @param boundary_edge_count          Output the total number of boundary edges
@@ -1240,4 +1391,18 @@ contains
 
     end function f_interpolated_distance
 
+    subroutine s_pack_model_for_gpu(ma)
+        type(t_model_array), intent(inout) :: ma
+        integer :: i
+
+        ma%ntrs = ma%model%ntrs
+        allocate (ma%trs_v(1:3, 1:3, 1:ma%ntrs))
-        allocate (ma%trs_v(1:3, 1:3, 1:ma%ntrs))
+
+        if (allocated(ma%trs_v)) deallocate(ma%trs_v)
+        if (allocated(ma%trs_n)) deallocate(ma%trs_n)
+
-        allocate (ma%trs_v(1:3, 1:3, 1:ma%ntrs))
+
+        if (allocated(ma%trs_v)) deallocate(ma%trs_v)
+        if (allocated(ma%trs_n)) deallocate(ma%trs_n)
+
+        allocate (ma%trs_n(1:3, 1:ma%ntrs))
+
+        do i = 1, ma%ntrs
+            ma%trs_v(:, :, i) = ma%model%trs(i)%v(:, :)
+            ma%trs_n(:, i) = ma%model%trs(i)%n(:)
+        end do
+    end subroutine
+
-    subroutine s_pack_model_for_gpu(ma)
-        type(t_model_array), intent(inout) :: ma
-        integer :: i
-
-        ma%ntrs = ma%model%ntrs
-        allocate (ma%trs_v(1:3, 1:3, 1:ma%ntrs))
-        allocate (ma%trs_n(1:3, 1:ma%ntrs))
-
-        do i = 1, ma%ntrs
-            ma%trs_v(:, :, i) = ma%model%trs(i)%v(:, :)
-            ma%trs_n(:, i) = ma%model%trs(i)%n(:)
-        end do
-    end subroutine
+    subroutine s_pack_model_for_gpu(ma)
+        type(t_model_array), intent(inout) :: ma
+        integer :: i
+
+        ma%ntrs = ma%model%ntrs
+        if (allocated(ma%trs_v)) deallocate(ma%trs_v)
+        if (allocated(ma%trs_n)) deallocate(ma%trs_n)
+        allocate (ma%trs_v(1:3, 1:3, 1:ma%ntrs))
+        allocate (ma%trs_n(1:3, 1:ma%ntrs))
+
+        do i = 1, ma%ntrs
+            ma%trs_v(:, :, i) = ma%model%trs(i)%v(:, :)
+            ma%trs_n(:, i) = ma%model%trs(i)%n(:)
+        end do
+    end subroutine
-    subroutine s_pack_model_for_gpu(ma)
-        type(t_model_array), intent(inout) :: ma
-        integer :: i
-
-        ma%ntrs = ma%model%ntrs
-        allocate (ma%trs_v(1:3, 1:3, 1:ma%ntrs))
-        allocate (ma%trs_n(1:3, 1:ma%ntrs))
-
-        do i = 1, ma%ntrs
-            ma%trs_v(:, :, i) = ma%model%trs(i)%v(:, :)
-            ma%trs_n(:, i) = ma%model%trs(i)%n(:)
-        end do
-    end subroutine
+    subroutine s_pack_model_for_gpu(ma)
+        type(t_model_array), intent(inout) :: ma
+        integer :: i
+
+        ma%ntrs = ma%model%ntrs
+        if (allocated(ma%trs_v)) deallocate(ma%trs_v)
+        if (allocated(ma%trs_n)) deallocate(ma%trs_n)
+        allocate (ma%trs_v(1:3, 1:3, 1:ma%ntrs))
+        allocate (ma%trs_n(1:3, 1:ma%ntrs))
+
+        do i = 1, ma%ntrs
+            ma%trs_v(:, :, i) = ma%model%trs(i)%v(:, :)
+            ma%trs_n(:, i) = ma%model%trs(i)%n(:)
+        end do
+    end subroutine
 end module m_model