From 0a304a207422cb92b8c7b0ac2bbf05898d45fc69 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Mon, 16 Feb 2026 12:19:04 -0500
Subject: [PATCH 01/22] Moved all of the IB marker calculation to the GPU
 without copy

---
 src/simulation/m_ib_patches.fpp | 122 ++++++++++++++++----------------
 src/simulation/m_ibm.fpp        |   9 ++-
 2 files changed, 65 insertions(+), 66 deletions(-)

diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index d983a9f1de..0509593918 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -61,9 +61,9 @@ module m_ib_patches
 
 contains
 
-    impure subroutine s_apply_ib_patches(ib_markers_sf)
+    impure subroutine s_apply_ib_patches(ib_markers)
 
-        integer, dimension(:, :, :), intent(inout), optional :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         integer :: i
 
@@ -76,16 +76,16 @@ contains
             do i = 1, num_ibs
 
                 if (patch_ib(i)%geometry == 8) then
-                    call s_ib_sphere(i, ib_markers_sf)
+                    call s_ib_sphere(i, ib_markers)
                 elseif (patch_ib(i)%geometry == 9) then
-                    call s_ib_cuboid(i, ib_markers_sf)
+                    call s_ib_cuboid(i, ib_markers)
                 elseif (patch_ib(i)%geometry == 10) then
-                    call s_ib_cylinder(i, ib_markers_sf)
+                    call s_ib_cylinder(i, ib_markers)
                 elseif (patch_ib(i)%geometry == 11) then
-                    call s_ib_3D_airfoil(i, ib_markers_sf)
+                    call s_ib_3D_airfoil(i, ib_markers)
                     ! STL+IBM patch
                 elseif (patch_ib(i)%geometry == 12) then
-                    call s_ib_model(i, ib_markers_sf)
+                    call s_ib_model(i, ib_markers)
                 end if
             end do
             !> @}
@@ -97,16 +97,16 @@ contains
             !> @{
             do i = 1, num_ibs
                 if (patch_ib(i)%geometry == 2) then
-                    call s_ib_circle(i, ib_markers_sf)
+                    call s_ib_circle(i, ib_markers)
                 elseif (patch_ib(i)%geometry == 3) then
-                    call s_ib_rectangle(i, ib_markers_sf)
+                    call s_ib_rectangle(i, ib_markers)
                 elseif (patch_ib(i)%geometry == 4) then
-                    call s_ib_airfoil(i, ib_markers_sf)
+                    call s_ib_airfoil(i, ib_markers)
                     ! STL+IBM patch
                 elseif (patch_ib(i)%geometry == 5) then
-                    call s_ib_model(i, ib_markers_sf)
+                    call s_ib_model(i, ib_markers)
                 elseif (patch_ib(i)%geometry == 6) then
-                    call s_ib_ellipse(i, ib_markers_sf)
+                    call s_ib_ellipse(i, ib_markers)
                 end if
             end do
             !> @}
@@ -248,12 +248,12 @@ contains
         !!              are provided. Note that the circular patch DOES allow for
         !!              the smoothing of its boundary.
         !! @param patch_id is the patch identifier
-        !! @param ib_markers_sf Array to track patch ids
+        !! @param ib_markers Array to track patch ids
         !! @param ib True if this patch is an immersed boundary
-    subroutine s_ib_circle(patch_id, ib_markers_sf)
+    subroutine s_ib_circle(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         real(wp), dimension(1:2) :: center
         real(wp) :: radius
@@ -272,14 +272,14 @@ contains
         ! that cell. If both queries check out, the primitive variables of
         ! the current patch are assigned to this cell.
 
-        $:GPU_PARALLEL_LOOP(private='[i,j]', copy='[ib_markers_sf]',&
+        $:GPU_PARALLEL_LOOP(private='[i,j]',&
                   & copyin='[patch_id,center,radius]', collapse=2)
         do j = 0, n
             do i = 0, m
                 if ((x_cc(i) - center(1))**2 &
                     + (y_cc(j) - center(2))**2 <= radius**2) &
                     then
-                    ib_markers_sf(i, j, 0) = patch_id
+                    ib_markers%sf(i, j, 0) = patch_id
                 end if
             end do
         end do
@@ -288,11 +288,11 @@ contains
     end subroutine s_ib_circle
 
     !! @param patch_id is the patch identifier
-    !! @param ib_markers_sf Array to track patch ids
-    subroutine s_ib_airfoil(patch_id, ib_markers_sf)
+    !! @param ib_markers Array to track patch ids
+    subroutine s_ib_airfoil(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         real(wp) :: f, ca_in, pa, ma, ta
         real(wp) :: xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
@@ -376,7 +376,7 @@ contains
 
         end if
 
-        $:GPU_PARALLEL_LOOP(private='[i,j,xy_local,k,f]', copy='[ib_markers_sf]',&
+        $:GPU_PARALLEL_LOOP(private='[i,j,xy_local,k,f]', &
                   & copyin='[patch_id,center,inverse_rotation,offset,ma,ca_in,airfoil_grid_u,airfoil_grid_l]', collapse=2)
         do j = 0, n
             do i = 0, m
@@ -401,13 +401,13 @@ contains
                         if (f_approx_equal(airfoil_grid_u(k)%x, xy_local(1))) then
                             if (xy_local(2) <= airfoil_grid_u(k)%y) then
                                 !!IB
-                                ib_markers_sf(i, j, 0) = patch_id
+                                ib_markers%sf(i, j, 0) = patch_id
                             end if
                         else
                             f = (airfoil_grid_u(k)%x - xy_local(1))/(airfoil_grid_u(k)%x - airfoil_grid_u(k - 1)%x)
                             if (xy_local(2) <= ((1._wp - f)*airfoil_grid_u(k)%y + f*airfoil_grid_u(k - 1)%y)) then
                                 !!IB
-                                ib_markers_sf(i, j, 0) = patch_id
+                                ib_markers%sf(i, j, 0) = patch_id
                             end if
                         end if
                     else
@@ -418,14 +418,14 @@ contains
                         if (f_approx_equal(airfoil_grid_l(k)%x, xy_local(1))) then
                             if (xy_local(2) >= airfoil_grid_l(k)%y) then
                                 !!IB
-                                ib_markers_sf(i, j, 0) = patch_id
+                                ib_markers%sf(i, j, 0) = patch_id
                             end if
                         else
                             f = (airfoil_grid_l(k)%x - xy_local(1))/(airfoil_grid_l(k)%x - airfoil_grid_l(k - 1)%x)
 
                             if (xy_local(2) >= ((1._wp - f)*airfoil_grid_l(k)%y + f*airfoil_grid_l(k - 1)%y)) then
                                 !!IB
-                                ib_markers_sf(i, j, 0) = patch_id
+                                ib_markers%sf(i, j, 0) = patch_id
                             end if
                         end if
                     end if
@@ -437,12 +437,12 @@ contains
     end subroutine s_ib_airfoil
 
     !! @param patch_id is the patch identifier
-    !! @param ib_markers_sf Array to track patch ids
+    !! @param ib_markers Array to track patch ids
     !! @param ib True if this patch is an immersed boundary
-    subroutine s_ib_3D_airfoil(patch_id, ib_markers_sf)
+    subroutine s_ib_3D_airfoil(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         real(wp) :: lz, z_max, z_min, f, ca_in, pa, ma, ta, xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
         integer :: i, j, k, l
@@ -527,7 +527,7 @@ contains
             $:GPU_UPDATE(device='[airfoil_grid_l,airfoil_grid_u]')
         end if
 
-        $:GPU_PARALLEL_LOOP(private='[i,j,l,xyz_local,k,f]', copy='[ib_markers_sf]',&
+        $:GPU_PARALLEL_LOOP(private='[i,j,l,xyz_local,k,f]',&
                   & copyin='[patch_id,center,inverse_rotation,offset,ma,ca_in,airfoil_grid_u,airfoil_grid_l,z_min,z_max]', collapse=3)
         do l = 0, p
             do j = 0, n
@@ -547,13 +547,13 @@ contains
                                 if (f_approx_equal(airfoil_grid_u(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) <= airfoil_grid_u(k)%y) then
                                         !IB
-                                        ib_markers_sf(i, j, l) = patch_id
+                                        ib_markers%sf(i, j, l) = patch_id
                                     end if
                                 else
                                     f = (airfoil_grid_u(k)%x - xyz_local(1))/(airfoil_grid_u(k)%x - airfoil_grid_u(k - 1)%x)
                                     if (xyz_local(2) <= ((1._wp - f)*airfoil_grid_u(k)%y + f*airfoil_grid_u(k - 1)%y)) then
                                         !!IB
-                                        ib_markers_sf(i, j, l) = patch_id
+                                        ib_markers%sf(i, j, l) = patch_id
                                     end if
                                 end if
                             else
@@ -564,14 +564,14 @@ contains
                                 if (f_approx_equal(airfoil_grid_l(k)%x, xyz_local(1))) then
                                     if (xyz_local(2) >= airfoil_grid_l(k)%y) then
                                         !!IB
-                                        ib_markers_sf(i, j, l) = patch_id
+                                        ib_markers%sf(i, j, l) = patch_id
                                     end if
                                 else
                                     f = (airfoil_grid_l(k)%x - xyz_local(1))/(airfoil_grid_l(k)%x - airfoil_grid_l(k - 1)%x)
 
                                     if (xyz_local(2) >= ((1._wp - f)*airfoil_grid_l(k)%y + f*airfoil_grid_l(k - 1)%y)) then
                                         !!IB
-                                        ib_markers_sf(i, j, l) = patch_id
+                                        ib_markers%sf(i, j, l) = patch_id
                                     end if
                                 end if
                             end if
@@ -593,12 +593,12 @@ contains
         !!              rectangular patch DOES NOT allow for the smoothing of its
         !!              boundaries.
         !! @param patch_id is the patch identifier
-        !! @param ib_markers_sf Array to track patch ids
+        !! @param ib_markers Array to track patch ids
         !! @param ib True if this patch is an immersed boundary
-    subroutine s_ib_rectangle(patch_id, ib_markers_sf)
+    subroutine s_ib_rectangle(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         integer :: i, j, k !< generic loop iterators
         real(wp) :: pi_inf, gamma, lit_gamma !< Equation of state parameters
@@ -621,7 +621,7 @@ contains
         ! domain and verifying whether the current patch has the permission
         ! to write to that cell. If both queries check out, the primitive
         ! variables of the current patch are assigned to this cell.
-        $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]', copy='[ib_markers_sf]',&
+        $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]',&
                   & copyin='[patch_id,center,length,inverse_rotation,x_cc,y_cc]', collapse=2)
         do j = 0, n
             do i = 0, m
@@ -635,7 +635,7 @@ contains
                     0.5_wp*length(2) >= xy_local(2)) then
 
                     ! Updating the patch identities bookkeeping variable
-                    ib_markers_sf(i, j, 0) = patch_id
+                    ib_markers%sf(i, j, 0) = patch_id
 
                 end if
             end do
@@ -650,12 +650,12 @@ contains
         !!              provided. Please note that the spherical patch DOES allow
         !!              for the smoothing of its boundary.
         !! @param patch_id is the patch identifier
-        !! @param ib_markers_sf Array to track patch ids
+        !! @param ib_markers Array to track patch ids
         !! @param ib True if this patch is an immersed boundary
-    subroutine s_ib_sphere(patch_id, ib_markers_sf)
+    subroutine s_ib_sphere(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         ! Generic loop iterators
         integer :: i, j, k
@@ -676,7 +676,7 @@ contains
         ! and verifying whether the current patch has permission to write to
         ! that cell. If both queries check out, the primitive variables of
         ! the current patch are assigned to this cell.
-        $:GPU_PARALLEL_LOOP(private='[i,j,k,cart_y,cart_z]', copy='[ib_markers_sf]',&
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,cart_y,cart_z]',&
                   & copyin='[patch_id,center,radius]', collapse=3)
         do k = 0, p
             do j = 0, n
@@ -691,7 +691,7 @@ contains
                     if (((x_cc(i) - center(1))**2 &
                          + (cart_y - center(2))**2 &
                          + (cart_z - center(3))**2 <= radius**2)) then
-                        ib_markers_sf(i, j, k) = patch_id
+                        ib_markers%sf(i, j, k) = patch_id
                     end if
                 end do
             end do
@@ -709,11 +709,11 @@ contains
         !!              the cuboidal patch DOES NOT allow for the smearing of its
         !!              boundaries.
         !! @param patch_id is the patch identifier
-        !! @param ib_markers_sf Array to track patch ids
-    subroutine s_ib_cuboid(patch_id, ib_markers_sf)
+        !! @param ib_markers Array to track patch ids
+    subroutine s_ib_cuboid(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         integer :: i, j, k !< Generic loop iterators
         real(wp), dimension(1:3) :: xyz_local, center, length !< x and y coordinates in local IB frame
@@ -732,7 +732,7 @@ contains
         ! and verifying whether the current patch has permission to write to
         ! to that cell. If both queries check out, the primitive variables
         ! of the current patch are assigned to this cell.
-        $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]', copy='[ib_markers_sf]',&
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]',&
                   & copyin='[patch_id,center,length,inverse_rotation]', collapse=3)
         do k = 0, p
             do j = 0, n
@@ -756,7 +756,7 @@ contains
                         0.5*length(3) >= xyz_local(3)) then
 
                         ! Updating the patch identities bookkeeping variable
-                        ib_markers_sf(i, j, k) = patch_id
+                        ib_markers%sf(i, j, k) = patch_id
                     end if
                 end do
             end do
@@ -774,12 +774,12 @@ contains
         !!              that the cylindrical patch DOES allow for the smoothing
         !!              of its lateral boundary.
         !! @param patch_id is the patch identifier
-        !! @param ib_markers_sf Array to track patch ids
+        !! @param ib_markers Array to track patch ids
         !! @param ib True if this patch is an immersed boundary
-    subroutine s_ib_cylinder(patch_id, ib_markers_sf)
+    subroutine s_ib_cylinder(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         integer :: i, j, k !< Generic loop iterators
         real(wp) :: radius
@@ -802,7 +802,7 @@ contains
         ! domain and verifying whether the current patch has the permission
         ! to write to that cell. If both queries check out, the primitive
         ! variables of the current patch are assigned to this cell.
-        $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]', copy='[ib_markers_sf]',&
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]',&
                   & copyin='[patch_id,center,length,radius,inverse_rotation]', collapse=3)
         do k = 0, p
             do j = 0, n
@@ -836,7 +836,7 @@ contains
                           0.5_wp*length(3) >= xyz_local(3)))) then
 
                         ! Updating the patch identities bookkeeping variable
-                        ib_markers_sf(i, j, k) = patch_id
+                        ib_markers%sf(i, j, k) = patch_id
                     end if
                 end do
             end do
@@ -845,10 +845,10 @@ contains
 
     end subroutine s_ib_cylinder
 
-    subroutine s_ib_ellipse(patch_id, ib_markers_sf)
+    subroutine s_ib_ellipse(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         integer :: i, j, k !< generic loop iterators
         real(wp), dimension(1:3) :: xy_local !< x and y coordinates in local IB frame
@@ -865,7 +865,7 @@ contains
 
         ! Checking whether the ellipse covers a particular cell in the
         ! domain
-        $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]', copy='[ib_markers_sf]',&
+        $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]',&
                   & copyin='[patch_id,center,ellipse_coeffs,inverse_rotation,x_cc,y_cc]', collapse=2)
         do j = 0, n
             do i = 0, m
@@ -876,7 +876,7 @@ contains
                 ! Ellipse condition (x/a)^2 + (y/b)^2 <= 1
                 if ((xy_local(1)/ellipse_coeffs(1))**2 + (xy_local(2)/ellipse_coeffs(2))**2 <= 1._wp) then
                     ! Updating the patch identities bookkeeping variable
-                    ib_markers_sf(i, j, 0) = patch_id
+                    ib_markers%sf(i, j, 0) = patch_id
                 end if
             end do
         end do
@@ -886,13 +886,13 @@ contains
 
     !> The STL patch is a 2/3D geometry that is imported from an STL file.
     !! @param patch_id is the patch identifier
-    !! @param ib_markers_sf Array to track patch ids
+    !! @param ib_markers Array to track patch ids
     !! @param STL_levelset STL levelset
     !! @param STL_levelset_norm STL levelset normals
-    subroutine s_ib_model(patch_id, ib_markers_sf)
+    subroutine s_ib_model(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
-        integer, dimension(0:m, 0:n, 0:p), intent(inout) :: ib_markers_sf
+        type(integer_field), intent(inout) :: ib_markers
 
         integer :: i, j, k !< Generic loop iterators
 
@@ -934,7 +934,7 @@ contains
 
                     ! Reading STL boundary vertices and compute the levelset and levelset_norm
                     if (eta > patch_ib(patch_id)%model_threshold) then
-                        ib_markers_sf(i, j, k) = patch_id
+                        ib_markers%sf(i, j, k) = patch_id
                     end if
 
                 end do
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index bd497231f6..965fa68950 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -106,9 +106,9 @@ contains
 
         ! recompute the new ib_patch locations and broadcast them.
         ib_markers%sf = 0._wp
-        call s_apply_ib_patches(ib_markers%sf(0:m, 0:n, 0:p))
         $:GPU_UPDATE(device='[ib_markers%sf]')
-        call s_populate_ib_buffers()
+        call s_apply_ib_patches(ib_markers)
+        ! call s_populate_ib_buffers()
         $:GPU_UPDATE(host='[ib_markers%sf]')
         do i = 1, num_ibs
             if (patch_ib(i)%moving_ibm /= 0) call s_compute_centroid_offset(i) ! offsets are computed after IB markers are generated
@@ -1002,9 +1002,8 @@ contains
         $:GPU_UPDATE(device='[patch_ib]')
 
         ! recompute the new ib_patch locations and broadcast them.
-        call s_apply_ib_patches(ib_markers%sf(0:m, 0:n, 0:p))
-        $:GPU_UPDATE(device='[ib_markers%sf]')
-        call s_populate_ib_buffers()
+         $:GPU_UPDATE(device='[ib_markers%sf]')
+        call s_apply_ib_patches(ib_markers)
         $:GPU_UPDATE(host='[ib_markers%sf]')
 
         ! recalculate the ghost point locations and coefficients

From e8c778e5034ee35929ee74fc2ead0acfec4aa03a Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Mon, 16 Feb 2026 13:36:11 -0500
Subject: [PATCH 02/22] Added profiling and increased maximum num IBs to 1000

---
 src/common/m_constants.fpp          |  2 +-
 src/simulation/m_ibm.fpp            | 17 +++++++++++++++--
 src/simulation/m_time_steppers.fpp  |  4 ++++
 toolchain/mfc/case_validator.py     |  2 +-
 toolchain/mfc/params/definitions.py |  4 ++--
 5 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index d728447f5f..039e7d5d2b 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -22,7 +22,7 @@ module m_constants
     integer, parameter :: fourier_rings = 5                       !< Fourier filter ring limit
     integer, parameter :: num_fluids_max = 10                     !< Maximum number of fluids in the simulation
     integer, parameter :: num_probes_max = 10                     !< Maximum number of flow probes in the simulation
-    integer, parameter :: num_patches_max = 10
+    integer, parameter :: num_patches_max = 1000
     integer, parameter :: num_bc_patches_max = 10
     integer, parameter :: pathlen_max = 400
     integer, parameter :: nnode = 4    !< Number of QBMM nodes
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 965fa68950..d42d58d782 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -84,6 +84,8 @@ contains
         integer :: i, j, k
         integer :: max_num_gps, max_num_inner_gps
 
+        call nvtxStartRange("SETUP-IBM-MODULE")
+
         ! do all set up for moving immersed boundaries
         moving_immersed_boundary_flag = .false.
         do i = 1, num_ibs
@@ -108,7 +110,7 @@ contains
         ib_markers%sf = 0._wp
         $:GPU_UPDATE(device='[ib_markers%sf]')
         call s_apply_ib_patches(ib_markers)
-        ! call s_populate_ib_buffers()
+        call s_populate_ib_buffers()
         $:GPU_UPDATE(host='[ib_markers%sf]')
         do i = 1, num_ibs
             if (patch_ib(i)%moving_ibm /= 0) call s_compute_centroid_offset(i) ! offsets are computed after IB markers are generated
@@ -141,6 +143,8 @@ contains
         call s_compute_interpolation_coeffs(ghost_points)
         $:GPU_UPDATE(device='[ghost_points]')
 
+        call nvtxEndRange
+
     end subroutine s_ibm_setup
 
     subroutine s_populate_ib_buffers()
@@ -989,6 +993,8 @@ contains
 
         integer :: i, ierr
 
+        call nvtxStartRange("UPDATE-MIB")
+
         ! Clears the existing immersed boundary indices
         ib_markers%sf = 0._wp
 
@@ -1002,9 +1008,12 @@ contains
         $:GPU_UPDATE(device='[patch_ib]')
 
         ! recompute the new ib_patch locations and broadcast them.
-         $:GPU_UPDATE(device='[ib_markers%sf]')
+        call nvtxStartRange("COMPUTE-IB-MARKERS")
+        $:GPU_UPDATE(device='[ib_markers%sf]')
         call s_apply_ib_patches(ib_markers)
+        call s_populate_ib_buffers()
         $:GPU_UPDATE(host='[ib_markers%sf]')
+        call nvtxEndRange
 
         ! recalculate the ghost point locations and coefficients
         call s_find_num_ghost_points(num_gps, num_inner_gps)
@@ -1013,8 +1022,10 @@ contains
         call s_find_ghost_points(ghost_points, inner_points)
         $:GPU_UPDATE(device='[ghost_points, inner_points]')
 
+        call nvtxStartRange("APPLY-LEVELSET")
         call s_apply_levelset(ghost_points, num_gps)
         $:GPU_UPDATE(device='[ghost_points]')
+        call nvtxEndRange
 
         call s_compute_image_points(ghost_points)
         $:GPU_UPDATE(device='[ghost_points]')
@@ -1022,6 +1033,8 @@ contains
         call s_compute_interpolation_coeffs(ghost_points)
         $:GPU_UPDATE(device='[ghost_points]')
 
+        call nvtxEndRange
+
     end subroutine s_update_mib
 
     ! compute the surface integrals of the IB via a volume integraion method described in
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 94ea98bf8c..f6eb0b530d 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -802,6 +802,8 @@ contains
         integer :: i
         logical :: forces_computed
 
+        call nvtxStartRange("PROPAGATE-IMERSED-BOUNDARIES")
+
         forces_computed = .false.
 
         do i = 1, num_ibs
@@ -849,6 +851,8 @@ contains
 
         call s_update_mib(num_ibs)
 
+        call nvtxEndRange
+
     end subroutine s_propagate_immersed_boundaries
 
     !> This subroutine saves the temporary q_prim_vf vector
diff --git a/toolchain/mfc/case_validator.py b/toolchain/mfc/case_validator.py
index 2c29c75e7f..9b071e9e66 100644
--- a/toolchain/mfc/case_validator.py
+++ b/toolchain/mfc/case_validator.py
@@ -684,7 +684,7 @@ def check_ibm(self):
 
         self.prohibit(ib and n <= 0,
                      "Immersed Boundaries do not work in 1D (requires n > 0)")
-        self.prohibit(ib and (num_ibs <= 0 or num_ibs > 10),
+        self.prohibit(ib and (num_ibs <= 0 or num_ibs > 1000),
                      "num_ibs must be between 1 and num_patches_max (10)")
         self.prohibit(not ib and num_ibs > 0,
                      "num_ibs is set, but ib is not enabled")
diff --git a/toolchain/mfc/params/definitions.py b/toolchain/mfc/params/definitions.py
index d44249f664..7c456b36fc 100644
--- a/toolchain/mfc/params/definitions.py
+++ b/toolchain/mfc/params/definitions.py
@@ -11,7 +11,7 @@
 from .registry import REGISTRY
 
 # Index limits
-NP, NF, NI, NA, NPR, NB = 10, 10, 10, 4, 10, 10  # patches, fluids, ibs, acoustic, probes, bc_patches
+NP, NF, NI, NA, NPR, NB = 10, 10, 1000, 4, 10, 10  # patches, fluids, ibs, acoustic, probes, bc_patches
 
 
 # =============================================================================
@@ -672,7 +672,7 @@ def get_value_label(param_name: str, value: int) -> str:
     # Counts (must be positive)
     "num_fluids": {"min": 1, "max": 10},
     "num_patches": {"min": 0, "max": 10},
-    "num_ibs": {"min": 0, "max": 10},
+    "num_ibs": {"min": 0, "max": 1000},
     "num_source": {"min": 1},
     "num_probes": {"min": 1},
     "num_integrals": {"min": 1},

From 5d885bbcfecd3db17b4acd9eb0a4ef14be6f0606 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Mon, 16 Feb 2026 14:31:49 -0500
Subject: [PATCH 03/22] Performance tuning complted for marker generation

---
 src/simulation/m_ib_patches.fpp | 114 +++++++++++++++++++++-----------
 src/simulation/m_ibm.fpp        |  11 ++-
 2 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 0509593918..c0ab727dd7 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -85,7 +85,7 @@ contains
                     call s_ib_3D_airfoil(i, ib_markers)
                     ! STL+IBM patch
                 elseif (patch_ib(i)%geometry == 12) then
-                    call s_ib_model(i, ib_markers)
+                    call s_ib_3d_model(i, ib_markers)
                 end if
             end do
             !> @}
@@ -274,8 +274,8 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[i,j]',&
                   & copyin='[patch_id,center,radius]', collapse=2)
-        do j = 0, n
-            do i = 0, m
+        do j = -gp_layers, n+gp_layers
+            do i = -gp_layers, m+gp_layers
                 if ((x_cc(i) - center(1))**2 &
                     + (y_cc(j) - center(2))**2 <= radius**2) &
                     then
@@ -378,8 +378,8 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[i,j,xy_local,k,f]', &
                   & copyin='[patch_id,center,inverse_rotation,offset,ma,ca_in,airfoil_grid_u,airfoil_grid_l]', collapse=2)
-        do j = 0, n
-            do i = 0, m
+        do j = -gp_layers, n+gp_layers
+            do i = -gp_layers, m+gp_layers
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp] ! get coordinate frame centered on IB
                 xy_local = matmul(inverse_rotation, xy_local) ! rotate the frame into the IB's coordinates
                 xy_local = xy_local - offset ! airfoils are a patch that require a centroid offset
@@ -529,9 +529,9 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[i,j,l,xyz_local,k,f]',&
                   & copyin='[patch_id,center,inverse_rotation,offset,ma,ca_in,airfoil_grid_u,airfoil_grid_l,z_min,z_max]', collapse=3)
-        do l = 0, p
-            do j = 0, n
-                do i = 0, m
+        do l = -gp_layers, p+gp_layers
+            do j = -gp_layers, n+gp_layers
+                do i = -gp_layers, m+gp_layers
                     xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), z_cc(l) - center(3)] ! get coordinate frame centered on IB
                     xyz_local = matmul(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinates
                     xyz_local = xyz_local - offset ! airfoils are a patch that require a centroid offset
@@ -623,8 +623,8 @@ contains
         ! variables of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]',&
                   & copyin='[patch_id,center,length,inverse_rotation,x_cc,y_cc]', collapse=2)
-        do j = 0, n
-            do i = 0, m
+        do j = -gp_layers, n+gp_layers
+            do i = -gp_layers, m+gp_layers
                 ! get the x and y coordinates in the local IB frame
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
                 xy_local = matmul(inverse_rotation, xy_local)
@@ -678,9 +678,9 @@ contains
         ! the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,cart_y,cart_z]',&
                   & copyin='[patch_id,center,radius]', collapse=3)
-        do k = 0, p
-            do j = 0, n
-                do i = 0, m
+        do k = -gp_layers, p+gp_layers
+            do j = -gp_layers, n+gp_layers
+                do i = -gp_layers, m+gp_layers
                     if (grid_geometry == 3) then
                         call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
                     else
@@ -734,9 +734,9 @@ contains
         ! of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]',&
                   & copyin='[patch_id,center,length,inverse_rotation]', collapse=3)
-        do k = 0, p
-            do j = 0, n
-                do i = 0, m
+        do k = -gp_layers, p+gp_layers
+            do j = -gp_layers, n+gp_layers
+                do i = -gp_layers, m+gp_layers
 
                     if (grid_geometry == 3) then
                         ! TODO :: This does not work and is not covered by any tests. This should be fixed
@@ -804,9 +804,9 @@ contains
         ! variables of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]',&
                   & copyin='[patch_id,center,length,radius,inverse_rotation]', collapse=3)
-        do k = 0, p
-            do j = 0, n
-                do i = 0, m
+         do k = -gp_layers, p+gp_layers
+            do j = -gp_layers, n+gp_layers
+                do i = -gp_layers, m+gp_layers
 
                     if (grid_geometry == 3) then
                         call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
@@ -867,8 +867,8 @@ contains
         ! domain
         $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]',&
                   & copyin='[patch_id,center,ellipse_coeffs,inverse_rotation,x_cc,y_cc]', collapse=2)
-        do j = 0, n
-            do i = 0, m
+        do j = -gp_layers, n+gp_layers
+            do i = -gp_layers, m+gp_layers
                 ! get the x and y coordinates in the local IB frame
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
                 xy_local = matmul(inverse_rotation, xy_local)
@@ -884,11 +884,9 @@ contains
 
     end subroutine s_ib_ellipse
 
-    !> The STL patch is a 2/3D geometry that is imported from an STL file.
+    !> The STL patch is a 2D geometry that is imported from an STL file.
     !! @param patch_id is the patch identifier
     !! @param ib_markers Array to track patch ids
-    !! @param STL_levelset STL levelset
-    !! @param STL_levelset_norm STL levelset normals
     subroutine s_ib_model(patch_id, ib_markers)
 
         integer, intent(in) :: patch_id
@@ -900,25 +898,69 @@ contains
 
         real(wp) :: eta
         real(wp), dimension(1:3) :: point, local_point, offset
-        real(wp), dimension(1:3) :: center, xyz_local
+        real(wp), dimension(1:3) :: center, xy_local
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
 
         model => models(patch_id)%model
         center = 0._wp
         center(1) = patch_ib(patch_id)%x_centroid
         center(2) = patch_ib(patch_id)%y_centroid
-        if (p > 0) center(3) = patch_ib(patch_id)%z_centroid
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
 
-        do i = 0, m
-            do j = 0, n
-                do k = 0, p
+        do i = 0-gp_layers, m+gp_layers
+            do j = -gp_layers, n+gp_layers
 
-                    xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
-                    if (p > 0) then
-                        xyz_local(3) = z_cc(k) - center(3)
+                    xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
+                    xy_local = matmul(inverse_rotation, xy_local)
+                    xy_local = xy_local - offset
+
+                    if (grid_geometry == 3) then
+                        xy_local = f_convert_cyl_to_cart(xy_local)
                     end if
+
+                    eta = f_model_is_inside(model, xy_local, (/dx(i), dy(j), 0._wp/), patch_ib(patch_id)%model_spc)
+
+                    ! Reading STL boundary vertices and compute the levelset and levelset_norm
+                    if (eta > patch_ib(patch_id)%model_threshold) then
+                        ib_markers%sf(i, j, 0) = patch_id
+                    end if
+
+            end do
+        end do
+
+    end subroutine s_ib_model
+
+    !> The STL patch is a 3D geometry that is imported from an STL file.
+    !! @param patch_id is the patch identifier
+    !! @param ib_markers Array to track patch ids
+    subroutine s_ib_3d_model(patch_id, ib_markers)
+
+        integer, intent(in) :: patch_id
+        type(integer_field), intent(inout) :: ib_markers
+
+        integer :: i, j, k !< Generic loop iterators
+
+        type(t_model), pointer :: model
+
+        real(wp) :: eta
+        real(wp), dimension(1:3) :: point, local_point, offset
+        real(wp), dimension(1:3) :: center, xyz_local
+        real(wp), dimension(1:3, 1:3) :: inverse_rotation
+
+        model => models(patch_id)%model
+        center = 0._wp
+        center(1) = patch_ib(patch_id)%x_centroid
+        center(2) = patch_ib(patch_id)%y_centroid
+        center(3) = patch_ib(patch_id)%z_centroid
+        inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
+        offset(:) = patch_ib(patch_id)%centroid_offset(:)
+
+        do i = 0-gp_layers, m+gp_layers
+            do j = -gp_layers, n+gp_layers
+                do k = -gp_layers, p+gp_layers
+
+                    xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), z_cc(k) - center(3)]
                     xyz_local = matmul(inverse_rotation, xyz_local)
                     xyz_local = xyz_local - offset
 
@@ -926,11 +968,7 @@ contains
                         xyz_local = f_convert_cyl_to_cart(xyz_local)
                     end if
 
-                    if (p == 0) then
-                        eta = f_model_is_inside(model, xyz_local, (/dx(i), dy(j), 0._wp/), patch_ib(patch_id)%model_spc)
-                    else
-                        eta = f_model_is_inside(model, xyz_local, (/dx(i), dy(j), dz(k)/), patch_ib(patch_id)%model_spc)
-                    end if
+                    eta = f_model_is_inside(model, xyz_local, (/dx(i), dy(j), dz(k)/), patch_ib(patch_id)%model_spc)
 
                     ! Reading STL boundary vertices and compute the levelset and levelset_norm
                     if (eta > patch_ib(patch_id)%model_threshold) then
@@ -941,7 +979,7 @@ contains
             end do
         end do
 
-    end subroutine s_ib_model
+    end subroutine s_ib_3d_model
 
     !> Subroutine that computes a rotation matrix for converting to the rotating frame of the boundary
     subroutine s_update_ib_rotation_matrix(patch_id)
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index d42d58d782..7c6c609d4a 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -110,7 +110,6 @@ contains
         ib_markers%sf = 0._wp
         $:GPU_UPDATE(device='[ib_markers%sf]')
         call s_apply_ib_patches(ib_markers)
-        call s_populate_ib_buffers()
         $:GPU_UPDATE(host='[ib_markers%sf]')
         do i = 1, num_ibs
             if (patch_ib(i)%moving_ibm /= 0) call s_compute_centroid_offset(i) ! offsets are computed after IB markers are generated
@@ -1008,13 +1007,13 @@ contains
         $:GPU_UPDATE(device='[patch_ib]')
 
         ! recompute the new ib_patch locations and broadcast them.
-        call nvtxStartRange("COMPUTE-IB-MARKERS")
+        call nvtxStartRange("APPLY-IB-PATCHES")
         $:GPU_UPDATE(device='[ib_markers%sf]')
         call s_apply_ib_patches(ib_markers)
-        call s_populate_ib_buffers()
-        $:GPU_UPDATE(host='[ib_markers%sf]')
         call nvtxEndRange
 
+        call nvtxStartRange("COMPUTE-GHOST-POINTS")
+
         ! recalculate the ghost point locations and coefficients
         call s_find_num_ghost_points(num_gps, num_inner_gps)
         $:GPU_UPDATE(device='[num_gps, num_inner_gps]')
@@ -1022,10 +1021,10 @@ contains
         call s_find_ghost_points(ghost_points, inner_points)
         $:GPU_UPDATE(device='[ghost_points, inner_points]')
 
-        call nvtxStartRange("APPLY-LEVELSET")
+        call nvtxEndRange
+        
         call s_apply_levelset(ghost_points, num_gps)
         $:GPU_UPDATE(device='[ghost_points]')
-        call nvtxEndRange
 
         call s_compute_image_points(ghost_points)
         $:GPU_UPDATE(device='[ghost_points]')

From 85889c059443a20d7eccf19ca39c05819cee2691 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Mon, 16 Feb 2026 16:34:21 -0500
Subject: [PATCH 04/22] Bindary search for IB index region beginning for
 reduced IB marker compute

---
 src/simulation/m_ib_patches.fpp | 58 +++++++++++++++++++++++++++++++--
 src/simulation/m_ibm.fpp        |  7 +++-
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index c0ab727dd7..fd8e22f574 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -659,6 +659,7 @@ contains
 
         ! Generic loop iterators
         integer :: i, j, k
+        integer :: il, ir, jl, jr, kl, kr
         real(wp) :: radius
         real(wp), dimension(1:3) :: center
 
@@ -672,15 +673,26 @@ contains
         center(3) = patch_ib(patch_id)%z_centroid
         radius = patch_ib(patch_id)%radius
 
+        il = -gp_layers
+        jl = -gp_layers
+        kl = -gp_layers
+        ir = m + gp_layers
+        jr = n + gp_layers
+        kr = p + gp_layers
+        call get_bounding_indices(center(1) - radius, center(1) + radius, x_cc, il, ir)
+        call get_bounding_indices(center(2) - radius, center(2) + radius, y_cc, jl, jr)
+        call get_bounding_indices(center(3) - radius, center(3) + radius, z_cc, kl, kr)
+
         ! Checking whether the sphere covers a particular cell in the domain
         ! and verifying whether the current patch has permission to write to
         ! that cell. If both queries check out, the primitive variables of
         ! the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,cart_y,cart_z]',&
                   & copyin='[patch_id,center,radius]', collapse=3)
-        do k = -gp_layers, p+gp_layers
-            do j = -gp_layers, n+gp_layers
-                do i = -gp_layers, m+gp_layers
+        do k = kl-1, kr+1
+            do j = jl-1, jr+1
+                do i = il-1, ir+1
+                ! do i = -gp_layers, m+gp_layers
                     if (grid_geometry == 3) then
                         call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
                     else
@@ -1058,6 +1070,46 @@ contains
 
     end subroutine s_convert_cylindrical_to_spherical_coord
 
+    subroutine get_bounding_indices(left_bound, right_bound, cell_centers, left_index, right_index)
+
+      real(wp), intent(in) :: left_bound, right_bound
+      integer, intent(inout) :: left_index, right_index
+      real(wp), dimension(-buff_size:), intent(in) :: cell_centers
+
+      integer :: itr_left, itr_middle, itr_right
+
+        itr_left = left_index
+        itr_right = right_index
+
+        do while (itr_left + 1 < itr_right)
+            itr_middle = (itr_left + itr_right) / 2
+            if (cell_centers(itr_middle) < left_bound) then
+              itr_left = itr_middle
+            else if (cell_centers(itr_middle) > left_bound) then
+              itr_right = itr_middle
+            else
+                itr_left = itr_middle
+                exit
+            end if
+        end do
+        left_index = itr_left
+
+        itr_right = right_index
+        do while (itr_left + 1 < itr_right)
+            itr_middle = (itr_left + itr_right) / 2
+            if (cell_centers(itr_middle) < right_bound) then
+              itr_left = itr_middle
+            else if (cell_centers(itr_middle) > right_bound) then
+              itr_right = itr_middle
+            else
+                itr_right = itr_middle
+                exit
+            end if
+        end do
+        right_index = itr_right
+
+    end subroutine get_bounding_indices
+
     !> Archimedes spiral function
     !! @param myth Angle
     !! @param offset Thickness
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 7c6c609d4a..71d4b12768 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -992,7 +992,7 @@ contains
 
         integer :: i, ierr
 
-        call nvtxStartRange("UPDATE-MIB")
+        call nvtxStartRange("UPDATE-MIBM")
 
         ! Clears the existing immersed boundary indices
         ib_markers%sf = 0._wp
@@ -1057,6 +1057,9 @@ contains
         #:else
             real(wp), dimension(num_fluids) :: dynamic_viscosities
         #:endif
+
+        call nvtxStartRange("COMPUTE-IB-FORCES")
+
         forces = 0._wp
         torques = 0._wp
 
@@ -1187,6 +1190,8 @@ contains
             patch_ib(i)%torque(:) = matmul(patch_ib(i)%rotation_matrix_inverse, torques(i, :)) ! torques must be converted to the local coordinates of the IB
         end do
 
+        call nvtxEndRange
+
     end subroutine s_compute_ib_forces
 
     !> Subroutine to deallocate memory reserved for the IBM module

From f0085c93bd8cf4dae3c7460cb7a2d338d04d86e1 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Tue, 17 Feb 2026 09:15:25 -0500
Subject: [PATCH 05/22] ghost points are now computed on the GPU

---
 src/common/include/parallel_macros.fpp |  10 ++
 src/simulation/m_compute_levelset.fpp  |   4 -
 src/simulation/m_ib_patches.fpp        |  95 +++++------
 src/simulation/m_ibm.fpp               | 222 +++++++++++--------------
 4 files changed, 155 insertions(+), 176 deletions(-)

diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
index bfe4b3beaf..c5ad5c1fb3 100644
--- a/src/common/include/parallel_macros.fpp
+++ b/src/common/include/parallel_macros.fpp
@@ -174,6 +174,16 @@
 #endif
 #:enddef
 
+#:def END_GPU_ATOMIC_CAPTURE()
+    #:set acc_end_directive = '!$acc end atomic'
+    #:set omp_end_directive = '!$omp end atomic'
+#if defined(MFC_OpenACC)
+    $:acc_end_directive
+#elif defined(MFC_OpenMP)
+    $:omp_end_directive
+#endif
+#:enddef
+
 #:def GPU_UPDATE(host=None, device=None, extraAccArgs=None, extraOmpArgs=None)
     #:set acc_code = ACC_UPDATE(host=host, device=device, extraAccArgs=extraAccArgs)
     #:set omp_code = OMP_UPDATE(host=host, device=device, extraOmpArgs=extraOmpArgs)
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 69bcd4e8b5..ef3333745b 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -33,8 +33,6 @@ contains
 
         integer :: i, patch_id, patch_geometry
 
-        $:GPU_UPDATE(device='[gps(1:num_gps)]')
-
         !  3D Patch Geometries
         if (p > 0) then
 
@@ -92,8 +90,6 @@ contains
             end if
         end do
 
-        $:GPU_UPDATE(host='[gps(1:num_gps)]')
-
     end subroutine s_apply_levelset
 
     subroutine s_circle_levelset(gp)
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index fd8e22f574..d662bff389 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -274,8 +274,8 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[i,j]',&
                   & copyin='[patch_id,center,radius]', collapse=2)
-        do j = -gp_layers, n+gp_layers
-            do i = -gp_layers, m+gp_layers
+        do j = -gp_layers, n + gp_layers
+            do i = -gp_layers, m + gp_layers
                 if ((x_cc(i) - center(1))**2 &
                     + (y_cc(j) - center(2))**2 <= radius**2) &
                     then
@@ -378,8 +378,8 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[i,j,xy_local,k,f]', &
                   & copyin='[patch_id,center,inverse_rotation,offset,ma,ca_in,airfoil_grid_u,airfoil_grid_l]', collapse=2)
-        do j = -gp_layers, n+gp_layers
-            do i = -gp_layers, m+gp_layers
+        do j = -gp_layers, n + gp_layers
+            do i = -gp_layers, m + gp_layers
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp] ! get coordinate frame centered on IB
                 xy_local = matmul(inverse_rotation, xy_local) ! rotate the frame into the IB's coordinates
                 xy_local = xy_local - offset ! airfoils are a patch that require a centroid offset
@@ -529,9 +529,9 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[i,j,l,xyz_local,k,f]',&
                   & copyin='[patch_id,center,inverse_rotation,offset,ma,ca_in,airfoil_grid_u,airfoil_grid_l,z_min,z_max]', collapse=3)
-        do l = -gp_layers, p+gp_layers
-            do j = -gp_layers, n+gp_layers
-                do i = -gp_layers, m+gp_layers
+        do l = -gp_layers, p + gp_layers
+            do j = -gp_layers, n + gp_layers
+                do i = -gp_layers, m + gp_layers
                     xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), z_cc(l) - center(3)] ! get coordinate frame centered on IB
                     xyz_local = matmul(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinates
                     xyz_local = xyz_local - offset ! airfoils are a patch that require a centroid offset
@@ -623,8 +623,8 @@ contains
         ! variables of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]',&
                   & copyin='[patch_id,center,length,inverse_rotation,x_cc,y_cc]', collapse=2)
-        do j = -gp_layers, n+gp_layers
-            do i = -gp_layers, m+gp_layers
+        do j = -gp_layers, n + gp_layers
+            do i = -gp_layers, m + gp_layers
                 ! get the x and y coordinates in the local IB frame
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
                 xy_local = matmul(inverse_rotation, xy_local)
@@ -673,6 +673,7 @@ contains
         center(3) = patch_ib(patch_id)%z_centroid
         radius = patch_ib(patch_id)%radius
 
+        ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers
         jl = -gp_layers
         kl = -gp_layers
@@ -689,10 +690,10 @@ contains
         ! the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,cart_y,cart_z]',&
                   & copyin='[patch_id,center,radius]', collapse=3)
-        do k = kl-1, kr+1
-            do j = jl-1, jr+1
-                do i = il-1, ir+1
-                ! do i = -gp_layers, m+gp_layers
+        do k = kl - 1, kr + 1
+            do j = jl - 1, jr + 1
+                do i = il - 1, ir + 1
+                    ! do i = -gp_layers, m+gp_layers
                     if (grid_geometry == 3) then
                         call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
                     else
@@ -746,9 +747,9 @@ contains
         ! of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]',&
                   & copyin='[patch_id,center,length,inverse_rotation]', collapse=3)
-        do k = -gp_layers, p+gp_layers
-            do j = -gp_layers, n+gp_layers
-                do i = -gp_layers, m+gp_layers
+        do k = -gp_layers, p + gp_layers
+            do j = -gp_layers, n + gp_layers
+                do i = -gp_layers, m + gp_layers
 
                     if (grid_geometry == 3) then
                         ! TODO :: This does not work and is not covered by any tests. This should be fixed
@@ -816,9 +817,9 @@ contains
         ! variables of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]',&
                   & copyin='[patch_id,center,length,radius,inverse_rotation]', collapse=3)
-         do k = -gp_layers, p+gp_layers
-            do j = -gp_layers, n+gp_layers
-                do i = -gp_layers, m+gp_layers
+        do k = -gp_layers, p + gp_layers
+            do j = -gp_layers, n + gp_layers
+                do i = -gp_layers, m + gp_layers
 
                     if (grid_geometry == 3) then
                         call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
@@ -879,8 +880,8 @@ contains
         ! domain
         $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]',&
                   & copyin='[patch_id,center,ellipse_coeffs,inverse_rotation,x_cc,y_cc]', collapse=2)
-        do j = -gp_layers, n+gp_layers
-            do i = -gp_layers, m+gp_layers
+        do j = -gp_layers, n + gp_layers
+            do i = -gp_layers, m + gp_layers
                 ! get the x and y coordinates in the local IB frame
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
                 xy_local = matmul(inverse_rotation, xy_local)
@@ -920,23 +921,23 @@ contains
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
 
-        do i = 0-gp_layers, m+gp_layers
-            do j = -gp_layers, n+gp_layers
+        do i = 0 - gp_layers, m + gp_layers
+            do j = -gp_layers, n + gp_layers
 
-                    xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
-                    xy_local = matmul(inverse_rotation, xy_local)
-                    xy_local = xy_local - offset
+                xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
+                xy_local = matmul(inverse_rotation, xy_local)
+                xy_local = xy_local - offset
 
-                    if (grid_geometry == 3) then
-                        xy_local = f_convert_cyl_to_cart(xy_local)
-                    end if
+                if (grid_geometry == 3) then
+                    xy_local = f_convert_cyl_to_cart(xy_local)
+                end if
 
-                    eta = f_model_is_inside(model, xy_local, (/dx(i), dy(j), 0._wp/), patch_ib(patch_id)%model_spc)
+                eta = f_model_is_inside(model, xy_local, (/dx(i), dy(j), 0._wp/), patch_ib(patch_id)%model_spc)
 
-                    ! Reading STL boundary vertices and compute the levelset and levelset_norm
-                    if (eta > patch_ib(patch_id)%model_threshold) then
-                        ib_markers%sf(i, j, 0) = patch_id
-                    end if
+                ! Reading STL boundary vertices and compute the levelset and levelset_norm
+                if (eta > patch_ib(patch_id)%model_threshold) then
+                    ib_markers%sf(i, j, 0) = patch_id
+                end if
 
             end do
         end do
@@ -968,9 +969,9 @@ contains
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
 
-        do i = 0-gp_layers, m+gp_layers
-            do j = -gp_layers, n+gp_layers
-                do k = -gp_layers, p+gp_layers
+        do i = 0 - gp_layers, m + gp_layers
+            do j = -gp_layers, n + gp_layers
+                do k = -gp_layers, p + gp_layers
 
                     xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), z_cc(k) - center(3)]
                     xyz_local = matmul(inverse_rotation, xyz_local)
@@ -1072,21 +1073,21 @@ contains
 
     subroutine get_bounding_indices(left_bound, right_bound, cell_centers, left_index, right_index)
 
-      real(wp), intent(in) :: left_bound, right_bound
-      integer, intent(inout) :: left_index, right_index
-      real(wp), dimension(-buff_size:), intent(in) :: cell_centers
+        real(wp), intent(in) :: left_bound, right_bound
+        integer, intent(inout) :: left_index, right_index
+        real(wp), dimension(-buff_size:), intent(in) :: cell_centers
 
-      integer :: itr_left, itr_middle, itr_right
+        integer :: itr_left, itr_middle, itr_right
 
         itr_left = left_index
         itr_right = right_index
 
         do while (itr_left + 1 < itr_right)
-            itr_middle = (itr_left + itr_right) / 2
+            itr_middle = (itr_left + itr_right)/2
             if (cell_centers(itr_middle) < left_bound) then
-              itr_left = itr_middle
+                itr_left = itr_middle
             else if (cell_centers(itr_middle) > left_bound) then
-              itr_right = itr_middle
+                itr_right = itr_middle
             else
                 itr_left = itr_middle
                 exit
@@ -1096,11 +1097,11 @@ contains
 
         itr_right = right_index
         do while (itr_left + 1 < itr_right)
-            itr_middle = (itr_left + itr_right) / 2
+            itr_middle = (itr_left + itr_right)/2
             if (cell_centers(itr_middle) < right_bound) then
-              itr_left = itr_middle
+                itr_left = itr_middle
             else if (cell_centers(itr_middle) > right_bound) then
-              itr_right = itr_middle
+                itr_right = itr_middle
             else
                 itr_right = itr_middle
                 exit
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 71d4b12768..9267ed668a 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -129,12 +129,9 @@ contains
         @:ALLOCATE(inner_points(1:int((max_num_gps + max_num_inner_gps) * 2.0)))
 
         $:GPU_ENTER_DATA(copyin='[ghost_points,inner_points]')
-
         call s_find_ghost_points(ghost_points, inner_points)
-        $:GPU_UPDATE(device='[ghost_points, inner_points]')
-
         call s_apply_levelset(ghost_points, num_gps)
-        $:GPU_UPDATE(device='[ghost_points]')
+        $:GPU_UPDATE(host='[ghost_points, inner_points]')
 
         call s_compute_image_points(ghost_points)
         $:GPU_UPDATE(device='[ghost_points]')
@@ -534,52 +531,55 @@ contains
 
     end subroutine s_compute_image_points
 
-    !> Function that finds the number of ghost points, used for allocating
+    !> Subroutine that finds the number of ghost points, used for allocating
     !! memory.
     subroutine s_find_num_ghost_points(num_gps_out, num_inner_gps_out)
 
         integer, intent(out) :: num_gps_out
         integer, intent(out) :: num_inner_gps_out
 
-        integer, dimension(2*gp_layers + 1, 2*gp_layers + 1) &
-            :: subsection_2D
-        integer, dimension(2*gp_layers + 1, 2*gp_layers + 1, 2*gp_layers + 1) &
-            :: subsection_3D
-        integer :: i, j, k!< Iterator variables
+        integer :: i, j, k, ii, jj, kk, gp_layers_z !< Iterator variables
+        integer :: num_gps_local, num_inner_gps_local !< local copies of the gp count to support GPU compute
+        logical :: is_gp
 
-        num_gps_out = 0
-        num_inner_gps_out = 0
+        num_gps_local = 0
+        num_inner_gps_local = 0
+        gp_layers_z = gp_layers
+        if (p == 0) gp_layers_z = 0
 
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,ii,jj,kk,is_gp]', copy='[num_gps_local,num_inner_gps_local]', firstprivate='[gp_layers,gp_layers_z]', collapse=3)
         do i = 0, m
             do j = 0, n
-                if (p == 0) then
-                    if (ib_markers%sf(i, j, 0) /= 0) then
-                        subsection_2D = ib_markers%sf( &
-                                        i - gp_layers:i + gp_layers, &
-                                        j - gp_layers:j + gp_layers, 0)
-                        if (any(subsection_2D == 0)) then
-                            num_gps_out = num_gps_out + 1
+                do k = 0, p
+                    if (ib_markers%sf(i, j, k) /= 0) then
+                        is_gp = .false.
+                        marker_search: do ii = i - gp_layers, i + gp_layers
+                            do jj = j - gp_layers, j + gp_layers
+                                do kk = k - gp_layers_z, k + gp_layers_z
+                                    if (ib_markers%sf(ii, jj, kk) == 0) then
+                                        ! if any neighbors are not in the IB, it is a ghost point
+                                        is_gp = .true.
+                                        exit marker_search
+                                    end if
+                                end do
+                            end do
+                        end do marker_search
+
+                        if (is_gp) then
+                            $:GPU_ATOMIC(atomic='update')
+                            num_gps_local = num_gps_local + 1
                         else
-                            num_inner_gps_out = num_inner_gps_out + 1
+                            $:GPU_ATOMIC(atomic='update')
+                            num_inner_gps_local = num_inner_gps_local + 1
                         end if
                     end if
-                else
-                    do k = 0, p
-                        if (ib_markers%sf(i, j, k) /= 0) then
-                            subsection_3D = ib_markers%sf( &
-                                            i - gp_layers:i + gp_layers, &
-                                            j - gp_layers:j + gp_layers, &
-                                            k - gp_layers:k + gp_layers)
-                            if (any(subsection_3D == 0)) then
-                                num_gps_out = num_gps_out + 1
-                            else
-                                num_inner_gps_out = num_inner_gps_out + 1
-                            end if
-                        end if
-                    end do
-                end if
+                end do
             end do
         end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        num_gps_out = num_gps_local
+        num_inner_gps_out = num_inner_gps_local
 
     end subroutine s_find_num_ghost_points
 
@@ -588,116 +588,89 @@ contains
 
         type(ghost_point), dimension(num_gps), intent(INOUT) :: ghost_points_in
         type(ghost_point), dimension(num_inner_gps), intent(INOUT) :: inner_points_in
-        integer, dimension(2*gp_layers + 1, 2*gp_layers + 1) &
-            :: subsection_2D
-        integer, dimension(2*gp_layers + 1, 2*gp_layers + 1, 2*gp_layers + 1) &
-            :: subsection_3D
-        integer :: i, j, k !< Iterator variables
-        integer :: count, count_i
+        integer :: i, j, k, ii, jj, kk, gp_layers_z !< Iterator variables
+        integer :: count, count_i, local_idx
         integer :: patch_id
+        logical :: is_gp
 
-        count = 1
-        count_i = 1
+        count = 0
+        count_i = 0
+        gp_layers_z = gp_layers
+        if (p == 0) gp_layers_z = 0
 
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,ii,jj,kk,is_gp,local_idx]', copyin='[count,count_i, x_domain, y_domain, z_domain]', firstprivate='[gp_layers,gp_layers_z]', collapse=3)
         do i = 0, m
             do j = 0, n
-                if (p == 0) then
-                    ! 2D
-                    if (ib_markers%sf(i, j, 0) /= 0) then
-                        subsection_2D = ib_markers%sf( &
-                                        i - gp_layers:i + gp_layers, &
-                                        j - gp_layers:j + gp_layers, 0)
-                        if (any(subsection_2D == 0)) then
-                            ghost_points_in(count)%loc = [i, j, 0]
-                            patch_id = ib_markers%sf(i, j, 0)
-                            ghost_points_in(count)%ib_patch_id = &
-                                patch_id
-
-                            ghost_points_in(count)%slip = patch_ib(patch_id)%slip
-                            ! ghost_points(count)%rank = proc_rank
+                do k = 0, p
+                    if (ib_markers%sf(i, j, k) /= 0) then
+                        is_gp = .false.
+                        marker_search: do ii = i - gp_layers, i + gp_layers
+                            do jj = j - gp_layers, j + gp_layers
+                                do kk = k - gp_layers_z, k + gp_layers_z
+                                    if (ib_markers%sf(ii, jj, kk) == 0) then
+                                        ! if any neighbors are not in the IB, it is a ghost point
+                                        is_gp = .true.
+                                        exit marker_search
+                                    end if
+                                end do
+                            end do
+                        end do marker_search
+
+                        if (is_gp) then
+                            $:GPU_ATOMIC(atomic='capture')
+                            count = count + 1
+                            local_idx = count
+                            $:END_GPU_ATOMIC_CAPTURE()
+
+                            ghost_points_in(local_idx)%loc = [i, j, k]
+                            patch_id = ib_markers%sf(i, j, k)
+                            ghost_points_in(local_idx)%ib_patch_id = patch_id
+
+                            ghost_points_in(local_idx)%slip = patch_ib(patch_id)%slip
 
                             if ((x_cc(i) - dx(i)) < x_domain%beg) then
-                                ghost_points_in(count)%DB(1) = -1
+                                ghost_points_in(local_idx)%DB(1) = -1
                             else if ((x_cc(i) + dx(i)) > x_domain%end) then
-                                ghost_points_in(count)%DB(1) = 1
+                                ghost_points_in(local_idx)%DB(1) = 1
                             else
-                                ghost_points_in(count)%DB(1) = 0
+                                ghost_points_in(local_idx)%DB(1) = 0
                             end if
 
                             if ((y_cc(j) - dy(j)) < y_domain%beg) then
-                                ghost_points_in(count)%DB(2) = -1
+                                ghost_points_in(local_idx)%DB(2) = -1
                             else if ((y_cc(j) + dy(j)) > y_domain%end) then
-                                ghost_points_in(count)%DB(2) = 1
+                                ghost_points_in(local_idx)%DB(2) = 1
                             else
-                                ghost_points_in(count)%DB(2) = 0
+                                ghost_points_in(local_idx)%DB(2) = 0
                             end if
 
-                            count = count + 1
-
-                        else
-                            inner_points_in(count_i)%loc = [i, j, 0]
-                            patch_id = ib_markers%sf(i, j, 0)
-                            inner_points_in(count_i)%ib_patch_id = &
-                                patch_id
-                            inner_points_in(count_i)%slip = patch_ib(patch_id)%slip
-                            count_i = count_i + 1
-
-                        end if
-                    end if
-                else
-                    ! 3D
-                    do k = 0, p
-                        if (ib_markers%sf(i, j, k) /= 0) then
-                            subsection_3D = ib_markers%sf( &
-                                            i - gp_layers:i + gp_layers, &
-                                            j - gp_layers:j + gp_layers, &
-                                            k - gp_layers:k + gp_layers)
-                            if (any(subsection_3D == 0)) then
-                                ghost_points_in(count)%loc = [i, j, k]
-                                patch_id = ib_markers%sf(i, j, k)
-                                ghost_points_in(count)%ib_patch_id = &
-                                    ib_markers%sf(i, j, k)
-                                ghost_points_in(count)%slip = patch_ib(patch_id)%slip
-
-                                if ((x_cc(i) - dx(i)) < x_domain%beg) then
-                                    ghost_points_in(count)%DB(1) = -1
-                                else if ((x_cc(i) + dx(i)) > x_domain%end) then
-                                    ghost_points_in(count)%DB(1) = 1
-                                else
-                                    ghost_points_in(count)%DB(1) = 0
-                                end if
-
-                                if ((y_cc(j) - dy(j)) < y_domain%beg) then
-                                    ghost_points_in(count)%DB(2) = -1
-                                else if ((y_cc(j) + dy(j)) > y_domain%end) then
-                                    ghost_points_in(count)%DB(2) = 1
-                                else
-                                    ghost_points_in(count)%DB(2) = 0
-                                end if
-
+                            if (p /= 0) then
                                 if ((z_cc(k) - dz(k)) < z_domain%beg) then
-                                    ghost_points_in(count)%DB(3) = -1
+                                    ghost_points_in(local_idx)%DB(3) = -1
                                 else if ((z_cc(k) + dz(k)) > z_domain%end) then
-                                    ghost_points_in(count)%DB(3) = 1
+                                    ghost_points_in(local_idx)%DB(3) = 1
                                 else
-                                    ghost_points_in(count)%DB(3) = 0
+                                    ghost_points_in(local_idx)%DB(3) = 0
                                 end if
+                            end if
 
-                                count = count + 1
-                            else
-                                inner_points_in(count_i)%loc = [i, j, k]
-                                patch_id = ib_markers%sf(i, j, k)
-                                inner_points_in(count_i)%ib_patch_id = &
-                                    ib_markers%sf(i, j, k)
-                                inner_points_in(count_i)%slip = patch_ib(patch_id)%slip
+                        else
+                            $:GPU_ATOMIC(atomic='capture')
+                            count_i = count_i + 1
+                            local_idx = count_i
+                            $:END_GPU_ATOMIC_CAPTURE()
+
+                            inner_points_in(local_idx)%loc = [i, j, k]
+                            patch_id = ib_markers%sf(i, j, k)
+                            inner_points_in(local_idx)%ib_patch_id = patch_id
+                            inner_points_in(local_idx)%slip = patch_ib(patch_id)%slip
 
-                                count_i = count_i + 1
-                            end if
                         end if
-                    end do
-                end if
+                    end if
+                end do
             end do
         end do
+        $:END_GPU_PARALLEL_LOOP()
 
     end subroutine s_find_ghost_points
 
@@ -1013,24 +986,23 @@ contains
         call nvtxEndRange
 
         call nvtxStartRange("COMPUTE-GHOST-POINTS")
-
         ! recalculate the ghost point locations and coefficients
         call s_find_num_ghost_points(num_gps, num_inner_gps)
         $:GPU_UPDATE(device='[num_gps, num_inner_gps]')
 
         call s_find_ghost_points(ghost_points, inner_points)
-        $:GPU_UPDATE(device='[ghost_points, inner_points]')
-
         call nvtxEndRange
-        
+
+        call nvtxStartRange("COMPUTE-IMAGE-POINTS")
         call s_apply_levelset(ghost_points, num_gps)
-        $:GPU_UPDATE(device='[ghost_points]')
+        $:GPU_UPDATE(host='[ghost_points, inner_points]')
 
         call s_compute_image_points(ghost_points)
         $:GPU_UPDATE(device='[ghost_points]')
 
         call s_compute_interpolation_coeffs(ghost_points)
         $:GPU_UPDATE(device='[ghost_points]')
+        call nvtxEndRange
 
         call nvtxEndRange
 

From bfcc5935eb6953ab2e66d4ab6c3f06465ca31d71 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Tue, 17 Feb 2026 10:58:11 -0500
Subject: [PATCH 06/22] image points computed on the GPU for x4 performance in
 that subroutine

---
 src/simulation/m_ibm.fpp | 209 +++++++++++++++------------------------
 1 file changed, 80 insertions(+), 129 deletions(-)

diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 9267ed668a..01cdc632c6 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -131,13 +131,9 @@ contains
         $:GPU_ENTER_DATA(copyin='[ghost_points,inner_points]')
         call s_find_ghost_points(ghost_points, inner_points)
         call s_apply_levelset(ghost_points, num_gps)
-        $:GPU_UPDATE(host='[ghost_points, inner_points]')
 
         call s_compute_image_points(ghost_points)
-        $:GPU_UPDATE(device='[ghost_points]')
-
         call s_compute_interpolation_coeffs(ghost_points)
-        $:GPU_UPDATE(device='[ghost_points]')
 
         call nvtxEndRange
 
@@ -458,6 +454,7 @@ contains
         integer :: dir
         integer :: index
 
+        $:GPU_PARALLEL_LOOP(private='[q,gp,i,j,k,physical_loc,patch_id,dist,norm,dim,bound,dir,index,temp_loc,s_cc]')
         do q = 1, num_gps
             gp = ghost_points_in(q)
             i = gp%loc(1)
@@ -507,6 +504,7 @@ contains
                     do while ((temp_loc < s_cc(index) &
                                .or. temp_loc > s_cc(index + 1)))
                         index = index + dir
+#if !defined(MFC_OpenACC) && !defined(MFC_OpenMP)
                         if (index < -buff_size .or. index > bound) then
                             print *, "A required image point is not located in this computational domain."
                             print *, "Ghost Point is located at ", [x_cc(i), y_cc(j), z_cc(k)], " while moving in dimension ", dim
@@ -518,6 +516,7 @@ contains
                             print *, "A short term fix may include increasing buff_size further in m_helper_basic (currently set to a minimum of 10)"
                             error stop "Ghost Point and Image Point on Different Processors"
                         end if
+#endif
                     end do
                     ghost_points_in(q)%ip_grid(dim) = index
                     if (ghost_points_in(q)%DB(dim) == -1) then
@@ -528,6 +527,7 @@ contains
                 end if
             end do
         end do
+        $:END_GPU_PARALLEL_LOOP()
 
     end subroutine s_compute_image_points
 
@@ -685,51 +685,74 @@ contains
         real(wp) :: buf
         real(wp), dimension(2, 2, 2) :: eta
         type(ghost_point) :: gp
-        integer :: i !< Iterator variables
-        integer :: i1, i2, j1, j2, k1, k2 !< Grid indexes
+        integer :: q, i, j, k, ii, jj, kk !< Grid indexes and iterators
         integer :: patch_id
+        logical is_cell_center
 
-        ! 2D
-        if (p <= 0) then
-            do i = 1, num_gps
-                gp = ghost_points_in(i)
-                ! Get the interpolation points
-                i1 = gp%ip_grid(1); i2 = i1 + 1
-                j1 = gp%ip_grid(2); j2 = j1 + 1
-
-                dist = 0._wp
-                buf = 1._wp
-                dist(1, 1, 1) = sqrt( &
-                                (x_cc(i1) - gp%ip_loc(1))**2 + &
-                                (y_cc(j1) - gp%ip_loc(2))**2)
-                dist(2, 1, 1) = sqrt( &
-                                (x_cc(i2) - gp%ip_loc(1))**2 + &
-                                (y_cc(j1) - gp%ip_loc(2))**2)
-                dist(1, 2, 1) = sqrt( &
-                                (x_cc(i1) - gp%ip_loc(1))**2 + &
-                                (y_cc(j2) - gp%ip_loc(2))**2)
-                dist(2, 2, 1) = sqrt( &
-                                (x_cc(i2) - gp%ip_loc(1))**2 + &
-                                (y_cc(j2) - gp%ip_loc(2))**2)
-
-                interp_coeffs = 0._wp
-
-                if (dist(1, 1, 1) <= 1.e-16_wp) then
-                    interp_coeffs(1, 1, 1) = 1._wp
-                else if (dist(2, 1, 1) <= 1.e-16_wp) then
-                    interp_coeffs(2, 1, 1) = 1._wp
-                else if (dist(1, 2, 1) <= 1.e-16_wp) then
-                    interp_coeffs(1, 2, 1) = 1._wp
-                else if (dist(2, 2, 1) <= 1.e-16_wp) then
-                    interp_coeffs(2, 2, 1) = 1._wp
-                else
+        $:GPU_PARALLEL_LOOP(private='[q,i,j,k,ii,jj,kk,dist,buf,gp,interp_coeffs,eta,alpha,patch_id,is_cell_center]')
+        do q = 1, num_gps
+            gp = ghost_points_in(q)
+            ! Get the interpolation points
+            i = gp%ip_grid(1)
+            j = gp%ip_grid(2)
+            if (p /= 0) then
+                k = gp%ip_grid(3)
+            else
+                k = 0; 
+            end if
+
+            ! get the distance to a cell in each direction
+            dist = 0._wp
+            buf = 1._wp
+            do ii = 0, 1
+                do jj = 0, 1
+                    if (p == 0) then
+                        dist(1 + ii, 1 + jj, 1) = sqrt( &
+                                                  (x_cc(i + ii) - gp%ip_loc(1))**2 + &
+                                                  (y_cc(j + jj) - gp%ip_loc(2))**2)
+                    else
+                        do kk = 0, 1
+                            dist(1 + ii, 1 + jj, 1 + kk) = sqrt( &
+                                                           (x_cc(i + ii) - gp%ip_loc(1))**2 + &
+                                                           (y_cc(j + jj) - gp%ip_loc(2))**2 + &
+                                                           (z_cc(k + kk) - gp%ip_loc(3))**2)
+                        end do
+                    end if
+                end do
+            end do
+
+            ! check if we are arbitrarily close to a cell center
+            interp_coeffs = 0._wp
+            is_cell_center = .false.
+            check_is_cell_center: do ii = 0, 1
+                do jj = 0, 1
+                    if (dist(ii + 1, jj + 1, 1) <= 1.e-16_wp) then
+                        interp_coeffs(ii + 1, jj + 1, 1) = 1._wp
+                        is_cell_center = .true.
+                        exit check_is_cell_center
+                    else
+                        if (p /= 0) then
+                            if (dist(ii + 1, jj + 1, 2) <= 1.e-16_wp) then
+                                interp_coeffs(ii + 1, jj + 1, 2) = 1._wp
+                                is_cell_center = .true.
+                                exit check_is_cell_center
+                            end if
+                        end if
+                    end if
+                end do
+            end do check_is_cell_center
+
+            if (.not. is_cell_center) then
+                ! if we are not arbitrarily close, interpolate
+                alpha = 1._wp
+                patch_id = gp%ib_patch_id
+                if (ib_markers%sf(i, j, k) /= 0) alpha(1, 1, 1) = 0._wp
+                if (ib_markers%sf(i + 1, j, k) /= 0) alpha(2, 1, 1) = 0._wp
+                if (ib_markers%sf(i, j + 1, k) /= 0) alpha(1, 2, 1) = 0._wp
+                if (ib_markers%sf(i + 1, j + 1, k) /= 0) alpha(2, 2, 1) = 0._wp
+
+                if (p == 0) then
                     eta(:, :, 1) = 1._wp/dist(:, :, 1)**2
-                    alpha = 1._wp
-                    patch_id = gp%ib_patch_id
-                    if (ib_markers%sf(i1, j1, 0) /= 0) alpha(1, 1, 1) = 0._wp
-                    if (ib_markers%sf(i2, j1, 0) /= 0) alpha(2, 1, 1) = 0._wp
-                    if (ib_markers%sf(i1, j2, 0) /= 0) alpha(1, 2, 1) = 0._wp
-                    if (ib_markers%sf(i2, j2, 0) /= 0) alpha(2, 2, 1) = 0._wp
                     buf = sum(alpha(:, :, 1)*eta(:, :, 1))
                     if (buf > 0._wp) then
                         interp_coeffs(:, :, 1) = alpha(:, :, 1)*eta(:, :, 1)/buf
@@ -737,82 +760,15 @@ contains
                         buf = sum(eta(:, :, 1))
                         interp_coeffs(:, :, 1) = eta(:, :, 1)/buf
                     end if
-                end if
-
-                ghost_points_in(i)%interp_coeffs = interp_coeffs
-            end do
-
-        else
-            do i = 1, num_gps
-                gp = ghost_points_in(i)
-                ! Get the interpolation points
-                i1 = gp%ip_grid(1); i2 = i1 + 1
-                j1 = gp%ip_grid(2); j2 = j1 + 1
-                k1 = gp%ip_grid(3); k2 = k1 + 1
-
-                ! Get interpolation weights (Chaudhuri et al. 2011, JCP)
-                dist(1, 1, 1) = sqrt( &
-                                (x_cc(i1) - gp%ip_loc(1))**2 + &
-                                (y_cc(j1) - gp%ip_loc(2))**2 + &
-                                (z_cc(k1) - gp%ip_loc(3))**2)
-                dist(2, 1, 1) = sqrt( &
-                                (x_cc(i2) - gp%ip_loc(1))**2 + &
-                                (y_cc(j1) - gp%ip_loc(2))**2 + &
-                                (z_cc(k1) - gp%ip_loc(3))**2)
-                dist(1, 2, 1) = sqrt( &
-                                (x_cc(i1) - gp%ip_loc(1))**2 + &
-                                (y_cc(j2) - gp%ip_loc(2))**2 + &
-                                (z_cc(k1) - gp%ip_loc(3))**2)
-                dist(2, 2, 1) = sqrt( &
-                                (x_cc(i2) - gp%ip_loc(1))**2 + &
-                                (y_cc(j2) - gp%ip_loc(2))**2 + &
-                                (z_cc(k1) - gp%ip_loc(3))**2)
-                dist(1, 1, 2) = sqrt( &
-                                (x_cc(i1) - gp%ip_loc(1))**2 + &
-                                (y_cc(j1) - gp%ip_loc(2))**2 + &
-                                (z_cc(k2) - gp%ip_loc(3))**2)
-                dist(2, 1, 2) = sqrt( &
-                                (x_cc(i2) - gp%ip_loc(1))**2 + &
-                                (y_cc(j1) - gp%ip_loc(2))**2 + &
-                                (z_cc(k2) - gp%ip_loc(3))**2)
-                dist(1, 2, 2) = sqrt( &
-                                (x_cc(i1) - gp%ip_loc(1))**2 + &
-                                (y_cc(j2) - gp%ip_loc(2))**2 + &
-                                (z_cc(k2) - gp%ip_loc(3))**2)
-                dist(2, 2, 2) = sqrt( &
-                                (x_cc(i2) - gp%ip_loc(1))**2 + &
-                                (y_cc(j2) - gp%ip_loc(2))**2 + &
-                                (z_cc(k2) - gp%ip_loc(3))**2)
-                interp_coeffs = 0._wp
-                buf = 1._wp
-                if (dist(1, 1, 1) <= 1.e-16_wp) then
-                    interp_coeffs(1, 1, 1) = 1._wp
-                else if (dist(2, 1, 1) <= 1.e-16_wp) then
-                    interp_coeffs(2, 1, 1) = 1._wp
-                else if (dist(1, 2, 1) <= 1.e-16_wp) then
-                    interp_coeffs(1, 2, 1) = 1._wp
-                else if (dist(2, 2, 1) <= 1.e-16_wp) then
-                    interp_coeffs(2, 2, 1) = 1._wp
-                else if (dist(1, 1, 2) <= 1.e-16_wp) then
-                    interp_coeffs(1, 1, 2) = 1._wp
-                else if (dist(2, 1, 2) <= 1.e-16_wp) then
-                    interp_coeffs(2, 1, 2) = 1._wp
-                else if (dist(1, 2, 2) <= 1.e-16_wp) then
-                    interp_coeffs(1, 2, 2) = 1._wp
-                else if (dist(2, 2, 2) <= 1.e-16_wp) then
-                    interp_coeffs(2, 2, 2) = 1._wp
                 else
+
+                    if (ib_markers%sf(i, j, k + 1) /= 0) alpha(1, 1, 2) = 0._wp
+                    if (ib_markers%sf(i + 1, j, k + 1) /= 0) alpha(2, 1, 2) = 0._wp
+                    if (ib_markers%sf(i, j + 1, k + 1) /= 0) alpha(1, 2, 2) = 0._wp
+                    if (ib_markers%sf(i + 1, j + 1, k + 1) /= 0) alpha(2, 2, 2) = 0._wp
                     eta = 1._wp/dist**2
-                    alpha = 1._wp
-                    if (ib_markers%sf(i1, j1, k1) /= 0) alpha(1, 1, 1) = 0._wp
-                    if (ib_markers%sf(i2, j1, k1) /= 0) alpha(2, 1, 1) = 0._wp
-                    if (ib_markers%sf(i1, j2, k1) /= 0) alpha(1, 2, 1) = 0._wp
-                    if (ib_markers%sf(i2, j2, k1) /= 0) alpha(2, 2, 1) = 0._wp
-                    if (ib_markers%sf(i1, j1, k2) /= 0) alpha(1, 1, 2) = 0._wp
-                    if (ib_markers%sf(i2, j1, k2) /= 0) alpha(2, 1, 2) = 0._wp
-                    if (ib_markers%sf(i1, j2, k2) /= 0) alpha(1, 2, 2) = 0._wp
-                    if (ib_markers%sf(i2, j2, k2) /= 0) alpha(2, 2, 2) = 0._wp
                     buf = sum(alpha*eta)
+
                     if (buf > 0._wp) then
                         interp_coeffs = alpha*eta/buf
                     else
@@ -821,9 +777,11 @@ contains
                     end if
                 end if
 
-                ghost_points_in(i)%interp_coeffs = interp_coeffs
-            end do
-        end if
+            end if
+
+            ghost_points_in(q)%interp_coeffs = interp_coeffs
+        end do
+        $:END_GPU_PARALLEL_LOOP()
 
     end subroutine s_compute_interpolation_coeffs
 
@@ -988,20 +946,13 @@ contains
         call nvtxStartRange("COMPUTE-GHOST-POINTS")
         ! recalculate the ghost point locations and coefficients
         call s_find_num_ghost_points(num_gps, num_inner_gps)
-        $:GPU_UPDATE(device='[num_gps, num_inner_gps]')
-
         call s_find_ghost_points(ghost_points, inner_points)
         call nvtxEndRange
 
         call nvtxStartRange("COMPUTE-IMAGE-POINTS")
         call s_apply_levelset(ghost_points, num_gps)
-        $:GPU_UPDATE(host='[ghost_points, inner_points]')
-
         call s_compute_image_points(ghost_points)
-        $:GPU_UPDATE(device='[ghost_points]')
-
         call s_compute_interpolation_coeffs(ghost_points)
-        $:GPU_UPDATE(device='[ghost_points]')
         call nvtxEndRange
 
         call nvtxEndRange

From 5201ee7a6c3d6f023572eb6db0aa57cf76601915 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Tue, 17 Feb 2026 11:20:32 -0500
Subject: [PATCH 07/22] Need WAY more parameters in the case file... We should
 probably do something about that...

---
 toolchain/mfc/params_tests/test_definitions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/toolchain/mfc/params_tests/test_definitions.py b/toolchain/mfc/params_tests/test_definitions.py
index 7b9710bf43..20e0774526 100644
--- a/toolchain/mfc/params_tests/test_definitions.py
+++ b/toolchain/mfc/params_tests/test_definitions.py
@@ -161,10 +161,10 @@ class TestParameterCounts(unittest.TestCase):
     """Tests for expected parameter counts."""
 
     def test_total_param_count(self):
-        """Total parameter count should be around 3400."""
+        """Total parameter count should be around 40000."""
         count = len(REGISTRY.all_params)
-        self.assertGreater(count, 3000, "Too few parameters")
-        self.assertLess(count, 4000, "Too many parameters")
+        self.assertGreater(count, 39000, f"Too few parameters. Got {count}.")
+        self.assertLess(count, 41000, f"Too many parameters. Got {count}.")
 
     def test_log_params_count(self):
         """Should have many LOG type parameters."""

From 622edb0c50a688287e6cbdae9f02c735b4ef3933 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Tue, 17 Feb 2026 11:56:04 -0500
Subject: [PATCH 08/22] Extended the binary search reduction to all 3D IB
 geometries

---
 src/simulation/m_ib_patches.fpp | 55 +++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index d662bff389..5bc294c8fb 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -445,7 +445,7 @@ contains
         type(integer_field), intent(inout) :: ib_markers
 
         real(wp) :: lz, z_max, z_min, f, ca_in, pa, ma, ta, xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer :: i, j, k, l
+        integer :: i, j, k, l, il, ir, jl, jr, ll, lr
         integer :: Np1, Np2
 
         real(wp), dimension(1:3) :: xyz_local, center, offset !< x, y, z coordinates in local IB frame
@@ -527,11 +527,23 @@ contains
             $:GPU_UPDATE(device='[airfoil_grid_l,airfoil_grid_u]')
         end if
 
+        ! find the indices to the left and right of the IB in i, j, k
+        il = -gp_layers
+        jl = -gp_layers
+        ll = -gp_layers
+        ir = m + gp_layers
+        jr = n + gp_layers
+        lr = p + gp_layers
+        ! maximum distance any marker can be from the center is the legnth of the airfoil
+        call get_bounding_indices(center(1) - ca_in, center(1) + ca_in, x_cc, il, ir)
+        call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
+        call get_bounding_indices(center(3) - ca_in, center(3) + ca_in, z_cc, ll, lr)
+
         $:GPU_PARALLEL_LOOP(private='[i,j,l,xyz_local,k,f]',&
                   & copyin='[patch_id,center,inverse_rotation,offset,ma,ca_in,airfoil_grid_u,airfoil_grid_l,z_min,z_max]', collapse=3)
-        do l = -gp_layers, p + gp_layers
-            do j = -gp_layers, n + gp_layers
-                do i = -gp_layers, m + gp_layers
+        do l = ll, lr
+            do j = jl, jr
+                do i = il, ir
                     xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), z_cc(l) - center(3)] ! get coordinate frame centered on IB
                     xyz_local = matmul(inverse_rotation, xyz_local) ! rotate the frame into the IB's coordinates
                     xyz_local = xyz_local - offset ! airfoils are a patch that require a centroid offset
@@ -728,9 +740,10 @@ contains
         integer, intent(in) :: patch_id
         type(integer_field), intent(inout) :: ib_markers
 
-        integer :: i, j, k !< Generic loop iterators
+        integer :: i, j, k, ir, il, jr, jl, kr, kl !< Generic loop iterators
         real(wp), dimension(1:3) :: xyz_local, center, length !< x and y coordinates in local IB frame
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
+        real(wp) :: corner_distance
 
         ! Transferring the cuboid's centroid and length information
         center(1) = patch_ib(patch_id)%x_centroid
@@ -741,15 +754,27 @@ contains
         length(3) = patch_ib(patch_id)%length_z
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
 
+        ! find the indices to the left and right of the IB in i, j, k
+        il = -gp_layers
+        jl = -gp_layers
+        kl = -gp_layers
+        ir = m + gp_layers
+        jr = n + gp_layers
+        kr = p + gp_layers
+        corner_distance = sqrt(dot_product(length, length))/2._wp ! maximum distance any marker can be from the center
+        call get_bounding_indices(center(1) - corner_distance, center(1) + corner_distance, x_cc, il, ir)
+        call get_bounding_indices(center(2) - corner_distance, center(2) + corner_distance, y_cc, jl, jr)
+        call get_bounding_indices(center(3) - corner_distance, center(3) + corner_distance, z_cc, kl, kr)
+
         ! Checking whether the cuboid covers a particular cell in the domain
         ! and verifying whether the current patch has permission to write to
         ! to that cell. If both queries check out, the primitive variables
         ! of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]',&
                   & copyin='[patch_id,center,length,inverse_rotation]', collapse=3)
-        do k = -gp_layers, p + gp_layers
-            do j = -gp_layers, n + gp_layers
-                do i = -gp_layers, m + gp_layers
+        do k = kl, kr
+            do j = jl, jr
+                do i = il, ir
 
                     if (grid_geometry == 3) then
                         ! TODO :: This does not work and is not covered by any tests. This should be fixed
@@ -794,10 +819,11 @@ contains
         integer, intent(in) :: patch_id
         type(integer_field), intent(inout) :: ib_markers
 
-        integer :: i, j, k !< Generic loop iterators
+        integer :: i, j, k, il, ir, jl, jr, kl, kr !< Generic loop iterators
         real(wp) :: radius
         real(wp), dimension(1:3) :: xyz_local, center, length !< x and y coordinates in local IB frame
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
+        real(wp) :: corner_distance
 
         ! Transferring the cylindrical patch's centroid, length, radius,
         ! smoothing patch identity and smoothing coefficient information
@@ -811,6 +837,17 @@ contains
         radius = patch_ib(patch_id)%radius
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
 
+        il = -gp_layers
+        jl = -gp_layers
+        kl = -gp_layers
+        ir = m + gp_layers
+        jr = n + gp_layers
+        kr = p + gp_layers
+        corner_distance = sqrt(radius**2 + maxval(length)**2) ! distance to rim of cylinder
+        call get_bounding_indices(center(1) - corner_distance, center(1) + corner_distance, x_cc, il, ir)
+        call get_bounding_indices(center(2) - corner_distance, center(2) + corner_distance, y_cc, jl, jr)
+        call get_bounding_indices(center(3) - corner_distance, center(3) + corner_distance, z_cc, kl, kr)
+
         ! Checking whether the cylinder covers a particular cell in the
         ! domain and verifying whether the current patch has the permission
         ! to write to that cell. If both queries check out, the primitive

From 0a089ce4e6c884f4a13ba48bc0ef8619a5426ba4 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Tue, 17 Feb 2026 12:19:02 -0500
Subject: [PATCH 09/22] Extended area reduction to all non-model IBs

---
 src/simulation/m_ib_patches.fpp | 64 ++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 5bc294c8fb..9bded72b72 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -258,7 +258,7 @@ contains
         real(wp), dimension(1:2) :: center
         real(wp) :: radius
 
-        integer :: i, j, k !< Generic loop iterators
+        integer :: i, j, il, ir, jl, jr !< Generic loop iterators
 
         ! Transferring the circular patch's radius, centroid, smearing patch
         ! identity and smearing coefficient information
@@ -267,6 +267,14 @@ contains
         center(2) = patch_ib(patch_id)%y_centroid
         radius = patch_ib(patch_id)%radius
 
+        ! find the indices to the left and right of the IB in i, j, k
+        il = -gp_layers
+        jl = -gp_layers
+        ir = m + gp_layers
+        jr = n + gp_layers
+        call get_bounding_indices(center(1) - radius, center(1) + radius, x_cc, il, ir)
+        call get_bounding_indices(center(2) - radius, center(2) + radius, y_cc, jl, jr)
+
         ! Checking whether the circle covers a particular cell in the domain
         ! and verifying whether the current patch has permission to write to
         ! that cell. If both queries check out, the primitive variables of
@@ -274,8 +282,8 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[i,j]',&
                   & copyin='[patch_id,center,radius]', collapse=2)
-        do j = -gp_layers, n + gp_layers
-            do i = -gp_layers, m + gp_layers
+        do j = jl, jr
+            do i = il, ir
                 if ((x_cc(i) - center(1))**2 &
                     + (y_cc(j) - center(2))**2 <= radius**2) &
                     then
@@ -296,7 +304,7 @@ contains
 
         real(wp) :: f, ca_in, pa, ma, ta
         real(wp) :: xa, yt, xu, yu, xl, yl, xc, yc, dycdxc, sin_c, cos_c
-        integer :: i, j, k
+        integer :: i, j, k, il, ir, jl, jr
         integer :: Np1, Np2
 
         real(wp), dimension(1:3) :: xy_local, offset !< x and y coordinates in local IB frame
@@ -376,10 +384,19 @@ contains
 
         end if
 
+        ! find the indices to the left and right of the IB in i, j, k
+        il = -gp_layers
+        jl = -gp_layers
+        ir = m + gp_layers
+        jr = n + gp_layers
+        ! maximum distance any marker can be from the center is the length of the airfoil
+        call get_bounding_indices(center(1) - ca_in, center(1) + ca_in, x_cc, il, ir)
+        call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
+
         $:GPU_PARALLEL_LOOP(private='[i,j,xy_local,k,f]', &
                   & copyin='[patch_id,center,inverse_rotation,offset,ma,ca_in,airfoil_grid_u,airfoil_grid_l]', collapse=2)
-        do j = -gp_layers, n + gp_layers
-            do i = -gp_layers, m + gp_layers
+        do j = jl, jr
+            do i = il, ir
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp] ! get coordinate frame centered on IB
                 xy_local = matmul(inverse_rotation, xy_local) ! rotate the frame into the IB's coordinates
                 xy_local = xy_local - offset ! airfoils are a patch that require a centroid offset
@@ -534,7 +551,7 @@ contains
         ir = m + gp_layers
         jr = n + gp_layers
         lr = p + gp_layers
-        ! maximum distance any marker can be from the center is the legnth of the airfoil
+        ! maximum distance any marker can be from the center is the length of the airfoil
         call get_bounding_indices(center(1) - ca_in, center(1) + ca_in, x_cc, il, ir)
         call get_bounding_indices(center(2) - ca_in, center(2) + ca_in, y_cc, jl, jr)
         call get_bounding_indices(center(3) - ca_in, center(3) + ca_in, z_cc, ll, lr)
@@ -612,16 +629,12 @@ contains
         integer, intent(in) :: patch_id
         type(integer_field), intent(inout) :: ib_markers
 
-        integer :: i, j, k !< generic loop iterators
-        real(wp) :: pi_inf, gamma, lit_gamma !< Equation of state parameters
+        integer :: i, j, il, ir, jl, jr !< generic loop iterators
+        real(wp) :: corner_distance !< Equation of state parameters
         real(wp), dimension(1:3) :: xy_local !< x and y coordinates in local IB frame
         real(wp), dimension(1:2) :: length, center !< x and y coordinates in local IB frame
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
 
-        pi_inf = fluid_pp(1)%pi_inf
-        gamma = fluid_pp(1)%gamma
-        lit_gamma = (1._wp + gamma)/gamma
-
         ! Transferring the rectangle's centroid and length information
         center(1) = patch_ib(patch_id)%x_centroid
         center(2) = patch_ib(patch_id)%y_centroid
@@ -629,14 +642,23 @@ contains
         length(2) = patch_ib(patch_id)%length_y
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
 
+        ! find the indices to the left and right of the IB in i, j, k
+        il = -gp_layers
+        jl = -gp_layers
+        ir = m + gp_layers
+        jr = n + gp_layers
+        corner_distance = sqrt(dot_product(length, length))/2._wp ! maximum distance any marker can be from the center
+        call get_bounding_indices(center(1) - corner_distance, center(1) + corner_distance, x_cc, il, ir)
+        call get_bounding_indices(center(2) - corner_distance, center(2) + corner_distance, y_cc, jl, jr)
+
         ! Checking whether the rectangle covers a particular cell in the
         ! domain and verifying whether the current patch has the permission
         ! to write to that cell. If both queries check out, the primitive
         ! variables of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j, xy_local]',&
                   & copyin='[patch_id,center,length,inverse_rotation,x_cc,y_cc]', collapse=2)
-        do j = -gp_layers, n + gp_layers
-            do i = -gp_layers, m + gp_layers
+        do j = jl, jr
+            do i = il, ir
                 ! get the x and y coordinates in the local IB frame
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
                 xy_local = matmul(inverse_rotation, xy_local)
@@ -702,9 +724,9 @@ contains
         ! the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,cart_y,cart_z]',&
                   & copyin='[patch_id,center,radius]', collapse=3)
-        do k = kl - 1, kr + 1
-            do j = jl - 1, jr + 1
-                do i = il - 1, ir + 1
+        do k = kl, kr
+            do j = jl, jr
+                do i = il, ir
                     ! do i = -gp_layers, m+gp_layers
                     if (grid_geometry == 3) then
                         call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
@@ -854,9 +876,9 @@ contains
         ! variables of the current patch are assigned to this cell.
         $:GPU_PARALLEL_LOOP(private='[i,j,k,xyz_local,cart_y,cart_z]',&
                   & copyin='[patch_id,center,length,radius,inverse_rotation]', collapse=3)
-        do k = -gp_layers, p + gp_layers
-            do j = -gp_layers, n + gp_layers
-                do i = -gp_layers, m + gp_layers
+        do k = kl, kr
+            do j = jl, jr
+                do i = il, ir
 
                     if (grid_geometry == 3) then
                         call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))

From 804a2865442584e99a551cea6422b3e912165687 Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Wed, 18 Feb 2026 11:45:46 -0500
Subject: [PATCH 10/22] Intermittent commit for GPU STLs

---
 src/common/m_model.fpp          | 49 +++++++++++++++++++++++----------
 src/simulation/m_ib_patches.fpp | 29 +++++++++++++++++--
 2 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 67c19fa454..90f72eda04 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -479,6 +479,23 @@ contains
         is_buffered = .true.
     end subroutine s_skip_ignored_lines
 
+    !> This function is used to replace the fortran random number
+    !! generator because the native generator is not compatible being called
+    !! from GPU routines/functions
+    function f_model_random_number(seed) result(rval)
+      
+        ! $:GPU_ROUTINE(parallelism='[seq]')
+        
+        integer, intent(inout) :: seed
+        real(wp) :: rval
+        
+        seed = ieor(seed, ishft(seed, 13))
+        seed = ieor(seed, ishft(seed, -17))
+        seed = ieor(seed, ishft(seed, 5))
+        
+        rval = abs(real(seed, wp)) / real(huge(seed), wp)
+    end function f_model_random_number
+
     !> This procedure, recursively, finds whether a point is inside an octree.
     !! @param model    Model to search in.
     !! @param point    Point to test.
@@ -493,32 +510,33 @@ contains
         real(wp), dimension(1:3), intent(in) :: point
         real(wp), dimension(1:3), intent(in) :: spacing
         integer, intent(in) :: spc
+        real(wp) :: phi, theta
+        integer :: rand_seed
 
         real(wp) :: fraction
 
         type(t_ray) :: ray
-        integer :: i, j, nInOrOut, nHits
+        integer :: i, j, k, nInOrOut, nHits
 
         real(wp), dimension(1:spc, 1:3) :: ray_origins, ray_dirs
 
-        ! TODO :: The random number generation prohibits GPU compute due to the subroutine not being able to be called in kernels
-        ! This should be swapped out with something that allows GPU compute. I recommend the fibonacci sphere:
-        ! do i = 1, spc
-        !   phi = acos(1.0 - 2.0*(i-1.0)/(spc-1.0))
-        !   theta = pi * (1.0 + sqrt(5.0)) * (i-1.0)
-        !   ray_dirs(i,:) = [cos(theta)*sin(phi), sin(theta)*sin(phi), cos(phi)]
-        !   ray_origins(i,:) = point
-        ! end do
+        rand_seed = int(point(1) * 73856093_wp) + &
+                    int(point(2) * 19349663_wp) + &
+                    int(point(3) * 83492791_wp)
+        if (rand_seed == 0) rand_seed = 1
 
+        ! generate our random collection or rays
         do i = 1, spc
-            call random_number(ray_origins(i, :))
-            ray_origins(i, :) = point + (ray_origins(i, :) - 0.5_wp)*spacing(:)
-
-            call random_number(ray_dirs(i, :))
-            ray_dirs(i, :) = ray_dirs(i, :) - 0.5_wp
+            do k = 1, 3
+                ! random jitter in the origin helps us estimate volume fraction instead of only at the cell center
+                ray_origins(i, k) = point(k) + (f_model_random_number(rand_seed) - 0.5_wp) * spacing(k)
+                ! cast sample rays in all directions
+                ray_dirs(i, k) = point(k) + f_model_random_number(rand_seed) - 0.5_wp
+            end do
             ray_dirs(i, :) = ray_dirs(i, :)/sqrt(sum(ray_dirs(i, :)*ray_dirs(i, :)))
         end do
 
+        ! ray trace
         nInOrOut = 0
         do i = 1, spc
             ray%o = ray_origins(i, :)
@@ -526,11 +544,14 @@ contains
 
             nHits = 0
             do j = 1, model%ntrs
+                ! count the number of triangles this ray intersects
                 if (f_intersects_triangle(ray, model%trs(j))) then
                     nHits = nHits + 1
                 end if
             end do
 
+            ! if thje ray hits an odd number of triangles on its way out, then
+            ! it must be on the inside of the model
             nInOrOut = nInOrOut + mod(nHits, 2)
         end do
 
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 9bded72b72..8c4dfd696e 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -138,11 +138,9 @@ contains
 
         real(wp), dimension(1:4, 1:4) :: transform, transform_n
 
-        call random_seed()
-
         do patch_id = 1, num_ibs
             if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
-                allocate (models(patch_id)%model)
+                @:ALLOCATE(models(patch_id)%model)
                 print *, " * Reading model: "//trim(patch_ib(patch_id)%model_filepath)
 
                 model = f_model_read(patch_ib(patch_id)%model_filepath)
@@ -237,6 +235,30 @@ contains
                     models(patch_id)%total_vertices = total_vertices
                 end if
 
+                ! update allocatables
+                $:GPU_ENTER_DATA(copyin='[models(patch_id)]')
+                $:GPU_ENTER_DATA(copyin='[models(patch_id)%model]')
+                $:GPU_ENTER_DATA(copyin='[models(patch_id)%model%trs]')
+                $:GPU_ENTER_DATA(copyin='[models(patch_id)%model%ntrs]')
+                $:GPU_ENTER_DATA(copyin='[models(patch_id)%boundary_v]')
+                do i = 1, models(patch_id)%model%ntrs
+                    $:GPU_ENTER_DATA(copyin='[models(patch_id)%model%trs(i)]')
+                end do
+                if (interpolate) then
+                    $:GPU_ENTER_DATA(copyin='[models(patch_id)%interpolated_boundary_v]')
+                end if
+
+                print *, "Entering GPU loop to read"
+
+                ! TODO :: REMOVE ME
+                $:GPU_PARALLEL_LOOP(private='[i]')
+                do i = 1, 3
+                    print *, "Num Triangles", models(1)%model%ntrs
+                    ! print *, "interpolate", models(i)%interpolate
+                    ! print *, "Vertices", models(i)%model%trs(1)%v
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
             end if
         end do
 
@@ -995,6 +1017,7 @@ contains
 
                 ! Reading STL boundary vertices and compute the levelset and levelset_norm
                 if (eta > patch_ib(patch_id)%model_threshold) then
+                  print *, eta, i, j
                     ib_markers%sf(i, j, 0) = patch_id
                 end if
 

From 64bc3480260f9d32de0f8a78109a52503bb4ae3b Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Wed, 18 Feb 2026 15:34:31 -0500
Subject: [PATCH 11/22] Ib markers computed on GPU working

---
 src/common/m_derived_types.fpp  |   8 ++-
 src/common/m_helper.fpp         |   2 +
 src/common/m_model.fpp          |  73 +++++++++++++++++++-
 src/simulation/m_ib_patches.fpp | 116 +++++++++++++++++++++-----------
 src/simulation/m_ibm.fpp        |   4 +-
 5 files changed, 160 insertions(+), 43 deletions(-)

diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index 7aab3c96e2..e0e84b3b06 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -184,12 +184,18 @@ module m_derived_types
     end type t_model
 
     type :: t_model_array
+        ! --- Original CPU-side fields (unchanged) ---
         type(t_model), allocatable :: model
         real(wp), allocatable, dimension(:, :, :) :: boundary_v
         real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v
         integer :: boundary_edge_count
         integer :: total_vertices
-        logical :: interpolate
+        integer :: interpolate
+
+        ! --- GPU-friendly flattened arrays ---
+        integer :: ntrs  ! copy of model%ntrs
+        real(wp), allocatable, dimension(:, :, :) :: trs_v  ! (3, 3, ntrs) - triangle vertices
+        real(wp), allocatable, dimension(:, :)    :: trs_n  ! (3, ntrs)    - triangle normals
     end type t_model_array
 
     !> Derived type adding initial condition (ic) patch parameters as attributes
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index d990a83345..f50b99a965 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -328,6 +328,8 @@ contains
     !! @return The cross product of the two vectors.
     pure function f_cross(a, b) result(c)
 
+        $:GPU_ROUTINE(parallelism='[seq]')
+
         real(wp), dimension(3), intent(in) :: a, b
         real(wp), dimension(3) :: c
 
diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 90f72eda04..afb46ac7f4 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -22,7 +22,8 @@ module m_model
     ! Subroutines for STL immersed boundaries
     public :: f_check_boundary, f_register_edge, f_check_interpolation_2D, &
               f_check_interpolation_3D, f_interpolate_2D, f_interpolate_3D, &
-              f_interpolated_distance, f_normals, f_distance, f_distance_normals_3D, f_tri_area
+              f_interpolated_distance, f_normals, f_distance, f_distance_normals_3D, f_tri_area, s_pack_model_for_gpu, &
+              f_model_is_inside_flat
 
 contains
 
@@ -484,7 +485,7 @@ contains
     !! from GPU routines/functions
     function f_model_random_number(seed) result(rval)
       
-        ! $:GPU_ROUTINE(parallelism='[seq]')
+        $:GPU_ROUTINE(parallelism='[seq]')
         
         integer, intent(inout) :: seed
         real(wp) :: rval
@@ -559,6 +560,58 @@ contains
 
     end function f_model_is_inside
 
+    impure function f_model_is_inside_flat(ntrs, trs_v, trs_n, pid, point, spacing, spc) result(fraction)
+        
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        integer, intent(in) :: ntrs
+        real(wp), dimension(:,:,:,:), intent(in) :: trs_v
+        real(wp), dimension(:,:,:), intent(in) :: trs_n
+        integer, intent(in) :: pid
+        real(wp), dimension(1:3), intent(in) :: point
+        real(wp), dimension(1:3), intent(in) :: spacing
+        integer, intent(in) :: spc
+
+        real(wp) :: fraction
+        real(wp) :: origin(1:3), dir(1:3), dir_mag
+        type(t_ray) :: ray
+        type(t_triangle) :: tri
+        integer :: i, j, k, nInOrOut, nHits
+        integer :: rand_seed
+
+        rand_seed = int(point(1) * 73856093_wp) + &
+                    int(point(2) * 19349663_wp) + &
+                    int(point(3) * 83492791_wp)
+        if (rand_seed == 0) rand_seed = 1
+
+        ! generate our random collection of rays
+        nInOrOut = 0
+        do i = 1, spc
+            ! Generate one ray at a time — no arrays needed
+            do k = 1, 3
+                origin(k) = point(k) + (f_model_random_number(rand_seed) - 0.5_wp) * spacing(k)
+                dir(k) = point(k) + f_model_random_number(rand_seed) - 0.5_wp
+            end do
+            dir_mag = sqrt(dir(1)*dir(1) + dir(2)*dir(2) + dir(3)*dir(3))
+            dir(:) = dir(:) / dir_mag
+
+            ray%o = origin
+            ray%d = dir
+
+            nHits = 0
+            do j = 1, ntrs
+                tri%v(:, :) = trs_v(:, :, j, pid)
+                tri%n(:) = trs_n(:, j, pid)
+                if (f_intersects_triangle(ray, tri)) then
+                    nHits = nHits + 1
+                end if
+            end do
+            nInOrOut = nInOrOut + mod(nHits, 2)
+        end do
+
+        fraction = real(nInOrOut)/real(spc)
+    end function f_model_is_inside_flat
+
     ! From https://www.scratchapixel.com/lessons/3e-basic-rendering/ray-tracing-rendering-a-triangle/ray-triangle-intersection-geometric-solution.html
     !> This procedure checks if a ray intersects a triangle.
     !! @param ray      Ray.
@@ -566,6 +619,8 @@ contains
     !! @return         True if the ray intersects the triangle, false otherwise.
     elemental function f_intersects_triangle(ray, triangle) result(intersects)
 
+        $:GPU_ROUTINE(parallelism='[seq]')
+
         type(t_ray), intent(in) :: ray
         type(t_triangle), intent(in) :: triangle
 
@@ -1269,4 +1324,18 @@ contains
 
     end function f_interpolated_distance
 
+    subroutine s_pack_model_for_gpu(ma)
+        type(t_model_array), intent(inout) :: ma
+        integer :: i
+    
+        ma%ntrs = ma%model%ntrs
+        allocate(ma%trs_v(1:3, 1:3, 1:ma%ntrs))
+        allocate(ma%trs_n(1:3, 1:ma%ntrs))
+    
+        do i = 1, ma%ntrs
+            ma%trs_v(:, :, i) = ma%model%trs(i)%v(:, :)
+            ma%trs_n(:, i)    = ma%model%trs(i)%n(:)
+        end do
+    end subroutine
+
 end module m_model
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 8c4dfd696e..df7641daab 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -25,7 +25,7 @@ module m_ib_patches
 
     implicit none
 
-    private; public :: s_apply_ib_patches, s_update_ib_rotation_matrix, s_instantiate_STL_models, models, f_convert_cyl_to_cart
+    private; public :: s_apply_ib_patches, s_update_ib_rotation_matrix, s_instantiate_STL_models, models, f_convert_cyl_to_cart, gpu_ntrs, gpu_trs_v, gpu_trs_n
 
     real(wp) :: x_centroid, y_centroid, z_centroid
     real(wp) :: length_x, length_y, length_z
@@ -55,9 +55,12 @@ module m_ib_patches
 
     character(len=5) :: istr ! string to store int to string result for error checking
 
-    type(t_model_array), allocatable, target :: models(:)
-    $:GPU_DECLARE(create='[models]')
     !! array of STL models that can be allocated and then used in IB marker and levelset compute
+    type(t_model_array), allocatable, target :: models(:)
+    !! GPU-friendly flat arrays for STL model data
+    integer, allocatable :: gpu_ntrs(:)
+    real(wp), allocatable, dimension(:,:,:,:) :: gpu_trs_v
+    real(wp), allocatable, dimension(:,:,:)   :: gpu_trs_n
 
 contains
 
@@ -140,7 +143,7 @@ contains
 
         do patch_id = 1, num_ibs
             if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
-                @:ALLOCATE(models(patch_id)%model)
+                allocate(models(patch_id)%model)
                 print *, " * Reading model: "//trim(patch_ib(patch_id)%model_filepath)
 
                 model = f_model_read(patch_ib(patch_id)%model_filepath)
@@ -229,38 +232,52 @@ contains
                 models(patch_id)%model = model
                 models(patch_id)%boundary_v = boundary_v
                 models(patch_id)%boundary_edge_count = boundary_edge_count
-                models(patch_id)%interpolate = interpolate
+                if (interpolate) then
+                    models(patch_id)%interpolate = 1
+                else
+                    models(patch_id)%interpolate = 0
+                end if
                 if (interpolate) then
                     models(patch_id)%interpolated_boundary_v = interpolated_boundary_v
                     models(patch_id)%total_vertices = total_vertices
                 end if
 
-                ! update allocatables
-                $:GPU_ENTER_DATA(copyin='[models(patch_id)]')
-                $:GPU_ENTER_DATA(copyin='[models(patch_id)%model]')
-                $:GPU_ENTER_DATA(copyin='[models(patch_id)%model%trs]')
-                $:GPU_ENTER_DATA(copyin='[models(patch_id)%model%ntrs]')
-                $:GPU_ENTER_DATA(copyin='[models(patch_id)%boundary_v]')
-                do i = 1, models(patch_id)%model%ntrs
-                    $:GPU_ENTER_DATA(copyin='[models(patch_id)%model%trs(i)]')
-                end do
-                if (interpolate) then
-                    $:GPU_ENTER_DATA(copyin='[models(patch_id)%interpolated_boundary_v]')
+            end if
+        end do
+
+        ! Pack and upload flat arrays for GPU (AFTER the loop)
+        block
+            integer :: pid, max_ntrs
+
+            max_ntrs = 0
+            do pid = 1, num_ibs
+                if (allocated(models(pid)%model)) then
+                    call s_pack_model_for_gpu(models(pid))
+                    max_ntrs = max(max_ntrs, models(pid)%ntrs)
                 end if
+            end do
+
+            if (max_ntrs > 0) then
+                allocate(gpu_ntrs(1:num_ibs))
+                allocate(gpu_trs_v(1:3, 1:3, 1:max_ntrs, 1:num_ibs))
+                allocate(gpu_trs_n(1:3, 1:max_ntrs, 1:num_ibs))
 
-                print *, "Entering GPU loop to read"
+                gpu_ntrs = 0
+                gpu_trs_v = 0._wp
+                gpu_trs_n = 0._wp
 
-                ! TODO :: REMOVE ME
-                $:GPU_PARALLEL_LOOP(private='[i]')
-                do i = 1, 3
-                    print *, "Num Triangles", models(1)%model%ntrs
-                    ! print *, "interpolate", models(i)%interpolate
-                    ! print *, "Vertices", models(i)%model%trs(1)%v
+                do pid = 1, num_ibs
+                    if (allocated(models(pid)%model)) then
+                        gpu_ntrs(pid) = models(pid)%ntrs
+                        gpu_trs_v(:, :, 1:models(pid)%ntrs, pid) = models(pid)%trs_v
+                        gpu_trs_n(:, 1:models(pid)%ntrs, pid)    = models(pid)%trs_n
+                    end if
                 end do
-                $:END_GPU_PARALLEL_LOOP()
+
+                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n]')
 
             end if
-        end do
+        end block
 
     end subroutine s_instantiate_STL_models
 
@@ -986,23 +1003,25 @@ contains
         integer, intent(in) :: patch_id
         type(integer_field), intent(inout) :: ib_markers
 
-        integer :: i, j, k !< Generic loop iterators
+        integer :: i, j, k  !< Generic loop iterators
+        integer :: spc
 
-        type(t_model), pointer :: model
-
-        real(wp) :: eta
+        real(wp) :: eta, threshold
         real(wp), dimension(1:3) :: point, local_point, offset
         real(wp), dimension(1:3) :: center, xy_local
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
 
-        model => models(patch_id)%model
         center = 0._wp
         center(1) = patch_ib(patch_id)%x_centroid
         center(2) = patch_ib(patch_id)%y_centroid
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
+        spc = patch_ib(patch_id)%model_spc
+        threshold = patch_ib(patch_id)%model_threshold
 
-        do i = 0 - gp_layers, m + gp_layers
+        $:GPU_PARALLEL_LOOP(private='[i,j, xy_local, eta]',&
+                  & copyin='[patch_id,center,inverse_rotation, offset, spc, threshold]', collapse=2)
+        do i = -gp_layers, m + gp_layers
             do j = -gp_layers, n + gp_layers
 
                 xy_local = [x_cc(i) - center(1), y_cc(j) - center(2), 0._wp]
@@ -1013,16 +1032,30 @@ contains
                     xy_local = f_convert_cyl_to_cart(xy_local)
                 end if
 
-                eta = f_model_is_inside(model, xy_local, (/dx(i), dy(j), 0._wp/), patch_ib(patch_id)%model_spc)
+                if (i == 13 .and. j == 16) then
+                  print *, "spc:", spc
+                  print *, "ntrs:", gpu_ntrs(patch_id)
+                  print *, "threshold:", threshold
+                  print *, "dx:", dx(i), dy(j)
+                  print *, "xy_local:", xy_local(1)
+
+                end if
+
+                eta = f_model_is_inside_flat(gpu_ntrs(patch_id), &
+                                              gpu_trs_v, gpu_trs_n, &
+                                              patch_id, &
+                                              xy_local, (/dx(i), dy(j), 0._wp/), &
+                                              spc)
 
                 ! Reading STL boundary vertices and compute the levelset and levelset_norm
-                if (eta > patch_ib(patch_id)%model_threshold) then
+                if (eta > threshold) then
                   print *, eta, i, j
                     ib_markers%sf(i, j, 0) = patch_id
                 end if
 
             end do
         end do
+        $:END_GPU_PARALLEL_LOOP()
 
     end subroutine s_ib_model
 
@@ -1035,10 +1068,9 @@ contains
         type(integer_field), intent(inout) :: ib_markers
 
         integer :: i, j, k !< Generic loop iterators
+        integer :: spc
 
-        type(t_model), pointer :: model
-
-        real(wp) :: eta
+        real(wp) :: eta, threshold
         real(wp), dimension(1:3) :: point, local_point, offset
         real(wp), dimension(1:3) :: center, xyz_local
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
@@ -1050,8 +1082,12 @@ contains
         center(3) = patch_ib(patch_id)%z_centroid
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
+        spc = patch_ib(patch_id)%model_spc
+        threshold = patch_ib(patch_id)%model_threshold
 
-        do i = 0 - gp_layers, m + gp_layers
+        $:GPU_PARALLEL_LOOP(private='[i,j,k, xyz_local, eta]',&
+                  & copyin='[patch_id,center,inverse_rotation, offset, spc, threshold]', collapse=2)
+        do i = -gp_layers, m + gp_layers
             do j = -gp_layers, n + gp_layers
                 do k = -gp_layers, p + gp_layers
 
@@ -1063,7 +1099,11 @@ contains
                         xyz_local = f_convert_cyl_to_cart(xyz_local)
                     end if
 
-                    eta = f_model_is_inside(model, xyz_local, (/dx(i), dy(j), dz(k)/), patch_ib(patch_id)%model_spc)
+                    eta = f_model_is_inside_flat(gpu_ntrs(patch_id), &
+                                              gpu_trs_v, gpu_trs_n, &
+                                              patch_id, &
+                                              xyz_local, (/dx(i), dy(j), dz(k)/), &
+                                              spc)
 
                     ! Reading STL boundary vertices and compute the levelset and levelset_norm
                     if (eta > patch_ib(patch_id)%model_threshold) then
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 01cdc632c6..92400123bf 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -98,9 +98,9 @@ contains
         $:GPU_UPDATE(device='[patch_ib(1:num_ibs)]')
 
         ! GPU routines require updated cell centers
-        $:GPU_UPDATE(device='[x_cc, y_cc]')
+        $:GPU_UPDATE(device='[x_cc, y_cc, dx, dy]')
         if (p /= 0) then
-            $:GPU_UPDATE(device='[z_cc]')
+            $:GPU_UPDATE(device='[z_cc, dz]')
         end if
 
         ! allocate STL models

From bc972cabd2437eddfe63b6c9268bad9cbec5b334 Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Wed, 18 Feb 2026 15:54:17 -0500
Subject: [PATCH 12/22] Passes STL tests with GPU compute for IB markers (not
 added levelset yet)

---
 src/common/m_derived_types.fpp        |  2 +-
 src/common/m_model.fpp                | 44 +++++++++++++--------------
 src/simulation/m_compute_levelset.fpp |  2 ++
 src/simulation/m_ib_patches.fpp       | 17 ++++-------
 src/simulation/m_igr.fpp              | 26 ++++++++--------
 5 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index e0e84b3b06..cbdb3e2aa4 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -195,7 +195,7 @@ module m_derived_types
         ! --- GPU-friendly flattened arrays ---
         integer :: ntrs  ! copy of model%ntrs
         real(wp), allocatable, dimension(:, :, :) :: trs_v  ! (3, 3, ntrs) - triangle vertices
-        real(wp), allocatable, dimension(:, :)    :: trs_n  ! (3, ntrs)    - triangle normals
+        real(wp), allocatable, dimension(:, :) :: trs_n  ! (3, ntrs)    - triangle normals
     end type t_model_array
 
     !> Derived type adding initial condition (ic) patch parameters as attributes
diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index afb46ac7f4..2c62a1444f 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -484,17 +484,17 @@ contains
     !! generator because the native generator is not compatible being called
     !! from GPU routines/functions
     function f_model_random_number(seed) result(rval)
-      
+
         $:GPU_ROUTINE(parallelism='[seq]')
-        
+
         integer, intent(inout) :: seed
         real(wp) :: rval
-        
+
         seed = ieor(seed, ishft(seed, 13))
         seed = ieor(seed, ishft(seed, -17))
         seed = ieor(seed, ishft(seed, 5))
-        
-        rval = abs(real(seed, wp)) / real(huge(seed), wp)
+
+        rval = abs(real(seed, wp))/real(huge(seed), wp)
     end function f_model_random_number
 
     !> This procedure, recursively, finds whether a point is inside an octree.
@@ -521,16 +521,16 @@ contains
 
         real(wp), dimension(1:spc, 1:3) :: ray_origins, ray_dirs
 
-        rand_seed = int(point(1) * 73856093_wp) + &
-                    int(point(2) * 19349663_wp) + &
-                    int(point(3) * 83492791_wp)
+        rand_seed = int(point(1)*73856093_wp) + &
+                    int(point(2)*19349663_wp) + &
+                    int(point(3)*83492791_wp)
         if (rand_seed == 0) rand_seed = 1
 
         ! generate our random collection or rays
         do i = 1, spc
             do k = 1, 3
                 ! random jitter in the origin helps us estimate volume fraction instead of only at the cell center
-                ray_origins(i, k) = point(k) + (f_model_random_number(rand_seed) - 0.5_wp) * spacing(k)
+                ray_origins(i, k) = point(k) + (f_model_random_number(rand_seed) - 0.5_wp)*spacing(k)
                 ! cast sample rays in all directions
                 ray_dirs(i, k) = point(k) + f_model_random_number(rand_seed) - 0.5_wp
             end do
@@ -561,12 +561,12 @@ contains
     end function f_model_is_inside
 
     impure function f_model_is_inside_flat(ntrs, trs_v, trs_n, pid, point, spacing, spc) result(fraction)
-        
+
         $:GPU_ROUTINE(parallelism='[seq]')
 
         integer, intent(in) :: ntrs
-        real(wp), dimension(:,:,:,:), intent(in) :: trs_v
-        real(wp), dimension(:,:,:), intent(in) :: trs_n
+        real(wp), dimension(:, :, :, :), intent(in) :: trs_v
+        real(wp), dimension(:, :, :), intent(in) :: trs_n
         integer, intent(in) :: pid
         real(wp), dimension(1:3), intent(in) :: point
         real(wp), dimension(1:3), intent(in) :: spacing
@@ -579,9 +579,9 @@ contains
         integer :: i, j, k, nInOrOut, nHits
         integer :: rand_seed
 
-        rand_seed = int(point(1) * 73856093_wp) + &
-                    int(point(2) * 19349663_wp) + &
-                    int(point(3) * 83492791_wp)
+        rand_seed = int(point(1)*73856093_wp) + &
+                    int(point(2)*19349663_wp) + &
+                    int(point(3)*83492791_wp)
         if (rand_seed == 0) rand_seed = 1
 
         ! generate our random collection of rays
@@ -589,11 +589,11 @@ contains
         do i = 1, spc
             ! Generate one ray at a time — no arrays needed
             do k = 1, 3
-                origin(k) = point(k) + (f_model_random_number(rand_seed) - 0.5_wp) * spacing(k)
+                origin(k) = point(k) + (f_model_random_number(rand_seed) - 0.5_wp)*spacing(k)
                 dir(k) = point(k) + f_model_random_number(rand_seed) - 0.5_wp
             end do
             dir_mag = sqrt(dir(1)*dir(1) + dir(2)*dir(2) + dir(3)*dir(3))
-            dir(:) = dir(:) / dir_mag
+            dir(:) = dir(:)/dir_mag
 
             ray%o = origin
             ray%d = dir
@@ -1327,14 +1327,14 @@ contains
     subroutine s_pack_model_for_gpu(ma)
         type(t_model_array), intent(inout) :: ma
         integer :: i
-    
+
         ma%ntrs = ma%model%ntrs
-        allocate(ma%trs_v(1:3, 1:3, 1:ma%ntrs))
-        allocate(ma%trs_n(1:3, 1:ma%ntrs))
-    
+        allocate (ma%trs_v(1:3, 1:3, 1:ma%ntrs))
+        allocate (ma%trs_n(1:3, 1:ma%ntrs))
+
         do i = 1, ma%ntrs
             ma%trs_v(:, :, i) = ma%model%trs(i)%v(:, :)
-            ma%trs_n(:, i)    = ma%model%trs(i)%n(:)
+            ma%trs_n(:, i) = ma%model%trs(i)%n(:)
         end do
     end subroutine
 
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index ef3333745b..1f72b6a37a 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -729,6 +729,8 @@ contains
 
         end if
 
+        ! print gp%levelset, gp%levelset_norm(1), gp%levelset_norm(2), gp%levelset_norm(3)
+
     end subroutine s_model_levelset
 
 end module m_compute_levelset
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index df7641daab..4617e98ff7 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -61,6 +61,8 @@ module m_ib_patches
     integer, allocatable :: gpu_ntrs(:)
     real(wp), allocatable, dimension(:,:,:,:) :: gpu_trs_v
     real(wp), allocatable, dimension(:,:,:)   :: gpu_trs_n
+    real(wp), allocatable, dimension(:,:,:) :: gpu_boundary_v
+    real(wp), allocatable, dimension(:,:,:) :: gpu_interpolated_boundary_v
 
 contains
 
@@ -1032,15 +1034,6 @@ contains
                     xy_local = f_convert_cyl_to_cart(xy_local)
                 end if
 
-                if (i == 13 .and. j == 16) then
-                  print *, "spc:", spc
-                  print *, "ntrs:", gpu_ntrs(patch_id)
-                  print *, "threshold:", threshold
-                  print *, "dx:", dx(i), dy(j)
-                  print *, "xy_local:", xy_local(1)
-
-                end if
-
                 eta = f_model_is_inside_flat(gpu_ntrs(patch_id), &
                                               gpu_trs_v, gpu_trs_n, &
                                               patch_id, &
@@ -1049,7 +1042,6 @@ contains
 
                 ! Reading STL boundary vertices and compute the levelset and levelset_norm
                 if (eta > threshold) then
-                  print *, eta, i, j
                     ib_markers%sf(i, j, 0) = patch_id
                 end if
 
@@ -1057,6 +1049,8 @@ contains
         end do
         $:END_GPU_PARALLEL_LOOP()
 
+        $:GPU_UPDATE(host='[ib_markers%sf]')
+
     end subroutine s_ib_model
 
     !> The STL patch is a 3D geometry that is imported from an STL file.
@@ -1075,7 +1069,6 @@ contains
         real(wp), dimension(1:3) :: center, xyz_local
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
 
-        model => models(patch_id)%model
         center = 0._wp
         center(1) = patch_ib(patch_id)%x_centroid
         center(2) = patch_ib(patch_id)%y_centroid
@@ -1114,6 +1107,8 @@ contains
             end do
         end do
 
+        $:GPU_UPDATE(host='[ib_markers%sf]')
+
     end subroutine s_ib_3d_model
 
     !> Subroutine that computes a rotation matrix for converting to the rotating frame of the boundary
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index fa06ef9092..3f98d0e7e4 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -269,7 +269,7 @@ contains
 
     subroutine s_igr_iterative_solve(q_cons_vf, bc_type, t_step)
 #ifdef _CRAYFTN
-        !DIR$ OPTIMIZE (-haggress)
+    !DIR$ OPTIMIZE (-haggress)
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
@@ -370,7 +370,7 @@ contains
 
     subroutine s_igr_sigma_x(q_cons_vf, rhs_vf)
 #ifdef _CRAYFTN
-        !DIR$ OPTIMIZE (-haggress)
+    !DIR$ OPTIMIZE (-haggress)
 #endif
         type(scalar_field), &
             dimension(sys_size), &
@@ -455,7 +455,7 @@ contains
 
     subroutine s_igr_riemann_solver(q_cons_vf, rhs_vf, idir)
 #ifdef _CRAYFTN
-        !DIR$ OPTIMIZE (-haggress)
+    !DIR$ OPTIMIZE (-haggress)
 #endif
         type(scalar_field), &
             dimension(sys_size), &
@@ -495,9 +495,9 @@ contains
 
                                 #:if MFC_CASE_OPTIMIZATION
                                     #:if igr_order == 5
-                                        !DIR$ unroll 6
+                                    !DIR$ unroll 6
                                     #:elif igr_order == 3
-                                        !DIR$ unroll 4
+                                    !DIR$ unroll 4
                                     #:endif
                                 #:endif
                                 $:GPU_LOOP(parallelism='[seq]')
@@ -912,9 +912,9 @@ contains
 
                                 #:if MFC_CASE_OPTIMIZATION
                                     #:if igr_order == 5
-                                        !DIR$ unroll 6
+                                    !DIR$ unroll 6
                                     #:elif igr_order == 3
-                                        !DIR$ unroll 4
+                                    !DIR$ unroll 4
                                     #:endif
                                 #:endif
                                 $:GPU_LOOP(parallelism='[seq]')
@@ -1429,9 +1429,9 @@ contains
 
                                     #:if MFC_CASE_OPTIMIZATION
                                         #:if igr_order == 5
-                                            !DIR$ unroll 6
+                                        !DIR$ unroll 6
                                         #:elif igr_order == 3
-                                            !DIR$ unroll 4
+                                        !DIR$ unroll 4
                                         #:endif
                                     #:endif
                                     $:GPU_LOOP(parallelism='[seq]')
@@ -1826,9 +1826,9 @@ contains
 
                                     #:if MFC_CASE_OPTIMIZATION
                                         #:if igr_order == 5
-                                            !DIR$ unroll 6
+                                        !DIR$ unroll 6
                                         #:elif igr_order == 3
-                                            !DIR$ unroll 4
+                                        !DIR$ unroll 4
                                         #:endif
                                     #:endif
                                     $:GPU_LOOP(parallelism='[seq]')
@@ -2311,9 +2311,9 @@ contains
 
                                 #:if MFC_CASE_OPTIMIZATION
                                     #:if igr_order == 5
-                                        !DIR$ unroll 6
+                                    !DIR$ unroll 6
                                     #:elif igr_order == 3
-                                        !DIR$ unroll 4
+                                    !DIR$ unroll 4
                                     #:endif
                                 #:endif
                                 $:GPU_LOOP(parallelism='[seq]')

From a1769d06c5958342878eeefbb85235ff5437e056 Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Thu, 19 Feb 2026 04:18:18 -0500
Subject: [PATCH 13/22] Moved mdoel-specific code to the model file for
 cleanliness

---
 src/common/m_model.fpp          | 182 +++++++++++++++++++++++++++++-
 src/simulation/m_ib_patches.fpp | 190 ++------------------------------
 src/simulation/m_ibm.fpp        |   2 +
 src/simulation/m_igr.fpp        |  26 ++---
 4 files changed, 205 insertions(+), 195 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 2c62a1444f..ff8e0d57f9 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -17,7 +17,7 @@ module m_model
 
     private
 
-    public :: f_model_read, s_model_write, s_model_free, f_model_is_inside
+    public :: s_instantiate_STL_models, f_model_read, s_model_write, s_model_free, f_model_is_inside, models, gpu_ntrs, gpu_trs_v, gpu_trs_n
 
     ! Subroutines for STL immersed boundaries
     public :: f_check_boundary, f_register_edge, f_check_interpolation_2D, &
@@ -25,8 +25,188 @@ module m_model
               f_interpolated_distance, f_normals, f_distance, f_distance_normals_3D, f_tri_area, s_pack_model_for_gpu, &
               f_model_is_inside_flat
 
+    !! array of STL models that can be allocated and then used in IB marker and levelset compute
+    type(t_model_array), allocatable, target :: models(:)
+    !! GPU-friendly flat arrays for STL model data
+    integer, allocatable :: gpu_ntrs(:)
+    real(wp), allocatable, dimension(:, :, :, :) :: gpu_trs_v
+    real(wp), allocatable, dimension(:, :, :) :: gpu_trs_n
+    real(wp), allocatable, dimension(:, :, :) :: gpu_boundary_v
+    real(wp), allocatable, dimension(:, :, :) :: gpu_interpolated_boundary_v
+
 contains
 
+    subroutine s_instantiate_STL_models()
+
+        ! Variables for IBM+STL
+        real(wp) :: normals(1:3) !< Boundary normal buffer
+        integer :: boundary_vertex_count, boundary_edge_count, total_vertices !< Boundary vertex
+        real(wp), allocatable, dimension(:, :, :) :: boundary_v !< Boundary vertex buffer
+        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v !< Interpolated vertex buffer
+        real(wp) :: dx_local, dy_local, dz_local !< Levelset distance buffer
+        logical :: interpolate !< Logical variable to determine whether or not the model should be interpolated
+
+        integer :: i, j, k !< Generic loop iterators
+        integer :: patch_id
+
+        type(t_bbox) :: bbox, bbox_old
+        type(t_model) :: model
+        type(ic_model_parameters) :: params
+
+        real(wp) :: eta
+        real(wp), dimension(1:3) :: point, model_center
+        real(wp) :: grid_mm(1:3, 1:2)
+
+        real(wp), dimension(1:4, 1:4) :: transform, transform_n
+
+#ifdef MFC_PRE_PROCESS
+        dx_local = dx; dy_local = dy
+        if (p /= 0) dz_local = dz
+#else
+        dx_local = minval(dx); dy_local = minval(dy)
+        if (p /= 0) dz_local = minval(dz)
+#endif
+
+        do patch_id = 1, num_ibs
+            if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
+                allocate (models(patch_id)%model)
+                print *, " * Reading model: "//trim(patch_ib(patch_id)%model_filepath)
+
+                model = f_model_read(patch_ib(patch_id)%model_filepath)
+                params%scale(:) = patch_ib(patch_id)%model_scale(:)
+                params%translate(:) = patch_ib(patch_id)%model_translate(:)
+                params%rotate(:) = patch_ib(patch_id)%model_rotate(:)
+                params%spc = patch_ib(patch_id)%model_spc
+                params%threshold = patch_ib(patch_id)%model_threshold
+
+                if (f_approx_equal(dot_product(params%scale, params%scale), 0._wp)) then
+                    params%scale(:) = 1._wp
+                end if
+
+                if (proc_rank == 0) then
+                    print *, " * Transforming model."
+                end if
+
+                ! Get the model center before transforming the model
+                bbox_old = f_create_bbox(model)
+                model_center(1:3) = (bbox_old%min(1:3) + bbox_old%max(1:3))/2._wp
+
+                ! Compute the transform matrices for vertices and normals
+                transform = f_create_transform_matrix(params, model_center)
+                transform_n = f_create_transform_matrix(params)
+
+                call s_transform_model(model, transform, transform_n)
+
+                ! Recreate the bounding box after transformation
+                bbox = f_create_bbox(model)
+
+                ! Show the number of vertices in the original STL model
+                if (proc_rank == 0) then
+                    print *, ' * Number of input model vertices:', 3*model%ntrs
+                end if
+
+                call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
+
+                ! Check if the model needs interpolation
+                if (p > 0) then
+                    call f_check_interpolation_3D(model, (/dx_local, dy_local, dz_local/), interpolate)
+                else
+                    call f_check_interpolation_2D(boundary_v, boundary_edge_count, (/dx_local, dy_local, 0._wp/), interpolate)
+                end if
+
+                ! Show the number of edges and boundary edges in 2D STL models
+                if (proc_rank == 0 .and. p == 0) then
+                    print *, ' * Number of 2D model boundary edges:', boundary_edge_count
+                end if
+
+                ! Interpolate the STL model along the edges (2D) and on triangle facets (3D)
+                if (interpolate) then
+                    if (proc_rank == 0) then
+                        print *, ' * Interpolating STL vertices.'
+                    end if
+
+                    if (p > 0) then
+                        call f_interpolate_3D(model, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
+                    else
+                        call f_interpolate_2D(boundary_v, boundary_edge_count, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
+                    end if
+
+                    if (proc_rank == 0) then
+                        print *, ' * Total number of interpolated boundary vertices:', total_vertices
+                    end if
+                end if
+
+                if (proc_rank == 0) then
+                    write (*, "(A, 3(2X, F20.10))") "    > Model:  Min:", bbox%min(1:3)
+                    write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (bbox%min(1:3) + bbox%max(1:3))/2._wp
+                    write (*, "(A, 3(2X, F20.10))") "    >         Max:", bbox%max(1:3)
+
+                    grid_mm(1, :) = (/minval(x_cc(0:m)) - 0.e5_wp*dx_local, maxval(x_cc(0:m)) + 0.e5_wp*dx_local/)
+                    grid_mm(2, :) = (/minval(y_cc(0:n)) - 0.e5_wp*dy_local, maxval(y_cc(0:n)) + 0.e5_wp*dy_local/)
+
+                    if (p > 0) then
+                        grid_mm(3, :) = (/minval(z_cc(0:p)) - 0.e5_wp*dz_local, maxval(z_cc(0:p)) + 0.e5_wp*dz_local/)
+                    else
+                        grid_mm(3, :) = (/0._wp, 0._wp/)
+                    end if
+
+                    write (*, "(A, 3(2X, F20.10))") "    > Domain: Min:", grid_mm(:, 1)
+                    write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (grid_mm(:, 1) + grid_mm(:, 2))/2._wp
+                    write (*, "(A, 3(2X, F20.10))") "    >         Max:", grid_mm(:, 2)
+                end if
+
+                models(patch_id)%model = model
+                models(patch_id)%boundary_v = boundary_v
+                models(patch_id)%boundary_edge_count = boundary_edge_count
+                if (interpolate) then
+                    models(patch_id)%interpolate = 1
+                else
+                    models(patch_id)%interpolate = 0
+                end if
+                if (interpolate) then
+                    models(patch_id)%interpolated_boundary_v = interpolated_boundary_v
+                    models(patch_id)%total_vertices = total_vertices
+                end if
+
+            end if
+        end do
+
+        ! Pack and upload flat arrays for GPU (AFTER the loop)
+        block
+            integer :: pid, max_ntrs
+
+            max_ntrs = 0
+            do pid = 1, num_ibs
+                if (allocated(models(pid)%model)) then
+                    call s_pack_model_for_gpu(models(pid))
+                    max_ntrs = max(max_ntrs, models(pid)%ntrs)
+                end if
+            end do
+
+            if (max_ntrs > 0) then
+                allocate (gpu_ntrs(1:num_ibs))
+                allocate (gpu_trs_v(1:3, 1:3, 1:max_ntrs, 1:num_ibs))
+                allocate (gpu_trs_n(1:3, 1:max_ntrs, 1:num_ibs))
+
+                gpu_ntrs = 0
+                gpu_trs_v = 0._wp
+                gpu_trs_n = 0._wp
+
+                do pid = 1, num_ibs
+                    if (allocated(models(pid)%model)) then
+                        gpu_ntrs(pid) = models(pid)%ntrs
+                        gpu_trs_v(:, :, 1:models(pid)%ntrs, pid) = models(pid)%trs_v
+                        gpu_trs_n(:, 1:models(pid)%ntrs, pid) = models(pid)%trs_n
+                    end if
+                end do
+
+                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n]')
+
+            end if
+        end block
+
+    end subroutine s_instantiate_STL_models
+
     !> This procedure reads a binary STL file.
     !! @param filepath Path to the STL file.
     !! @param model The binary of the STL file.
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 4617e98ff7..719a9e56cc 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -25,7 +25,7 @@ module m_ib_patches
 
     implicit none
 
-    private; public :: s_apply_ib_patches, s_update_ib_rotation_matrix, s_instantiate_STL_models, models, f_convert_cyl_to_cart, gpu_ntrs, gpu_trs_v, gpu_trs_n
+    private; public :: s_apply_ib_patches, s_update_ib_rotation_matrix, f_convert_cyl_to_cart
 
     real(wp) :: x_centroid, y_centroid, z_centroid
     real(wp) :: length_x, length_y, length_z
@@ -55,15 +55,6 @@ module m_ib_patches
 
     character(len=5) :: istr ! string to store int to string result for error checking
 
-    !! array of STL models that can be allocated and then used in IB marker and levelset compute
-    type(t_model_array), allocatable, target :: models(:)
-    !! GPU-friendly flat arrays for STL model data
-    integer, allocatable :: gpu_ntrs(:)
-    real(wp), allocatable, dimension(:,:,:,:) :: gpu_trs_v
-    real(wp), allocatable, dimension(:,:,:)   :: gpu_trs_n
-    real(wp), allocatable, dimension(:,:,:) :: gpu_boundary_v
-    real(wp), allocatable, dimension(:,:,:) :: gpu_interpolated_boundary_v
-
 contains
 
     impure subroutine s_apply_ib_patches(ib_markers)
@@ -120,169 +111,6 @@ contains
 
     end subroutine s_apply_ib_patches
 
-    subroutine s_instantiate_STL_models()
-
-        ! Variables for IBM+STL
-        real(wp) :: normals(1:3) !< Boundary normal buffer
-        integer :: boundary_vertex_count, boundary_edge_count, total_vertices !< Boundary vertex
-        real(wp), allocatable, dimension(:, :, :) :: boundary_v !< Boundary vertex buffer
-        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v !< Interpolated vertex buffer
-        real(wp) :: distance !< Levelset distance buffer
-        logical :: interpolate !< Logical variable to determine whether or not the model should be interpolated
-
-        integer :: i, j, k !< Generic loop iterators
-        integer :: patch_id
-
-        type(t_bbox) :: bbox, bbox_old
-        type(t_model) :: model
-        type(ic_model_parameters) :: params
-
-        real(wp) :: eta
-        real(wp), dimension(1:3) :: point, model_center
-        real(wp) :: grid_mm(1:3, 1:2)
-
-        real(wp), dimension(1:4, 1:4) :: transform, transform_n
-
-        do patch_id = 1, num_ibs
-            if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
-                allocate(models(patch_id)%model)
-                print *, " * Reading model: "//trim(patch_ib(patch_id)%model_filepath)
-
-                model = f_model_read(patch_ib(patch_id)%model_filepath)
-                params%scale(:) = patch_ib(patch_id)%model_scale(:)
-                params%translate(:) = patch_ib(patch_id)%model_translate(:)
-                params%rotate(:) = patch_ib(patch_id)%model_rotate(:)
-                params%spc = patch_ib(patch_id)%model_spc
-                params%threshold = patch_ib(patch_id)%model_threshold
-
-                if (f_approx_equal(dot_product(params%scale, params%scale), 0._wp)) then
-                    params%scale(:) = 1._wp
-                end if
-
-                if (proc_rank == 0) then
-                    print *, " * Transforming model."
-                end if
-
-                ! Get the model center before transforming the model
-                bbox_old = f_create_bbox(model)
-                model_center(1:3) = (bbox_old%min(1:3) + bbox_old%max(1:3))/2._wp
-
-                ! Compute the transform matrices for vertices and normals
-                transform = f_create_transform_matrix(params, model_center)
-                transform_n = f_create_transform_matrix(params)
-
-                call s_transform_model(model, transform, transform_n)
-
-                ! Recreate the bounding box after transformation
-                bbox = f_create_bbox(model)
-
-                ! Show the number of vertices in the original STL model
-                if (proc_rank == 0) then
-                    print *, ' * Number of input model vertices:', 3*model%ntrs
-                end if
-
-                call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
-
-                ! Check if the model needs interpolation
-                if (p > 0) then
-                    call f_check_interpolation_3D(model, (/dx, dy, dz/), interpolate)
-                else
-                    call f_check_interpolation_2D(boundary_v, boundary_edge_count, (/minval(dx), minval(dy), 0._wp/), interpolate)
-                end if
-
-                ! Show the number of edges and boundary edges in 2D STL models
-                if (proc_rank == 0 .and. p == 0) then
-                    print *, ' * Number of 2D model boundary edges:', boundary_edge_count
-                end if
-
-                ! Interpolate the STL model along the edges (2D) and on triangle facets (3D)
-                if (interpolate) then
-                    if (proc_rank == 0) then
-                        print *, ' * Interpolating STL vertices.'
-                    end if
-
-                    if (p > 0) then
-                        call f_interpolate_3D(model, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
-                    else
-                        call f_interpolate_2D(boundary_v, boundary_edge_count, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
-                    end if
-
-                    if (proc_rank == 0) then
-                        print *, ' * Total number of interpolated boundary vertices:', total_vertices
-                    end if
-                end if
-
-                if (proc_rank == 0) then
-                    write (*, "(A, 3(2X, F20.10))") "    > Model:  Min:", bbox%min(1:3)
-                    write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (bbox%min(1:3) + bbox%max(1:3))/2._wp
-                    write (*, "(A, 3(2X, F20.10))") "    >         Max:", bbox%max(1:3)
-
-                    grid_mm(1, :) = (/minval(x_cc(0:m)) - 0.e5_wp*dx(0), maxval(x_cc(0:m)) + 0.e5_wp*dx(m)/)
-                    grid_mm(2, :) = (/minval(y_cc(0:n)) - 0.e5_wp*dy(0), maxval(y_cc(0:n)) + 0.e5_wp*dy(n)/)
-
-                    if (p > 0) then
-                        grid_mm(3, :) = (/minval(z_cc(0:p)) - 0.e5_wp*dz(0), maxval(z_cc(0:p)) + 0.e5_wp*dz(p)/)
-                    else
-                        grid_mm(3, :) = (/0._wp, 0._wp/)
-                    end if
-
-                    write (*, "(A, 3(2X, F20.10))") "    > Domain: Min:", grid_mm(:, 1)
-                    write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (grid_mm(:, 1) + grid_mm(:, 2))/2._wp
-                    write (*, "(A, 3(2X, F20.10))") "    >         Max:", grid_mm(:, 2)
-                end if
-
-                models(patch_id)%model = model
-                models(patch_id)%boundary_v = boundary_v
-                models(patch_id)%boundary_edge_count = boundary_edge_count
-                if (interpolate) then
-                    models(patch_id)%interpolate = 1
-                else
-                    models(patch_id)%interpolate = 0
-                end if
-                if (interpolate) then
-                    models(patch_id)%interpolated_boundary_v = interpolated_boundary_v
-                    models(patch_id)%total_vertices = total_vertices
-                end if
-
-            end if
-        end do
-
-        ! Pack and upload flat arrays for GPU (AFTER the loop)
-        block
-            integer :: pid, max_ntrs
-
-            max_ntrs = 0
-            do pid = 1, num_ibs
-                if (allocated(models(pid)%model)) then
-                    call s_pack_model_for_gpu(models(pid))
-                    max_ntrs = max(max_ntrs, models(pid)%ntrs)
-                end if
-            end do
-
-            if (max_ntrs > 0) then
-                allocate(gpu_ntrs(1:num_ibs))
-                allocate(gpu_trs_v(1:3, 1:3, 1:max_ntrs, 1:num_ibs))
-                allocate(gpu_trs_n(1:3, 1:max_ntrs, 1:num_ibs))
-
-                gpu_ntrs = 0
-                gpu_trs_v = 0._wp
-                gpu_trs_n = 0._wp
-
-                do pid = 1, num_ibs
-                    if (allocated(models(pid)%model)) then
-                        gpu_ntrs(pid) = models(pid)%ntrs
-                        gpu_trs_v(:, :, 1:models(pid)%ntrs, pid) = models(pid)%trs_v
-                        gpu_trs_n(:, 1:models(pid)%ntrs, pid)    = models(pid)%trs_n
-                    end if
-                end do
-
-                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n]')
-
-            end if
-        end block
-
-    end subroutine s_instantiate_STL_models
-
     !> The circular patch is a 2D geometry that may be used, for
         !!              example, in creating a bubble or a droplet. The geometry
         !!              of the patch is well-defined when its centroid and radius
@@ -1035,10 +863,10 @@ contains
                 end if
 
                 eta = f_model_is_inside_flat(gpu_ntrs(patch_id), &
-                                              gpu_trs_v, gpu_trs_n, &
-                                              patch_id, &
-                                              xy_local, (/dx(i), dy(j), 0._wp/), &
-                                              spc)
+                                             gpu_trs_v, gpu_trs_n, &
+                                             patch_id, &
+                                             xy_local, (/dx(i), dy(j), 0._wp/), &
+                                             spc)
 
                 ! Reading STL boundary vertices and compute the levelset and levelset_norm
                 if (eta > threshold) then
@@ -1093,10 +921,10 @@ contains
                     end if
 
                     eta = f_model_is_inside_flat(gpu_ntrs(patch_id), &
-                                              gpu_trs_v, gpu_trs_n, &
-                                              patch_id, &
-                                              xyz_local, (/dx(i), dy(j), dz(k)/), &
-                                              spc)
+                                                 gpu_trs_v, gpu_trs_n, &
+                                                 patch_id, &
+                                                 xyz_local, (/dx(i), dy(j), dz(k)/), &
+                                                 spc)
 
                     ! Reading STL boundary vertices and compute the levelset and levelset_norm
                     if (eta > patch_ib(patch_id)%model_threshold) then
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 92400123bf..7855a60abf 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -28,6 +28,8 @@ module m_ibm
 
     use m_viscous
 
+    use m_model
+
     implicit none
 
     private :: s_compute_image_points, &
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 3f98d0e7e4..fa06ef9092 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -269,7 +269,7 @@ contains
 
     subroutine s_igr_iterative_solve(q_cons_vf, bc_type, t_step)
 #ifdef _CRAYFTN
-    !DIR$ OPTIMIZE (-haggress)
+        !DIR$ OPTIMIZE (-haggress)
 #endif
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
@@ -370,7 +370,7 @@ contains
 
     subroutine s_igr_sigma_x(q_cons_vf, rhs_vf)
 #ifdef _CRAYFTN
-    !DIR$ OPTIMIZE (-haggress)
+        !DIR$ OPTIMIZE (-haggress)
 #endif
         type(scalar_field), &
             dimension(sys_size), &
@@ -455,7 +455,7 @@ contains
 
     subroutine s_igr_riemann_solver(q_cons_vf, rhs_vf, idir)
 #ifdef _CRAYFTN
-    !DIR$ OPTIMIZE (-haggress)
+        !DIR$ OPTIMIZE (-haggress)
 #endif
         type(scalar_field), &
             dimension(sys_size), &
@@ -495,9 +495,9 @@ contains
 
                                 #:if MFC_CASE_OPTIMIZATION
                                     #:if igr_order == 5
-                                    !DIR$ unroll 6
+                                        !DIR$ unroll 6
                                     #:elif igr_order == 3
-                                    !DIR$ unroll 4
+                                        !DIR$ unroll 4
                                     #:endif
                                 #:endif
                                 $:GPU_LOOP(parallelism='[seq]')
@@ -912,9 +912,9 @@ contains
 
                                 #:if MFC_CASE_OPTIMIZATION
                                     #:if igr_order == 5
-                                    !DIR$ unroll 6
+                                        !DIR$ unroll 6
                                     #:elif igr_order == 3
-                                    !DIR$ unroll 4
+                                        !DIR$ unroll 4
                                     #:endif
                                 #:endif
                                 $:GPU_LOOP(parallelism='[seq]')
@@ -1429,9 +1429,9 @@ contains
 
                                     #:if MFC_CASE_OPTIMIZATION
                                         #:if igr_order == 5
-                                        !DIR$ unroll 6
+                                            !DIR$ unroll 6
                                         #:elif igr_order == 3
-                                        !DIR$ unroll 4
+                                            !DIR$ unroll 4
                                         #:endif
                                     #:endif
                                     $:GPU_LOOP(parallelism='[seq]')
@@ -1826,9 +1826,9 @@ contains
 
                                     #:if MFC_CASE_OPTIMIZATION
                                         #:if igr_order == 5
-                                        !DIR$ unroll 6
+                                            !DIR$ unroll 6
                                         #:elif igr_order == 3
-                                        !DIR$ unroll 4
+                                            !DIR$ unroll 4
                                         #:endif
                                     #:endif
                                     $:GPU_LOOP(parallelism='[seq]')
@@ -2311,9 +2311,9 @@ contains
 
                                 #:if MFC_CASE_OPTIMIZATION
                                     #:if igr_order == 5
-                                    !DIR$ unroll 6
+                                        !DIR$ unroll 6
                                     #:elif igr_order == 3
-                                    !DIR$ unroll 4
+                                        !DIR$ unroll 4
                                     #:endif
                                 #:endif
                                 $:GPU_LOOP(parallelism='[seq]')

From 5e586554ed2ea59db34f70d9bc3b0481a0121230 Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Thu, 19 Feb 2026 05:29:18 -0500
Subject: [PATCH 14/22] STLs appear to be working on the GPU with NVHPC!

---
 src/common/m_derived_types.fpp        |   4 +-
 src/common/m_model.fpp                | 123 ++++++++++++++++++++++++--
 src/simulation/m_compute_levelset.fpp |  39 ++++----
 src/simulation/m_ib_patches.fpp       |   4 -
 4 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index cbdb3e2aa4..21ccbf3bbd 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -184,7 +184,7 @@ module m_derived_types
     end type t_model
 
     type :: t_model_array
-        ! --- Original CPU-side fields (unchanged) ---
+        ! Original CPU-side fields (unchanged)
         type(t_model), allocatable :: model
         real(wp), allocatable, dimension(:, :, :) :: boundary_v
         real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v
@@ -192,7 +192,7 @@ module m_derived_types
         integer :: total_vertices
         integer :: interpolate
 
-        ! --- GPU-friendly flattened arrays ---
+        ! GPU-friendly flattened arrays
         integer :: ntrs  ! copy of model%ntrs
         real(wp), allocatable, dimension(:, :, :) :: trs_v  ! (3, 3, ntrs) - triangle vertices
         real(wp), allocatable, dimension(:, :) :: trs_n  ! (3, ntrs)    - triangle normals
diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index ff8e0d57f9..3e8812485d 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -17,13 +17,15 @@ module m_model
 
     private
 
-    public :: s_instantiate_STL_models, f_model_read, s_model_write, s_model_free, f_model_is_inside, models, gpu_ntrs, gpu_trs_v, gpu_trs_n
+    public :: s_instantiate_STL_models, f_model_read, s_model_write, s_model_free, f_model_is_inside, models, gpu_ntrs, &
+              gpu_trs_v, gpu_trs_n, gpu_boundary_v, gpu_interpolated_boundary_v, gpu_interpolate, gpu_boundary_edge_count, &
+              gpu_total_vertices
 
     ! Subroutines for STL immersed boundaries
     public :: f_check_boundary, f_register_edge, f_check_interpolation_2D, &
               f_check_interpolation_3D, f_interpolate_2D, f_interpolate_3D, &
               f_interpolated_distance, f_normals, f_distance, f_distance_normals_3D, f_tri_area, s_pack_model_for_gpu, &
-              f_model_is_inside_flat
+              f_model_is_inside_flat, f_distance_normals_3d_flat
 
     !! array of STL models that can be allocated and then used in IB marker and levelset compute
     type(t_model_array), allocatable, target :: models(:)
@@ -31,8 +33,11 @@ module m_model
     integer, allocatable :: gpu_ntrs(:)
     real(wp), allocatable, dimension(:, :, :, :) :: gpu_trs_v
     real(wp), allocatable, dimension(:, :, :) :: gpu_trs_n
-    real(wp), allocatable, dimension(:, :, :) :: gpu_boundary_v
+    real(wp), allocatable, dimension(:, :, :, :) :: gpu_boundary_v
     real(wp), allocatable, dimension(:, :, :) :: gpu_interpolated_boundary_v
+    integer, allocatable :: gpu_interpolate(:)
+    integer, allocatable :: gpu_boundary_edge_count(:)
+    integer, allocatable :: gpu_total_vertices(:)
 
 contains
 
@@ -174,34 +179,80 @@ contains
         ! Pack and upload flat arrays for GPU (AFTER the loop)
         block
             integer :: pid, max_ntrs
+            integer :: max_bv1, max_bv2, max_bv3, max_iv1, max_iv2
 
             max_ntrs = 0
+            max_bv1 = 0; max_bv2 = 0; max_bv3 = 0
+            max_iv1 = 0; max_iv2 = 0
+
             do pid = 1, num_ibs
                 if (allocated(models(pid)%model)) then
                     call s_pack_model_for_gpu(models(pid))
                     max_ntrs = max(max_ntrs, models(pid)%ntrs)
                 end if
+                if (allocated(models(pid)%boundary_v)) then
+                    max_bv1 = max(max_bv1, size(models(pid)%boundary_v, 1))
+                    max_bv2 = max(max_bv2, size(models(pid)%boundary_v, 2))
+                    max_bv3 = max(max_bv3, size(models(pid)%boundary_v, 3))
+                end if
+                if (allocated(models(pid)%interpolated_boundary_v)) then
+                    max_iv1 = max(max_iv1, size(models(pid)%interpolated_boundary_v, 1))
+                    max_iv2 = max(max_iv2, size(models(pid)%interpolated_boundary_v, 2))
+                end if
             end do
 
             if (max_ntrs > 0) then
                 allocate (gpu_ntrs(1:num_ibs))
                 allocate (gpu_trs_v(1:3, 1:3, 1:max_ntrs, 1:num_ibs))
                 allocate (gpu_trs_n(1:3, 1:max_ntrs, 1:num_ibs))
+                allocate (gpu_interpolate(1:num_ibs))
+                allocate (gpu_boundary_edge_count(1:num_ibs))
+                allocate (gpu_total_vertices(1:num_ibs))
 
                 gpu_ntrs = 0
                 gpu_trs_v = 0._wp
                 gpu_trs_n = 0._wp
+                gpu_interpolate = 0
+                gpu_boundary_edge_count = 0
+                gpu_total_vertices = 0
+
+                if (max_bv1 > 0) then
+                    allocate (gpu_boundary_v(1:max_bv1, 1:max_bv2, 1:max_bv3, 1:num_ibs))
+                    gpu_boundary_v = 0._wp
+                end if
+
+                if (max_iv1 > 0) then
+                    allocate (gpu_interpolated_boundary_v(1:max_iv1, 1:max_iv2, 1:num_ibs))
+                    gpu_interpolated_boundary_v = 0._wp
+                end if
 
                 do pid = 1, num_ibs
                     if (allocated(models(pid)%model)) then
                         gpu_ntrs(pid) = models(pid)%ntrs
                         gpu_trs_v(:, :, 1:models(pid)%ntrs, pid) = models(pid)%trs_v
                         gpu_trs_n(:, 1:models(pid)%ntrs, pid) = models(pid)%trs_n
+                        gpu_interpolate(pid) = models(pid)%interpolate
+                        gpu_boundary_edge_count(pid) = models(pid)%boundary_edge_count
+                        gpu_total_vertices(pid) = models(pid)%total_vertices
+                    end if
+                    if (allocated(models(pid)%boundary_v)) then
+                        gpu_boundary_v(1:size(models(pid)%boundary_v, 1), &
+                                       1:size(models(pid)%boundary_v, 2), &
+                                       1:size(models(pid)%boundary_v, 3), pid) = models(pid)%boundary_v
+                    end if
+                    if (allocated(models(pid)%interpolated_boundary_v)) then
+                        gpu_interpolated_boundary_v(1:size(models(pid)%interpolated_boundary_v, 1), &
+                                                    1:size(models(pid)%interpolated_boundary_v, 2), pid) = models(pid)%interpolated_boundary_v
                     end if
                 end do
 
-                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n]')
-
+                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n, gpu_interpolate, gpu_boundary_edge_count, gpu_total_vertices]')
+                if (allocated(gpu_boundary_v)) then
+                    $:GPU_ENTER_DATA(copyin='[gpu_boundary_v]')
+                end if
+                if (allocated(gpu_interpolated_boundary_v)) then
+                    $:GPU_ENTER_DATA(copyin='[gpu_interpolated_boundary_v]')
+                end if
             end if
         end block
 
@@ -731,7 +782,7 @@ contains
                 end if
             end do
 
-            ! if thje ray hits an odd number of triangles on its way out, then
+            ! if the ray hits an odd number of triangles on its way out, then
             ! it must be on the inside of the model
             nInOrOut = nInOrOut + mod(nHits, 2)
         end do
@@ -1371,6 +1422,66 @@ contains
 
     end subroutine f_distance_normals_3D
 
+    subroutine f_distance_normals_3D_flat(ntrs, trs_v, trs_n, pid, point, normals, distance)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        integer, intent(in) :: ntrs
+        real(wp), dimension(:, :, :, :), intent(in) :: trs_v
+        real(wp), dimension(:, :, :), intent(in) :: trs_n
+        integer, intent(in) :: pid
+        real(wp), dimension(1:3), intent(in) :: point
+        real(wp), dimension(1:3), intent(out) :: normals
+        real(wp), intent(out) :: distance
+
+        real(wp), dimension(1:3, 1:3) :: tri
+        real(wp) :: dist_min, dist_t_min
+        real(wp) :: dist_min_normal, dist_buffer_normal
+        real(wp), dimension(1:3) :: midp
+        real(wp), dimension(1:3) :: dist_buffer
+        integer :: i, j, tri_idx
+
+        dist_min = 1.e12_wp
+        dist_min_normal = 1.e12_wp
+        distance = 0._wp
+
+        tri_idx = 0
+        do i = 1, ntrs
+            do j = 1, 3
+                tri(j, 1) = trs_v(j, 1, i, pid)
+                tri(j, 2) = trs_v(j, 2, i, pid)
+                tri(j, 3) = trs_v(j, 3, i, pid)
+                dist_buffer(j) = sqrt((point(1) - tri(j, 1))**2 + &
+                                      (point(2) - tri(j, 2))**2 + &
+                                      (point(3) - tri(j, 3))**2)
+            end do
+
+            do j = 1, 3
+                midp(j) = (tri(1, j) + tri(2, j) + tri(3, j))/3
+            end do
+
+            dist_t_min = minval(dist_buffer(1:3))
+            dist_buffer_normal = sqrt((point(1) - midp(1))**2 + &
+                                      (point(2) - midp(2))**2 + &
+                                      (point(3) - midp(3))**2)
+
+            if (dist_t_min < dist_min) then
+                dist_min = dist_t_min
+            end if
+
+            if (dist_buffer_normal < dist_min_normal) then
+                dist_min_normal = dist_buffer_normal
+                tri_idx = i
+            end if
+        end do
+
+        normals(1) = trs_n(1, tri_idx, pid)
+        normals(2) = trs_n(2, tri_idx, pid)
+        normals(3) = trs_n(3, tri_idx, pid)
+        distance = dist_min
+
+    end subroutine f_distance_normals_3D_flat
+
     !> This procedure determines the levelset distance of 2D models without interpolation.
     !! @param boundary_v                   Group of all the boundary vertices of the 2D model without interpolation
     !! @param boundary_vertex_count        Output the total number of boundary vertices
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 1f72b6a37a..263c1bcacc 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -50,6 +50,8 @@ contains
                     call s_cylinder_levelset(gps(i))
                 elseif (patch_geometry == 11) then
                     call s_3d_airfoil_levelset(gps(i))
+                elseif (patch_geometry == 12) then
+                    call s_model_levelset(gps(i))
                 end if
             end do
             $:END_GPU_PARALLEL_LOOP()
@@ -70,6 +72,8 @@ contains
                     call s_rectangle_levelset(gps(i))
                 elseif (patch_geometry == 4) then
                     call s_airfoil_levelset(gps(i))
+                elseif (patch_geometry == 5) then
+                    call s_model_levelset(gps(i))
                 elseif (patch_geometry == 6) then
                     call s_ellipse_levelset(gps(i))
                 end if
@@ -79,17 +83,6 @@ contains
 
         end if
 
-        ! STL models computed on the CPU for now
-        do i = 1, num_gps
-            patch_id = gps(i)%ib_patch_id
-            patch_geometry = patch_ib(patch_id)%geometry
-
-            if (patch_geometry == 5 .or. patch_geometry == 12) then
-                call s_model_levelset(gps(i))
-                $:GPU_UPDATE(device='[gps(i)]')
-            end if
-        end do
-
     end subroutine s_apply_levelset
 
     subroutine s_circle_levelset(gp)
@@ -651,7 +644,7 @@ contains
         type(ghost_point), intent(inout) :: gp
 
         integer :: i, j, k, patch_id, boundary_edge_count, total_vertices
-        logical :: interpolate
+        integer :: interpolate
         real(wp), dimension(1:3) :: center, xyz_local
         real(wp) :: normals(1:3) !< Boundary normal buffer
         real(wp) :: distance
@@ -663,9 +656,9 @@ contains
         k = gp%loc(3)
 
         ! load in model values
-        interpolate = models(patch_id)%interpolate
-        boundary_edge_count = models(patch_id)%boundary_edge_count
-        total_vertices = models(patch_id)%total_vertices
+        interpolate = gpu_interpolate(patch_id)
+        boundary_edge_count = gpu_boundary_edge_count(patch_id)
+        total_vertices = gpu_total_vertices(patch_id)
 
         center = 0._wp
         if (.not. f_is_default(patch_ib(patch_id)%x_centroid)) center(1) = patch_ib(patch_id)%x_centroid
@@ -673,6 +666,7 @@ contains
         if (p > 0) then
             if (.not. f_is_default(patch_ib(patch_id)%z_centroid)) center(3) = patch_ib(patch_id)%z_centroid
         end if
+
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
         rotation(:, :) = patch_ib(patch_id)%rotation_matrix(:, :)
 
@@ -691,11 +685,11 @@ contains
         if (p > 0) then
 
             ! Get the boundary normals and shortest distance between the cell center and the model boundary
-            call f_distance_normals_3D(models(patch_id)%model, xyz_local, normals, distance)
+            call f_distance_normals_3D_flat(gpu_ntrs(patch_id), gpu_trs_v, gpu_trs_n, patch_id, xyz_local, normals, distance)
 
             ! Get the shortest distance between the cell center and the interpolated model boundary
             if (interpolate) then
-                gp%levelset = f_interpolated_distance(models(patch_id)%interpolated_boundary_v, total_vertices, xyz_local)
+                gp%levelset = f_interpolated_distance(gpu_interpolated_boundary_v(:, :, patch_id), total_vertices, xyz_local)
             else
                 gp%levelset = distance
             end if
@@ -707,19 +701,18 @@ contains
             gp%levelset_norm = matmul(rotation, normals(1:3))
         else
             ! 2D models
-            if (interpolate) then
+            if (interpolate == 1) then
                 ! Get the shortest distance between the cell center and the model boundary
-                gp%levelset = f_interpolated_distance(models(patch_id)%interpolated_boundary_v, total_vertices, xyz_local)
+                gp%levelset = f_interpolated_distance(gpu_interpolated_boundary_v(:, :, patch_id), total_vertices, xyz_local)  ! Change 7
             else
-                ! Get the shortest distance between the cell center and the interpolated model boundary
-                gp%levelset = f_distance(models(patch_id)%boundary_v, boundary_edge_count, xyz_local)
+                gp%levelset = f_distance(gpu_boundary_v(:, :, :, patch_id), boundary_edge_count, xyz_local)  ! Change 8
             end if
 
             ! Correct the sign of the levelset
             gp%levelset = -abs(gp%levelset)
 
             ! Get the boundary normals
-            call f_normals(models(patch_id)%boundary_v, &
+            call f_normals(gpu_boundary_v(:, :, :, patch_id), &
                            boundary_edge_count, &
                            xyz_local, &
                            normals)
@@ -729,8 +722,6 @@ contains
 
         end if
 
-        ! print gp%levelset, gp%levelset_norm(1), gp%levelset_norm(2), gp%levelset_norm(3)
-
     end subroutine s_model_levelset
 
 end module m_compute_levelset
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 719a9e56cc..cab36bcda7 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -877,8 +877,6 @@ contains
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        $:GPU_UPDATE(host='[ib_markers%sf]')
-
     end subroutine s_ib_model
 
     !> The STL patch is a 3D geometry that is imported from an STL file.
@@ -935,8 +933,6 @@ contains
             end do
         end do
 
-        $:GPU_UPDATE(host='[ib_markers%sf]')
-
     end subroutine s_ib_3d_model
 
     !> Subroutine that computes a rotation matrix for converting to the rotating frame of the boundary

From 6cc7accce8b08b447dd41d6f740fbadf25d5bd5b Mon Sep 17 00:00:00 2001
From: Daniel J Vickers <dnlvickers5@gmail.com>
Date: Thu, 19 Feb 2026 09:49:48 -0500
Subject: [PATCH 15/22] STLs ran on GPU in 3D!

---
 src/common/m_model.fpp          | 222 +---------------------------
 src/simulation/m_ib_patches.fpp | 248 +++++++++++++++++++++++++++++++-
 2 files changed, 244 insertions(+), 226 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 3e8812485d..272eb2dfea 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -17,9 +17,9 @@ module m_model
 
     private
 
-    public :: s_instantiate_STL_models, f_model_read, s_model_write, s_model_free, f_model_is_inside, models, gpu_ntrs, &
+    public :: f_model_read, s_model_write, s_model_free, f_model_is_inside, models, gpu_ntrs, &
               gpu_trs_v, gpu_trs_n, gpu_boundary_v, gpu_interpolated_boundary_v, gpu_interpolate, gpu_boundary_edge_count, &
-              gpu_total_vertices
+              gpu_total_vertices, stl_bounding_boxes
 
     ! Subroutines for STL immersed boundaries
     public :: f_check_boundary, f_register_edge, f_check_interpolation_2D, &
@@ -38,226 +38,10 @@ module m_model
     integer, allocatable :: gpu_interpolate(:)
     integer, allocatable :: gpu_boundary_edge_count(:)
     integer, allocatable :: gpu_total_vertices(:)
+    real(wp), allocatable :: stl_bounding_boxes(:, :, :)
 
 contains
 
-    subroutine s_instantiate_STL_models()
-
-        ! Variables for IBM+STL
-        real(wp) :: normals(1:3) !< Boundary normal buffer
-        integer :: boundary_vertex_count, boundary_edge_count, total_vertices !< Boundary vertex
-        real(wp), allocatable, dimension(:, :, :) :: boundary_v !< Boundary vertex buffer
-        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v !< Interpolated vertex buffer
-        real(wp) :: dx_local, dy_local, dz_local !< Levelset distance buffer
-        logical :: interpolate !< Logical variable to determine whether or not the model should be interpolated
-
-        integer :: i, j, k !< Generic loop iterators
-        integer :: patch_id
-
-        type(t_bbox) :: bbox, bbox_old
-        type(t_model) :: model
-        type(ic_model_parameters) :: params
-
-        real(wp) :: eta
-        real(wp), dimension(1:3) :: point, model_center
-        real(wp) :: grid_mm(1:3, 1:2)
-
-        real(wp), dimension(1:4, 1:4) :: transform, transform_n
-
-#ifdef MFC_PRE_PROCESS
-        dx_local = dx; dy_local = dy
-        if (p /= 0) dz_local = dz
-#else
-        dx_local = minval(dx); dy_local = minval(dy)
-        if (p /= 0) dz_local = minval(dz)
-#endif
-
-        do patch_id = 1, num_ibs
-            if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
-                allocate (models(patch_id)%model)
-                print *, " * Reading model: "//trim(patch_ib(patch_id)%model_filepath)
-
-                model = f_model_read(patch_ib(patch_id)%model_filepath)
-                params%scale(:) = patch_ib(patch_id)%model_scale(:)
-                params%translate(:) = patch_ib(patch_id)%model_translate(:)
-                params%rotate(:) = patch_ib(patch_id)%model_rotate(:)
-                params%spc = patch_ib(patch_id)%model_spc
-                params%threshold = patch_ib(patch_id)%model_threshold
-
-                if (f_approx_equal(dot_product(params%scale, params%scale), 0._wp)) then
-                    params%scale(:) = 1._wp
-                end if
-
-                if (proc_rank == 0) then
-                    print *, " * Transforming model."
-                end if
-
-                ! Get the model center before transforming the model
-                bbox_old = f_create_bbox(model)
-                model_center(1:3) = (bbox_old%min(1:3) + bbox_old%max(1:3))/2._wp
-
-                ! Compute the transform matrices for vertices and normals
-                transform = f_create_transform_matrix(params, model_center)
-                transform_n = f_create_transform_matrix(params)
-
-                call s_transform_model(model, transform, transform_n)
-
-                ! Recreate the bounding box after transformation
-                bbox = f_create_bbox(model)
-
-                ! Show the number of vertices in the original STL model
-                if (proc_rank == 0) then
-                    print *, ' * Number of input model vertices:', 3*model%ntrs
-                end if
-
-                call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
-
-                ! Check if the model needs interpolation
-                if (p > 0) then
-                    call f_check_interpolation_3D(model, (/dx_local, dy_local, dz_local/), interpolate)
-                else
-                    call f_check_interpolation_2D(boundary_v, boundary_edge_count, (/dx_local, dy_local, 0._wp/), interpolate)
-                end if
-
-                ! Show the number of edges and boundary edges in 2D STL models
-                if (proc_rank == 0 .and. p == 0) then
-                    print *, ' * Number of 2D model boundary edges:', boundary_edge_count
-                end if
-
-                ! Interpolate the STL model along the edges (2D) and on triangle facets (3D)
-                if (interpolate) then
-                    if (proc_rank == 0) then
-                        print *, ' * Interpolating STL vertices.'
-                    end if
-
-                    if (p > 0) then
-                        call f_interpolate_3D(model, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
-                    else
-                        call f_interpolate_2D(boundary_v, boundary_edge_count, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
-                    end if
-
-                    if (proc_rank == 0) then
-                        print *, ' * Total number of interpolated boundary vertices:', total_vertices
-                    end if
-                end if
-
-                if (proc_rank == 0) then
-                    write (*, "(A, 3(2X, F20.10))") "    > Model:  Min:", bbox%min(1:3)
-                    write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (bbox%min(1:3) + bbox%max(1:3))/2._wp
-                    write (*, "(A, 3(2X, F20.10))") "    >         Max:", bbox%max(1:3)
-
-                    grid_mm(1, :) = (/minval(x_cc(0:m)) - 0.e5_wp*dx_local, maxval(x_cc(0:m)) + 0.e5_wp*dx_local/)
-                    grid_mm(2, :) = (/minval(y_cc(0:n)) - 0.e5_wp*dy_local, maxval(y_cc(0:n)) + 0.e5_wp*dy_local/)
-
-                    if (p > 0) then
-                        grid_mm(3, :) = (/minval(z_cc(0:p)) - 0.e5_wp*dz_local, maxval(z_cc(0:p)) + 0.e5_wp*dz_local/)
-                    else
-                        grid_mm(3, :) = (/0._wp, 0._wp/)
-                    end if
-
-                    write (*, "(A, 3(2X, F20.10))") "    > Domain: Min:", grid_mm(:, 1)
-                    write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (grid_mm(:, 1) + grid_mm(:, 2))/2._wp
-                    write (*, "(A, 3(2X, F20.10))") "    >         Max:", grid_mm(:, 2)
-                end if
-
-                models(patch_id)%model = model
-                models(patch_id)%boundary_v = boundary_v
-                models(patch_id)%boundary_edge_count = boundary_edge_count
-                if (interpolate) then
-                    models(patch_id)%interpolate = 1
-                else
-                    models(patch_id)%interpolate = 0
-                end if
-                if (interpolate) then
-                    models(patch_id)%interpolated_boundary_v = interpolated_boundary_v
-                    models(patch_id)%total_vertices = total_vertices
-                end if
-
-            end if
-        end do
-
-        ! Pack and upload flat arrays for GPU (AFTER the loop)
-        block
-            integer :: pid, max_ntrs
-            integer :: max_bv1, max_bv2, max_bv3, max_iv1, max_iv2
-
-            max_ntrs = 0
-            max_bv1 = 0; max_bv2 = 0; max_bv3 = 0
-            max_iv1 = 0; max_iv2 = 0
-
-            do pid = 1, num_ibs
-                if (allocated(models(pid)%model)) then
-                    call s_pack_model_for_gpu(models(pid))
-                    max_ntrs = max(max_ntrs, models(pid)%ntrs)
-                end if
-                if (allocated(models(pid)%boundary_v)) then
-                    max_bv1 = max(max_bv1, size(models(pid)%boundary_v, 1))
-                    max_bv2 = max(max_bv2, size(models(pid)%boundary_v, 2))
-                    max_bv3 = max(max_bv3, size(models(pid)%boundary_v, 3))
-                end if
-                if (allocated(models(pid)%interpolated_boundary_v)) then
-                    max_iv1 = max(max_iv1, size(models(pid)%interpolated_boundary_v, 1))
-                    max_iv2 = max(max_iv2, size(models(pid)%interpolated_boundary_v, 2))
-                end if
-            end do
-
-            if (max_ntrs > 0) then
-                allocate (gpu_ntrs(1:num_ibs))
-                allocate (gpu_trs_v(1:3, 1:3, 1:max_ntrs, 1:num_ibs))
-                allocate (gpu_trs_n(1:3, 1:max_ntrs, 1:num_ibs))
-                allocate (gpu_interpolate(1:num_ibs))
-                allocate (gpu_boundary_edge_count(1:num_ibs))
-                allocate (gpu_total_vertices(1:num_ibs))
-
-                gpu_ntrs = 0
-                gpu_trs_v = 0._wp
-                gpu_trs_n = 0._wp
-                gpu_interpolate = 0
-                gpu_boundary_edge_count = 0
-                gpu_total_vertices = 0
-
-                if (max_bv1 > 0) then
-                    allocate (gpu_boundary_v(1:max_bv1, 1:max_bv2, 1:max_bv3, 1:num_ibs))
-                    gpu_boundary_v = 0._wp
-                end if
-
-                if (max_iv1 > 0) then
-                    allocate (gpu_interpolated_boundary_v(1:max_iv1, 1:max_iv2, 1:num_ibs))
-                    gpu_interpolated_boundary_v = 0._wp
-                end if
-
-                do pid = 1, num_ibs
-                    if (allocated(models(pid)%model)) then
-                        gpu_ntrs(pid) = models(pid)%ntrs
-                        gpu_trs_v(:, :, 1:models(pid)%ntrs, pid) = models(pid)%trs_v
-                        gpu_trs_n(:, 1:models(pid)%ntrs, pid) = models(pid)%trs_n
-                        gpu_interpolate(pid) = models(pid)%interpolate
-                        gpu_boundary_edge_count(pid) = models(pid)%boundary_edge_count
-                        gpu_total_vertices(pid) = models(pid)%total_vertices
-                    end if
-                    if (allocated(models(pid)%boundary_v)) then
-                        gpu_boundary_v(1:size(models(pid)%boundary_v, 1), &
-                                       1:size(models(pid)%boundary_v, 2), &
-                                       1:size(models(pid)%boundary_v, 3), pid) = models(pid)%boundary_v
-                    end if
-                    if (allocated(models(pid)%interpolated_boundary_v)) then
-                        gpu_interpolated_boundary_v(1:size(models(pid)%interpolated_boundary_v, 1), &
-                                                    1:size(models(pid)%interpolated_boundary_v, 2), pid) = models(pid)%interpolated_boundary_v
-                    end if
-                end do
-
-                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n, gpu_interpolate, gpu_boundary_edge_count, gpu_total_vertices]')
-                if (allocated(gpu_boundary_v)) then
-                    $:GPU_ENTER_DATA(copyin='[gpu_boundary_v]')
-                end if
-                if (allocated(gpu_interpolated_boundary_v)) then
-                    $:GPU_ENTER_DATA(copyin='[gpu_interpolated_boundary_v]')
-                end if
-            end if
-        end block
-
-    end subroutine s_instantiate_STL_models
-
     !> This procedure reads a binary STL file.
     !! @param filepath Path to the STL file.
     !! @param model The binary of the STL file.
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index cab36bcda7..82405566ec 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -25,7 +25,7 @@ module m_ib_patches
 
     implicit none
 
-    private; public :: s_apply_ib_patches, s_update_ib_rotation_matrix, f_convert_cyl_to_cart
+    private; public :: s_apply_ib_patches, s_update_ib_rotation_matrix, f_convert_cyl_to_cart, s_instantiate_STL_models
 
     real(wp) :: x_centroid, y_centroid, z_centroid
     real(wp) :: length_x, length_y, length_z
@@ -887,10 +887,10 @@ contains
         integer, intent(in) :: patch_id
         type(integer_field), intent(inout) :: ib_markers
 
-        integer :: i, j, k !< Generic loop iterators
+        integer :: i, j, k, il, ir, jl, jr, kl, kr !< Generic loop iterators
         integer :: spc
 
-        real(wp) :: eta, threshold
+        real(wp) :: eta, threshold, corner_distance
         real(wp), dimension(1:3) :: point, local_point, offset
         real(wp), dimension(1:3) :: center, xyz_local
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
@@ -904,11 +904,26 @@ contains
         spc = patch_ib(patch_id)%model_spc
         threshold = patch_ib(patch_id)%model_threshold
 
+        il = -gp_layers
+        jl = -gp_layers
+        kl = -gp_layers
+        ir = m + gp_layers
+        jr = n + gp_layers
+        kr = p + gp_layers
+        corner_distance = 0._wp
+        do i = 1, 3
+            corner_distance = corner_distance + maxval(abs(stl_bounding_boxes(patch_id, i, 1:3)))**2 ! distance to rim of cylinder
+        end do
+        corner_distance = sqrt(corner_distance)
+        call get_bounding_indices(center(1) - corner_distance, center(1) + corner_distance, x_cc, il, ir)
+        call get_bounding_indices(center(2) - corner_distance, center(2) + corner_distance, y_cc, jl, jr)
+        call get_bounding_indices(center(3) - corner_distance, center(3) + corner_distance, z_cc, kl, kr)
+
         $:GPU_PARALLEL_LOOP(private='[i,j,k, xyz_local, eta]',&
-                  & copyin='[patch_id,center,inverse_rotation, offset, spc, threshold]', collapse=2)
-        do i = -gp_layers, m + gp_layers
-            do j = -gp_layers, n + gp_layers
-                do k = -gp_layers, p + gp_layers
+                  & copyin='[patch_id,center,inverse_rotation, offset, spc, threshold]', collapse=3)
+        do i = il, ir
+            do j = jl, jr
+                do k = kl, kr
 
                     xyz_local = [x_cc(i) - center(1), y_cc(j) - center(2), z_cc(k) - center(3)]
                     xyz_local = matmul(inverse_rotation, xyz_local)
@@ -932,9 +947,228 @@ contains
                 end do
             end do
         end do
+        $:END_GPU_PARALLEL_LOOP()
 
     end subroutine s_ib_3d_model
 
+    subroutine s_instantiate_STL_models()
+
+        ! Variables for IBM+STL
+        real(wp) :: normals(1:3) !< Boundary normal buffer
+        integer :: boundary_vertex_count, boundary_edge_count, total_vertices !< Boundary vertex
+        real(wp), allocatable, dimension(:, :, :) :: boundary_v !< Boundary vertex buffer
+        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v !< Interpolated vertex buffer
+        real(wp) :: dx_local, dy_local, dz_local !< Levelset distance buffer
+        logical :: interpolate !< Logical variable to determine whether or not the model should be interpolated
+
+        integer :: i, j, k !< Generic loop iterators
+        integer :: patch_id
+
+        type(t_bbox) :: bbox, bbox_old
+        type(t_model) :: model
+        type(ic_model_parameters) :: params
+
+        real(wp) :: eta
+        real(wp), dimension(1:3) :: point, model_center
+        real(wp) :: grid_mm(1:3, 1:2)
+
+        real(wp), dimension(1:4, 1:4) :: transform, transform_n
+
+        dx_local = minval(dx); dy_local = minval(dy)
+        if (p /= 0) dz_local = minval(dz)
+
+        do patch_id = 1, num_ibs
+            if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
+                allocate (models(patch_id)%model)
+                print *, " * Reading model: "//trim(patch_ib(patch_id)%model_filepath)
+
+                model = f_model_read(patch_ib(patch_id)%model_filepath)
+                params%scale(:) = patch_ib(patch_id)%model_scale(:)
+                params%translate(:) = patch_ib(patch_id)%model_translate(:)
+                params%rotate(:) = patch_ib(patch_id)%model_rotate(:)
+                params%spc = patch_ib(patch_id)%model_spc
+                params%threshold = patch_ib(patch_id)%model_threshold
+
+                if (f_approx_equal(dot_product(params%scale, params%scale), 0._wp)) then
+                    params%scale(:) = 1._wp
+                end if
+
+                if (proc_rank == 0) then
+                    print *, " * Transforming model."
+                end if
+
+                ! Get the model center before transforming the model
+                bbox_old = f_create_bbox(model)
+                model_center(1:3) = (bbox_old%min(1:3) + bbox_old%max(1:3))/2._wp
+
+                ! Compute the transform matrices for vertices and normals
+                transform = f_create_transform_matrix(params, model_center)
+                transform_n = f_create_transform_matrix(params)
+
+                call s_transform_model(model, transform, transform_n)
+
+                ! Recreate the bounding box after transformation
+                bbox = f_create_bbox(model)
+
+                ! Show the number of vertices in the original STL model
+                if (proc_rank == 0) then
+                    print *, ' * Number of input model vertices:', 3*model%ntrs
+                end if
+
+                call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
+
+                ! Check if the model needs interpolation
+                if (p > 0) then
+                    call f_check_interpolation_3D(model, (/dx_local, dy_local, dz_local/), interpolate)
+                else
+                    call f_check_interpolation_2D(boundary_v, boundary_edge_count, (/dx_local, dy_local, 0._wp/), interpolate)
+                end if
+                interpolate = .false.
+
+                ! Show the number of edges and boundary edges in 2D STL models
+                if (proc_rank == 0 .and. p == 0) then
+                    print *, ' * Number of 2D model boundary edges:', boundary_edge_count
+                end if
+
+                ! Interpolate the STL model along the edges (2D) and on triangle facets (3D)
+                if (interpolate) then
+                    if (proc_rank == 0) then
+                        print *, ' * Interpolating STL vertices.'
+                    end if
+
+                    if (p > 0) then
+                        call f_interpolate_3D(model, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
+                    else
+                        call f_interpolate_2D(boundary_v, boundary_edge_count, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
+                    end if
+
+                    if (proc_rank == 0) then
+                        print *, ' * Total number of interpolated boundary vertices:', total_vertices
+                    end if
+                end if
+
+                if (proc_rank == 0) then
+                    write (*, "(A, 3(2X, F20.10))") "    > Model:  Min:", bbox%min(1:3)
+                    write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (bbox%min(1:3) + bbox%max(1:3))/2._wp
+                    write (*, "(A, 3(2X, F20.10))") "    >         Max:", bbox%max(1:3)
+
+                    grid_mm(1, :) = (/minval(x_cc(0:m)) - 0.5_wp*dx_local, maxval(x_cc(0:m)) + 0.5_wp*dx_local/)
+                    grid_mm(2, :) = (/minval(y_cc(0:n)) - 0.5_wp*dy_local, maxval(y_cc(0:n)) + 0.5_wp*dy_local/)
+
+                    if (p > 0) then
+                        grid_mm(3, :) = (/minval(z_cc(0:p)) - 0.5_wp*dz_local, maxval(z_cc(0:p)) + 0.5_wp*dz_local/)
+                    else
+                        grid_mm(3, :) = (/0._wp, 0._wp/)
+                    end if
+
+                    write (*, "(A, 3(2X, F20.10))") "    > Domain: Min:", grid_mm(:, 1)
+                    write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (grid_mm(:, 1) + grid_mm(:, 2))/2._wp
+                    write (*, "(A, 3(2X, F20.10))") "    >         Max:", grid_mm(:, 2)
+                end if
+
+                allocate (stl_bounding_boxes(patch_id, 1:3, 1:3))
+                stl_bounding_boxes(patch_id, 1, 1:3) = [bbox%min(1), (bbox%min(1) + bbox%max(1))/2._wp, bbox%max(1)]
+                stl_bounding_boxes(patch_id, 2, 1:3) = [bbox%min(2), (bbox%min(2) + bbox%max(2))/2._wp, bbox%max(2)]
+                stl_bounding_boxes(patch_id, 3, 1:3) = [bbox%min(3), (bbox%min(3) + bbox%max(3))/2._wp, bbox%max(3)]
+
+                models(patch_id)%model = model
+                models(patch_id)%boundary_v = boundary_v
+                models(patch_id)%boundary_edge_count = boundary_edge_count
+                if (interpolate) then
+                    models(patch_id)%interpolate = 1
+                else
+                    models(patch_id)%interpolate = 0
+                end if
+                if (interpolate) then
+                    models(patch_id)%interpolated_boundary_v = interpolated_boundary_v
+                    models(patch_id)%total_vertices = total_vertices
+                end if
+
+            end if
+        end do
+
+        ! Pack and upload flat arrays for GPU (AFTER the loop)
+        block
+            integer :: pid, max_ntrs
+            integer :: max_bv1, max_bv2, max_bv3, max_iv1, max_iv2
+
+            max_ntrs = 0
+            max_bv1 = 0; max_bv2 = 0; max_bv3 = 0
+            max_iv1 = 0; max_iv2 = 0
+
+            do pid = 1, num_ibs
+                if (allocated(models(pid)%model)) then
+                    call s_pack_model_for_gpu(models(pid))
+                    max_ntrs = max(max_ntrs, models(pid)%ntrs)
+                end if
+                if (allocated(models(pid)%boundary_v)) then
+                    max_bv1 = max(max_bv1, size(models(pid)%boundary_v, 1))
+                    max_bv2 = max(max_bv2, size(models(pid)%boundary_v, 2))
+                    max_bv3 = max(max_bv3, size(models(pid)%boundary_v, 3))
+                end if
+                if (allocated(models(pid)%interpolated_boundary_v)) then
+                    max_iv1 = max(max_iv1, size(models(pid)%interpolated_boundary_v, 1))
+                    max_iv2 = max(max_iv2, size(models(pid)%interpolated_boundary_v, 2))
+                end if
+            end do
+
+            if (max_ntrs > 0) then
+                allocate (gpu_ntrs(1:num_ibs))
+                allocate (gpu_trs_v(1:3, 1:3, 1:max_ntrs, 1:num_ibs))
+                allocate (gpu_trs_n(1:3, 1:max_ntrs, 1:num_ibs))
+                allocate (gpu_interpolate(1:num_ibs))
+                allocate (gpu_boundary_edge_count(1:num_ibs))
+                allocate (gpu_total_vertices(1:num_ibs))
+
+                gpu_ntrs = 0
+                gpu_trs_v = 0._wp
+                gpu_trs_n = 0._wp
+                gpu_interpolate = 0
+                gpu_boundary_edge_count = 0
+                gpu_total_vertices = 0
+
+                if (max_bv1 > 0) then
+                    allocate (gpu_boundary_v(1:max_bv1, 1:max_bv2, 1:max_bv3, 1:num_ibs))
+                    gpu_boundary_v = 0._wp
+                end if
+
+                if (max_iv1 > 0) then
+                    allocate (gpu_interpolated_boundary_v(1:max_iv1, 1:max_iv2, 1:num_ibs))
+                    gpu_interpolated_boundary_v = 0._wp
+                end if
+
+                do pid = 1, num_ibs
+                    if (allocated(models(pid)%model)) then
+                        gpu_ntrs(pid) = models(pid)%ntrs
+                        gpu_trs_v(:, :, 1:models(pid)%ntrs, pid) = models(pid)%trs_v
+                        gpu_trs_n(:, 1:models(pid)%ntrs, pid) = models(pid)%trs_n
+                        gpu_interpolate(pid) = models(pid)%interpolate
+                        gpu_boundary_edge_count(pid) = models(pid)%boundary_edge_count
+                        gpu_total_vertices(pid) = models(pid)%total_vertices
+                    end if
+                    if (allocated(models(pid)%boundary_v)) then
+                        gpu_boundary_v(1:size(models(pid)%boundary_v, 1), &
+                                       1:size(models(pid)%boundary_v, 2), &
+                                       1:size(models(pid)%boundary_v, 3), pid) = models(pid)%boundary_v
+                    end if
+                    if (allocated(models(pid)%interpolated_boundary_v)) then
+                        gpu_interpolated_boundary_v(1:size(models(pid)%interpolated_boundary_v, 1), &
+                                                    1:size(models(pid)%interpolated_boundary_v, 2), pid) = models(pid)%interpolated_boundary_v
+                    end if
+                end do
+
+                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n, gpu_interpolate, gpu_boundary_edge_count, gpu_total_vertices]')
+                if (allocated(gpu_boundary_v)) then
+                    $:GPU_ENTER_DATA(copyin='[gpu_boundary_v]')
+                end if
+                if (allocated(gpu_interpolated_boundary_v)) then
+                    $:GPU_ENTER_DATA(copyin='[gpu_interpolated_boundary_v]')
+                end if
+            end if
+        end block
+
+    end subroutine s_instantiate_STL_models
+
     !> Subroutine that computes a rotation matrix for converting to the rotating frame of the boundary
     subroutine s_update_ib_rotation_matrix(patch_id)
 

From 4dc9072a0421edb834ed11725b12433d541aaba3 Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Fri, 20 Feb 2026 10:27:10 -0500
Subject: [PATCH 16/22] Missed on comparison that NVHPC allows, but GNU does
 not

---
 src/simulation/m_compute_levelset.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index eaae58667b..1c254e0d9f 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -691,7 +691,7 @@ contains
             call f_distance_normals_3D_flat(gpu_ntrs(patch_id), gpu_trs_v, gpu_trs_n, patch_id, xyz_local, normals, distance)
 
             ! Get the shortest distance between the cell center and the interpolated model boundary
-            if (interpolate) then
+            if (interpolate == 1) then
                 gp%levelset = f_interpolated_distance(gpu_interpolated_boundary_v(:, :, patch_id), total_vertices, xyz_local)
             else
                 gp%levelset = distance

From 0fd5492941bd02ebb3c9df792a32d0312104cade Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Fri, 20 Feb 2026 11:46:11 -0500
Subject: [PATCH 17/22] Resolved issues with GPU arrays allocation for 3D STLs

---
 src/simulation/m_ib_patches.fpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 7d0256e414..5aa302b32b 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -1028,7 +1028,7 @@ contains
                 else
                     call f_check_interpolation_2D(boundary_v, boundary_edge_count, (/dx_local, dy_local, 0._wp/), interpolate)
                 end if
-                interpolate = .false.
+                print *, interpolate
 
                 ! Show the number of edges and boundary edges in 2D STL models
                 if (proc_rank == 0 .and. p == 0) then
@@ -1151,12 +1151,13 @@ contains
                         gpu_boundary_edge_count(pid) = models(pid)%boundary_edge_count
                         gpu_total_vertices(pid) = models(pid)%total_vertices
                     end if
-                    if (allocated(models(pid)%boundary_v)) then
+                    if (allocated(models(pid)%boundary_v) .and. p == 0) then
+                      print *, size(models(pid)%boundary_v, 1), size(models(pid)%boundary_v, 2), size(models(pid)%boundary_v, 3)
                         gpu_boundary_v(1:size(models(pid)%boundary_v, 1), &
                                        1:size(models(pid)%boundary_v, 2), &
                                        1:size(models(pid)%boundary_v, 3), pid) = models(pid)%boundary_v
                     end if
-                    if (allocated(models(pid)%interpolated_boundary_v)) then
+                    if (allocated(models(pid)%interpolated_boundary_v) .and. gpu_interpolate(pid) == 1) then
                         gpu_interpolated_boundary_v(1:size(models(pid)%interpolated_boundary_v, 1), &
                                                     1:size(models(pid)%interpolated_boundary_v, 2), pid) = models(pid)%interpolated_boundary_v
                     end if

From 6edad1b9ceb8f4f5227c1913245fce7399005bc3 Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Fri, 20 Feb 2026 14:37:17 -0500
Subject: [PATCH 18/22] Finished GPU implementation of all subroutines. Final
 check before regenerating golden file

---
 src/common/m_model.fpp                | 177 ++++++++++++++++----------
 src/simulation/m_compute_levelset.fpp |   1 -
 src/simulation/m_ib_patches.fpp       |   7 +-
 3 files changed, 115 insertions(+), 70 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 443f6f399e..54704c7551 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -734,6 +734,7 @@ contains
         end do
 
         ! Check all edges and count repeated edges
+        $:GPU_PARALLEL_LOOP(private='[i,j]', copy='[temp_boundary_v,edge_occurrence]', copyin='[threshold_edge_zero]', collapse=2)
         do i = 1, edge_count
             do j = 1, edge_count
                 if (i /= j) then
@@ -746,11 +747,13 @@ contains
                          (abs(temp_boundary_v(i, 2, 1) - temp_boundary_v(j, 1, 1)) < threshold_edge_zero) .and. &
                          (abs(temp_boundary_v(i, 2, 2) - temp_boundary_v(j, 1, 2)) < threshold_edge_zero))) then
 
+                        $:GPU_ATOMIC(atomic='update')
                         edge_occurrence(i) = edge_occurrence(i) + 1
                     end if
                 end if
             end do
         end do
+        $:END_GPU_PARALLEL_LOOP()
 
         ! Count the number of boundary vertices/edges
         boundary_vertex_count = 0
@@ -818,7 +821,7 @@ contains
 
     end subroutine f_register_edge
 
-    !> This procedure check if interpolates is needed for 2D models.
+    !> This procedure check if interpolation is needed for 2D models.
     !! @param boundary_v                Temporary edge end vertex buffer
     !! @param spacing                   Dimensions of the current levelset cell
     subroutine f_check_interpolation_2D(boundary_v, boundary_edge_count, spacing, interpolate)
@@ -830,24 +833,29 @@ contains
 
         real(wp) :: l1, cell_width !< Length of each boundary edge and cell width
         integer :: j !< Boundary edge index iterator
+        integer :: interpolate_integer !< integer form of interpolate logical to support GPU checking
 
         cell_width = minval(spacing(1:2))
-        interpolate = .false.
+        interpolate_integer = 0
 
+        $:GPU_PARALLEL_LOOP(private='[j,l1]', copyin='[boundary_v,cell_width,Ifactor_2D]', copy='[interpolate_integer]')
         do j = 1, boundary_edge_count
-            l1 = sqrt((boundary_v(j, 2, 1) - boundary_v(j, 1, 1))**2 + &
-                      (boundary_v(j, 2, 2) - boundary_v(j, 1, 2))**2)
+            if (interpolate_integer == 0) then
+                l1 = sqrt((boundary_v(j, 2, 1) - boundary_v(j, 1, 1))**2 + &
+                          (boundary_v(j, 2, 2) - boundary_v(j, 1, 2))**2)
 
-            if ((l1 > cell_width)) then
-                interpolate = .true.
-            else
-                interpolate = .false.
+                if ((l1*Ifactor_2D > cell_width)) then
+                    interpolate_integer = 1
+                end if
             end if
         end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        interpolate = (interpolate_integer == 1)
 
     end subroutine f_check_interpolation_2D
 
-    !> This procedure check if interpolates is needed for 3D models.
+    !> This procedure check if interpolation is needed for 3D models.
     !! @param model              Model to search in.
     !! @param spacing            Dimensions of the current levelset cell
     !! @param interpolate        Logical output
@@ -858,37 +866,45 @@ contains
         real(wp), dimension(1:3), intent(in) :: spacing
         real(wp), dimension(1:3) :: edge_l
         real(wp) :: cell_width
-        real(wp), dimension(1:3, 1:3) :: tri_v
-        integer :: i, j !< Loop iterator
+        real(wp), dimension(1:model%ntrs, 1:3, 1:3) :: tri_v
+        integer :: i, j, interpolate_integer !< Loop iterator
 
         cell_width = minval(spacing)
-        interpolate = .false.
+        interpolate_integer = 0
 
+        ! load up the array of triangles for GPU packing
         do i = 1, model%ntrs
             do j = 1, 3
-                tri_v(1, j) = model%trs(i)%v(1, j)
-                tri_v(2, j) = model%trs(i)%v(2, j)
-                tri_v(3, j) = model%trs(i)%v(3, j)
+                tri_v(i, 1, j) = model%trs(i)%v(1, j)
+                tri_v(i, 2, j) = model%trs(i)%v(2, j)
+                tri_v(i, 3, j) = model%trs(i)%v(3, j)
             end do
+        end do
 
-            edge_l(1) = sqrt((tri_v(1, 2) - tri_v(1, 1))**2 + &
-                             (tri_v(2, 2) - tri_v(2, 1))**2 + &
-                             (tri_v(3, 2) - tri_v(3, 1))**2)
-            edge_l(2) = sqrt((tri_v(1, 3) - tri_v(1, 2))**2 + &
-                             (tri_v(2, 3) - tri_v(2, 2))**2 + &
-                             (tri_v(3, 3) - tri_v(3, 2))**2)
-            edge_l(3) = sqrt((tri_v(1, 1) - tri_v(1, 3))**2 + &
-                             (tri_v(2, 1) - tri_v(2, 3))**2 + &
-                             (tri_v(3, 1) - tri_v(3, 3))**2)
-
-            if ((edge_l(1) > cell_width) .or. &
-                (edge_l(2) > cell_width) .or. &
-                (edge_l(3) > cell_width)) then
-                interpolate = .true.
-            else
-                interpolate = .false.
+        ! compare the side of all
+        $:GPU_PARALLEL_LOOP(private='[i,edge_l]', copyin='[cell_width,tri_v,Ifactor_3D]', copy='[interpolate_integer]')
+        do i = 1, model%ntrs
+            if (interpolate_integer == 0) then
+                edge_l(1) = sqrt((tri_v(i, 1, 2) - tri_v(i, 1, 1))**2 + &
+                                 (tri_v(i, 2, 2) - tri_v(i, 2, 1))**2 + &
+                                 (tri_v(i, 3, 2) - tri_v(i, 3, 1))**2)
+                edge_l(2) = sqrt((tri_v(i, 1, 3) - tri_v(i, 1, 2))**2 + &
+                                 (tri_v(i, 2, 3) - tri_v(i, 2, 2))**2 + &
+                                 (tri_v(i, 3, 3) - tri_v(i, 3, 2))**2)
+                edge_l(3) = sqrt((tri_v(i, 1, 1) - tri_v(i, 1, 3))**2 + &
+                                 (tri_v(i, 2, 1) - tri_v(i, 2, 3))**2 + &
+                                 (tri_v(i, 3, 1) - tri_v(i, 3, 3))**2)
+
+                if ((edge_l(1) > cell_width*Ifactor_3D) .or. &
+                    (edge_l(2) > cell_width*Ifactor_3D) .or. &
+                    (edge_l(3) > cell_width*Ifactor_3D)) then
+                    interpolate_integer = 1
+                end if
             end if
         end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        interpolate = (interpolate_integer == 1)
 
     end subroutine f_check_interpolation_3D
 
@@ -905,7 +921,7 @@ contains
         real(wp), allocatable, intent(inout), dimension(:, :) :: interpolated_boundary_v
 
         integer, intent(inout) :: total_vertices, boundary_edge_count
-        integer :: num_segments
+        integer :: num_segments, vertex_idx
         integer :: i, j
 
         real(wp) :: edge_length, cell_width
@@ -917,6 +933,7 @@ contains
 
         ! First pass: Calculate the total number of vertices including interpolated ones
         total_vertices = 1
+        $:GPU_PARALLEL_LOOP(private='[i,edge_x,edge_y,edge_length,num_segments]', copyin='[Ifactor_2D]', copy='[total_vertices]')
         do i = 1, boundary_edge_count
             ! Get the coordinates of the two ends of the current edge
             edge_x(1) = boundary_v(i, 1, 1)
@@ -936,14 +953,17 @@ contains
             end if
 
             ! Each edge contributes num_segments vertices
+            $:GPU_ATOMIC(atomic='update')
             total_vertices = total_vertices + num_segments
         end do
+        $:END_GPU_PARALLEL_LOOP()
 
         ! Allocate memory for the new boundary vertices array
         allocate (interpolated_boundary_v(1:total_vertices, 1:3))
 
         ! Fill the new boundary vertices array with original and interpolated vertices
         total_vertices = 1
+        $:GPU_PARALLEL_LOOP(private='[i,edge_x,edge_y,edge_length,num_segments,edge_del,vertex_idx]', copyin='[Ifactor_2D]', copy='[total_vertices,interpolated_boundary_v]')
         do i = 1, boundary_edge_count
             ! Get the coordinates of the two ends of the current edge
             edge_x(1) = boundary_v(i, 1, 1)
@@ -972,18 +992,25 @@ contains
 
             ! Add original and interpolated vertices to the output array
             do j = 1, num_segments - 1
+                $:GPU_ATOMIC(atomic='capture')
                 total_vertices = total_vertices + 1
-                interpolated_boundary_v(total_vertices, 1) = edge_x(1) + j*edge_del(1)
-                interpolated_boundary_v(total_vertices, 2) = edge_y(1) + j*edge_del(2)
+                vertex_idx = total_vertices
+                $:END_GPU_ATOMIC_CAPTURE()
+                interpolated_boundary_v(vertex_idx, 1) = edge_x(1) + j*edge_del(1)
+                interpolated_boundary_v(vertex_idx, 2) = edge_y(1) + j*edge_del(2)
             end do
 
             ! Add the last vertex of the edge
             if (num_segments > 0) then
+                $:GPU_ATOMIC(atomic='capture')
                 total_vertices = total_vertices + 1
-                interpolated_boundary_v(total_vertices, 1) = edge_x(2)
-                interpolated_boundary_v(total_vertices, 2) = edge_y(2)
+                vertex_idx = total_vertices
+                $:END_GPU_ATOMIC_CAPTURE()
+                interpolated_boundary_v(vertex_idx, 1) = edge_x(2)
+                interpolated_boundary_v(vertex_idx, 2) = edge_y(2)
             end if
         end do
+        $:END_GPU_PARALLEL_LOOP()
 
     end subroutine f_interpolate_2D
 
@@ -998,12 +1025,16 @@ contains
         real(wp), allocatable, intent(inout), dimension(:, :) :: interpolated_boundary_v
         integer, intent(out) :: total_vertices
 
-        integer :: i, j, k, num_triangles, num_segments, num_inner_vertices
+        integer :: i, j, k, num_triangles, num_segments, num_inner_vertices, vertex_idx
         real(wp), dimension(1:3, 1:3) :: tri
         real(wp), dimension(1:3) :: edge_del, cell_area
         real(wp), dimension(1:3) :: bary_coord !< Barycentric coordinates
         real(wp) :: edge_length, cell_width, cell_area_min, tri_area
 
+        ! GPU-friendly flat arrays packed from model
+        real(wp), allocatable, dimension(:, :, :) :: flat_v  ! (3, 3, num_triangles)
+        real(wp), allocatable, dimension(:, :) :: flat_n  ! (3, num_triangles)
+
         ! Number of triangles in the model
         num_triangles = model%ntrs
         cell_width = minval(spacing)
@@ -1015,18 +1046,23 @@ contains
         cell_area_min = minval(cell_area)
         num_inner_vertices = 0
 
+        ! Pack model into flat arrays on CPU
+        allocate (flat_v(1:3, 1:3, 1:num_triangles))
+        allocate (flat_n(1:3, 1:num_triangles))
+        do i = 1, num_triangles
+            flat_v(:, :, i) = model%trs(i)%v(:, :)
+            flat_n(:, i) = model%trs(i)%n(:)
+        end do
+
         ! Calculate the total number of vertices including interpolated ones
         total_vertices = 0
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,tri,edge_length,num_segments,num_inner_vertices]', copyin='[Ifactor_3D,Ifactor_bary_3D,flat_v,flat_n]', copy='[total_vertices]', collapse=1)
         do i = 1, num_triangles
             do j = 1, 3
                 ! Get the coordinates of the two vertices of the current edge
-                tri(1, 1) = model%trs(i)%v(j, 1)
-                tri(1, 2) = model%trs(i)%v(j, 2)
-                tri(1, 3) = model%trs(i)%v(j, 3)
+                tri(1, :) = flat_v(j, :, i)
                 ! Next vertex in the triangle (cyclic)
-                tri(2, 1) = model%trs(i)%v(mod(j, 3) + 1, 1)
-                tri(2, 2) = model%trs(i)%v(mod(j, 3) + 1, 2)
-                tri(2, 3) = model%trs(i)%v(mod(j, 3) + 1, 3)
+                tri(2, :) = flat_v(mod(j, 3) + 1, :, i)
 
                 ! Compute the length of the edge
                 edge_length = sqrt((tri(2, 1) - tri(1, 1))**2 + &
@@ -1041,39 +1077,35 @@ contains
                 end if
 
                 ! Each edge contributes num_segments vertices
+                $:GPU_ATOMIC(atomic='update')
                 total_vertices = total_vertices + num_segments + 1
             end do
 
             ! Add vertices inside the triangle
             do k = 1, 3
-                tri(k, 1) = model%trs(i)%v(k, 1)
-                tri(k, 2) = model%trs(i)%v(k, 2)
-                tri(k, 3) = model%trs(i)%v(k, 3)
+                tri(k, :) = flat_v(k, :, i)
             end do
             call f_tri_area(tri, tri_area)
 
             if (tri_area > threshold_bary*cell_area_min) then
                 num_inner_vertices = Ifactor_bary_3D*ceiling(tri_area/cell_area_min)
+                $:GPU_ATOMIC(atomic='update')
                 total_vertices = total_vertices + num_inner_vertices
             end if
         end do
+        $:END_GPU_PARALLEL_LOOP()
 
         ! Allocate memory for the new boundary vertices array
         allocate (interpolated_boundary_v(1:total_vertices, 1:3))
 
         ! Fill the new boundary vertices array with original and interpolated vertices
         total_vertices = 0
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,tri,edge_length,num_segments,num_inner_vertices,edge_del,bary_coord,vertex_idx]', copyin='[Ifactor_3D,Ifactor_bary_3D]', copy='[total_vertices]', collapse=1)
         do i = 1, num_triangles
             ! Loop through the 3 edges of each triangle
             do j = 1, 3
-                ! Get the coordinates of the two vertices of the current edge
-                tri(1, 1) = model%trs(i)%v(j, 1)
-                tri(1, 2) = model%trs(i)%v(j, 2)
-                tri(1, 3) = model%trs(i)%v(j, 3)
-                ! Next vertex in the triangle (cyclic)
-                tri(2, 1) = model%trs(i)%v(mod(j, 3) + 1, 1)
-                tri(2, 2) = model%trs(i)%v(mod(j, 3) + 1, 2)
-                tri(2, 3) = model%trs(i)%v(mod(j, 3) + 1, 3)
+                tri(1, :) = flat_v(j, :, i)
+                tri(2, :) = flat_v(mod(j, 3) + 1, :, i)
 
                 ! Compute the length of the edge
                 edge_length = sqrt((tri(2, 1) - tri(1, 1))**2 + &
@@ -1093,33 +1125,42 @@ contains
 
                 ! Add original and interpolated vertices to the output array
                 do k = 0, num_segments - 1
+                    $:GPU_ATOMIC(atomic='capture')
                     total_vertices = total_vertices + 1
-                    interpolated_boundary_v(total_vertices, 1) = tri(1, 1) + k*edge_del(1)
-                    interpolated_boundary_v(total_vertices, 2) = tri(1, 2) + k*edge_del(2)
-                    interpolated_boundary_v(total_vertices, 3) = tri(1, 3) + k*edge_del(3)
+                    vertex_idx = total_vertices
+                    $:END_GPU_ATOMIC_CAPTURE()
+                    interpolated_boundary_v(vertex_idx, 1) = tri(1, 1) + k*edge_del(1)
+                    interpolated_boundary_v(vertex_idx, 2) = tri(1, 2) + k*edge_del(2)
+                    interpolated_boundary_v(vertex_idx, 3) = tri(1, 3) + k*edge_del(3)
                 end do
 
                 ! Add the last vertex of the edge
+                $:GPU_ATOMIC(atomic='capture')
                 total_vertices = total_vertices + 1
-                interpolated_boundary_v(total_vertices, 1) = tri(2, 1)
-                interpolated_boundary_v(total_vertices, 2) = tri(2, 2)
-                interpolated_boundary_v(total_vertices, 3) = tri(2, 3)
+                vertex_idx = total_vertices
+                $:END_GPU_ATOMIC_CAPTURE()
+                interpolated_boundary_v(vertex_idx, 1) = tri(2, 1)
+                interpolated_boundary_v(vertex_idx, 2) = tri(2, 2)
+                interpolated_boundary_v(vertex_idx, 3) = tri(2, 3)
             end do
 
             ! Interpolate verties that are not on edges
             do k = 1, 3
-                tri(k, 1) = model%trs(i)%v(k, 1)
-                tri(k, 2) = model%trs(i)%v(k, 2)
-                tri(k, 3) = model%trs(i)%v(k, 3)
+                tri(k, :) = flat_v(k, :, i)
             end do
             call f_tri_area(tri, tri_area)
 
             if (tri_area > threshold_bary*cell_area_min) then
                 num_inner_vertices = Ifactor_bary_3D*ceiling(tri_area/cell_area_min)
                 !Use barycentric coordinates for randomly distributed points
+
+                ! Deterministic seed per triangle
+                rand_seed = i*73856093 + 19349663
+                if (rand_seed == 0) rand_seed = 1
+
                 do k = 1, num_inner_vertices
-                    call random_number(bary_coord(1))
-                    call random_number(bary_coord(2))
+                    bary_coord(1) = f_model_random_number(rand_seed)
+                    bary_coord(2) = f_model_random_number(rand_seed)
 
                     if ((bary_coord(1) + bary_coord(2)) >= 1._wp) then
                         bary_coord(1) = 1._wp - bary_coord(1)
@@ -1127,6 +1168,7 @@ contains
                     end if
                     bary_coord(3) = 1._wp - bary_coord(1) - bary_coord(2)
 
+                    $:GPU_ATOMIC(atomic='update')
                     total_vertices = total_vertices + 1
                     interpolated_boundary_v(total_vertices, 1) = dot_product(bary_coord, tri(1:3, 1))
                     interpolated_boundary_v(total_vertices, 2) = dot_product(bary_coord, tri(1:3, 2))
@@ -1134,6 +1176,9 @@ contains
                 end do
             end if
         end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        deallocate (flat_v, flat_n)
 
     end subroutine f_interpolate_3D
 
@@ -1344,6 +1389,8 @@ contains
         real(wp), dimension(1:3) :: AB, AC, cross
         integer :: i !< Loop iterator
 
+        $:GPU_ROUTINE(parallelism='[seq]')
+
         do i = 1, 3
             AB(i) = tri(2, i) - tri(1, i)
             AC(i) = tri(3, i) - tri(1, i)
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 1c254e0d9f..8b04f66b11 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -722,7 +722,6 @@ contains
 
             ! Assign the levelset_norm
             gp%levelset_norm = matmul(rotation, normals(1:3))
-
         end if
 
     end subroutine s_model_levelset
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 5aa302b32b..050bfdb35b 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -1020,7 +1020,8 @@ contains
                     print *, ' * Number of input model vertices:', 3*model%ntrs
                 end if
 
-                call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
+                ! Need the cells that form the boundary of the flat projection in 2D
+                if (p == 0) call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
 
                 ! Check if the model needs interpolation
                 if (p > 0) then
@@ -1028,7 +1029,6 @@ contains
                 else
                     call f_check_interpolation_2D(boundary_v, boundary_edge_count, (/dx_local, dy_local, 0._wp/), interpolate)
                 end if
-                print *, interpolate
 
                 ! Show the number of edges and boundary edges in 2D STL models
                 if (proc_rank == 0 .and. p == 0) then
@@ -1044,7 +1044,7 @@ contains
                     if (p > 0) then
                         call f_interpolate_3D(model, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
                     else
-                        call f_interpolate_2D(boundary_v, boundary_edge_count, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
+                        call f_interpolate_2D(boundary_v, boundary_edge_count, (/dx, dy, 0._wp/), interpolated_boundary_v, total_vertices)
                     end if
 
                     if (proc_rank == 0) then
@@ -1152,7 +1152,6 @@ contains
                         gpu_total_vertices(pid) = models(pid)%total_vertices
                     end if
                     if (allocated(models(pid)%boundary_v) .and. p == 0) then
-                      print *, size(models(pid)%boundary_v, 1), size(models(pid)%boundary_v, 2), size(models(pid)%boundary_v, 3)
                         gpu_boundary_v(1:size(models(pid)%boundary_v, 1), &
                                        1:size(models(pid)%boundary_v, 2), &
                                        1:size(models(pid)%boundary_v, 3), pid) = models(pid)%boundary_v

From 1e5326a6430cc1cc822209fa0f526c1b22e52f5b Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Sat, 21 Feb 2026 00:24:09 -0500
Subject: [PATCH 19/22] Refactored interpolation out of the code and replaced
 it with a projection-based distance method that is more accurate, faster, and
 simpler. Also removed some redundant subroutine calls from ICPP patches to
 better align it with IB patches

---
 src/common/m_model.fpp                | 760 ++++++--------------------
 src/pre_process/m_icpp_patches.fpp    |  25 -
 src/simulation/m_compute_levelset.fpp |  38 +-
 src/simulation/m_ib_patches.fpp       |  57 +-
 src/simulation/m_ibm.fpp              |  13 -
 src/simulation/m_mpi_proxy.fpp        | 185 -------
 6 files changed, 186 insertions(+), 892 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 54704c7551..b474fec80e 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -19,14 +19,12 @@ module m_model
     private
 
     public :: f_model_read, s_model_write, s_model_free, f_model_is_inside, models, gpu_ntrs, &
-              gpu_trs_v, gpu_trs_n, gpu_boundary_v, gpu_interpolated_boundary_v, gpu_interpolate, gpu_boundary_edge_count, &
+              gpu_trs_v, gpu_trs_n, gpu_boundary_v, gpu_boundary_edge_count, &
               gpu_total_vertices, stl_bounding_boxes
 
     ! Subroutines for STL immersed boundaries
-    public :: f_check_boundary, f_register_edge, f_check_interpolation_2D, &
-              f_check_interpolation_3D, f_interpolate_2D, f_interpolate_3D, &
-              f_interpolated_distance, f_normals, f_distance, f_distance_normals_3D, f_tri_area, s_pack_model_for_gpu, &
-              f_model_is_inside_flat, f_distance_normals_3d_flat
+    public :: f_check_boundary, f_register_edge, f_model_is_inside_flat, &
+              f_distance_normals_3D, f_distance_normals_2D, s_pack_model_for_gpu
 
     !! array of STL models that can be allocated and then used in IB marker and levelset compute
     type(t_model_array), allocatable, target :: models(:)
@@ -35,8 +33,6 @@ module m_model
     real(wp), allocatable, dimension(:, :, :, :) :: gpu_trs_v
     real(wp), allocatable, dimension(:, :, :) :: gpu_trs_n
     real(wp), allocatable, dimension(:, :, :, :) :: gpu_boundary_v
-    real(wp), allocatable, dimension(:, :, :) :: gpu_interpolated_boundary_v
-    integer, allocatable :: gpu_interpolate(:)
     integer, allocatable :: gpu_boundary_edge_count(:)
     integer, allocatable :: gpu_total_vertices(:)
     real(wp), allocatable :: stl_bounding_boxes(:, :, :)
@@ -56,7 +52,7 @@ contains
         character(kind=c_char, len=80) :: header
         integer(kind=c_int32_t) :: nTriangles
 
-        real(kind=c_float) :: normal(3), v(3, 3)
+        real(kind=c_float) :: normal(3), v(3, 3), v_norm
         integer(kind=c_int16_t) :: attribute
 
         open (newunit=iunit, file=filepath, action='READ', &
@@ -86,6 +82,8 @@ contains
 
             model%trs(i)%v = v
             model%trs(i)%n = normal
+            v_norm = sqrt(normal(1)**2 + normal(2)**2 + normal(3)**2)
+            if (v_norm > 0._wp) model%trs(i)%n = normal/v_norm
         end do
 
         close (iunit)
@@ -102,6 +100,7 @@ contains
         integer :: i, j, iunit, iostat
         character(80) :: line, buffered_line
         logical :: is_buffered
+        real(wp) :: normal(3), v_norm
 
         is_buffered = .false.
 
@@ -150,7 +149,9 @@ contains
             end if
 
             call s_skip_ignored_lines(iunit, buffered_line, is_buffered)
-            read (line(13:), *) model%trs(i)%n
+            read (line(13:), *) normal
+            v_norm = sqrt(normal(1)**2 + normal(2)**2 + normal(3)**2)
+            if (v_norm > 0._wp) model%trs(i)%n = normal/v_norm
 
             call s_skip_ignored_lines(iunit, buffered_line, is_buffered)
             if (is_buffered) then
@@ -821,432 +822,16 @@ contains
 
     end subroutine f_register_edge
 
-    !> This procedure check if interpolation is needed for 2D models.
-    !! @param boundary_v                Temporary edge end vertex buffer
-    !! @param spacing                   Dimensions of the current levelset cell
-    subroutine f_check_interpolation_2D(boundary_v, boundary_edge_count, spacing, interpolate)
-
-        logical, intent(inout) :: interpolate !< Logical indicator of interpolation
-        integer, intent(in) :: boundary_edge_count !< Number of boundary edges
-        real(wp), intent(in), dimension(1:boundary_edge_count, 1:3, 1:2) :: boundary_v
-        real(wp), dimension(1:3), intent(in) :: spacing
-
-        real(wp) :: l1, cell_width !< Length of each boundary edge and cell width
-        integer :: j !< Boundary edge index iterator
-        integer :: interpolate_integer !< integer form of interpolate logical to support GPU checking
-
-        cell_width = minval(spacing(1:2))
-        interpolate_integer = 0
-
-        $:GPU_PARALLEL_LOOP(private='[j,l1]', copyin='[boundary_v,cell_width,Ifactor_2D]', copy='[interpolate_integer]')
-        do j = 1, boundary_edge_count
-            if (interpolate_integer == 0) then
-                l1 = sqrt((boundary_v(j, 2, 1) - boundary_v(j, 1, 1))**2 + &
-                          (boundary_v(j, 2, 2) - boundary_v(j, 1, 2))**2)
-
-                if ((l1*Ifactor_2D > cell_width)) then
-                    interpolate_integer = 1
-                end if
-            end if
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        interpolate = (interpolate_integer == 1)
-
-    end subroutine f_check_interpolation_2D
-
-    !> This procedure check if interpolation is needed for 3D models.
-    !! @param model              Model to search in.
-    !! @param spacing            Dimensions of the current levelset cell
-    !! @param interpolate        Logical output
-    subroutine f_check_interpolation_3D(model, spacing, interpolate)
-
-        logical, intent(inout) :: interpolate
-        type(t_model), intent(in) :: model
-        real(wp), dimension(1:3), intent(in) :: spacing
-        real(wp), dimension(1:3) :: edge_l
-        real(wp) :: cell_width
-        real(wp), dimension(1:model%ntrs, 1:3, 1:3) :: tri_v
-        integer :: i, j, interpolate_integer !< Loop iterator
-
-        cell_width = minval(spacing)
-        interpolate_integer = 0
-
-        ! load up the array of triangles for GPU packing
-        do i = 1, model%ntrs
-            do j = 1, 3
-                tri_v(i, 1, j) = model%trs(i)%v(1, j)
-                tri_v(i, 2, j) = model%trs(i)%v(2, j)
-                tri_v(i, 3, j) = model%trs(i)%v(3, j)
-            end do
-        end do
-
-        ! compare the side of all
-        $:GPU_PARALLEL_LOOP(private='[i,edge_l]', copyin='[cell_width,tri_v,Ifactor_3D]', copy='[interpolate_integer]')
-        do i = 1, model%ntrs
-            if (interpolate_integer == 0) then
-                edge_l(1) = sqrt((tri_v(i, 1, 2) - tri_v(i, 1, 1))**2 + &
-                                 (tri_v(i, 2, 2) - tri_v(i, 2, 1))**2 + &
-                                 (tri_v(i, 3, 2) - tri_v(i, 3, 1))**2)
-                edge_l(2) = sqrt((tri_v(i, 1, 3) - tri_v(i, 1, 2))**2 + &
-                                 (tri_v(i, 2, 3) - tri_v(i, 2, 2))**2 + &
-                                 (tri_v(i, 3, 3) - tri_v(i, 3, 2))**2)
-                edge_l(3) = sqrt((tri_v(i, 1, 1) - tri_v(i, 1, 3))**2 + &
-                                 (tri_v(i, 2, 1) - tri_v(i, 2, 3))**2 + &
-                                 (tri_v(i, 3, 1) - tri_v(i, 3, 3))**2)
-
-                if ((edge_l(1) > cell_width*Ifactor_3D) .or. &
-                    (edge_l(2) > cell_width*Ifactor_3D) .or. &
-                    (edge_l(3) > cell_width*Ifactor_3D)) then
-                    interpolate_integer = 1
-                end if
-            end if
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        interpolate = (interpolate_integer == 1)
-
-    end subroutine f_check_interpolation_3D
-
-    !> This procedure interpolates 2D models.
-    !! @param boundary_v                   Group of all the boundary vertices of the 2D model without interpolation
-    !! @param boundary_edge_count          Output total number of boundary edges
-    !! @param spacing                      Dimensions of the current levelset cell
-    !! @param interpolated_boundary_v      Output all the boundary vertices of the interpolated 2D model
-    !! @param total_vertices               Total number of vertices after interpolation
-    subroutine f_interpolate_2D(boundary_v, boundary_edge_count, spacing, interpolated_boundary_v, total_vertices)
-
-        real(wp), intent(in), dimension(:, :, :) :: boundary_v
-        real(wp), dimension(1:3), intent(in) :: spacing
-        real(wp), allocatable, intent(inout), dimension(:, :) :: interpolated_boundary_v
-
-        integer, intent(inout) :: total_vertices, boundary_edge_count
-        integer :: num_segments, vertex_idx
-        integer :: i, j
-
-        real(wp) :: edge_length, cell_width
-        real(wp), dimension(1:2) :: edge_x, edge_y, edge_del
-
-        ! Get the number of boundary edges
-        cell_width = minval(spacing(1:2))
-        num_segments = 0
-
-        ! First pass: Calculate the total number of vertices including interpolated ones
-        total_vertices = 1
-        $:GPU_PARALLEL_LOOP(private='[i,edge_x,edge_y,edge_length,num_segments]', copyin='[Ifactor_2D]', copy='[total_vertices]')
-        do i = 1, boundary_edge_count
-            ! Get the coordinates of the two ends of the current edge
-            edge_x(1) = boundary_v(i, 1, 1)
-            edge_y(1) = boundary_v(i, 1, 2)
-            edge_x(2) = boundary_v(i, 2, 1)
-            edge_y(2) = boundary_v(i, 2, 2)
-
-            ! Compute the length of the edge
-            edge_length = sqrt((edge_x(2) - edge_x(1))**2 + &
-                               (edge_y(2) - edge_y(1))**2)
-
-            ! Determine the number of segments
-            if (edge_length > cell_width) then
-                num_segments = Ifactor_2D*ceiling(edge_length/cell_width)
-            else
-                num_segments = 1
-            end if
-
-            ! Each edge contributes num_segments vertices
-            $:GPU_ATOMIC(atomic='update')
-            total_vertices = total_vertices + num_segments
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        ! Allocate memory for the new boundary vertices array
-        allocate (interpolated_boundary_v(1:total_vertices, 1:3))
-
-        ! Fill the new boundary vertices array with original and interpolated vertices
-        total_vertices = 1
-        $:GPU_PARALLEL_LOOP(private='[i,edge_x,edge_y,edge_length,num_segments,edge_del,vertex_idx]', copyin='[Ifactor_2D]', copy='[total_vertices,interpolated_boundary_v]')
-        do i = 1, boundary_edge_count
-            ! Get the coordinates of the two ends of the current edge
-            edge_x(1) = boundary_v(i, 1, 1)
-            edge_y(1) = boundary_v(i, 1, 2)
-            edge_x(2) = boundary_v(i, 2, 1)
-            edge_y(2) = boundary_v(i, 2, 2)
-
-            ! Compute the length of the edge
-            edge_length = sqrt((edge_x(2) - edge_x(1))**2 + &
-                               (edge_y(2) - edge_y(1))**2)
-
-            ! Determine the number of segments and interpolation step
-            if (edge_length > cell_width) then
-                num_segments = Ifactor_2D*ceiling(edge_length/cell_width)
-                edge_del(1) = (edge_x(2) - edge_x(1))/num_segments
-                edge_del(2) = (edge_y(2) - edge_y(1))/num_segments
-            else
-                num_segments = 1
-                edge_del(1) = 0._wp
-                edge_del(2) = 0._wp
-            end if
-
-            interpolated_boundary_v(1, 1) = edge_x(1)
-            interpolated_boundary_v(1, 2) = edge_y(1)
-            interpolated_boundary_v(1, 3) = 0._wp
-
-            ! Add original and interpolated vertices to the output array
-            do j = 1, num_segments - 1
-                $:GPU_ATOMIC(atomic='capture')
-                total_vertices = total_vertices + 1
-                vertex_idx = total_vertices
-                $:END_GPU_ATOMIC_CAPTURE()
-                interpolated_boundary_v(vertex_idx, 1) = edge_x(1) + j*edge_del(1)
-                interpolated_boundary_v(vertex_idx, 2) = edge_y(1) + j*edge_del(2)
-            end do
-
-            ! Add the last vertex of the edge
-            if (num_segments > 0) then
-                $:GPU_ATOMIC(atomic='capture')
-                total_vertices = total_vertices + 1
-                vertex_idx = total_vertices
-                $:END_GPU_ATOMIC_CAPTURE()
-                interpolated_boundary_v(vertex_idx, 1) = edge_x(2)
-                interpolated_boundary_v(vertex_idx, 2) = edge_y(2)
-            end if
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-    end subroutine f_interpolate_2D
-
-    !> This procedure interpolates 3D models.
-    !! @param model                        Model to search in.
-    !! @param spacing                      Dimensions of the current levelset cell
-    !! @param interpolated_boundary_v      Output all the boundary vertices of the interpolated 3D model
-    !! @param total_vertices               Total number of vertices after interpolation
-    impure subroutine f_interpolate_3D(model, spacing, interpolated_boundary_v, total_vertices)
-        real(wp), dimension(1:3), intent(in) :: spacing
-        type(t_model), intent(in) :: model
-        real(wp), allocatable, intent(inout), dimension(:, :) :: interpolated_boundary_v
-        integer, intent(out) :: total_vertices
-
-        integer :: i, j, k, num_triangles, num_segments, num_inner_vertices, vertex_idx
-        real(wp), dimension(1:3, 1:3) :: tri
-        real(wp), dimension(1:3) :: edge_del, cell_area
-        real(wp), dimension(1:3) :: bary_coord !< Barycentric coordinates
-        real(wp) :: edge_length, cell_width, cell_area_min, tri_area
-
-        ! GPU-friendly flat arrays packed from model
-        real(wp), allocatable, dimension(:, :, :) :: flat_v  ! (3, 3, num_triangles)
-        real(wp), allocatable, dimension(:, :) :: flat_n  ! (3, num_triangles)
-
-        ! Number of triangles in the model
-        num_triangles = model%ntrs
-        cell_width = minval(spacing)
-
-        ! Find the minimum surface area
-        cell_area(1) = spacing(1)*spacing(2)
-        cell_area(2) = spacing(1)*spacing(3)
-        cell_area(3) = spacing(2)*spacing(3)
-        cell_area_min = minval(cell_area)
-        num_inner_vertices = 0
-
-        ! Pack model into flat arrays on CPU
-        allocate (flat_v(1:3, 1:3, 1:num_triangles))
-        allocate (flat_n(1:3, 1:num_triangles))
-        do i = 1, num_triangles
-            flat_v(:, :, i) = model%trs(i)%v(:, :)
-            flat_n(:, i) = model%trs(i)%n(:)
-        end do
-
-        ! Calculate the total number of vertices including interpolated ones
-        total_vertices = 0
-        $:GPU_PARALLEL_LOOP(private='[i,j,k,tri,edge_length,num_segments,num_inner_vertices]', copyin='[Ifactor_3D,Ifactor_bary_3D,flat_v,flat_n]', copy='[total_vertices]', collapse=1)
-        do i = 1, num_triangles
-            do j = 1, 3
-                ! Get the coordinates of the two vertices of the current edge
-                tri(1, :) = flat_v(j, :, i)
-                ! Next vertex in the triangle (cyclic)
-                tri(2, :) = flat_v(mod(j, 3) + 1, :, i)
-
-                ! Compute the length of the edge
-                edge_length = sqrt((tri(2, 1) - tri(1, 1))**2 + &
-                                   (tri(2, 2) - tri(1, 2))**2 + &
-                                   (tri(2, 3) - tri(1, 3))**2)
-
-                ! Determine the number of segments
-                if (edge_length > cell_width) then
-                    num_segments = Ifactor_3D*ceiling(edge_length/cell_width)
-                else
-                    num_segments = 1
-                end if
-
-                ! Each edge contributes num_segments vertices
-                $:GPU_ATOMIC(atomic='update')
-                total_vertices = total_vertices + num_segments + 1
-            end do
-
-            ! Add vertices inside the triangle
-            do k = 1, 3
-                tri(k, :) = flat_v(k, :, i)
-            end do
-            call f_tri_area(tri, tri_area)
-
-            if (tri_area > threshold_bary*cell_area_min) then
-                num_inner_vertices = Ifactor_bary_3D*ceiling(tri_area/cell_area_min)
-                $:GPU_ATOMIC(atomic='update')
-                total_vertices = total_vertices + num_inner_vertices
-            end if
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        ! Allocate memory for the new boundary vertices array
-        allocate (interpolated_boundary_v(1:total_vertices, 1:3))
-
-        ! Fill the new boundary vertices array with original and interpolated vertices
-        total_vertices = 0
-        $:GPU_PARALLEL_LOOP(private='[i,j,k,tri,edge_length,num_segments,num_inner_vertices,edge_del,bary_coord,vertex_idx]', copyin='[Ifactor_3D,Ifactor_bary_3D]', copy='[total_vertices]', collapse=1)
-        do i = 1, num_triangles
-            ! Loop through the 3 edges of each triangle
-            do j = 1, 3
-                tri(1, :) = flat_v(j, :, i)
-                tri(2, :) = flat_v(mod(j, 3) + 1, :, i)
-
-                ! Compute the length of the edge
-                edge_length = sqrt((tri(2, 1) - tri(1, 1))**2 + &
-                                   (tri(2, 2) - tri(1, 2))**2 + &
-                                   (tri(2, 3) - tri(1, 3))**2)
-
-                ! Determine the number of segments and interpolation step
-                if (edge_length > cell_width) then
-                    num_segments = Ifactor_3D*ceiling(edge_length/cell_width)
-                    edge_del(1) = (tri(2, 1) - tri(1, 1))/num_segments
-                    edge_del(2) = (tri(2, 2) - tri(1, 2))/num_segments
-                    edge_del(3) = (tri(2, 3) - tri(1, 3))/num_segments
-                else
-                    num_segments = 1
-                    edge_del = 0._wp
-                end if
-
-                ! Add original and interpolated vertices to the output array
-                do k = 0, num_segments - 1
-                    $:GPU_ATOMIC(atomic='capture')
-                    total_vertices = total_vertices + 1
-                    vertex_idx = total_vertices
-                    $:END_GPU_ATOMIC_CAPTURE()
-                    interpolated_boundary_v(vertex_idx, 1) = tri(1, 1) + k*edge_del(1)
-                    interpolated_boundary_v(vertex_idx, 2) = tri(1, 2) + k*edge_del(2)
-                    interpolated_boundary_v(vertex_idx, 3) = tri(1, 3) + k*edge_del(3)
-                end do
-
-                ! Add the last vertex of the edge
-                $:GPU_ATOMIC(atomic='capture')
-                total_vertices = total_vertices + 1
-                vertex_idx = total_vertices
-                $:END_GPU_ATOMIC_CAPTURE()
-                interpolated_boundary_v(vertex_idx, 1) = tri(2, 1)
-                interpolated_boundary_v(vertex_idx, 2) = tri(2, 2)
-                interpolated_boundary_v(vertex_idx, 3) = tri(2, 3)
-            end do
-
-            ! Interpolate verties that are not on edges
-            do k = 1, 3
-                tri(k, :) = flat_v(k, :, i)
-            end do
-            call f_tri_area(tri, tri_area)
-
-            if (tri_area > threshold_bary*cell_area_min) then
-                num_inner_vertices = Ifactor_bary_3D*ceiling(tri_area/cell_area_min)
-                !Use barycentric coordinates for randomly distributed points
-
-                ! Deterministic seed per triangle
-                rand_seed = i*73856093 + 19349663
-                if (rand_seed == 0) rand_seed = 1
-
-                do k = 1, num_inner_vertices
-                    bary_coord(1) = f_model_random_number(rand_seed)
-                    bary_coord(2) = f_model_random_number(rand_seed)
-
-                    if ((bary_coord(1) + bary_coord(2)) >= 1._wp) then
-                        bary_coord(1) = 1._wp - bary_coord(1)
-                        bary_coord(2) = 1._wp - bary_coord(2)
-                    end if
-                    bary_coord(3) = 1._wp - bary_coord(1) - bary_coord(2)
-
-                    $:GPU_ATOMIC(atomic='update')
-                    total_vertices = total_vertices + 1
-                    interpolated_boundary_v(total_vertices, 1) = dot_product(bary_coord, tri(1:3, 1))
-                    interpolated_boundary_v(total_vertices, 2) = dot_product(bary_coord, tri(1:3, 2))
-                    interpolated_boundary_v(total_vertices, 3) = dot_product(bary_coord, tri(1:3, 3))
-                end do
-            end if
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        deallocate (flat_v, flat_n)
-
-    end subroutine f_interpolate_3D
-
-    !> This procedure determines the levelset distance and normals of the 3D models without interpolation.
-    !! @param model        Model to search in.
-    !! @param point        The cell centers of the current level cell
-    !! @param normals      The output levelset normals
-    !! @param distance     The output levelset distance
-    subroutine f_distance_normals_3D(model, point, normals, distance)
-
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        type(t_model), intent(IN) :: model
-        real(wp), dimension(1:3), intent(in) :: point
-        real(wp), dimension(1:3), intent(out) :: normals
-        real(wp), intent(out) :: distance
-
-        real(wp), dimension(1:3, 1:3) :: tri
-        real(wp) :: dist_min, dist_t_min
-        real(wp) :: dist_min_normal, dist_buffer_normal
-        real(wp), dimension(1:3) :: midp !< Centers of the triangle facets
-        real(wp), dimension(1:3) :: dist_buffer !< Distance between the cell center and the vertices
-        integer :: i, j, tri_idx !< Iterator
-
-        dist_min = 1.e12_wp
-        dist_min_normal = 1.e12_wp
-        distance = 0._wp
-
-        tri_idx = 0
-        do i = 1, model%ntrs
-            do j = 1, 3
-                tri(j, 1) = model%trs(i)%v(j, 1)
-                tri(j, 2) = model%trs(i)%v(j, 2)
-                tri(j, 3) = model%trs(i)%v(j, 3)
-                dist_buffer(j) = sqrt((point(1) - tri(j, 1))**2 + &
-                                      (point(2) - tri(j, 2))**2 + &
-                                      (point(3) - tri(j, 3))**2)
-            end do
-
-            ! Get the surface center of each triangle facet
-            do j = 1, 3
-                midp(j) = (tri(1, j) + tri(2, j) + tri(3, j))/3
-            end do
-
-            dist_t_min = minval(dist_buffer(1:3))
-            dist_buffer_normal = sqrt((point(1) - midp(1))**2 + &
-                                      (point(2) - midp(2))**2 + &
-                                      (point(3) - midp(3))**2)
-
-            if (dist_t_min < dist_min) then
-                dist_min = dist_t_min
-            end if
-
-            if (dist_buffer_normal < dist_min_normal) then
-                dist_min_normal = dist_buffer_normal
-                tri_idx = i
-            end if
-        end do
-
-        normals(1) = model%trs(tri_idx)%n(1)
-        normals(2) = model%trs(tri_idx)%n(2)
-        normals(3) = model%trs(tri_idx)%n(3)
-        distance = dist_min
-
-    end subroutine f_distance_normals_3D
-
-    subroutine f_distance_normals_3D_flat(ntrs, trs_v, trs_n, pid, point, normals, distance)
-
+    !> This procedure determines the levelset distance and normals of 3D models
+    !! by computing the exact closest point via projection onto triangle surfaces.
+    !! @param ntrs                  Number of triangles for this patch
+    !! @param trs_v                 Flat GPU array of triangle vertices for all patches
+    !! @param trs_n                 Flat GPU array of triangle normals for all patches
+    !! @param pid                   Patch index into the arrays
+    !! @param point                 The cell center of the current levelset cell
+    !! @param normals               Output levelset normals
+    !! @param distance              Output levelset distance
+    subroutine f_distance_normals_3D(ntrs, trs_v, trs_n, pid, point, normals, distance)
         $:GPU_ROUTINE(parallelism='[seq]')
 
         integer, intent(in) :: ntrs
@@ -1257,186 +842,193 @@ contains
         real(wp), dimension(1:3), intent(out) :: normals
         real(wp), intent(out) :: distance
 
-        real(wp), dimension(1:3, 1:3) :: tri
-        real(wp) :: dist_min, dist_t_min
-        real(wp) :: dist_min_normal, dist_buffer_normal
-        real(wp), dimension(1:3) :: midp
-        real(wp), dimension(1:3) :: dist_buffer
-        integer :: i, j, tri_idx
+        integer :: i, j
+        real(wp) :: dist_min, dist_proj, dist_v, dist_e, t
+        real(wp) :: v1(1:3), v2(1:3), v3(1:3)
+        real(wp) :: e0(1:3), e1(1:3), pv(1:3)
+        real(wp) :: n(1:3), proj(1:3)
+        real(wp) :: d, ndot, denom
+        real(wp) :: u, v_bary, w
+        real(wp) :: d00, d01, d11, d20, d21
+        real(wp) :: edge(1:3), pe(1:3)
+        real(wp) :: verts(1:3, 1:3)
 
         dist_min = 1.e12_wp
-        dist_min_normal = 1.e12_wp
-        distance = 0._wp
+        normals = 0._wp
 
-        tri_idx = 0
         do i = 1, ntrs
-            do j = 1, 3
-                tri(j, 1) = trs_v(j, 1, i, pid)
-                tri(j, 2) = trs_v(j, 2, i, pid)
-                tri(j, 3) = trs_v(j, 3, i, pid)
-                dist_buffer(j) = sqrt((point(1) - tri(j, 1))**2 + &
-                                      (point(2) - tri(j, 2))**2 + &
-                                      (point(3) - tri(j, 3))**2)
-            end do
+            ! Triangle vertices
+            v1(:) = trs_v(1, :, i, pid)
+            v2(:) = trs_v(2, :, i, pid)
+            v3(:) = trs_v(3, :, i, pid)
+
+            ! Triangle normal
+            n(:) = trs_n(:, i, pid)
+
+            ! Project point onto triangle plane
+            pv(:) = point(:) - v1(:)
+            d = dot_product(pv, n)
+            if (abs(d) >= dist_min) cycle ! minimum distance is not small enough, no need to check validity
+            proj(:) = point(:) - d*n(:)
+
+            ! Check if projection is inside triangle using barycentric coordinates
+            e0(:) = v2(:) - v1(:)
+            e1(:) = v3(:) - v1(:)
+            pv(:) = proj(:) - v1(:)
+
+            d00 = dot_product(e0, e0)
+            d01 = dot_product(e0, e1)
+            d11 = dot_product(e1, e1)
+            d20 = dot_product(pv, e0)
+            d21 = dot_product(pv, e1)
+
+            denom = d00*d11 - d01*d01
+
+            if (abs(denom) > 0._wp) then
+                v_bary = (d11*d20 - d01*d21)/denom
+                w = (d00*d21 - d01*d20)/denom
+                u = 1._wp - v_bary - w
+            else
+                u = -1._wp
+                v_bary = -1._wp
+                w = -1._wp
+            end if
 
-            do j = 1, 3
-                midp(j) = (tri(1, j) + tri(2, j) + tri(3, j))/3
-            end do
+            ! If projection is inside triangle
+            if (u >= 0._wp .and. v_bary >= 0._wp .and. w >= 0._wp) then
+                dist_proj = sqrt((point(1) - proj(1))**2 + &
+                                 (point(2) - proj(2))**2 + &
+                                 (point(3) - proj(3))**2)
 
-            dist_t_min = minval(dist_buffer(1:3))
-            dist_buffer_normal = sqrt((point(1) - midp(1))**2 + &
-                                      (point(2) - midp(2))**2 + &
-                                      (point(3) - midp(3))**2)
+                if (dist_proj < dist_min) then
+                    dist_min = dist_proj
+                    normals(:) = n(:)
+                end if
+            else
+                ! Projection outside triangle: check edges and vertices
+                verts(:, 1) = v1(:)
+                verts(:, 2) = v2(:)
+                verts(:, 3) = v3(:)
+
+                ! Check three edges
+                do j = 1, 3
+                    edge(:) = verts(:, mod(j, 3) + 1) - verts(:, j)
+                    pe(:) = point(:) - verts(:, j)
+
+                    t = dot_product(pe, edge)/max(dot_product(edge, edge), 1.e-30_wp)
+
+                    if (t >= 0._wp .and. t <= 1._wp) then
+                        proj(:) = verts(:, j) + t*edge(:)
+                        dist_e = sqrt((point(1) - proj(1))**2 + &
+                                      (point(2) - proj(2))**2 + &
+                                      (point(3) - proj(3))**2)
+
+                        if (dist_e < dist_min) then
+                            dist_min = dist_e
+                            normals(:) = n(:)
+                        end if
+                    end if
+                end do
 
-            if (dist_t_min < dist_min) then
-                dist_min = dist_t_min
-            end if
+                ! Check three vertices
+                do j = 1, 3
+                    dist_v = sqrt((point(1) - verts(1, j))**2 + &
+                                  (point(2) - verts(2, j))**2 + &
+                                  (point(3) - verts(3, j))**2)
 
-            if (dist_buffer_normal < dist_min_normal) then
-                dist_min_normal = dist_buffer_normal
-                tri_idx = i
+                    if (dist_v < dist_min) then
+                        dist_min = dist_v
+                        normals(:) = n(:)
+                    end if
+                end do
             end if
         end do
 
-        normals(1) = trs_n(1, tri_idx, pid)
-        normals(2) = trs_n(2, tri_idx, pid)
-        normals(3) = trs_n(3, tri_idx, pid)
         distance = dist_min
 
-    end subroutine f_distance_normals_3D_flat
-
-    !> This procedure determines the levelset distance of 2D models without interpolation.
-    !! @param boundary_v                   Group of all the boundary vertices of the 2D model without interpolation
-    !! @param boundary_edge_count          Output the total number of boundary edges
-    !! @param point                        The cell centers of the current levelset cell
-    !! @return                             Distance which the levelset distance without interpolation
-    function f_distance(boundary_v, boundary_edge_count, point) result(distance)
-
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        integer, intent(in) :: boundary_edge_count
-        real(wp), intent(in), dimension(:, :, :) :: boundary_v
-        ! real(wp), intent(in), dimension(1:boundary_edge_count, 1:3, 1:2) :: boundary_v
-        real(wp), dimension(1:3), intent(in) :: point
-
-        integer :: i
-        real(wp) :: dist_buffer1, dist_buffer2
-        real(wp), dimension(1:boundary_edge_count) :: dist_buffer
-        real(wp) :: distance
-
-        distance = 0._wp
-        do i = 1, boundary_edge_count
-            dist_buffer1 = sqrt((point(1) - boundary_v(i, 1, 1))**2 + &
-                                & (point(2) - boundary_v(i, 1, 2))**2)
-
-            dist_buffer2 = sqrt((point(1) - boundary_v(i, 2, 1))**2 + &
-                                & (point(2) - boundary_v(i, 2, 2))**2)
-
-            dist_buffer(i) = minval((/dist_buffer1, dist_buffer2/))
-        end do
-
-        distance = minval(dist_buffer)
-
-    end function f_distance
-
-    !> This procedure determines the levelset normals of 2D models without interpolation.
-    !! @param boundary_v                   Group of all the boundary vertices of the 2D model without interpolation
-    !! @param boundary_edge_count          Output the total number of boundary edges
-    !! @param point                        The cell centers of the current levelset cell
-    !! @param normals                      Output levelset normals without interpolation
-    subroutine f_normals(boundary_v, boundary_edge_count, point, normals)
+    end subroutine f_distance_normals_3D
 
+    !> This procedure determines the levelset distance and normals of 2D models
+    !! by computing the exact closest point via projection onto boundary edges.
+    !! @param boundary_v            Flat GPU array of boundary vertices/normals for all patches
+    !! @param pid                   Patch index into the boundary_v array
+    !! @param boundary_edge_count   Total number of boundary edges for this patch
+    !! @param point                 The cell center of the current levelset cell
+    !! @param normals               Output levelset normals
+    !! @param distance              Output levelset distance
+    subroutine f_distance_normals_2D(boundary_v, pid, boundary_edge_count, point, normals, distance)
         $:GPU_ROUTINE(parallelism='[seq]')
 
+        real(wp), dimension(:, :, :, :), intent(in) :: boundary_v
+        integer, intent(in) :: pid
         integer, intent(in) :: boundary_edge_count
-        real(wp), intent(in), dimension(:, :, :) :: boundary_v
         real(wp), dimension(1:3), intent(in) :: point
         real(wp), dimension(1:3), intent(out) :: normals
+        real(wp), intent(out) :: distance
 
-        integer :: i, idx_buffer
-        real(wp) :: dist_min, dist_buffer
-        real(wp) :: midp(1:3)
+        integer :: i
+        real(wp) :: dist_min, dist_proj, dist_v1, dist_v2, t
+        real(wp) :: v1(1:2), v2(1:2), edge(1:2), pv(1:2)
+        real(wp) :: edge_len_sq, proj(1:2)
 
-        dist_buffer = 0._wp
-        dist_min = initial_distance_buffer
-        idx_buffer = 0
+        dist_min = 1.e12_wp
+        normals = 0._wp
 
         do i = 1, boundary_edge_count
-            midp(1) = (boundary_v(i, 2, 1) + boundary_v(i, 1, 1))/2
-            midp(2) = (boundary_v(i, 2, 2) + boundary_v(i, 1, 2))/2
-            midp(3) = 0._wp
-
-            dist_buffer = sqrt((point(1) - midp(1))**2 + &
-                                & (point(2) - midp(2))**2)
-
-            if (dist_buffer < dist_min) then
-                dist_min = dist_buffer
-                idx_buffer = i
+            ! Edge endpoints
+            v1(1) = boundary_v(i, 1, 1, pid)
+            v1(2) = boundary_v(i, 1, 2, pid)
+            v2(1) = boundary_v(i, 2, 1, pid)
+            v2(2) = boundary_v(i, 2, 2, pid)
+
+            ! Edge vector and point-to-v1 vector
+            edge(1) = v2(1) - v1(1)
+            edge(2) = v2(2) - v1(2)
+            pv(1) = point(1) - v1(1)
+            pv(2) = point(2) - v1(2)
+
+            edge_len_sq = dot_product(edge, edge)
+
+            ! Parameter of projection onto the edge line
+            if (edge_len_sq > 0._wp) then
+                t = dot_product(pv, edge)/edge_len_sq
+            else
+                t = 0._wp
             end if
-        end do
-
-        normals(1) = boundary_v(idx_buffer, 3, 1)
-        normals(2) = boundary_v(idx_buffer, 3, 2)
-        normals(3) = 0._wp
-
-    end subroutine f_normals
-
-    !> This procedure calculates the barycentric facet area
-    subroutine f_tri_area(tri, tri_area)
-
-        real(wp), dimension(1:3, 1:3), intent(in) :: tri
-        real(wp), intent(out) :: tri_area
-        real(wp), dimension(1:3) :: AB, AC, cross
-        integer :: i !< Loop iterator
-
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        do i = 1, 3
-            AB(i) = tri(2, i) - tri(1, i)
-            AC(i) = tri(3, i) - tri(1, i)
-        end do
-
-        cross(1) = AB(2)*AC(3) - AB(3)*AC(2)
-        cross(2) = AB(3)*AC(1) - AB(1)*AC(3)
-        cross(3) = AB(1)*AC(2) - AB(2)*AC(1)
-        tri_area = 0.5_wp*sqrt(cross(1)**2 + cross(2)**2 + cross(3)**2)
-
-    end subroutine f_tri_area
 
-    !> This procedure determines the levelset of interpolated 2D models.
-    !! @param interpolated_boundary_v      Group of all the boundary vertices of the interpolated 2D model
-    !! @param total_vertices               Total number of vertices after interpolation
-    !! @param point                        The cell centers of the current levelset cell
-    !! @return                             Distance which the levelset distance without interpolation
-    function f_interpolated_distance(interpolated_boundary_v, total_vertices, point) result(distance)
+            ! Check if projection falls within the segment
+            if (t >= 0._wp .and. t <= 1._wp) then
+                proj(1) = v1(1) + t*edge(1)
+                proj(2) = v1(2) + t*edge(2)
+                dist_proj = sqrt((point(1) - proj(1))**2 + (point(2) - proj(2))**2)
 
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        integer, intent(in) :: total_vertices
-        real(wp), intent(in), dimension(:, :) :: interpolated_boundary_v
-        real(wp), dimension(1:3), intent(in) :: point
-
-        integer :: i !< Loop iterator
-        real(wp) :: dist_buffer, min_dist
-        real(wp) :: distance
-
-        distance = initial_distance_buffer
-        dist_buffer = initial_distance_buffer
-        min_dist = initial_distance_buffer
-
-        do i = 1, total_vertices
-            dist_buffer = sqrt((point(1) - interpolated_boundary_v(i, 1))**2 + &
-                               (point(2) - interpolated_boundary_v(i, 2))**2 + &
-                               (point(3) - interpolated_boundary_v(i, 3))**2)
+                if (dist_proj < dist_min) then
+                    dist_min = dist_proj
+                    normals(1) = boundary_v(i, 3, 1, pid)
+                    normals(2) = boundary_v(i, 3, 2, pid)
+                end if
+            else
+                ! Check distance to first vertex
+                dist_v1 = sqrt(pv(1)*pv(1) + pv(2)*pv(2))
+                if (dist_v1 < dist_min) then
+                    dist_min = dist_v1
+                    normals(1) = boundary_v(i, 3, 1, pid)
+                    normals(2) = boundary_v(i, 3, 2, pid)
+                end if
 
-            if (min_dist > dist_buffer) then
-                min_dist = dist_buffer
+                ! Check distance to second vertex
+                dist_v2 = sqrt((point(1) - v2(1))**2 + (point(2) - v2(2))**2)
+                if (dist_v2 < dist_min) then
+                    dist_min = dist_v2
+                    normals(1) = boundary_v(i, 3, 1, pid)
+                    normals(2) = boundary_v(i, 3, 2, pid)
+                end if
             end if
         end do
 
-        distance = min_dist
+        distance = dist_min
 
-    end function f_interpolated_distance
+    end subroutine f_distance_normals_2D
 
     subroutine s_pack_model_for_gpu(ma)
         type(t_model_array), intent(inout) :: ma
diff --git a/src/pre_process/m_icpp_patches.fpp b/src/pre_process/m_icpp_patches.fpp
index 70af860ce0..0a2a6fe85c 100644
--- a/src/pre_process/m_icpp_patches.fpp
+++ b/src/pre_process/m_icpp_patches.fpp
@@ -1601,7 +1601,6 @@ contains
         real(wp) :: normals(1:3) !< Boundary normal buffer
         integer :: boundary_vertex_count, boundary_edge_count, total_vertices !< Boundary vertex
         real(wp), allocatable, dimension(:, :, :) :: boundary_v !< Boundary vertex buffer
-        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v !< Interpolated vertex buffer
         real(wp) :: distance !< Levelset distance buffer
         logical :: interpolate !< Logical variable to determine whether or not the model should be interpolated
 
@@ -1655,35 +1654,11 @@ contains
 
         call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
 
-        ! Check if the model needs interpolation
-        if (p > 0) then
-            call f_check_interpolation_3D(model, (/dx, dy, dz/), interpolate)
-        else
-            call f_check_interpolation_2D(boundary_v, boundary_edge_count, (/dx, dy, dz/), interpolate)
-        end if
-
         ! Show the number of edges and boundary edges in 2D STL models
         if (proc_rank == 0 .and. p == 0) then
             print *, ' * Number of 2D model boundary edges:', boundary_edge_count
         end if
 
-        ! Interpolate the STL model along the edges (2D) and on triangle facets (3D)
-        if (interpolate) then
-            if (proc_rank == 0) then
-                print *, ' * Interpolating STL vertices.'
-            end if
-
-            if (p > 0) then
-                call f_interpolate_3D(model, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
-            else
-                call f_interpolate_2D(boundary_v, boundary_edge_count, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
-            end if
-
-            if (proc_rank == 0) then
-                print *, ' * Total number of interpolated boundary vertices:', total_vertices
-            end if
-        end if
-
         if (proc_rank == 0) then
             write (*, "(A, 3(2X, F20.10))") "    > Model:  Min:", bbox%min(1:3)
             write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (bbox%min(1:3) + bbox%max(1:3))/2._wp
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 8b04f66b11..19301f2e30 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -647,7 +647,6 @@ contains
         type(ghost_point), intent(inout) :: gp
 
         integer :: i, j, k, patch_id, boundary_edge_count, total_vertices
-        integer :: interpolate
         real(wp), dimension(1:3) :: center, xyz_local
         real(wp) :: normals(1:3) !< Boundary normal buffer
         real(wp) :: distance
@@ -659,7 +658,6 @@ contains
         k = gp%loc(3)
 
         ! load in model values
-        interpolate = gpu_interpolate(patch_id)
         boundary_edge_count = gpu_boundary_edge_count(patch_id)
         total_vertices = gpu_total_vertices(patch_id)
 
@@ -686,44 +684,26 @@ contains
 
         ! 3D models
         if (p > 0) then
-
             ! Get the boundary normals and shortest distance between the cell center and the model boundary
-            call f_distance_normals_3D_flat(gpu_ntrs(patch_id), gpu_trs_v, gpu_trs_n, patch_id, xyz_local, normals, distance)
-
-            ! Get the shortest distance between the cell center and the interpolated model boundary
-            if (interpolate == 1) then
-                gp%levelset = f_interpolated_distance(gpu_interpolated_boundary_v(:, :, patch_id), total_vertices, xyz_local)
-            else
-                gp%levelset = distance
-            end if
+            call f_distance_normals_3D(gpu_ntrs(patch_id), gpu_trs_v, gpu_trs_n, patch_id, xyz_local, normals, distance)
 
-            ! Correct the sign of the levelset
+            ! Get the shortest distance between the cell center and the model boundary
+            gp%levelset = distance
             gp%levelset = -abs(gp%levelset)
 
             ! Assign the levelset_norm
             gp%levelset_norm = matmul(rotation, normals(1:3))
         else
             ! 2D models
-            if (interpolate == 1) then
-                ! Get the shortest distance between the cell center and the model boundary
-                gp%levelset = f_interpolated_distance(gpu_interpolated_boundary_v(:, :, patch_id), total_vertices, xyz_local)  ! Change 7
-            else
-                gp%levelset = f_distance(gpu_boundary_v(:, :, :, patch_id), boundary_edge_count, xyz_local)  ! Change 8
-            end if
-
-            ! Correct the sign of the levelset
-            gp%levelset = -abs(gp%levelset)
-
-            ! Get the boundary normals
-            call f_normals(gpu_boundary_v(:, :, :, patch_id), &
-                           boundary_edge_count, &
-                           xyz_local, &
-                           normals)
-
-            ! Assign the levelset_norm
+            call f_distance_normals_2D(gpu_boundary_v, patch_id, &
+                                       boundary_edge_count, &
+                                       xyz_local, normals, distance)
+            gp%levelset = -abs(distance)
             gp%levelset_norm = matmul(rotation, normals(1:3))
         end if
 
+        print *, gp%levelset, gp%levelset_norm
+
     end subroutine s_model_levelset
 
 end module m_compute_levelset
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 050bfdb35b..686ba6fbb3 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -962,9 +962,7 @@ contains
         real(wp) :: normals(1:3) !< Boundary normal buffer
         integer :: boundary_vertex_count, boundary_edge_count, total_vertices !< Boundary vertex
         real(wp), allocatable, dimension(:, :, :) :: boundary_v !< Boundary vertex buffer
-        real(wp), allocatable, dimension(:, :) :: interpolated_boundary_v !< Interpolated vertex buffer
         real(wp) :: dx_local, dy_local, dz_local !< Levelset distance buffer
-        logical :: interpolate !< Logical variable to determine whether or not the model should be interpolated
 
         integer :: i, j, k !< Generic loop iterators
         integer :: patch_id
@@ -1023,35 +1021,11 @@ contains
                 ! Need the cells that form the boundary of the flat projection in 2D
                 if (p == 0) call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
 
-                ! Check if the model needs interpolation
-                if (p > 0) then
-                    call f_check_interpolation_3D(model, (/dx_local, dy_local, dz_local/), interpolate)
-                else
-                    call f_check_interpolation_2D(boundary_v, boundary_edge_count, (/dx_local, dy_local, 0._wp/), interpolate)
-                end if
-
                 ! Show the number of edges and boundary edges in 2D STL models
                 if (proc_rank == 0 .and. p == 0) then
                     print *, ' * Number of 2D model boundary edges:', boundary_edge_count
                 end if
 
-                ! Interpolate the STL model along the edges (2D) and on triangle facets (3D)
-                if (interpolate) then
-                    if (proc_rank == 0) then
-                        print *, ' * Interpolating STL vertices.'
-                    end if
-
-                    if (p > 0) then
-                        call f_interpolate_3D(model, (/dx, dy, dz/), interpolated_boundary_v, total_vertices)
-                    else
-                        call f_interpolate_2D(boundary_v, boundary_edge_count, (/dx, dy, 0._wp/), interpolated_boundary_v, total_vertices)
-                    end if
-
-                    if (proc_rank == 0) then
-                        print *, ' * Total number of interpolated boundary vertices:', total_vertices
-                    end if
-                end if
-
                 if (proc_rank == 0) then
                     write (*, "(A, 3(2X, F20.10))") "    > Model:  Min:", bbox%min(1:3)
                     write (*, "(A, 3(2X, F20.10))") "    >         Cen:", (bbox%min(1:3) + bbox%max(1:3))/2._wp
@@ -1079,16 +1053,6 @@ contains
                 models(patch_id)%model = model
                 models(patch_id)%boundary_v = boundary_v
                 models(patch_id)%boundary_edge_count = boundary_edge_count
-                if (interpolate) then
-                    models(patch_id)%interpolate = 1
-                else
-                    models(patch_id)%interpolate = 0
-                end if
-                if (interpolate) then
-                    models(patch_id)%interpolated_boundary_v = interpolated_boundary_v
-                    models(patch_id)%total_vertices = total_vertices
-                end if
-
             end if
         end do
 
@@ -1111,24 +1075,18 @@ contains
                     max_bv2 = max(max_bv2, size(models(pid)%boundary_v, 2))
                     max_bv3 = max(max_bv3, size(models(pid)%boundary_v, 3))
                 end if
-                if (allocated(models(pid)%interpolated_boundary_v)) then
-                    max_iv1 = max(max_iv1, size(models(pid)%interpolated_boundary_v, 1))
-                    max_iv2 = max(max_iv2, size(models(pid)%interpolated_boundary_v, 2))
-                end if
             end do
 
             if (max_ntrs > 0) then
                 allocate (gpu_ntrs(1:num_ibs))
                 allocate (gpu_trs_v(1:3, 1:3, 1:max_ntrs, 1:num_ibs))
                 allocate (gpu_trs_n(1:3, 1:max_ntrs, 1:num_ibs))
-                allocate (gpu_interpolate(1:num_ibs))
                 allocate (gpu_boundary_edge_count(1:num_ibs))
                 allocate (gpu_total_vertices(1:num_ibs))
 
                 gpu_ntrs = 0
                 gpu_trs_v = 0._wp
                 gpu_trs_n = 0._wp
-                gpu_interpolate = 0
                 gpu_boundary_edge_count = 0
                 gpu_total_vertices = 0
 
@@ -1137,17 +1095,11 @@ contains
                     gpu_boundary_v = 0._wp
                 end if
 
-                if (max_iv1 > 0) then
-                    allocate (gpu_interpolated_boundary_v(1:max_iv1, 1:max_iv2, 1:num_ibs))
-                    gpu_interpolated_boundary_v = 0._wp
-                end if
-
                 do pid = 1, num_ibs
                     if (allocated(models(pid)%model)) then
                         gpu_ntrs(pid) = models(pid)%ntrs
                         gpu_trs_v(:, :, 1:models(pid)%ntrs, pid) = models(pid)%trs_v
                         gpu_trs_n(:, 1:models(pid)%ntrs, pid) = models(pid)%trs_n
-                        gpu_interpolate(pid) = models(pid)%interpolate
                         gpu_boundary_edge_count(pid) = models(pid)%boundary_edge_count
                         gpu_total_vertices(pid) = models(pid)%total_vertices
                     end if
@@ -1156,19 +1108,12 @@ contains
                                        1:size(models(pid)%boundary_v, 2), &
                                        1:size(models(pid)%boundary_v, 3), pid) = models(pid)%boundary_v
                     end if
-                    if (allocated(models(pid)%interpolated_boundary_v) .and. gpu_interpolate(pid) == 1) then
-                        gpu_interpolated_boundary_v(1:size(models(pid)%interpolated_boundary_v, 1), &
-                                                    1:size(models(pid)%interpolated_boundary_v, 2), pid) = models(pid)%interpolated_boundary_v
-                    end if
                 end do
 
-                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n, gpu_interpolate, gpu_boundary_edge_count, gpu_total_vertices]')
+                $:GPU_ENTER_DATA(copyin='[gpu_ntrs, gpu_trs_v, gpu_trs_n, gpu_boundary_edge_count, gpu_total_vertices]')
                 if (allocated(gpu_boundary_v)) then
                     $:GPU_ENTER_DATA(copyin='[gpu_boundary_v]')
                 end if
-                if (allocated(gpu_interpolated_boundary_v)) then
-                    $:GPU_ENTER_DATA(copyin='[gpu_interpolated_boundary_v]')
-                end if
             end if
         end block
 
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index a7f1e20aad..3a02eec5e7 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -140,19 +140,6 @@ contains
 
     end subroutine s_ibm_setup
 
-    !> @brief Exchanges immersed boundary marker data across MPI subdomain boundaries.
-    subroutine s_populate_ib_buffers()
-
-        #:for DIRC, DIRI in [('x', 1), ('y', 2), ('z', 3)]
-            #:for LOCC, LOCI in [('beg', -1), ('end', 1)]
-                if (bc_${DIRC}$%${LOCC}$ >= 0) then
-                    call s_mpi_sendrecv_ib_buffers(ib_markers, ${DIRI}$, ${LOCI}$)
-                end if
-            #:endfor
-        #:endfor
-
-    end subroutine s_populate_ib_buffers
-
     !>  Subroutine that updates the conservative variables at the ghost points
         !!  @param pb_in Internal bubble pressure
         !!  @param mv_in Mass of vapor in bubble
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index be737782be..3da31fbac8 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -255,191 +255,6 @@ contains
 
     end subroutine s_mpi_bcast_user_inputs
 
-    !> @brief Packs, exchanges, and unpacks immersed boundary marker buffers between neighboring MPI ranks.
-    subroutine s_mpi_sendrecv_ib_buffers(ib_markers, mpi_dir, pbc_loc)
-
-        type(integer_field), intent(inout) :: ib_markers
-
-        integer, intent(in) :: mpi_dir, pbc_loc
-
-        integer :: i, j, k, l, r, q !< Generic loop iterators
-
-        integer :: buffer_counts(1:3), buffer_count
-
-        type(int_bounds_info) :: boundary_conditions(1:3)
-        integer :: beg_end(1:2), grid_dims(1:3)
-        integer :: dst_proc, src_proc, recv_tag, send_tag
-
-        logical :: beg_end_geq_0, qbmm_comm
-
-        integer :: pack_offset, unpack_offset
-
-#ifdef MFC_MPI
-        integer :: ierr !< Generic flag used to identify and report MPI errors
-
-        call nvtxStartRange("IB-MARKER-COMM-PACKBUF")
-
-        buffer_counts = (/ &
-                        buff_size*(n + 1)*(p + 1), &
-                        buff_size*(m + 2*buff_size + 1)*(p + 1), &
-                        buff_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
-                        /)
-
-        buffer_count = buffer_counts(mpi_dir)
-        boundary_conditions = (/bc_x, bc_y, bc_z/)
-        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
-        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
-
-        ! Implements:
-        ! pbc_loc  bc_x >= 0 -> [send/recv]_tag  [dst/src]_proc
-        ! -1 (=0)      0            ->     [1,0]       [0,0]      | 0 0 [1,0] [beg,beg]
-        ! -1 (=0)      1            ->     [0,0]       [1,0]      | 0 1 [0,0] [end,beg]
-        ! +1 (=1)      0            ->     [0,1]       [1,1]      | 1 0 [0,1] [end,end]
-        ! +1 (=1)      1            ->     [1,1]       [0,1]      | 1 1 [1,1] [beg,end]
-
-        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
-        recv_tag = f_logical_to_int(pbc_loc == 1)
-
-        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
-        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
-
-        grid_dims = (/m, n, p/)
-
-        pack_offset = 0
-        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
-            pack_offset = grid_dims(mpi_dir) - buff_size + 1
-        end if
-
-        unpack_offset = 0
-        if (pbc_loc == 1) then
-            unpack_offset = grid_dims(mpi_dir) + buff_size + 1
-        end if
-
-        ! Pack Buffer to Send
-        #:for mpi_dir in [1, 2, 3]
-            if (mpi_dir == ${mpi_dir}$) then
-                #:if mpi_dir == 1
-                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 0, buff_size - 1
-                                r = (j + buff_size*(k + (n + 1)*l))
-                                ib_buff_send(r) = ib_markers%sf(j + pack_offset, k, l)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:elif mpi_dir == 2
-                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
-                    do l = 0, p
-                        do k = 0, buff_size - 1
-                            do j = -buff_size, m + buff_size
-                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                     (k + buff_size*l))
-                                ib_buff_send(r) = ib_markers%sf(j, k + pack_offset, l)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:else
-                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
-                    do l = 0, buff_size - 1
-                        do k = -buff_size, n + buff_size
-                            do j = -buff_size, m + buff_size
-                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                     ((k + buff_size) + (n + 2*buff_size + 1)*l))
-                                ib_buff_send(r) = ib_markers%sf(j, k, l + pack_offset)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:endif
-            end if
-        #:endfor
-        call nvtxEndRange ! Packbuf
-
-        #:for rdma_mpi in [False, True]
-            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
-                #:if rdma_mpi
-                    #:call GPU_HOST_DATA(use_device_addr='[ib_buff_send, ib_buff_recv]')
-
-                        call nvtxStartRange("IB-MARKER-SENDRECV-RDMA")
-                        call MPI_SENDRECV( &
-                            ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
-                            ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
-                            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                        call nvtxEndRange
-
-                    #:endcall GPU_HOST_DATA
-                    $:GPU_WAIT()
-                #:else
-                    call nvtxStartRange("IB-MARKER-DEV2HOST")
-                    $:GPU_UPDATE(host='[ib_buff_send]')
-                    call nvtxEndRange
-
-                    call nvtxStartRange("IB-MARKER-SENDRECV-NO-RMDA")
-                    call MPI_SENDRECV( &
-                        ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
-                        ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-                    call nvtxEndRange
-
-                    call nvtxStartRange("IB-MARKER-HOST2DEV")
-                    $:GPU_UPDATE(device='[ib_buff_recv]')
-                    call nvtxEndRange
-                #:endif
-            end if
-        #:endfor
-
-        ! Unpack Received Buffer
-        call nvtxStartRange("IB-MARKER-COMM-UNPACKBUF")
-        #:for mpi_dir in [1, 2, 3]
-            if (mpi_dir == ${mpi_dir}$) then
-                #:if mpi_dir == 1
-                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
-                    do l = 0, p
-                        do k = 0, n
-                            do j = -buff_size, -1
-                                r = (j + buff_size*((k + 1) + (n + 1)*l))
-                                ib_markers%sf(j + unpack_offset, k, l) = ib_buff_recv(r)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:elif mpi_dir == 2
-                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
-                    do l = 0, p
-                        do k = -buff_size, -1
-                            do j = -buff_size, m + buff_size
-                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                     ((k + buff_size) + buff_size*l))
-                                ib_markers%sf(j, k + unpack_offset, l) = ib_buff_recv(r)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:else
-                    ! Unpacking buffer from bc_z%beg
-                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
-                    do l = -buff_size, -1
-                        do k = -buff_size, n + buff_size
-                            do j = -buff_size, m + buff_size
-                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                     ((k + buff_size) + (n + 2*buff_size + 1)* &
-                                      (l + buff_size)))
-                                ib_markers%sf(j, k, l + unpack_offset) = ib_buff_recv(r)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:endif
-            end if
-        #:endfor
-        call nvtxEndRange
-#endif
-
-    end subroutine s_mpi_sendrecv_ib_buffers
-
     !> @brief Broadcasts random phase numbers from rank 0 to all MPI processes.
     impure subroutine s_mpi_send_random_number(phi_rn, num_freq)
         integer, intent(in) :: num_freq

From f42982e3f9cb9a6bf31e9a99ca7c9837a519e143 Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Sat, 21 Feb 2026 00:36:05 -0500
Subject: [PATCH 20/22] The code really wanted me to run formatting

---
 src/common/m_model.fpp                | 18 +++++++++---------
 src/simulation/m_compute_levelset.fpp |  2 --
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index b474fec80e..c68cf5ffba 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -849,7 +849,7 @@ contains
         real(wp) :: n(1:3), proj(1:3)
         real(wp) :: d, ndot, denom
         real(wp) :: u, v_bary, w
-        real(wp) :: d00, d01, d11, d20, d21
+        real(wp) :: l00, l01, l11, l20, l21
         real(wp) :: edge(1:3), pe(1:3)
         real(wp) :: verts(1:3, 1:3)
 
@@ -876,17 +876,17 @@ contains
             e1(:) = v3(:) - v1(:)
             pv(:) = proj(:) - v1(:)
 
-            d00 = dot_product(e0, e0)
-            d01 = dot_product(e0, e1)
-            d11 = dot_product(e1, e1)
-            d20 = dot_product(pv, e0)
-            d21 = dot_product(pv, e1)
+            l00 = dot_product(e0, e0)
+            l01 = dot_product(e0, e1)
+            l11 = dot_product(e1, e1)
+            l20 = dot_product(pv, e0)
+            l21 = dot_product(pv, e1)
 
-            denom = d00*d11 - d01*d01
+            denom = l00*l11 - l01*l01
 
             if (abs(denom) > 0._wp) then
-                v_bary = (d11*d20 - d01*d21)/denom
-                w = (d00*d21 - d01*d20)/denom
+                v_bary = (l11*l20 - l01*l21)/denom
+                w = (l00*l21 - l01*l20)/denom
                 u = 1._wp - v_bary - w
             else
                 u = -1._wp
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 19301f2e30..0971e8fbdf 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -702,8 +702,6 @@ contains
             gp%levelset_norm = matmul(rotation, normals(1:3))
         end if
 
-        print *, gp%levelset, gp%levelset_norm
-
     end subroutine s_model_levelset
 
 end module m_compute_levelset

From 0099975bce44a7ebf35b7f4b89391e4092f1a5fe Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Sat, 21 Feb 2026 01:06:49 -0500
Subject: [PATCH 21/22] Some more line deletions for the model normal vector

---
 src/common/m_model.fpp | 41 +++++++++++++----------------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index c68cf5ffba..763f7e69a1 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -967,7 +967,7 @@ contains
         real(wp), intent(out) :: distance
 
         integer :: i
-        real(wp) :: dist_min, dist_proj, dist_v1, dist_v2, t
+        real(wp) :: dist_min, dist, t
         real(wp) :: v1(1:2), v2(1:2), edge(1:2), pv(1:2)
         real(wp) :: edge_len_sq, proj(1:2)
 
@@ -982,11 +982,9 @@ contains
             v2(2) = boundary_v(i, 2, 2, pid)
 
             ! Edge vector and point-to-v1 vector
-            edge(1) = v2(1) - v1(1)
-            edge(2) = v2(2) - v1(2)
+            edge = v2 - v1
             pv(1) = point(1) - v1(1)
             pv(2) = point(2) - v1(2)
-
             edge_len_sq = dot_product(edge, edge)
 
             ! Parameter of projection onto the edge line
@@ -998,31 +996,18 @@ contains
 
             ! Check if projection falls within the segment
             if (t >= 0._wp .and. t <= 1._wp) then
-                proj(1) = v1(1) + t*edge(1)
-                proj(2) = v1(2) + t*edge(2)
-                dist_proj = sqrt((point(1) - proj(1))**2 + (point(2) - proj(2))**2)
-
-                if (dist_proj < dist_min) then
-                    dist_min = dist_proj
-                    normals(1) = boundary_v(i, 3, 1, pid)
-                    normals(2) = boundary_v(i, 3, 2, pid)
-                end if
-            else
-                ! Check distance to first vertex
-                dist_v1 = sqrt(pv(1)*pv(1) + pv(2)*pv(2))
-                if (dist_v1 < dist_min) then
-                    dist_min = dist_v1
-                    normals(1) = boundary_v(i, 3, 1, pid)
-                    normals(2) = boundary_v(i, 3, 2, pid)
-                end if
+                proj = v1 + t*edge
+                dist = sqrt((point(1) - proj(1))**2 + (point(2) - proj(2))**2)
+            else if (t < 0._wp) then ! negative t means that v1 is the closest point on the edge
+                dist = sqrt((point(1) - v1(1))**2 + (point(2) - v1(2))**2)
+            else ! t > 1 means that v2 is the closest point on the line edge
+                dist = sqrt((point(1) - v2(1))**2 + (point(2) - v2(2))**2)
+            end if
 
-                ! Check distance to second vertex
-                dist_v2 = sqrt((point(1) - v2(1))**2 + (point(2) - v2(2))**2)
-                if (dist_v2 < dist_min) then
-                    dist_min = dist_v2
-                    normals(1) = boundary_v(i, 3, 1, pid)
-                    normals(2) = boundary_v(i, 3, 2, pid)
-                end if
+            if (dist < dist_min) then
+                dist_min = dist
+                normals(1) = boundary_v(i, 3, 1, pid)
+                normals(2) = boundary_v(i, 3, 2, pid)
             end if
         end do
 

From 8709609dae3f5dc319e4af095860438bcfa03f96 Mon Sep 17 00:00:00 2001
From: danieljvickers <dnlvickers5@gmail.com>
Date: Sat, 21 Feb 2026 01:10:31 -0500
Subject: [PATCH 22/22] Renamed model subroutines to use the s_ prefix for
 consistency with the rest of the code

---
 src/common/m_model.fpp                | 26 +++++++++++++-------------
 src/pre_process/m_icpp_patches.fpp    |  2 +-
 src/simulation/m_compute_levelset.fpp |  4 ++--
 src/simulation/m_ib_patches.fpp       |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 763f7e69a1..2bbdda5a91 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -23,8 +23,8 @@ module m_model
               gpu_total_vertices, stl_bounding_boxes
 
     ! Subroutines for STL immersed boundaries
-    public :: f_check_boundary, f_register_edge, f_model_is_inside_flat, &
-              f_distance_normals_3D, f_distance_normals_2D, s_pack_model_for_gpu
+    public :: s_check_boundary, s_register_edge, f_model_is_inside_flat, &
+              s_distance_normals_3D, s_distance_normals_2D, s_pack_model_for_gpu
 
     !! array of STL models that can be allocated and then used in IB marker and levelset compute
     type(t_model_array), allocatable, target :: models(:)
@@ -695,7 +695,7 @@ contains
     !> This procedure checks and labels edges shared by two or more triangles facets of the 2D STL model.
     !! @param model                      Model to search in.
     !! @param boundary_vertex_count      Output total boundary vertex count
-    subroutine f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
+    subroutine s_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
 
         type(t_model), intent(in) :: model
         real(wp), allocatable, intent(out), dimension(:, :, :) :: boundary_v !< Output boundary vertices/normals
@@ -721,17 +721,17 @@ contains
             ! First edge (v1, v2)
             edge(1, 1:2) = model%trs(i)%v(1, 1:2)
             edge(2, 1:2) = model%trs(i)%v(2, 1:2)
-            call f_register_edge(temp_boundary_v, edge, edge_index, edge_count)
+            call s_register_edge(temp_boundary_v, edge, edge_index, edge_count)
 
             ! Second edge (v2, v3)
             edge(1, 1:2) = model%trs(i)%v(2, 1:2)
             edge(2, 1:2) = model%trs(i)%v(3, 1:2)
-            call f_register_edge(temp_boundary_v, edge, edge_index, edge_count)
+            call s_register_edge(temp_boundary_v, edge, edge_index, edge_count)
 
             ! Third edge (v3, v1)
             edge(1, 1:2) = model%trs(i)%v(3, 1:2)
             edge(2, 1:2) = model%trs(i)%v(1, 1:2)
-            call f_register_edge(temp_boundary_v, edge, edge_index, edge_count)
+            call s_register_edge(temp_boundary_v, edge, edge_index, edge_count)
         end do
 
         ! Check all edges and count repeated edges
@@ -805,10 +805,10 @@ contains
             boundary_v(i, 3, 2) = ynormal/v_norm
         end do
 
-    end subroutine f_check_boundary
+    end subroutine s_check_boundary
 
     !> This procedure appends the edge end vertices to a temporary buffer.
-    subroutine f_register_edge(temp_boundary_v, edge, edge_index, edge_count)
+    subroutine s_register_edge(temp_boundary_v, edge, edge_index, edge_count)
 
         integer, intent(inout) :: edge_index !< Edge index iterator
         integer, intent(inout) :: edge_count !< Total number of edges
@@ -820,7 +820,7 @@ contains
         temp_boundary_v(edge_index, 1, 1:2) = edge(1, 1:2)
         temp_boundary_v(edge_index, 2, 1:2) = edge(2, 1:2)
 
-    end subroutine f_register_edge
+    end subroutine s_register_edge
 
     !> This procedure determines the levelset distance and normals of 3D models
     !! by computing the exact closest point via projection onto triangle surfaces.
@@ -831,7 +831,7 @@ contains
     !! @param point                 The cell center of the current levelset cell
     !! @param normals               Output levelset normals
     !! @param distance              Output levelset distance
-    subroutine f_distance_normals_3D(ntrs, trs_v, trs_n, pid, point, normals, distance)
+    subroutine s_distance_normals_3D(ntrs, trs_v, trs_n, pid, point, normals, distance)
         $:GPU_ROUTINE(parallelism='[seq]')
 
         integer, intent(in) :: ntrs
@@ -946,7 +946,7 @@ contains
 
         distance = dist_min
 
-    end subroutine f_distance_normals_3D
+    end subroutine s_distance_normals_3D
 
     !> This procedure determines the levelset distance and normals of 2D models
     !! by computing the exact closest point via projection onto boundary edges.
@@ -956,7 +956,7 @@ contains
     !! @param point                 The cell center of the current levelset cell
     !! @param normals               Output levelset normals
     !! @param distance              Output levelset distance
-    subroutine f_distance_normals_2D(boundary_v, pid, boundary_edge_count, point, normals, distance)
+    subroutine s_distance_normals_2D(boundary_v, pid, boundary_edge_count, point, normals, distance)
         $:GPU_ROUTINE(parallelism='[seq]')
 
         real(wp), dimension(:, :, :, :), intent(in) :: boundary_v
@@ -1013,7 +1013,7 @@ contains
 
         distance = dist_min
 
-    end subroutine f_distance_normals_2D
+    end subroutine s_distance_normals_2D
 
     subroutine s_pack_model_for_gpu(ma)
         type(t_model_array), intent(inout) :: ma
diff --git a/src/pre_process/m_icpp_patches.fpp b/src/pre_process/m_icpp_patches.fpp
index 0a2a6fe85c..b2d101188b 100644
--- a/src/pre_process/m_icpp_patches.fpp
+++ b/src/pre_process/m_icpp_patches.fpp
@@ -1652,7 +1652,7 @@ contains
             print *, ' * Number of input model vertices:', 3*model%ntrs
         end if
 
-        call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
+        call s_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
 
         ! Show the number of edges and boundary edges in 2D STL models
         if (proc_rank == 0 .and. p == 0) then
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 0971e8fbdf..2fedac4ea9 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -685,7 +685,7 @@ contains
         ! 3D models
         if (p > 0) then
             ! Get the boundary normals and shortest distance between the cell center and the model boundary
-            call f_distance_normals_3D(gpu_ntrs(patch_id), gpu_trs_v, gpu_trs_n, patch_id, xyz_local, normals, distance)
+            call s_distance_normals_3D(gpu_ntrs(patch_id), gpu_trs_v, gpu_trs_n, patch_id, xyz_local, normals, distance)
 
             ! Get the shortest distance between the cell center and the model boundary
             gp%levelset = distance
@@ -695,7 +695,7 @@ contains
             gp%levelset_norm = matmul(rotation, normals(1:3))
         else
             ! 2D models
-            call f_distance_normals_2D(gpu_boundary_v, patch_id, &
+            call s_distance_normals_2D(gpu_boundary_v, patch_id, &
                                        boundary_edge_count, &
                                        xyz_local, normals, distance)
             gp%levelset = -abs(distance)
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index 686ba6fbb3..fd5fee175b 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -1019,7 +1019,7 @@ contains
                 end if
 
                 ! Need the cells that form the boundary of the flat projection in 2D
-                if (p == 0) call f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
+                if (p == 0) call s_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count)
 
                 ! Show the number of edges and boundary edges in 2D STL models
                 if (proc_rank == 0 .and. p == 0) then