diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..188fb31
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,46 @@
+BasedOnStyle: LLVM
+
+IndentWidth: 4
+TabWidth: 4
+UseTab: Never
+
+ColumnLimit: 120
+
+BreakBeforeBraces: Allman
+
+PointerAlignment: Right
+DerivePointerAlignment: false
+
+BinPackParameters: false
+BinPackArguments: false
+
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: false
+
+IndentCaseLabels: true
+InsertNewlineAtEOF: true
+
+SortIncludes: true
+
+ReflowComments: false
+AlignOperands: false
+
+StatementMacros:
+  - ASSERT
+  - CHECK
+  - REQUIRE
+  - ENSURE
+  - EXPECT
+  - CUDA_CHECK
+  - CUBLAS_CHECK
+  - CUSPARSE_CHECK
+
+AttributeMacros: 
+  - __global__
+  - __device__
+  - __host__
+  - __shared__
+  - __constant__
+  - __managed__
+  - __launch_bounds__
\ No newline at end of file
diff --git a/.github/workflows/check-clang-format.yml b/.github/workflows/check-clang-format.yml
new file mode 100644
index 0000000..b486bae
--- /dev/null
+++ b/.github/workflows/check-clang-format.yml
@@ -0,0 +1,21 @@
+name: Format
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  build:
+    name: Clang Format Check
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - uses: DoozyX/clang-format-lint-action@v0.20
+      with:
+        source: '.'
+        extensions: 'h,c,cu'
+        clangFormatVersion: 20
\ No newline at end of file
diff --git a/.github/workflows/spelling.yml b/.github/workflows/spelling.yml
new file mode 100644
index 0000000..f003f40
--- /dev/null
+++ b/.github/workflows/spelling.yml
@@ -0,0 +1,24 @@
+name: Spelling
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+env:
+  CLICOLOR: 1
+
+jobs:
+  spelling:
+    name: Spell Check with Typos
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Actions Repository
+      uses: actions/checkout@v5
+    - name: Spell Check Repo
+      uses: crate-ci/typos@v1.42.3
\ No newline at end of file
diff --git a/include/cupdlpx.h b/include/cupdlpx.h
index d18c242..5ad5e11 100644
--- a/include/cupdlpx.h
+++ b/include/cupdlpx.h
@@ -24,22 +24,19 @@ extern "C"
 #endif
 
     // create an lp_problem_t from a matrix descriptor
-    lp_problem_t *create_lp_problem(
-        const double *objective_c,
-        const matrix_desc_t *A_desc,
-        const double *con_lb,
-        const double *con_ub,
-        const double *var_lb,
-        const double *var_ub,
-        const double *objective_constant);
+    lp_problem_t *create_lp_problem(const double *objective_c,
+                                    const matrix_desc_t *A_desc,
+                                    const double *con_lb,
+                                    const double *con_ub,
+                                    const double *var_lb,
+                                    const double *var_ub,
+                                    const double *objective_constant);
 
     // Set up initial primal and dual solution for an lp_problem_t
     void set_start_values(lp_problem_t *prob, const double *primal, const double *dual);
 
     // solve the LP problem using PDHG
-    cupdlpx_result_t *solve_lp_problem(
-        lp_problem_t *prob,
-        const pdhg_parameters_t *params);
+    cupdlpx_result_t *solve_lp_problem(lp_problem_t *prob, const pdhg_parameters_t *params);
 
     // parameter
     void set_default_parameters(pdhg_parameters_t *params);
@@ -50,4 +47,4 @@ extern "C"
 
 #ifdef __cplusplus
 } // extern "C"
-#endif
\ No newline at end of file
+#endif
diff --git a/include/cupdlpx_types.h b/include/cupdlpx_types.h
index b4efe1a..d874206 100644
--- a/include/cupdlpx_types.h
+++ b/include/cupdlpx_types.h
@@ -25,171 +25,171 @@ extern "C"
 {
 #endif
 
-	typedef enum
-	{
-		TERMINATION_REASON_UNSPECIFIED,
-		TERMINATION_REASON_OPTIMAL,
-		TERMINATION_REASON_PRIMAL_INFEASIBLE,
-		TERMINATION_REASON_DUAL_INFEASIBLE,
-		TERMINATION_REASON_INFEASIBLE_OR_UNBOUNDED,
-		TERMINATION_REASON_TIME_LIMIT,
-		TERMINATION_REASON_ITERATION_LIMIT,
-		TERMINATION_REASON_FEAS_POLISH_SUCCESS
-	} termination_reason_t;
-
-	typedef enum
+    typedef enum
+    {
+        TERMINATION_REASON_UNSPECIFIED,
+        TERMINATION_REASON_OPTIMAL,
+        TERMINATION_REASON_PRIMAL_INFEASIBLE,
+        TERMINATION_REASON_DUAL_INFEASIBLE,
+        TERMINATION_REASON_INFEASIBLE_OR_UNBOUNDED,
+        TERMINATION_REASON_TIME_LIMIT,
+        TERMINATION_REASON_ITERATION_LIMIT,
+        TERMINATION_REASON_FEAS_POLISH_SUCCESS
+    } termination_reason_t;
+
+    typedef enum
     {
         NORM_TYPE_L2 = 0,
         NORM_TYPE_L_INF = 1
     } norm_type_t;
 
-	typedef struct
-	{
-		int num_variables;
-		int num_constraints;
-		double *variable_lower_bound;
-		double *variable_upper_bound;
-		double *objective_vector;
-		double objective_constant;
-
-		int *constraint_matrix_row_pointers;
-		int *constraint_matrix_col_indices;
-		double *constraint_matrix_values;
-		int constraint_matrix_num_nonzeros;
-
-		double *constraint_lower_bound;
-		double *constraint_upper_bound;
-
-		double *primal_start;
-		double *dual_start;
-	} lp_problem_t;
-
-	typedef struct
-	{
-		double artificial_restart_threshold;
-		double sufficient_reduction_for_restart;
-		double necessary_reduction_for_restart;
-		double k_p;
-		double k_i;
-		double k_d;
-		double i_smooth;
-	} restart_parameters_t;
-
-	typedef struct
-	{
-		double eps_optimal_relative;
-		double eps_feasible_relative;
-		double eps_feas_polish_relative;
-		double time_sec_limit;
-		int iteration_limit;
-	} termination_criteria_t;
-
-	typedef struct
-	{
-		int l_inf_ruiz_iterations;
-		bool has_pock_chambolle_alpha;
-		double pock_chambolle_alpha;
-		bool bound_objective_rescaling;
-		bool verbose;
-		int termination_evaluation_frequency;
-		int sv_max_iter;
-		double sv_tol;
-		termination_criteria_t termination_criteria;
-		restart_parameters_t restart_params;
-		double reflection_coefficient;
-		bool feasibility_polishing;
-		norm_type_t optimality_norm;
-		bool presolve;
-		double matrix_zero_tol;
-	} pdhg_parameters_t;
-
-	typedef struct
-	{
-		int num_variables;
-		int num_constraints;
-		int num_nonzeros;
-
-		int num_reduced_variables;
-		int num_reduced_constraints;
-		int num_reduced_nonzeros;
-
-		double *primal_solution;
-		double *dual_solution;
-		double *reduced_cost;
-
-		int total_count;
-		double rescaling_time_sec;
-		double cumulative_time_sec;
-		double presolve_time;
-		int presolve_status;
-		// PresolveStats presolve_stats;
-
-		double absolute_primal_residual;
-		double relative_primal_residual;
-		double absolute_dual_residual;
-		double relative_dual_residual;
-		double primal_objective_value;
-		double dual_objective_value;
-		double objective_gap;
-		double relative_objective_gap;
-		double max_primal_ray_infeasibility;
-		double max_dual_ray_infeasibility;
-		double primal_ray_linear_objective;
-		double dual_ray_objective;
-		termination_reason_t termination_reason;
-		double feasibility_polishing_time;
-		int feasibility_iteration;
-	} cupdlpx_result_t;
-
-	// matrix formats
-	typedef enum
-	{
-		matrix_dense = 0,
-		matrix_csr = 1,
-		matrix_csc = 2,
-		matrix_coo = 3
-	} matrix_format_t;
-
-	// matrix descriptor
-	typedef struct
-	{
-		int m; // num_constraints
-		int n; // num_variables
-		matrix_format_t fmt;
-
-		union MatrixData
-		{
-			struct MatrixDense
-			{					 // Dense (row-major)
-				const double *A; // m*n
-			} dense;
-
-			struct MatrixCSR
-			{ // CSR
-				int nnz;
-				const int *row_ptr;
-				const int *col_ind;
-				const double *vals;
-			} csr;
-
-			struct MatrixCSC
-			{ // CSC
-				int nnz;
-				const int *col_ptr;
-				const int *row_ind;
-				const double *vals;
-			} csc;
-
-			struct MatrixCOO
-			{ // COO
-				int nnz;
-				const int *row_ind;
-				const int *col_ind;
-				const double *vals;
-			} coo;
-		} data;
-	} matrix_desc_t;
+    typedef struct
+    {
+        int num_variables;
+        int num_constraints;
+        double *variable_lower_bound;
+        double *variable_upper_bound;
+        double *objective_vector;
+        double objective_constant;
+
+        int *constraint_matrix_row_pointers;
+        int *constraint_matrix_col_indices;
+        double *constraint_matrix_values;
+        int constraint_matrix_num_nonzeros;
+
+        double *constraint_lower_bound;
+        double *constraint_upper_bound;
+
+        double *primal_start;
+        double *dual_start;
+    } lp_problem_t;
+
+    typedef struct
+    {
+        double artificial_restart_threshold;
+        double sufficient_reduction_for_restart;
+        double necessary_reduction_for_restart;
+        double k_p;
+        double k_i;
+        double k_d;
+        double i_smooth;
+    } restart_parameters_t;
+
+    typedef struct
+    {
+        double eps_optimal_relative;
+        double eps_feasible_relative;
+        double eps_feas_polish_relative;
+        double time_sec_limit;
+        int iteration_limit;
+    } termination_criteria_t;
+
+    typedef struct
+    {
+        int l_inf_ruiz_iterations;
+        bool has_pock_chambolle_alpha;
+        double pock_chambolle_alpha;
+        bool bound_objective_rescaling;
+        bool verbose;
+        int termination_evaluation_frequency;
+        int sv_max_iter;
+        double sv_tol;
+        termination_criteria_t termination_criteria;
+        restart_parameters_t restart_params;
+        double reflection_coefficient;
+        bool feasibility_polishing;
+        norm_type_t optimality_norm;
+        bool presolve;
+        double matrix_zero_tol;
+    } pdhg_parameters_t;
+
+    typedef struct
+    {
+        int num_variables;
+        int num_constraints;
+        int num_nonzeros;
+
+        int num_reduced_variables;
+        int num_reduced_constraints;
+        int num_reduced_nonzeros;
+
+        double *primal_solution;
+        double *dual_solution;
+        double *reduced_cost;
+
+        int total_count;
+        double rescaling_time_sec;
+        double cumulative_time_sec;
+        double presolve_time;
+        int presolve_status;
+        // PresolveStats presolve_stats;
+
+        double absolute_primal_residual;
+        double relative_primal_residual;
+        double absolute_dual_residual;
+        double relative_dual_residual;
+        double primal_objective_value;
+        double dual_objective_value;
+        double objective_gap;
+        double relative_objective_gap;
+        double max_primal_ray_infeasibility;
+        double max_dual_ray_infeasibility;
+        double primal_ray_linear_objective;
+        double dual_ray_objective;
+        termination_reason_t termination_reason;
+        double feasibility_polishing_time;
+        int feasibility_iteration;
+    } cupdlpx_result_t;
+
+    // matrix formats
+    typedef enum
+    {
+        matrix_dense = 0,
+        matrix_csr = 1,
+        matrix_csc = 2,
+        matrix_coo = 3
+    } matrix_format_t;
+
+    // matrix descriptor
+    typedef struct
+    {
+        int m; // num_constraints
+        int n; // num_variables
+        matrix_format_t fmt;
+
+        union MatrixData
+        {
+            struct MatrixDense
+            {                    // Dense (row-major)
+                const double *A; // m*n
+            } dense;
+
+            struct MatrixCSR
+            { // CSR
+                int nnz;
+                const int *row_ptr;
+                const int *col_ind;
+                const double *vals;
+            } csr;
+
+            struct MatrixCSC
+            { // CSC
+                int nnz;
+                const int *col_ptr;
+                const int *row_ind;
+                const double *vals;
+            } csc;
+
+            struct MatrixCOO
+            { // COO
+                int nnz;
+                const int *row_ind;
+                const int *col_ind;
+                const double *vals;
+            } coo;
+        } data;
+    } matrix_desc_t;
 
 #ifdef __cplusplus
 } // extern "C"
-#endif
\ No newline at end of file
+#endif
diff --git a/include/mps_parser.h b/include/mps_parser.h
index 84cbeef..a1eed4f 100644
--- a/include/mps_parser.h
+++ b/include/mps_parser.h
@@ -18,4 +18,4 @@ limitations under the License.
 
 #include "cupdlpx_types.h"
 
-lp_problem_t *read_mps_file(const char *filename);
\ No newline at end of file
+lp_problem_t *read_mps_file(const char *filename);
diff --git a/internal/internal_types.h b/internal/internal_types.h
index 9eec9f9..9894c93 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -23,124 +23,124 @@ limitations under the License.
 
 typedef struct
 {
-	int num_rows;
-	int num_cols;
-	int num_nonzeros;
-	int *row_ptr;
-	int *col_ind;
-	int *row_ind;
-	double *val;
-	int *transpose_map;
+    int num_rows;
+    int num_cols;
+    int num_nonzeros;
+    int *row_ptr;
+    int *col_ind;
+    int *row_ind;
+    double *val;
+    int *transpose_map;
 } cu_sparse_matrix_csr_t;
 
 typedef struct
 {
-	int num_variables;
-	int num_constraints;
-	double *variable_lower_bound;
-	double *variable_upper_bound;
-	double *objective_vector;
-	double objective_constant;
-	cu_sparse_matrix_csr_t *constraint_matrix;
-	cu_sparse_matrix_csr_t *constraint_matrix_t;
-	double *constraint_lower_bound;
-	double *constraint_upper_bound;
-	int num_blocks_primal;
-	int num_blocks_dual;
-	int num_blocks_primal_dual;
-	int num_blocks_nnz;
-	double objective_vector_norm;
-	double constraint_bound_norm;
-	double *constraint_lower_bound_finite_val;
-	double *constraint_upper_bound_finite_val;
-	double *variable_lower_bound_finite_val;
-	double *variable_upper_bound_finite_val;
-
-	double *initial_primal_solution;
-	double *current_primal_solution;
-	double *pdhg_primal_solution;
-	double *reflected_primal_solution;
-	double *dual_product;
-	double *initial_dual_solution;
-	double *current_dual_solution;
-	double *pdhg_dual_solution;
-	double *reflected_dual_solution;
-	double *primal_product;
-	double step_size;
-	double *d_primal_step_size;
-	double *d_dual_step_size;
-	double primal_weight;
-	int total_count;
-	bool is_this_major_iteration;
-	double primal_weight_error_sum;
-	double primal_weight_last_error;
-	double best_primal_weight;
-	double best_primal_dual_residual_gap;
-
-	double *constraint_rescaling;
-	double *variable_rescaling;
-	double constraint_bound_rescaling;
-	double objective_vector_rescaling;
-	double *primal_slack;
-	double *dual_slack;
-	double rescaling_time_sec;
-	clock_t start_time;
-	double cumulative_time_sec;
-
-	double *primal_residual;
-	double absolute_primal_residual;
-	double relative_primal_residual;
-	double *dual_residual;
-	double absolute_dual_residual;
-	double relative_dual_residual;
-	double primal_objective_value;
-	double dual_objective_value;
-	double objective_gap;
-	double relative_objective_gap;
-	double max_primal_ray_infeasibility;
-	double max_dual_ray_infeasibility;
-	double primal_ray_linear_objective;
-	double dual_ray_objective;
-	termination_reason_t termination_reason;
-
-	double *delta_primal_solution;
-	double *delta_dual_solution;
-	double fixed_point_error;
-	double initial_fixed_point_error;
-	double last_trial_fixed_point_error;
-	int inner_count;
-	int *d_inner_count;
-
-	cusparseHandle_t sparse_handle;
-	cublasHandle_t blas_handle;
-	size_t spmv_buffer_size;
-	size_t primal_spmv_buffer_size;
-	size_t dual_spmv_buffer_size;
-	void *primal_spmv_buffer;
-	void *dual_spmv_buffer;
-	void *spmv_buffer;
-
-	cusparseSpMatDescr_t matA;
-	cusparseSpMatDescr_t matAt;
-	cusparseDnVecDescr_t vec_primal_sol;
-	cusparseDnVecDescr_t vec_dual_sol;
-	cusparseDnVecDescr_t vec_primal_prod;
-	cusparseDnVecDescr_t vec_dual_prod;
-
-	double *ones_primal_d;
-	double *ones_dual_d;
-
-	double feasibility_polishing_time;
-	int feasibility_iteration;
+    int num_variables;
+    int num_constraints;
+    double *variable_lower_bound;
+    double *variable_upper_bound;
+    double *objective_vector;
+    double objective_constant;
+    cu_sparse_matrix_csr_t *constraint_matrix;
+    cu_sparse_matrix_csr_t *constraint_matrix_t;
+    double *constraint_lower_bound;
+    double *constraint_upper_bound;
+    int num_blocks_primal;
+    int num_blocks_dual;
+    int num_blocks_primal_dual;
+    int num_blocks_nnz;
+    double objective_vector_norm;
+    double constraint_bound_norm;
+    double *constraint_lower_bound_finite_val;
+    double *constraint_upper_bound_finite_val;
+    double *variable_lower_bound_finite_val;
+    double *variable_upper_bound_finite_val;
+
+    double *initial_primal_solution;
+    double *current_primal_solution;
+    double *pdhg_primal_solution;
+    double *reflected_primal_solution;
+    double *dual_product;
+    double *initial_dual_solution;
+    double *current_dual_solution;
+    double *pdhg_dual_solution;
+    double *reflected_dual_solution;
+    double *primal_product;
+    double step_size;
+    double *d_primal_step_size;
+    double *d_dual_step_size;
+    double primal_weight;
+    int total_count;
+    bool is_this_major_iteration;
+    double primal_weight_error_sum;
+    double primal_weight_last_error;
+    double best_primal_weight;
+    double best_primal_dual_residual_gap;
+
+    double *constraint_rescaling;
+    double *variable_rescaling;
+    double constraint_bound_rescaling;
+    double objective_vector_rescaling;
+    double *primal_slack;
+    double *dual_slack;
+    double rescaling_time_sec;
+    clock_t start_time;
+    double cumulative_time_sec;
+
+    double *primal_residual;
+    double absolute_primal_residual;
+    double relative_primal_residual;
+    double *dual_residual;
+    double absolute_dual_residual;
+    double relative_dual_residual;
+    double primal_objective_value;
+    double dual_objective_value;
+    double objective_gap;
+    double relative_objective_gap;
+    double max_primal_ray_infeasibility;
+    double max_dual_ray_infeasibility;
+    double primal_ray_linear_objective;
+    double dual_ray_objective;
+    termination_reason_t termination_reason;
+
+    double *delta_primal_solution;
+    double *delta_dual_solution;
+    double fixed_point_error;
+    double initial_fixed_point_error;
+    double last_trial_fixed_point_error;
+    int inner_count;
+    int *d_inner_count;
+
+    cusparseHandle_t sparse_handle;
+    cublasHandle_t blas_handle;
+    size_t spmv_buffer_size;
+    size_t primal_spmv_buffer_size;
+    size_t dual_spmv_buffer_size;
+    void *primal_spmv_buffer;
+    void *dual_spmv_buffer;
+    void *spmv_buffer;
+
+    cusparseSpMatDescr_t matA;
+    cusparseSpMatDescr_t matAt;
+    cusparseDnVecDescr_t vec_primal_sol;
+    cusparseDnVecDescr_t vec_dual_sol;
+    cusparseDnVecDescr_t vec_primal_prod;
+    cusparseDnVecDescr_t vec_dual_prod;
+
+    double *ones_primal_d;
+    double *ones_dual_d;
+
+    double feasibility_polishing_time;
+    int feasibility_iteration;
 
     cudaStream_t stream;
 } pdhg_solver_state_t;
 
 typedef struct
 {
-	double *con_rescale;
-	double *var_rescale;
-	double con_bound_rescale;
-	double obj_vec_rescale;
-	double rescaling_time_sec;
+    double *con_rescale;
+    double *var_rescale;
+    double con_bound_rescale;
+    double obj_vec_rescale;
+    double rescaling_time_sec;
 } rescale_info_t;
diff --git a/internal/preconditioner.h b/internal/preconditioner.h
index 71a5c2b..e19bc6c 100644
--- a/internal/preconditioner.h
+++ b/internal/preconditioner.h
@@ -23,9 +23,7 @@ extern "C"
 {
 #endif
 
-    rescale_info_t *rescale_problem(
-        const pdhg_parameters_t *params,
-        pdhg_solver_state_t *state);
+    rescale_info_t *rescale_problem(const pdhg_parameters_t *params, pdhg_solver_state_t *state);
 
 #ifdef __cplusplus
 }
diff --git a/internal/presolve.h b/internal/presolve.h
index a852cef..8937668 100644
--- a/internal/presolve.h
+++ b/internal/presolve.h
@@ -22,7 +22,8 @@ extern "C"
 
     cupdlpx_presolve_info_t *pslp_presolve(const lp_problem_t *original_prob, const pdhg_parameters_t *params);
 
-    cupdlpx_result_t *create_result_from_presolve(const cupdlpx_presolve_info_t *info, const lp_problem_t *original_prob);
+    cupdlpx_result_t *create_result_from_presolve(const cupdlpx_presolve_info_t *info,
+                                                  const lp_problem_t *original_prob);
 
     const char *get_presolve_status_str(enum PresolveStatus_ status);
 
@@ -36,4 +37,4 @@ extern "C"
 }
 #endif
 
-#endif // PRESOLVE_H
\ No newline at end of file
+#endif // PRESOLVE_H
diff --git a/internal/solver.h b/internal/solver.h
index f7b6f49..b07aba8 100644
--- a/internal/solver.h
+++ b/internal/solver.h
@@ -23,9 +23,7 @@ extern "C"
 {
 #endif
 
-    cupdlpx_result_t *optimize(
-        const pdhg_parameters_t *params,
-        lp_problem_t *original_problem);
+    cupdlpx_result_t *optimize(const pdhg_parameters_t *params, lp_problem_t *original_problem);
 
 #ifdef __cplusplus
 }
diff --git a/internal/utils.h b/internal/utils.h
index 186eeeb..ce8d674 100644
--- a/internal/utils.h
+++ b/internal/utils.h
@@ -28,40 +28,37 @@ extern "C"
 {
 #endif
 
-#define CUDA_CHECK(call)                                           \
-    do                                                             \
-    {                                                              \
-        cudaError_t err = call;                                    \
-        if (err != cudaSuccess)                                    \
-        {                                                          \
-            fprintf(stderr, "CUDA Error at %s:%d: %s\n", __FILE__, \
-                    __LINE__, cudaGetErrorName(err));              \
-            exit(EXIT_FAILURE);                                    \
-        }                                                          \
+#define CUDA_CHECK(call)                                                                                               \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t err = call;                                                                                        \
+        if (err != cudaSuccess)                                                                                        \
+        {                                                                                                              \
+            fprintf(stderr, "CUDA Error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorName(err));                   \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
     } while (0)
 
-#define CUBLAS_CHECK(call)                                           \
-    do                                                               \
-    {                                                                \
-        cublasStatus_t status = call;                                \
-        if (status != CUBLAS_STATUS_SUCCESS)                         \
-        {                                                            \
-            fprintf(stderr, "cuBLAS Error at %s:%d: %s\n", __FILE__, \
-                    __LINE__, cublasGetStatusName(status));          \
-            exit(EXIT_FAILURE);                                      \
-        }                                                            \
+#define CUBLAS_CHECK(call)                                                                                             \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cublasStatus_t status = call;                                                                                  \
+        if (status != CUBLAS_STATUS_SUCCESS)                                                                           \
+        {                                                                                                              \
+            fprintf(stderr, "cuBLAS Error at %s:%d: %s\n", __FILE__, __LINE__, cublasGetStatusName(status));           \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
     } while (0)
 
-#define CUSPARSE_CHECK(call)                                           \
-    do                                                                 \
-    {                                                                  \
-        cusparseStatus_t status = call;                                \
-        if (status != CUSPARSE_STATUS_SUCCESS)                         \
-        {                                                              \
-            fprintf(stderr, "cuSPARSE Error at %s:%d: %s\n", __FILE__, \
-                    __LINE__, cusparseGetErrorName(status));           \
-            exit(EXIT_FAILURE);                                        \
-        }                                                              \
+#define CUSPARSE_CHECK(call)                                                                                           \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cusparseStatus_t status = call;                                                                                \
+        if (status != CUSPARSE_STATUS_SUCCESS)                                                                         \
+        {                                                                                                              \
+            fprintf(stderr, "cuSPARSE Error at %s:%d: %s\n", __FILE__, __LINE__, cusparseGetErrorName(status));        \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
     } while (0)
 
 #define THREADS_PER_BLOCK 256
@@ -75,27 +72,20 @@ extern "C"
 
     void *safe_realloc(void *ptr, size_t new_size);
 
-    double estimate_maximum_singular_value(
-        cusparseHandle_t sparse_handle,
-        cublasHandle_t blas_handle,
-        const cu_sparse_matrix_csr_t *A,
-        const cu_sparse_matrix_csr_t *AT,
-        int max_iterations,
-        double tolerance);
+    double estimate_maximum_singular_value(cusparseHandle_t sparse_handle,
+                                           cublasHandle_t blas_handle,
+                                           const cu_sparse_matrix_csr_t *A,
+                                           const cu_sparse_matrix_csr_t *AT,
+                                           int max_iterations,
+                                           double tolerance);
 
-    void compute_interaction_and_movement(
-        pdhg_solver_state_t *solver_state,
-        double *interaction,
-        double *movement);
+    void compute_interaction_and_movement(pdhg_solver_state_t *solver_state, double *interaction, double *movement);
 
-    bool should_do_adaptive_restart(
-        pdhg_solver_state_t *solver_state,
-        const restart_parameters_t *restart_params,
-        int termination_evaluation_frequency);
+    bool should_do_adaptive_restart(pdhg_solver_state_t *solver_state,
+                                    const restart_parameters_t *restart_params,
+                                    int termination_evaluation_frequency);
 
-    void check_termination_criteria(
-        pdhg_solver_state_t *solver_state,
-        const termination_criteria_t *criteria);
+    void check_termination_criteria(pdhg_solver_state_t *solver_state, const termination_criteria_t *criteria);
 
     void print_initial_info(const pdhg_parameters_t *params, lp_problem_t *problem);
 
@@ -113,30 +103,32 @@ extern "C"
 
     void fill_or_copy(double **dest, int n, const double *src, double fill_value);
 
-    int dense_to_csr(const matrix_desc_t *desc,
-                     int **row_ptr, int **col_ind, double **vals, int *nnz_out);
+    int dense_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals, int *nnz_out);
 
-    int csc_to_csr(const matrix_desc_t *desc,
-                   int **row_ptr, int **col_ind, double **vals);
+    int csc_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals);
 
-    int coo_to_csr(const matrix_desc_t *desc,
-                   int **row_ptr, int **col_ind, double **vals);
+    int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals);
 
-    void check_feas_polishing_termination_criteria(
-        pdhg_solver_state_t *solver_state,
-        const pdhg_solver_state_t *ori_solver_state,
-        const termination_criteria_t *criteria,
-        bool is_primal_polish);
+    void check_feas_polishing_termination_criteria(pdhg_solver_state_t *solver_state,
+                                                   const pdhg_solver_state_t *ori_solver_state,
+                                                   const termination_criteria_t *criteria,
+                                                   bool is_primal_polish);
 
     void print_initial_feas_polish_info(bool is_primal_polish, const pdhg_parameters_t *params);
 
     void display_feas_polish_iteration_stats(const pdhg_solver_state_t *state, bool verbose, bool is_primal_polish);
 
-    void pdhg_feas_polish_final_log(const pdhg_solver_state_t *primal_state, const pdhg_solver_state_t *dual_state, bool verbose);
+    void pdhg_feas_polish_final_log(const pdhg_solver_state_t *primal_state,
+                                    const pdhg_solver_state_t *dual_state,
+                                    bool verbose);
 
-    void compute_primal_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm);
+    void compute_primal_feas_polish_residual(pdhg_solver_state_t *state,
+                                             const pdhg_solver_state_t *ori_state,
+                                             norm_type_t optimality_norm);
 
-    void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm);
+    void compute_dual_feas_polish_residual(pdhg_solver_state_t *state,
+                                           const pdhg_solver_state_t *ori_state,
+                                           norm_type_t optimality_norm);
 
     void set_default_parameters(pdhg_parameters_t *params);
 
diff --git a/python_bindings/_core_bindings.cpp b/python_bindings/_core_bindings.cpp
index c990fe2..0329d96 100644
--- a/python_bindings/_core_bindings.cpp
+++ b/python_bindings/_core_bindings.cpp
@@ -89,8 +89,8 @@ static const double *get_arr_ptr_f64_or_null(py::object obj, const char *name, M
 }
 
 // get int32 pointer to contiguous numpy array
-static const int32_t *get_index_ptr_i32(py::object obj, const char *name,
-                                        MatrixKeepalive &keep, std::vector<int32_t> &tmp_vec)
+static const int32_t *
+get_index_ptr_i32(py::object obj, const char *name, MatrixKeepalive &keep, std::vector<int32_t> &tmp_vec)
 {
     // nullptr if obj is None
     if (!obj || obj.is_none())
@@ -126,8 +126,9 @@ static const int32_t *get_index_ptr_i32(py::object obj, const char *name,
             int64_t v = p[i];
             if (v < 0 || v > I32_MAX)
             {
-                throw std::overflow_error(std::string(name) + " has value out of int32 range; "
-                                                              "backend currently supports only 32-bit indices.");
+                throw std::overflow_error(std::string(name) +
+                                          " has value out of int32 range; "
+                                          "backend currently supports only 32-bit indices.");
             }
             tmp_vec[static_cast<size_t>(i)] = static_cast<int32_t>(v);
         }
@@ -138,16 +139,21 @@ static const int32_t *get_index_ptr_i32(py::object obj, const char *name,
 }
 
 // helper function to convert norm string to enum
-static norm_type_t parse_norm_string(const std::string& s)
+static norm_type_t parse_norm_string(const std::string &s)
 {
     std::string lower = s;
     std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
-    
-    if (lower == "l2") {
+
+    if (lower == "l2")
+    {
         return NORM_TYPE_L2;
-    } else if (lower == "linf") {
+    }
+    else if (lower == "linf")
+    {
         return NORM_TYPE_L_INF;
-    } else {
+    }
+    else
+    {
         throw std::invalid_argument("Unknown norm type: " + s + ". Use 'l2' or 'linf'.");
     }
 }
@@ -170,7 +176,8 @@ static void ensure_len_or_null(py::object obj, const char *name, int expect_len)
     // check length
     if ((int)arr.size() != expect_len)
     {
-        throw std::invalid_argument(std::string(name) + " length mismatch: expect " + std::to_string(expect_len) + ", got " + std::to_string((int)arr.size()));
+        throw std::invalid_argument(std::string(name) + " length mismatch: expect " + std::to_string(expect_len) +
+                                    ", got " + std::to_string((int)arr.size()));
     }
 }
 
@@ -179,24 +186,24 @@ static const char *status_to_str(termination_reason_t r)
 {
     switch (r)
     {
-    case TERMINATION_REASON_OPTIMAL:
-        return "OPTIMAL";
-    case TERMINATION_REASON_PRIMAL_INFEASIBLE:
-        return "PRIMAL_INFEASIBLE";
-    case TERMINATION_REASON_DUAL_INFEASIBLE:
-        return "DUAL_INFEASIBLE";
-    case TERMINATION_REASON_INFEASIBLE_OR_UNBOUNDED:
-        return "INFEASIBLE_OR_UNBOUNDED";
-    case TERMINATION_REASON_TIME_LIMIT:
-        return "TIME_LIMIT";
-    case TERMINATION_REASON_ITERATION_LIMIT:
-        return "ITERATION_LIMIT";
-    case TERMINATION_REASON_FEAS_POLISH_SUCCESS:
-        return "FEAS_POLISH_SUCCESS";
-    case TERMINATION_REASON_UNSPECIFIED:
-        return "UNSPECIFIED";
-    default:
-        return "UNKNOWN";
+        case TERMINATION_REASON_OPTIMAL:
+            return "OPTIMAL";
+        case TERMINATION_REASON_PRIMAL_INFEASIBLE:
+            return "PRIMAL_INFEASIBLE";
+        case TERMINATION_REASON_DUAL_INFEASIBLE:
+            return "DUAL_INFEASIBLE";
+        case TERMINATION_REASON_INFEASIBLE_OR_UNBOUNDED:
+            return "INFEASIBLE_OR_UNBOUNDED";
+        case TERMINATION_REASON_TIME_LIMIT:
+            return "TIME_LIMIT";
+        case TERMINATION_REASON_ITERATION_LIMIT:
+            return "ITERATION_LIMIT";
+        case TERMINATION_REASON_FEAS_POLISH_SUCCESS:
+            return "FEAS_POLISH_SUCCESS";
+        case TERMINATION_REASON_UNSPECIFIED:
+            return "UNSPECIFIED";
+        default:
+            return "UNKNOWN";
     }
 }
 
@@ -205,21 +212,21 @@ static int status_to_code(termination_reason_t r)
 {
     switch (r)
     {
-    case TERMINATION_REASON_OPTIMAL:
-        return 0;
-    case TERMINATION_REASON_PRIMAL_INFEASIBLE:
-        return 1;
-    case TERMINATION_REASON_DUAL_INFEASIBLE:
-        return 2;
-    case TERMINATION_REASON_TIME_LIMIT:
-        return 3;
-    case TERMINATION_REASON_ITERATION_LIMIT:
-        return 4;
-    case TERMINATION_REASON_INFEASIBLE_OR_UNBOUNDED:
-        return 5;
-    case TERMINATION_REASON_UNSPECIFIED:
-    default:
-        return -1;
+        case TERMINATION_REASON_OPTIMAL:
+            return 0;
+        case TERMINATION_REASON_PRIMAL_INFEASIBLE:
+            return 1;
+        case TERMINATION_REASON_DUAL_INFEASIBLE:
+            return 2;
+        case TERMINATION_REASON_TIME_LIMIT:
+            return 3;
+        case TERMINATION_REASON_ITERATION_LIMIT:
+            return 4;
+        case TERMINATION_REASON_INFEASIBLE_OR_UNBOUNDED:
+            return 5;
+        case TERMINATION_REASON_UNSPECIFIED:
+        default:
+            return -1;
     }
 }
 
@@ -283,20 +290,32 @@ static void parse_params_from_python(py::object params_obj, pdhg_parameters_t *p
     py::dict d = params_obj.cast<py::dict>();
 
     auto getf = [&](const char *k, double &tgt)
-    { if (d.contains(k)) tgt = py::cast<double>(d[k]); };
+    {
+        if (d.contains(k))
+            tgt = py::cast<double>(d[k]);
+    };
     auto geti = [&](const char *k, int &tgt)
-    { if (d.contains(k)) tgt = py::cast<int>(d[k]); };
+    {
+        if (d.contains(k))
+            tgt = py::cast<int>(d[k]);
+    };
     auto getb = [&](const char *k, bool &tgt)
-    { if (d.contains(k)) tgt = py::cast<bool>(d[k]); };
+    {
+        if (d.contains(k))
+            tgt = py::cast<bool>(d[k]);
+    };
     auto get_norm = [&](const char *k, norm_type_t &tgt)
-    { 
-        if (d.contains(k)) {
+    {
+        if (d.contains(k))
+        {
             py::object val = d[k];
-            if (py::isinstance<py::str>(val)) {
+            if (py::isinstance<py::str>(val))
+            {
                 std::string sval = py::cast<std::string>(val);
                 tgt = parse_norm_string(sval);
             }
-            else {
+            else
+            {
                 throw std::invalid_argument("optimality_norm must be a string ('l2'/'linf')");
             }
         }
@@ -436,17 +455,16 @@ static PyMatrixView get_matrix_from_python(py::object A)
 }
 
 // solve function
-static py::dict solve_once(
-    py::object A,
-    py::object objective_vector,          // c
-    py::object objective_constant,        // c0 (optional → 0)
-    py::object variable_lower_bound,      // lb (optional → 0)
-    py::object variable_upper_bound,      // ub (optional → inf)
-    py::object constraint_lower_bound,    // l  (optional → -inf)
-    py::object constraint_upper_bound,    // u  (optional → inf)
-    py::object params = py::none(),       // PDHG parameters (optional → default)
-    py::object primal_start = py::none(), // warm start primal solution (optional)
-    py::object dual_start = py::none()    // warm start dual solution (optional)
+static py::dict solve_once(py::object A,
+                           py::object objective_vector,          // c
+                           py::object objective_constant,        // c0 (optional → 0)
+                           py::object variable_lower_bound,      // lb (optional → 0)
+                           py::object variable_upper_bound,      // ub (optional → inf)
+                           py::object constraint_lower_bound,    // l  (optional → -inf)
+                           py::object constraint_upper_bound,    // u  (optional → inf)
+                           py::object params = py::none(),       // PDHG parameters (optional → default)
+                           py::object primal_start = py::none(), // warm start primal solution (optional)
+                           py::object dual_start = py::none()    // warm start dual solution (optional)
 )
 {
     // parse matrix
@@ -564,10 +582,10 @@ PYBIND11_MODULE(_cupdlpx_core, m)
 {
     m.doc() = "cupdlpx core bindings (auto-detect dense/CSR/CSC/COO; initialize default params here)";
 
-    m.def("get_default_params", &get_default_params_py,
-          "Return default PDHG parameters as a dict");
+    m.def("get_default_params", &get_default_params_py, "Return default PDHG parameters as a dict");
 
-    m.def("solve_once", &solve_once,
+    m.def("solve_once",
+          &solve_once,
           py::arg("A"),
           py::arg("objective_vector"),
           py::arg("objective_constant") = py::none(),
@@ -578,4 +596,4 @@ PYBIND11_MODULE(_cupdlpx_core, m)
           py::arg("params") = py::none(),
           py::arg("primal_start") = py::none(),
           py::arg("dual_start") = py::none());
-}
\ No newline at end of file
+}
diff --git a/src/cli.c b/src/cli.c
index e8b7058..5bfa4a6 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -27,11 +27,9 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
-char *get_output_path(const char *output_dir, const char *instance_name,
-                      const char *suffix)
+char *get_output_path(const char *output_dir, const char *instance_name, const char *suffix)
 {
-    size_t path_len =
-        strlen(output_dir) + strlen(instance_name) + strlen(suffix) + 2;
+    size_t path_len = strlen(output_dir) + strlen(instance_name) + strlen(suffix) + 2;
     char *full_path = safe_malloc(path_len * sizeof(char));
     snprintf(full_path, path_len, "%s/%s%s", output_dir, instance_name, suffix);
     return full_path;
@@ -58,8 +56,7 @@ char *extract_instance_name(const char *filename)
     return instance_name;
 }
 
-void save_solution(const double *data, int size, const char *output_dir,
-                   const char *instance_name, const char *suffix)
+void save_solution(const double *data, int size, const char *output_dir, const char *instance_name, const char *suffix)
 {
     char *file_path = get_output_path(output_dir, instance_name, suffix);
     if (file_path == NULL || data == NULL)
@@ -84,8 +81,7 @@ void save_solution(const double *data, int size, const char *output_dir,
     free(file_path);
 }
 
-void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir,
-                         const char *instance_name)
+void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir, const char *instance_name)
 {
     char *file_path = get_output_path(output_dir, instance_name, "_summary.txt");
     if (file_path == NULL)
@@ -100,8 +96,7 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir,
         free(file_path);
         return;
     }
-    fprintf(outfile, "Termination Reason: %s\n",
-            termination_reason_to_string(result->termination_reason));
+    fprintf(outfile, "Termination Reason: %s\n", termination_reason_to_string(result->termination_reason));
     if (result->presolve_time > 0.0)
     {
         fprintf(outfile, "Presolve Status: %s\n", get_presolve_status_str(result->presolve_status));
@@ -116,7 +111,7 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir,
         //     fprintf(outfile, "NNZ Removed Primal Propagation: %d\n", result->presolve_stats.nnz_removed_primal_propagation);
         //     fprintf(outfile, "NNZ Removed Parallel Rows: %d\n", result->presolve_stats.nnz_removed_parallel_rows);
         //     fprintf(outfile, "NNZ Removed Parallel Cols: %d\n", result->presolve_stats.nnz_removed_parallel_cols);
-            
+
         //     fprintf(outfile, "Presolve Time Init (sec): %e\n", result->presolve_stats.time_init);
         //     fprintf(outfile, "Presolve Time Run (sec): %e\n", result->presolve_stats.time_presolve);
         //     fprintf(outfile, "Presolve Time Fast (sec): %e\n", result->presolve_stats.time_fast_reductions);
@@ -130,16 +125,12 @@ void save_solver_summary(const cupdlpx_result_t *result, const char *output_dir,
     fprintf(outfile, "Precondition time (sec): %e\n", result->rescaling_time_sec);
     fprintf(outfile, "Runtime (sec): %e\n", result->cumulative_time_sec);
     fprintf(outfile, "Iterations Count: %d\n", result->total_count);
-    fprintf(outfile, "Primal Objective Value: %e\n",
-            result->primal_objective_value);
+    fprintf(outfile, "Primal Objective Value: %e\n", result->primal_objective_value);
     fprintf(outfile, "Dual Objective Value: %e\n", result->dual_objective_value);
-    fprintf(outfile, "Relative Primal Residual: %e\n",
-            result->relative_primal_residual);
-    fprintf(outfile, "Relative Dual Residual: %e\n",
-            result->relative_dual_residual);
+    fprintf(outfile, "Relative Primal Residual: %e\n", result->relative_primal_residual);
+    fprintf(outfile, "Relative Dual Residual: %e\n", result->relative_dual_residual);
     fprintf(outfile, "Absolute Objective Gap: %e\n", result->objective_gap);
-    fprintf(outfile, "Relative Objective Gap: %e\n",
-            result->relative_objective_gap);
+    fprintf(outfile, "Relative Objective Gap: %e\n", result->relative_objective_gap);
     fprintf(outfile, "Rows: %d\n", result->num_constraints);
     fprintf(outfile, "Columns: %d\n", result->num_variables);
     fprintf(outfile, "Nonzeros: %d\n", result->num_nonzeros);
@@ -157,55 +148,67 @@ void print_usage(const char *prog_name)
     fprintf(stderr, "Usage: %s [OPTIONS] <mps_file> <output_dir>\n\n", prog_name);
 
     fprintf(stderr, "Arguments:\n");
-    fprintf(stderr, "  <mps_file>               Path to the input problem in MPS "
-                    "format (.mps or .mps.gz).\n");
-    fprintf(stderr, "  <output_dir>             Directory where output files "
-                    "will be saved. It will contain:\n");
-    fprintf(stderr, "                             - <basename>_summary.txt\n");
     fprintf(stderr,
-            "                             - <basename>_primal_solution.txt\n");
+            "  <mps_file>               Path to the input problem in MPS "
+            "format (.mps or .mps.gz).\n");
     fprintf(stderr,
-            "                             - <basename>_dual_solution.txt\n\n");
+            "  <output_dir>             Directory where output files "
+            "will be saved. It will contain:\n");
+    fprintf(stderr, "                             - <basename>_summary.txt\n");
+    fprintf(stderr, "                             - <basename>_primal_solution.txt\n");
+    fprintf(stderr, "                             - <basename>_dual_solution.txt\n\n");
 
     fprintf(stderr, "Options:\n");
+    fprintf(stderr, "  -h, --help                          Display this help message.\n");
+    fprintf(stderr,
+            "  -v, --verbose                       "
+            "Enable verbose logging (default: false).\n");
+    fprintf(stderr,
+            "      --time_limit <seconds>          "
+            "Time limit in seconds (default: 3600.0).\n");
+    fprintf(stderr, "      --iter_limit <iterations>       Iteration limit (default: %d).\n", INT32_MAX);
+    fprintf(stderr,
+            "      --eps_opt <tolerance>           "
+            "Relative optimality tolerance (default: 1e-4).\n");
+    fprintf(stderr,
+            "      --eps_feas <tolerance>          "
+            "Relative feasibility tolerance (default: 1e-4).\n");
+    fprintf(stderr,
+            "      --l_inf_ruiz_iter <int>         "
+            "Iterations for L-inf Ruiz rescaling (default: 10).\n");
+    fprintf(stderr,
+            "      --no_pock_chambolle             "
+            "Disable Pock-Chambolle rescaling (default: enabled).\n");
+    fprintf(stderr,
+            "      --pock_chambolle_alpha <float>  "
+            "Value for Pock-Chambolle alpha (default: 1.0).\n");
     fprintf(stderr,
-            "  -h, --help                          Display this help message.\n");
-    fprintf(stderr, "  -v, --verbose                       "
-                    "Enable verbose logging (default: false).\n");
-    fprintf(stderr, "      --time_limit <seconds>          "
-                    "Time limit in seconds (default: 3600.0).\n");
-    fprintf(
-        stderr,
-        "      --iter_limit <iterations>       Iteration limit (default: %d).\n",
-        INT32_MAX);
-    fprintf(stderr, "      --eps_opt <tolerance>           "
-                    "Relative optimality tolerance (default: 1e-4).\n");
-    fprintf(stderr, "      --eps_feas <tolerance>          "
-                    "Relative feasibility tolerance (default: 1e-4).\n");
-    fprintf(stderr, "      --l_inf_ruiz_iter <int>         "
-                    "Iterations for L-inf Ruiz rescaling (default: 10).\n");
-    fprintf(stderr, "      --no_pock_chambolle             "
-                    "Disable Pock-Chambolle rescaling (default: enabled).\n");
-    fprintf(stderr, "      --pock_chambolle_alpha <float>  "
-                    "Value for Pock-Chambolle alpha (default: 1.0).\n");
-    fprintf(stderr, "      --no_bound_obj_rescaling        "
-                    "Disable bound objective rescaling (default: enabled).\n");
-    fprintf(stderr, "      --eval_freq <int>               "
-                    "Termination evaluation frequency (default: 200).\n");
-    fprintf(stderr, "      --sv_max_iter <int>             "
-                    "Max iterations for singular value estimation (default: 5000).\n");
-    fprintf(stderr, "      --sv_tol <float>                "
-                    "Tolerance for singular value estimation (default: 1e-4).\n");
-    fprintf(stderr, "  -f  --feasibility_polishing         "
-                    "Enable feasibility use feasibility polishing (default: false).\n");
-    fprintf(stderr, "      --eps_feas_polish <tolerance>   Relative feasibility "
-                    "polish tolerance (default: 1e-6).\n");
-    fprintf(stderr, "      --opt_norm <norm_type>          "
-                    "Norm for optimality criteria: l2 or linf (default: l2).\n");
-    fprintf(stderr, "      --no_presolve                   "
-                    "Disable presolve (default: enabled).\n");
-    fprintf(stderr, "      --matrix_zero_tol <tolerance>.  "
-                    "Zero tolerance in constraint matrix.\n");
+            "      --no_bound_obj_rescaling        "
+            "Disable bound objective rescaling (default: enabled).\n");
+    fprintf(stderr,
+            "      --eval_freq <int>               "
+            "Termination evaluation frequency (default: 200).\n");
+    fprintf(stderr,
+            "      --sv_max_iter <int>             "
+            "Max iterations for singular value estimation (default: 5000).\n");
+    fprintf(stderr,
+            "      --sv_tol <float>                "
+            "Tolerance for singular value estimation (default: 1e-4).\n");
+    fprintf(stderr,
+            "  -f  --feasibility_polishing         "
+            "Enable feasibility use feasibility polishing (default: false).\n");
+    fprintf(stderr,
+            "      --eps_feas_polish <tolerance>   Relative feasibility "
+            "polish tolerance (default: 1e-6).\n");
+    fprintf(stderr,
+            "      --opt_norm <norm_type>          "
+            "Norm for optimality criteria: l2 or linf (default: l2).\n");
+    fprintf(stderr,
+            "      --no_presolve                   "
+            "Disable presolve (default: enabled).\n");
+    fprintf(stderr,
+            "      --matrix_zero_tol <tolerance>.  "
+            "Zero tolerance in constraint matrix.\n");
 }
 
 int main(int argc, char *argv[])
@@ -213,106 +216,108 @@ int main(int argc, char *argv[])
     pdhg_parameters_t params;
     set_default_parameters(&params);
 
-    static struct option long_options[] = {
-        {"help", no_argument, 0, 'h'},
-        {"verbose", no_argument, 0, 'v'},
-        {"time_limit", required_argument, 0, 1001},
-        {"iter_limit", required_argument, 0, 1002},
-        {"eps_opt", required_argument, 0, 1003},
-        {"eps_feas", required_argument, 0, 1004},
-        {"eps_feas_polish", required_argument, 0, 1006},
-        {"feasibility_polishing", no_argument, 0, 'f'},
-        {"l_inf_ruiz_iter", required_argument, 0, 1007},
-        {"pock_chambolle_alpha", required_argument, 0, 1008},
-        {"no_pock_chambolle", no_argument, 0, 1009},
-        {"no_bound_obj_rescaling", no_argument, 0, 1010},
-        {"sv_max_iter", required_argument, 0, 1011},
-        {"sv_tol", required_argument, 0, 1012},
-        {"eval_freq", required_argument, 0, 1013},
-        {"opt_norm", required_argument, 0, 1014},
-        {"no_presolve", no_argument, 0, 1015},
-        {"matrix_zero_tol", required_argument, 0, 1016},
-        {0, 0, 0, 0}};
+    static struct option long_options[] = {{"help", no_argument, 0, 'h'},
+                                           {"verbose", no_argument, 0, 'v'},
+                                           {"time_limit", required_argument, 0, 1001},
+                                           {"iter_limit", required_argument, 0, 1002},
+                                           {"eps_opt", required_argument, 0, 1003},
+                                           {"eps_feas", required_argument, 0, 1004},
+                                           {"eps_feas_polish", required_argument, 0, 1006},
+                                           {"feasibility_polishing", no_argument, 0, 'f'},
+                                           {"l_inf_ruiz_iter", required_argument, 0, 1007},
+                                           {"pock_chambolle_alpha", required_argument, 0, 1008},
+                                           {"no_pock_chambolle", no_argument, 0, 1009},
+                                           {"no_bound_obj_rescaling", no_argument, 0, 1010},
+                                           {"sv_max_iter", required_argument, 0, 1011},
+                                           {"sv_tol", required_argument, 0, 1012},
+                                           {"eval_freq", required_argument, 0, 1013},
+                                           {"opt_norm", required_argument, 0, 1014},
+                                           {"no_presolve", no_argument, 0, 1015},
+                                           {"matrix_zero_tol", required_argument, 0, 1016},
+                                           {0, 0, 0, 0}};
 
     int opt;
     while ((opt = getopt_long(argc, argv, "hvfp", long_options, NULL)) != -1)
     {
         switch (opt)
         {
-        case 'h':
-            print_usage(argv[0]);
-            return 0;
-        case 'v':
-            params.verbose = true;
-            break;
-        case 1001: // --time_limit
-            params.termination_criteria.time_sec_limit = atof(optarg);
-            break;
-        case 1002: // --iter_limit
-            params.termination_criteria.iteration_limit = atoi(optarg);
-            break;
-        case 1003: // --eps_optimal
-            params.termination_criteria.eps_optimal_relative = atof(optarg);
-            break;
-        case 1004: // --eps_feas
-            params.termination_criteria.eps_feasible_relative = atof(optarg);
-            break;
-        case 1006: // --eps_feas_polish_relative
-            params.termination_criteria.eps_feas_polish_relative = atof(optarg);
-            break;
-        case 'f': // --feasibility_polishing
-            params.feasibility_polishing = true;
-            break;
-        case 1007: // --l_inf_ruiz_iter
-            params.l_inf_ruiz_iterations = atoi(optarg);
-            break;
-        case 1008: // --pock_chambolle_alpha
-            params.pock_chambolle_alpha = atof(optarg);
-            break;
-        case 1009: // --no_pock_chambolle
-            params.has_pock_chambolle_alpha = false;
-            break;
-        case 1010: // --no_bound_obj_rescaling
-            params.bound_objective_rescaling = false;
-            break;
-        case 1011: // --sv_max_iter
-            params.sv_max_iter = atoi(optarg);
-            break;
-        case 1012: // --sv_tol
-            params.sv_tol = atof(optarg);
-            break;
-        case 1013: // --eval_freq
-            params.termination_evaluation_frequency = atoi(optarg);
-            break;
-        case 1014: // --opt_norm
+            case 'h':
+                print_usage(argv[0]);
+                return 0;
+            case 'v':
+                params.verbose = true;
+                break;
+            case 1001: // --time_limit
+                params.termination_criteria.time_sec_limit = atof(optarg);
+                break;
+            case 1002: // --iter_limit
+                params.termination_criteria.iteration_limit = atoi(optarg);
+                break;
+            case 1003: // --eps_optimal
+                params.termination_criteria.eps_optimal_relative = atof(optarg);
+                break;
+            case 1004: // --eps_feas
+                params.termination_criteria.eps_feasible_relative = atof(optarg);
+                break;
+            case 1006: // --eps_feas_polish_relative
+                params.termination_criteria.eps_feas_polish_relative = atof(optarg);
+                break;
+            case 'f': // --feasibility_polishing
+                params.feasibility_polishing = true;
+                break;
+            case 1007: // --l_inf_ruiz_iter
+                params.l_inf_ruiz_iterations = atoi(optarg);
+                break;
+            case 1008: // --pock_chambolle_alpha
+                params.pock_chambolle_alpha = atof(optarg);
+                break;
+            case 1009: // --no_pock_chambolle
+                params.has_pock_chambolle_alpha = false;
+                break;
+            case 1010: // --no_bound_obj_rescaling
+                params.bound_objective_rescaling = false;
+                break;
+            case 1011: // --sv_max_iter
+                params.sv_max_iter = atoi(optarg);
+                break;
+            case 1012: // --sv_tol
+                params.sv_tol = atof(optarg);
+                break;
+            case 1013: // --eval_freq
+                params.termination_evaluation_frequency = atoi(optarg);
+                break;
+            case 1014: // --opt_norm
             {
                 const char *norm_str = optarg;
-                if (strcmp(norm_str, "l2") == 0) {
+                if (strcmp(norm_str, "l2") == 0)
+                {
                     params.optimality_norm = NORM_TYPE_L2;
-                } else if (strcmp(norm_str, "linf") == 0) {
+                }
+                else if (strcmp(norm_str, "linf") == 0)
+                {
                     params.optimality_norm = NORM_TYPE_L_INF;
-                } else {
+                }
+                else
+                {
                     fprintf(stderr, "Error: opt_norm must be 'l2' or 'linf'\n");
                     return 1;
                 }
             }
             break;
-        case 1015: // --no_presolve
-            params.presolve = false;
-            break;
-        case 1016: // --matrix_zero_tol
-            params.matrix_zero_tol = atof(optarg);
-            break;
-        case '?': // Unknown option
-            return 1;
+            case 1015: // --no_presolve
+                params.presolve = false;
+                break;
+            case 1016: // --matrix_zero_tol
+                params.matrix_zero_tol = atof(optarg);
+                break;
+            case '?': // Unknown option
+                return 1;
         }
     }
 
     if (argc - optind != 2)
     {
-        fprintf(
-            stderr,
-            "Error: You must specify an input file and an output directory.\n\n");
+        fprintf(stderr, "Error: You must specify an input file and an output directory.\n\n");
         print_usage(argv[0]);
         return 1;
     }
@@ -344,10 +349,9 @@ int main(int argc, char *argv[])
     else
     {
         save_solver_summary(result, output_dir, instance_name);
-        save_solution(result->primal_solution, problem->num_variables, output_dir,
-                      instance_name, "_primal_solution.txt");
-        save_solution(result->dual_solution, problem->num_constraints, output_dir,
-                      instance_name, "_dual_solution.txt");
+        save_solution(
+            result->primal_solution, problem->num_variables, output_dir, instance_name, "_primal_solution.txt");
+        save_solution(result->dual_solution, problem->num_constraints, output_dir, instance_name, "_dual_solution.txt");
         cupdlpx_result_free(result);
     }
 
@@ -355,4 +359,4 @@ int main(int argc, char *argv[])
     free(instance_name);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/src/cupdlpx.c b/src/cupdlpx.c
index f6b6c9c..035e520 100644
--- a/src/cupdlpx.c
+++ b/src/cupdlpx.c
@@ -25,8 +25,10 @@ limitations under the License.
 // create an lp_problem_t from a matrix
 lp_problem_t *create_lp_problem(const double *objective_c,
                                 const matrix_desc_t *A_desc,
-                                const double *con_lb, const double *con_ub,
-                                const double *var_lb, const double *var_ub,
+                                const double *con_lb,
+                                const double *con_ub,
+                                const double *var_lb,
+                                const double *var_ub,
                                 const double *objective_constant)
 {
     lp_problem_t *prob = (lp_problem_t *)safe_malloc(sizeof(lp_problem_t));
@@ -39,82 +41,75 @@ lp_problem_t *create_lp_problem(const double *objective_c,
     // handle matrix by format
     switch (A_desc->fmt)
     {
-    case matrix_dense:
-        dense_to_csr(A_desc, &prob->constraint_matrix_row_pointers,
-                     &prob->constraint_matrix_col_indices,
-                     &prob->constraint_matrix_values,
-                     &prob->constraint_matrix_num_nonzeros);
-        break;
-
-    case matrix_csc:
-    {
-        int *row_ptr = NULL, *col_ind = NULL;
-        double *vals = NULL;
-        if (csc_to_csr(A_desc, &row_ptr, &col_ind, &vals) != 0)
+        case matrix_dense:
+            dense_to_csr(A_desc,
+                         &prob->constraint_matrix_row_pointers,
+                         &prob->constraint_matrix_col_indices,
+                         &prob->constraint_matrix_values,
+                         &prob->constraint_matrix_num_nonzeros);
+            break;
+
+        case matrix_csc:
         {
-            fprintf(stderr, "[interface] CSC->CSR failed.\n");
-            free(prob);
-            return NULL;
+            int *row_ptr = NULL, *col_ind = NULL;
+            double *vals = NULL;
+            if (csc_to_csr(A_desc, &row_ptr, &col_ind, &vals) != 0)
+            {
+                fprintf(stderr, "[interface] CSC->CSR failed.\n");
+                free(prob);
+                return NULL;
+            }
+            prob->constraint_matrix_num_nonzeros = A_desc->data.csc.nnz;
+            prob->constraint_matrix_row_pointers = row_ptr;
+            prob->constraint_matrix_col_indices = col_ind;
+            prob->constraint_matrix_values = vals;
+            break;
         }
-        prob->constraint_matrix_num_nonzeros = A_desc->data.csc.nnz;
-        prob->constraint_matrix_row_pointers = row_ptr;
-        prob->constraint_matrix_col_indices = col_ind;
-        prob->constraint_matrix_values = vals;
-        break;
-    }
 
-    case matrix_coo:
-    {
-        int *row_ptr = NULL, *col_ind = NULL;
-        double *vals = NULL;
-        if (coo_to_csr(A_desc, &row_ptr, &col_ind, &vals) != 0)
+        case matrix_coo:
         {
-            fprintf(stderr, "[interface] COO->CSR failed.\n");
-            free(prob);
-            return NULL;
+            int *row_ptr = NULL, *col_ind = NULL;
+            double *vals = NULL;
+            if (coo_to_csr(A_desc, &row_ptr, &col_ind, &vals) != 0)
+            {
+                fprintf(stderr, "[interface] COO->CSR failed.\n");
+                free(prob);
+                return NULL;
+            }
+            prob->constraint_matrix_num_nonzeros = A_desc->data.coo.nnz;
+            prob->constraint_matrix_row_pointers = row_ptr;
+            prob->constraint_matrix_col_indices = col_ind;
+            prob->constraint_matrix_values = vals;
+            break;
         }
-        prob->constraint_matrix_num_nonzeros = A_desc->data.coo.nnz;
-        prob->constraint_matrix_row_pointers = row_ptr;
-        prob->constraint_matrix_col_indices = col_ind;
-        prob->constraint_matrix_values = vals;
-        break;
-    }
 
-    case matrix_csr:
-        prob->constraint_matrix_num_nonzeros = A_desc->data.csr.nnz;
-        prob->constraint_matrix_row_pointers =
-            (int *)safe_malloc((size_t)(A_desc->m + 1) * sizeof(int));
-        prob->constraint_matrix_col_indices =
-            (int *)safe_malloc((size_t)A_desc->data.csr.nnz * sizeof(int));
-        prob->constraint_matrix_values =
-            (double *)safe_malloc((size_t)A_desc->data.csr.nnz * sizeof(double));
-        memcpy(prob->constraint_matrix_row_pointers, A_desc->data.csr.row_ptr,
-               (size_t)(A_desc->m + 1) * sizeof(int));
-        memcpy(prob->constraint_matrix_col_indices, A_desc->data.csr.col_ind,
-               (size_t)A_desc->data.csr.nnz * sizeof(int));
-        memcpy(prob->constraint_matrix_values, A_desc->data.csr.vals,
-               (size_t)A_desc->data.csr.nnz * sizeof(double));
-        break;
-
-    default:
-        fprintf(
-            stderr,
-            "[interface] make_problem_from_matrix: unsupported matrix format %d.\n",
-            A_desc->fmt);
-        free(prob);
-        return NULL;
+        case matrix_csr:
+            prob->constraint_matrix_num_nonzeros = A_desc->data.csr.nnz;
+            prob->constraint_matrix_row_pointers = (int *)safe_malloc((size_t)(A_desc->m + 1) * sizeof(int));
+            prob->constraint_matrix_col_indices = (int *)safe_malloc((size_t)A_desc->data.csr.nnz * sizeof(int));
+            prob->constraint_matrix_values = (double *)safe_malloc((size_t)A_desc->data.csr.nnz * sizeof(double));
+            memcpy(
+                prob->constraint_matrix_row_pointers, A_desc->data.csr.row_ptr, (size_t)(A_desc->m + 1) * sizeof(int));
+            memcpy(prob->constraint_matrix_col_indices,
+                   A_desc->data.csr.col_ind,
+                   (size_t)A_desc->data.csr.nnz * sizeof(int));
+            memcpy(
+                prob->constraint_matrix_values, A_desc->data.csr.vals, (size_t)A_desc->data.csr.nnz * sizeof(double));
+            break;
+
+        default:
+            fprintf(stderr, "[interface] make_problem_from_matrix: unsupported matrix format %d.\n", A_desc->fmt);
+            free(prob);
+            return NULL;
     }
 
     // default fill values
     prob->objective_constant = objective_constant ? *objective_constant : 0.0;
     fill_or_copy(&prob->objective_vector, prob->num_variables, objective_c, 0.0);
     fill_or_copy(&prob->variable_lower_bound, prob->num_variables, var_lb, -INFINITY);
-    fill_or_copy(&prob->variable_upper_bound, prob->num_variables, var_ub,
-                 INFINITY);
-    fill_or_copy(&prob->constraint_lower_bound, prob->num_constraints, con_lb,
-                 -INFINITY);
-    fill_or_copy(&prob->constraint_upper_bound, prob->num_constraints, con_ub,
-                 INFINITY);
+    fill_or_copy(&prob->variable_upper_bound, prob->num_variables, var_ub, INFINITY);
+    fill_or_copy(&prob->constraint_lower_bound, prob->num_constraints, con_lb, -INFINITY);
+    fill_or_copy(&prob->constraint_upper_bound, prob->num_constraints, con_ub, INFINITY);
 
     return prob;
 }
@@ -149,8 +144,7 @@ void lp_problem_free(lp_problem_t *prob)
     free(prob);
 }
 
-void set_start_values(lp_problem_t *prob, const double *primal,
-                      const double *dual)
+void set_start_values(lp_problem_t *prob, const double *primal, const double *dual)
 {
     if (!prob)
         return;
@@ -182,8 +176,7 @@ void set_start_values(lp_problem_t *prob, const double *primal,
     }
 }
 
-cupdlpx_result_t *solve_lp_problem(lp_problem_t *prob,
-                                   const pdhg_parameters_t *params)
+cupdlpx_result_t *solve_lp_problem(lp_problem_t *prob, const pdhg_parameters_t *params)
 {
     // argument checks
     if (!prob)
diff --git a/src/mps_parser.c b/src/mps_parser.c
index 756d741..c37687e 100644
--- a/src/mps_parser.c
+++ b/src/mps_parser.c
@@ -75,8 +75,7 @@ static void namemap_resize(NameMap *map)
         {
             NameNode *next = current->next;
 
-            unsigned long h =
-                hash_string(current->name) % (unsigned long)new_num_buckets;
+            unsigned long h = hash_string(current->name) % (unsigned long)new_num_buckets;
 
             current->next = map->buckets[h];
             map->buckets[h] = current;
@@ -174,8 +173,7 @@ static FastLineReader *fast_reader_open(const char *filename)
 
     reader->buffer = safe_malloc(READER_BUFFER_SIZE);
 
-    if (strlen(filename) > 3 &&
-        strcmp(filename + strlen(filename) - 3, ".gz") == 0)
+    if (strlen(filename) > 3 && strcmp(filename + strlen(filename) - 3, ".gz") == 0)
     {
         reader->is_gz = true;
         reader->handle.gz_f = gzopen(filename, "rb");
@@ -222,8 +220,7 @@ static void fast_reader_close(FastLineReader *reader)
     free(reader);
 }
 
-static char *fast_reader_gets(FastLineReader *reader, char *line_buf,
-                              int line_buf_size)
+static char *fast_reader_gets(FastLineReader *reader, char *line_buf, int line_buf_size)
 {
     int len = 0;
 
@@ -234,8 +231,7 @@ static char *fast_reader_gets(FastLineReader *reader, char *line_buf,
         {
             if (reader->is_gz)
             {
-                int bytes_read =
-                    gzread(reader->handle.gz_f, reader->buffer, READER_BUFFER_SIZE);
+                int bytes_read = gzread(reader->handle.gz_f, reader->buffer, READER_BUFFER_SIZE);
                 if (bytes_read <= 0)
                 {
 
@@ -245,8 +241,7 @@ static char *fast_reader_gets(FastLineReader *reader, char *line_buf,
             }
             else
             {
-                size_t bytes_read =
-                    fread(reader->buffer, 1, READER_BUFFER_SIZE, reader->handle.f);
+                size_t bytes_read = fread(reader->buffer, 1, READER_BUFFER_SIZE, reader->handle.f);
                 if (bytes_read <= 0)
                 {
                     return (len > 0) ? line_buf : NULL;
@@ -256,8 +251,7 @@ static char *fast_reader_gets(FastLineReader *reader, char *line_buf,
             reader->current_pos = reader->buffer;
         }
 
-        char *newline_pos = (char *)memchr(reader->current_pos, '\n',
-                                           reader->end_pos - reader->current_pos);
+        char *newline_pos = (char *)memchr(reader->current_pos, '\n', reader->end_pos - reader->current_pos);
 
         int bytes_to_copy;
         bool line_complete = (newline_pos != NULL);
@@ -342,12 +336,9 @@ static int add_coo_entry(CooMatrix *coo, int row, int col, double value)
     if (coo->nnz >= coo->capacity)
     {
         size_t new_capacity = (coo->capacity == 0) ? 1024 : coo->capacity * 2;
-        coo->row_indices =
-            (int *)safe_realloc(coo->row_indices, new_capacity * sizeof(int));
-        coo->col_indices =
-            (int *)safe_realloc(coo->col_indices, new_capacity * sizeof(int));
-        coo->values =
-            (double *)safe_realloc(coo->values, new_capacity * sizeof(double));
+        coo->row_indices = (int *)safe_realloc(coo->row_indices, new_capacity * sizeof(int));
+        coo->col_indices = (int *)safe_realloc(coo->col_indices, new_capacity * sizeof(int));
+        coo->values = (double *)safe_realloc(coo->values, new_capacity * sizeof(double));
         coo->capacity = new_capacity;
     }
     coo->row_indices[coo->nnz] = row;
@@ -371,12 +362,9 @@ static bool ensure_column_capacity(MpsParserState *state)
         return false;
     }
 
-    state->objective_coeffs =
-        (double *)safe_realloc(state->objective_coeffs, new_cap * sizeof(double));
-    state->var_lower_bounds =
-        (double *)safe_realloc(state->var_lower_bounds, new_cap * sizeof(double));
-    state->var_upper_bounds =
-        (double *)safe_realloc(state->var_upper_bounds, new_cap * sizeof(double));
+    state->objective_coeffs = (double *)safe_realloc(state->objective_coeffs, new_cap * sizeof(double));
+    state->var_lower_bounds = (double *)safe_realloc(state->var_lower_bounds, new_cap * sizeof(double));
+    state->var_upper_bounds = (double *)safe_realloc(state->var_upper_bounds, new_cap * sizeof(double));
 
     for (size_t i = state->col_capacity; i < new_cap; ++i)
     {
@@ -391,18 +379,12 @@ static bool ensure_column_capacity(MpsParserState *state)
 
 static void free_parser_state(MpsParserState *state);
 static int finalize_rows(MpsParserState *state);
-static int parse_rows_section(MpsParserState *state, char **tokens,
-                              int n_tokens);
-static int parse_columns_section(MpsParserState *state, char **tokens,
-                                 int n_tokens);
-static int parse_rhs_section(MpsParserState *state, char **tokens,
-                             int n_tokens);
-static int parse_ranges_section(MpsParserState *state, char **tokens,
-                                int n_tokens);
-static int parse_bounds_section(MpsParserState *state, char **tokens,
-                                int n_tokens);
-static int mps_coo_to_csr(lp_problem_t *prob, CooMatrix *coo,
-                          size_t num_constraints);
+static int parse_rows_section(MpsParserState *state, char **tokens, int n_tokens);
+static int parse_columns_section(MpsParserState *state, char **tokens, int n_tokens);
+static int parse_rhs_section(MpsParserState *state, char **tokens, int n_tokens);
+static int parse_ranges_section(MpsParserState *state, char **tokens, int n_tokens);
+static int parse_bounds_section(MpsParserState *state, char **tokens, int n_tokens);
+static int mps_coo_to_csr(lp_problem_t *prob, CooMatrix *coo, size_t num_constraints);
 
 typedef enum
 {
@@ -472,8 +454,7 @@ lp_problem_t *read_mps_file(const char *filename)
                 next_section = SEC_ENDATA;
             }
 
-            if (current_section == SEC_ROWS && next_section != SEC_ROWS &&
-                !rows_finalized)
+            if (current_section == SEC_ROWS && next_section != SEC_ROWS && !rows_finalized)
             {
                 if (finalize_rows(&state) != 0)
                     state.error_flag = 1;
@@ -488,36 +469,35 @@ lp_problem_t *read_mps_file(const char *filename)
 
         switch (current_section)
         {
-        case SEC_OBJSENSE:
-            if (n_tokens > 0 && (strcmp(tokens[0], "MAX") == 0 ||
-                                 strcmp(tokens[0], "MAXIMIZE") == 0))
-            {
-                state.is_maximize = true;
-            }
-            break;
-        case SEC_ROWS:
-            if (parse_rows_section(&state, tokens, n_tokens) != 0)
-                state.error_flag = 1;
-            break;
-        case SEC_COLUMNS:
-            if (parse_columns_section(&state, tokens, n_tokens) != 0)
-                state.error_flag = 1;
-            break;
-        case SEC_RHS:
-            if (parse_rhs_section(&state, tokens, n_tokens) != 0)
-                state.error_flag = 1;
-            break;
-        case SEC_RANGES:
-            if (parse_ranges_section(&state, tokens, n_tokens) != 0)
-                state.error_flag = 1;
-            break;
-        case SEC_BOUNDS:
-            if (parse_bounds_section(&state, tokens, n_tokens) != 0)
-                state.error_flag = 1;
-            break;
-        default:
+            case SEC_OBJSENSE:
+                if (n_tokens > 0 && (strcmp(tokens[0], "MAX") == 0 || strcmp(tokens[0], "MAXIMIZE") == 0))
+                {
+                    state.is_maximize = true;
+                }
+                break;
+            case SEC_ROWS:
+                if (parse_rows_section(&state, tokens, n_tokens) != 0)
+                    state.error_flag = 1;
+                break;
+            case SEC_COLUMNS:
+                if (parse_columns_section(&state, tokens, n_tokens) != 0)
+                    state.error_flag = 1;
+                break;
+            case SEC_RHS:
+                if (parse_rhs_section(&state, tokens, n_tokens) != 0)
+                    state.error_flag = 1;
+                break;
+            case SEC_RANGES:
+                if (parse_ranges_section(&state, tokens, n_tokens) != 0)
+                    state.error_flag = 1;
+                break;
+            case SEC_BOUNDS:
+                if (parse_bounds_section(&state, tokens, n_tokens) != 0)
+                    state.error_flag = 1;
+                break;
+            default:
 
-            break;
+                break;
         }
     }
 
@@ -535,8 +515,7 @@ lp_problem_t *read_mps_file(const char *filename)
     prob->num_variables = state.col_map.size;
     prob->num_constraints = state.row_map.size;
     prob->constraint_matrix_num_nonzeros = state.coo_matrix.nnz;
-    prob->objective_constant =
-        state.is_maximize ? -state.objective_constant : state.objective_constant;
+    prob->objective_constant = state.is_maximize ? -state.objective_constant : state.objective_constant;
 
     prob->objective_vector = state.objective_coeffs;
     prob->variable_lower_bound = state.var_lower_bounds;
@@ -573,20 +552,16 @@ lp_problem_t *read_mps_file(const char *filename)
     return prob;
 }
 
-static int parse_rows_section(MpsParserState *state, char **tokens,
-                              int n_tokens)
+static int parse_rows_section(MpsParserState *state, char **tokens, int n_tokens)
 {
     if (n_tokens < 2)
         return 0;
 
     if (state->num_buffered_rows >= state->buffered_rows_capacity)
     {
-        state->buffered_rows_capacity = (state->buffered_rows_capacity == 0)
-                                            ? 64
-                                            : state->buffered_rows_capacity * 2;
-        state->buffered_rows = (BufferedRow *)safe_realloc(
-            state->buffered_rows,
-            state->buffered_rows_capacity * sizeof(BufferedRow));
+        state->buffered_rows_capacity = (state->buffered_rows_capacity == 0) ? 64 : state->buffered_rows_capacity * 2;
+        state->buffered_rows =
+            (BufferedRow *)safe_realloc(state->buffered_rows, state->buffered_rows_capacity * sizeof(BufferedRow));
     }
 
     BufferedRow *new_row = &state->buffered_rows[state->num_buffered_rows];
@@ -635,11 +610,9 @@ static int finalize_rows(MpsParserState *state)
             size_t current_size = state->row_map.size;
             if (current_size >= state->constraint_capacity)
             {
-                state->constraint_capacity = (state->constraint_capacity == 0)
-                                                 ? 64
-                                                 : state->constraint_capacity * 2;
-                state->constraint_types = (char *)safe_realloc(
-                    state->constraint_types, state->constraint_capacity * sizeof(char));
+                state->constraint_capacity = (state->constraint_capacity == 0) ? 64 : state->constraint_capacity * 2;
+                state->constraint_types =
+                    (char *)safe_realloc(state->constraint_types, state->constraint_capacity * sizeof(char));
             }
             namemap_put(&state->row_map, state->buffered_rows[i].name);
             state->constraint_types[current_size] = type;
@@ -648,10 +621,8 @@ static int finalize_rows(MpsParserState *state)
     size_t num_constraints = state->row_map.size;
     if (num_constraints > 0)
     {
-        state->constraint_lower_bounds =
-            safe_malloc(num_constraints * sizeof(double));
-        state->constraint_upper_bounds =
-            safe_malloc(num_constraints * sizeof(double));
+        state->constraint_lower_bounds = safe_malloc(num_constraints * sizeof(double));
+        state->constraint_upper_bounds = safe_malloc(num_constraints * sizeof(double));
 
         for (size_t i = 0; i < num_constraints; ++i)
         {
@@ -676,8 +647,7 @@ static int finalize_rows(MpsParserState *state)
     return 0;
 }
 
-static int parse_columns_section(MpsParserState *state, char **tokens,
-                                 int n_tokens)
+static int parse_columns_section(MpsParserState *state, char **tokens, int n_tokens)
 {
     if (n_tokens < 2)
         return 0;
@@ -704,8 +674,7 @@ static int parse_columns_section(MpsParserState *state, char **tokens,
     {
         if (!state->current_col_name)
         {
-            fprintf(stderr,
-                    "ERROR: Column data found before any column name was defined.\n");
+            fprintf(stderr, "ERROR: Column data found before any column name was defined.\n");
             return -1;
         }
         col_name = state->current_col_name;
@@ -724,8 +693,7 @@ static int parse_columns_section(MpsParserState *state, char **tokens,
         const char *row_name = tokens[i];
         double value = atof(tokens[i + 1]);
 
-        if (state->objective_row_name &&
-            strcmp(row_name, state->objective_row_name) == 0)
+        if (state->objective_row_name && strcmp(row_name, state->objective_row_name) == 0)
         {
             state->objective_coeffs[col_idx] += value;
         }
@@ -744,8 +712,7 @@ static int parse_columns_section(MpsParserState *state, char **tokens,
     return 0;
 }
 
-static int parse_rhs_section(MpsParserState *state, char **tokens,
-                             int n_tokens)
+static int parse_rhs_section(MpsParserState *state, char **tokens, int n_tokens)
 {
 
     for (int i = 1; i + 1 < n_tokens; i += 2)
@@ -753,8 +720,7 @@ static int parse_rhs_section(MpsParserState *state, char **tokens,
         const char *row_name = tokens[i];
         double value = atof(tokens[i + 1]);
 
-        if (state->objective_row_name &&
-            strcmp(row_name, state->objective_row_name) == 0)
+        if (state->objective_row_name && strcmp(row_name, state->objective_row_name) == 0)
         {
             state->objective_constant = -value;
         }
@@ -779,8 +745,7 @@ static int parse_rhs_section(MpsParserState *state, char **tokens,
     return 0;
 }
 
-static int parse_ranges_section(MpsParserState *state, char **tokens,
-                                int n_tokens)
+static int parse_ranges_section(MpsParserState *state, char **tokens, int n_tokens)
 {
 
     for (int i = 1; i + 1 < n_tokens; i += 2)
@@ -792,8 +757,8 @@ static int parse_ranges_section(MpsParserState *state, char **tokens,
         if (row_idx != -1)
         {
             char type = state->constraint_types[row_idx];
-            double rhs = (type == 'L') ? state->constraint_upper_bounds[row_idx]
-                                       : state->constraint_lower_bounds[row_idx];
+            double rhs =
+                (type == 'L') ? state->constraint_upper_bounds[row_idx] : state->constraint_lower_bounds[row_idx];
 
             if (type == 'G')
             {
@@ -819,8 +784,7 @@ static int parse_ranges_section(MpsParserState *state, char **tokens,
     return 0;
 }
 
-static int parse_bounds_section(MpsParserState *state, char **tokens,
-                                int n_tokens)
+static int parse_bounds_section(MpsParserState *state, char **tokens, int n_tokens)
 {
     if (n_tokens < 3)
         return 0;
@@ -868,12 +832,10 @@ static int parse_bounds_section(MpsParserState *state, char **tokens,
     return 0;
 }
 
-static int mps_coo_to_csr(lp_problem_t *prob, CooMatrix *coo,
-                          size_t num_constraints)
+static int mps_coo_to_csr(lp_problem_t *prob, CooMatrix *coo, size_t num_constraints)
 {
 
-    prob->constraint_matrix_row_pointers =
-        safe_calloc(num_constraints + 1, sizeof(int));
+    prob->constraint_matrix_row_pointers = safe_calloc(num_constraints + 1, sizeof(int));
     prob->constraint_matrix_col_indices = safe_malloc(coo->nnz * sizeof(int));
     prob->constraint_matrix_values = safe_malloc(coo->nnz * sizeof(double));
 
@@ -884,13 +846,11 @@ static int mps_coo_to_csr(lp_problem_t *prob, CooMatrix *coo,
 
     for (size_t i = 1; i <= num_constraints; ++i)
     {
-        prob->constraint_matrix_row_pointers[i] +=
-            prob->constraint_matrix_row_pointers[i - 1];
+        prob->constraint_matrix_row_pointers[i] += prob->constraint_matrix_row_pointers[i - 1];
     }
 
     int *row_pos = safe_malloc((num_constraints + 1) * sizeof(int));
-    memcpy(row_pos, prob->constraint_matrix_row_pointers,
-           (num_constraints + 1) * sizeof(int));
+    memcpy(row_pos, prob->constraint_matrix_row_pointers, (num_constraints + 1) * sizeof(int));
 
     for (size_t i = 0; i < coo->nnz; ++i)
     {
@@ -936,4 +896,4 @@ static void free_parser_state(MpsParserState *state)
     free(state->constraint_upper_bounds);
     free(state->objective_row_name);
     free(state->current_col_name);
-}
\ No newline at end of file
+}
diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index 11fc23b..8883c8c 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -16,12 +16,12 @@ limitations under the License.
 
 #include "preconditioner.h"
 #include "utils.h"
-#include <time.h>
-#include <stdio.h>
-#include <math.h>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
 #include <cub/device/device_reduce.cuh>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
 
 #define SCALING_EPSILON 1e-12
 
@@ -59,80 +59,86 @@ __global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors
                                             double *__restrict__ inverse_scaling_factors,
                                             double *__restrict__ cumulative_rescaling,
                                             int num_variables);
-__global__ void compute_bound_contrib_kernel(
-    const double *__restrict__ constraint_lower_bound,
-    const double *__restrict__ constraint_upper_bound,
-    int num_constraints,
-    double *__restrict__ contrib);
-__global__ void scale_bounds_kernel(
-    double *__restrict__ constraint_lower_bound,
-    double *__restrict__ constraint_upper_bound,
-    double *__restrict__ initial_dual_solution,
-    int num_constraints,
-    double constraint_scale,
-    double objective_scale);
-__global__ void scale_objective_kernel(
-    double *__restrict__ objective_vector,
-    double *__restrict__ variable_lower_bound,
-    double *__restrict__ variable_upper_bound,
-    double *__restrict__ initial_primal_solution,
-    int num_variables,
-    double constraint_scale,
-    double objective_scale);
+__global__ void compute_bound_contrib_kernel(const double *__restrict__ constraint_lower_bound,
+                                             const double *__restrict__ constraint_upper_bound,
+                                             int num_constraints,
+                                             double *__restrict__ contrib);
+__global__ void scale_bounds_kernel(double *__restrict__ constraint_lower_bound,
+                                    double *__restrict__ constraint_upper_bound,
+                                    double *__restrict__ initial_dual_solution,
+                                    int num_constraints,
+                                    double constraint_scale,
+                                    double objective_scale);
+__global__ void scale_objective_kernel(double *__restrict__ objective_vector,
+                                       double *__restrict__ variable_lower_bound,
+                                       double *__restrict__ variable_upper_bound,
+                                       double *__restrict__ initial_primal_solution,
+                                       int num_variables,
+                                       double constraint_scale,
+                                       double objective_scale);
 __global__ void fill_ones_kernel(double *__restrict__ x, int num_variables);
-static void scale_problem(pdhg_solver_state_t *state, double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
-static void ruiz_rescaling(pdhg_solver_state_t *state, int num_iters, rescale_info_t *rescale_info,
-                           double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
-static void pock_chambolle_rescaling(pdhg_solver_state_t *state, const double alpha, rescale_info_t *rescale_info,
-                                     double *constraint_rescaling, double *variable_rescaling, double *inverse_constraint_rescaling, double *inverse_variable_rescaling);
+static void scale_problem(pdhg_solver_state_t *state,
+                          double *constraint_rescaling,
+                          double *variable_rescaling,
+                          double *inverse_constraint_rescaling,
+                          double *inverse_variable_rescaling);
+static void ruiz_rescaling(pdhg_solver_state_t *state,
+                           int num_iters,
+                           rescale_info_t *rescale_info,
+                           double *constraint_rescaling,
+                           double *variable_rescaling,
+                           double *inverse_constraint_rescaling,
+                           double *inverse_variable_rescaling);
+static void pock_chambolle_rescaling(pdhg_solver_state_t *state,
+                                     const double alpha,
+                                     rescale_info_t *rescale_info,
+                                     double *constraint_rescaling,
+                                     double *variable_rescaling,
+                                     double *inverse_constraint_rescaling,
+                                     double *inverse_variable_rescaling);
 static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info);
 
-static void scale_problem(
-    pdhg_solver_state_t *state,
-    double *constraint_rescaling,
-    double *variable_rescaling,
-    double *inverse_constraint_rescaling,
-    double *inverse_variable_rescaling)
+static void scale_problem(pdhg_solver_state_t *state,
+                          double *constraint_rescaling,
+                          double *variable_rescaling,
+                          double *inverse_constraint_rescaling,
+                          double *inverse_variable_rescaling)
 {
     int num_variables = state->num_variables;
     int num_constraints = state->num_constraints;
 
-    scale_variables_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        state->objective_vector,
-        state->variable_lower_bound,
-        state->variable_upper_bound,
-        state->initial_primal_solution,
-        variable_rescaling,
-        inverse_variable_rescaling,
-        num_variables);
-
-    scale_constraints_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        state->constraint_lower_bound,
-        state->constraint_upper_bound,
-        state->initial_dual_solution,
-        constraint_rescaling,
-        inverse_constraint_rescaling,
-        num_constraints);
-
-    scale_csr_nnz_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
-        state->constraint_matrix->row_ind,
-        state->constraint_matrix->col_ind,
-        state->constraint_matrix->val,
-        state->constraint_matrix_t->val,
-        state->constraint_matrix->transpose_map,
-        inverse_variable_rescaling,
-        inverse_constraint_rescaling,
-        state->constraint_matrix->num_nonzeros);
+    scale_variables_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(state->objective_vector,
+                                                                            state->variable_lower_bound,
+                                                                            state->variable_upper_bound,
+                                                                            state->initial_primal_solution,
+                                                                            variable_rescaling,
+                                                                            inverse_variable_rescaling,
+                                                                            num_variables);
+
+    scale_constraints_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(state->constraint_lower_bound,
+                                                                            state->constraint_upper_bound,
+                                                                            state->initial_dual_solution,
+                                                                            constraint_rescaling,
+                                                                            inverse_constraint_rescaling,
+                                                                            num_constraints);
+
+    scale_csr_nnz_kernel<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(state->constraint_matrix->row_ind,
+                                                                       state->constraint_matrix->col_ind,
+                                                                       state->constraint_matrix->val,
+                                                                       state->constraint_matrix_t->val,
+                                                                       state->constraint_matrix->transpose_map,
+                                                                       inverse_variable_rescaling,
+                                                                       inverse_constraint_rescaling,
+                                                                       state->constraint_matrix->num_nonzeros);
 }
 
-static void ruiz_rescaling(
-    pdhg_solver_state_t *state,
-    int num_iterations,
-    rescale_info_t *rescale_info,
-    double *constraint_rescaling,
-    double *variable_rescaling,
-    double *inverse_constraint_rescaling,
-    double *inverse_variable_rescaling)
+static void ruiz_rescaling(pdhg_solver_state_t *state,
+                           int num_iterations,
+                           rescale_info_t *rescale_info,
+                           double *constraint_rescaling,
+                           double *variable_rescaling,
+                           double *inverse_constraint_rescaling,
+                           double *inverse_variable_rescaling)
 {
     const int num_constraints = state->num_constraints;
     const int num_variables = state->num_variables;
@@ -140,73 +146,49 @@ static void ruiz_rescaling(
     for (int iter = 0; iter < num_iterations; ++iter)
     {
         compute_csr_row_absmax_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-            state->constraint_matrix->row_ptr,
-            state->constraint_matrix->val,
-            num_constraints,
-            constraint_rescaling);
+            state->constraint_matrix->row_ptr, state->constraint_matrix->val, num_constraints, constraint_rescaling);
         clamp_sqrt_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-            constraint_rescaling,
-            inverse_constraint_rescaling,
-            rescale_info->con_rescale,
-            num_constraints);
+            constraint_rescaling, inverse_constraint_rescaling, rescale_info->con_rescale, num_constraints);
 
         compute_csr_row_absmax_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-            state->constraint_matrix_t->row_ptr,
-            state->constraint_matrix_t->val,
-            num_variables,
-            variable_rescaling);
+            state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->val, num_variables, variable_rescaling);
         clamp_sqrt_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-            variable_rescaling,
-            inverse_variable_rescaling,
-            rescale_info->var_rescale,
-            num_variables);
+            variable_rescaling, inverse_variable_rescaling, rescale_info->var_rescale, num_variables);
 
-        scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
+        scale_problem(
+            state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
     }
 }
 
-static void pock_chambolle_rescaling(
-    pdhg_solver_state_t *state,
-    const double alpha,
-    rescale_info_t *rescale_info,
-    double *constraint_rescaling,
-    double *variable_rescaling,
-    double *inverse_constraint_rescaling,
-    double *inverse_variable_rescaling)
+static void pock_chambolle_rescaling(pdhg_solver_state_t *state,
+                                     const double alpha,
+                                     rescale_info_t *rescale_info,
+                                     double *constraint_rescaling,
+                                     double *variable_rescaling,
+                                     double *inverse_constraint_rescaling,
+                                     double *inverse_variable_rescaling)
 {
     const int num_constraints = state->num_constraints;
     const int num_variables = state->num_variables;
 
     compute_csr_row_powsum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        state->constraint_matrix->row_ptr,
-        state->constraint_matrix->val,
-        num_constraints,
-        alpha,
-        constraint_rescaling);
+        state->constraint_matrix->row_ptr, state->constraint_matrix->val, num_constraints, alpha, constraint_rescaling);
     clamp_sqrt_and_accum_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        constraint_rescaling,
-        inverse_constraint_rescaling,
-        rescale_info->con_rescale,
-        num_constraints);
-
-    compute_csr_row_powsum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        state->constraint_matrix_t->row_ptr,
-        state->constraint_matrix_t->val,
-        num_variables,
-        2.0 - alpha,
-        variable_rescaling);
+        constraint_rescaling, inverse_constraint_rescaling, rescale_info->con_rescale, num_constraints);
+
+    compute_csr_row_powsum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(state->constraint_matrix_t->row_ptr,
+                                                                                   state->constraint_matrix_t->val,
+                                                                                   num_variables,
+                                                                                   2.0 - alpha,
+                                                                                   variable_rescaling);
     clamp_sqrt_and_accum_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        variable_rescaling,
-        inverse_variable_rescaling,
-        rescale_info->var_rescale,
-        num_variables);
+        variable_rescaling, inverse_variable_rescaling, rescale_info->var_rescale, num_variables);
 
-    scale_problem(state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
+    scale_problem(
+        state, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
 }
 
-static void bound_objective_rescaling(
-    pdhg_solver_state_t *state,
-    rescale_info_t *rescale_info)
+static void bound_objective_rescaling(pdhg_solver_state_t *state, rescale_info_t *rescale_info)
 {
     const int num_constraints = state->num_constraints;
     const int num_variables = state->num_variables;
@@ -214,10 +196,7 @@ static void bound_objective_rescaling(
     double *contrib_d = nullptr;
     CUDA_CHECK(cudaMalloc(&contrib_d, num_constraints * sizeof(double)));
     compute_bound_contrib_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        state->constraint_lower_bound,
-        state->constraint_upper_bound,
-        num_constraints,
-        contrib_d);
+        state->constraint_lower_bound, state->constraint_upper_bound, num_constraints, contrib_d);
 
     double *bnd_norm_sq_d = nullptr;
     CUDA_CHECK(cudaMalloc(&bnd_norm_sq_d, sizeof(double)));
@@ -235,40 +214,34 @@ static void bound_objective_rescaling(
     double bnd_norm = sqrt(bnd_norm_sq_h);
 
     double obj_norm = 0.0;
-    CUBLAS_CHECK(cublasDnrm2(state->blas_handle,
-                             state->num_variables,
-                             state->objective_vector, 1,
-                             &obj_norm));
+    CUBLAS_CHECK(cublasDnrm2(state->blas_handle, state->num_variables, state->objective_vector, 1, &obj_norm));
 
     double constraint_scale = 1.0 / (bnd_norm + 1.0);
     double objective_scale = 1.0 / (obj_norm + 1.0);
 
-    scale_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        state->constraint_lower_bound,
-        state->constraint_upper_bound,
-        state->initial_dual_solution,
-        num_constraints,
-        constraint_scale,
-        objective_scale);
-
-    scale_objective_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        state->objective_vector,
-        state->variable_lower_bound,
-        state->variable_upper_bound,
-        state->initial_primal_solution,
-        num_variables,
-        constraint_scale,
-        objective_scale);
+    scale_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(state->constraint_lower_bound,
+                                                                       state->constraint_upper_bound,
+                                                                       state->initial_dual_solution,
+                                                                       num_constraints,
+                                                                       constraint_scale,
+                                                                       objective_scale);
+
+    scale_objective_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(state->objective_vector,
+                                                                            state->variable_lower_bound,
+                                                                            state->variable_upper_bound,
+                                                                            state->initial_primal_solution,
+                                                                            num_variables,
+                                                                            constraint_scale,
+                                                                            objective_scale);
 
     rescale_info->con_bound_rescale = constraint_scale;
     rescale_info->obj_vec_rescale = objective_scale;
 }
 
-rescale_info_t *rescale_problem(
-    const pdhg_parameters_t *params,
-    pdhg_solver_state_t *state)
+rescale_info_t *rescale_problem(const pdhg_parameters_t *params, pdhg_solver_state_t *state)
 {
-    if (params->verbose) {
+    if (params->verbose)
+    {
         printf("\nPreconditioning\n");
     }
 
@@ -279,12 +252,11 @@ rescale_info_t *rescale_problem(
     rescale_info_t *rescale_info = (rescale_info_t *)safe_calloc(1, sizeof(rescale_info_t));
     CUDA_CHECK(cudaMalloc(&rescale_info->con_rescale, num_constraints * sizeof(double)));
     CUDA_CHECK(cudaMalloc(&rescale_info->var_rescale, num_variables * sizeof(double)));
-    fill_ones_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        rescale_info->con_rescale, num_constraints);
-    fill_ones_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        rescale_info->var_rescale, num_variables);
+    fill_ones_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(rescale_info->con_rescale, num_constraints);
+    fill_ones_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(rescale_info->var_rescale, num_variables);
 
-    double *constraint_rescaling = NULL, *variable_rescaling = NULL, *inverse_constraint_rescaling = NULL, *inverse_variable_rescaling = NULL;
+    double *constraint_rescaling = NULL, *variable_rescaling = NULL, *inverse_constraint_rescaling = NULL,
+           *inverse_variable_rescaling = NULL;
     CUDA_CHECK(cudaMalloc(&constraint_rescaling, num_constraints * sizeof(double)));
     CUDA_CHECK(cudaMalloc(&variable_rescaling, num_variables * sizeof(double)));
     CUDA_CHECK(cudaMalloc(&inverse_constraint_rescaling, num_constraints * sizeof(double)));
@@ -292,24 +264,39 @@ rescale_info_t *rescale_problem(
 
     if (params->l_inf_ruiz_iterations > 0)
     {
-        if (params->verbose) {
+        if (params->verbose)
+        {
             printf("  Ruiz scaling (%d iterations)\n", params->l_inf_ruiz_iterations);
         }
-        ruiz_rescaling(state, params->l_inf_ruiz_iterations, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
+        ruiz_rescaling(state,
+                       params->l_inf_ruiz_iterations,
+                       rescale_info,
+                       constraint_rescaling,
+                       variable_rescaling,
+                       inverse_constraint_rescaling,
+                       inverse_variable_rescaling);
     }
     if (params->has_pock_chambolle_alpha)
     {
-        if (params->verbose) {
+        if (params->verbose)
+        {
             printf("  Pock-Chambolle scaling (alpha=%.4f)\n", params->pock_chambolle_alpha);
         }
-        pock_chambolle_rescaling(state, params->pock_chambolle_alpha, rescale_info, constraint_rescaling, variable_rescaling, inverse_constraint_rescaling, inverse_variable_rescaling);
+        pock_chambolle_rescaling(state,
+                                 params->pock_chambolle_alpha,
+                                 rescale_info,
+                                 constraint_rescaling,
+                                 variable_rescaling,
+                                 inverse_constraint_rescaling,
+                                 inverse_variable_rescaling);
     }
 
     rescale_info->con_bound_rescale = 1.0;
     rescale_info->obj_vec_rescale = 1.0;
     if (params->bound_objective_rescaling)
     {
-        if (params->verbose) {
+        if (params->verbose)
+        {
             printf("  Bound-objective scaling\n");
         }
         bound_objective_rescaling(state, rescale_info);
@@ -370,9 +357,7 @@ __global__ void scale_csr_nnz_kernel(const int *__restrict__ constraint_row_ind,
                                      const double *__restrict__ inverse_constraint_rescaling,
                                      int nnz)
 {
-    for (int k = blockIdx.x * blockDim.x + threadIdx.x;
-         k < nnz;
-         k += gridDim.x * blockDim.x)
+    for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < nnz; k += gridDim.x * blockDim.x)
     {
         int i = constraint_row_ind[k];
         int j = constraint_col_ind[k];
@@ -450,14 +435,14 @@ __global__ void clamp_sqrt_and_accum_kernel(double *__restrict__ scaling_factors
     }
 }
 
-__global__ void compute_bound_contrib_kernel(
-    const double *__restrict__ constraint_lower_bound,
-    const double *__restrict__ constraint_upper_bound,
-    int num_constraints,
-    double *__restrict__ contrib)
+__global__ void compute_bound_contrib_kernel(const double *__restrict__ constraint_lower_bound,
+                                             const double *__restrict__ constraint_upper_bound,
+                                             int num_constraints,
+                                             double *__restrict__ contrib)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= num_constraints) return;
+    if (i >= num_constraints)
+        return;
 
     double Li = constraint_lower_bound[i];
     double Ui = constraint_upper_bound[i];
@@ -475,13 +460,12 @@ __global__ void compute_bound_contrib_kernel(
     contrib[i] = acc;
 }
 
-__global__ void scale_bounds_kernel(
-    double *__restrict__ constraint_lower_bound,
-    double *__restrict__ constraint_upper_bound,
-    double *__restrict__ initial_dual_solution,
-    int num_constraints,
-    double constraint_scale,
-    double objective_scale)
+__global__ void scale_bounds_kernel(double *__restrict__ constraint_lower_bound,
+                                    double *__restrict__ constraint_upper_bound,
+                                    double *__restrict__ initial_dual_solution,
+                                    int num_constraints,
+                                    double constraint_scale,
+                                    double objective_scale)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= num_constraints)
@@ -491,14 +475,13 @@ __global__ void scale_bounds_kernel(
     initial_dual_solution[i] *= objective_scale;
 }
 
-__global__ void scale_objective_kernel(
-    double *__restrict__ objective_vector,
-    double *__restrict__ variable_lower_bound,
-    double *__restrict__ variable_upper_bound,
-    double *__restrict__ initial_primal_solution,
-    int num_variables,
-    double constraint_scale,
-    double objective_scale)
+__global__ void scale_objective_kernel(double *__restrict__ objective_vector,
+                                       double *__restrict__ variable_lower_bound,
+                                       double *__restrict__ variable_upper_bound,
+                                       double *__restrict__ initial_primal_solution,
+                                       int num_variables,
+                                       double constraint_scale,
+                                       double objective_scale)
 {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
     if (j >= num_variables)
diff --git a/src/presolve.c b/src/presolve.c
index f5933dc..c2f788f 100644
--- a/src/presolve.c
+++ b/src/presolve.c
@@ -17,16 +17,16 @@ const char *get_presolve_status_str(enum PresolveStatus_ status)
 {
     switch (status)
     {
-    case UNCHANGED:
-        return "UNCHANGED";
-    case REDUCED:
-        return "REDUCED";
-    case INFEASIBLE:
-        return "INFEASIBLE";
-    case UNBNDORINFEAS:
-        return "INFEASIBLE_OR_UNBOUNDED";
-    default:
-        return "UNKNOWN_STATUS";
+        case UNCHANGED:
+            return "UNCHANGED";
+        case REDUCED:
+            return "REDUCED";
+        case INFEASIBLE:
+            return "INFEASIBLE";
+        case UNBNDORINFEAS:
+            return "INFEASIBLE_OR_UNBOUNDED";
+        default:
+            return "UNKNOWN_STATUS";
     }
 }
 
@@ -77,19 +77,18 @@ cupdlpx_presolve_info_t *pslp_presolve(const lp_problem_t *original_prob, const
     info->settings->verbose = false;
 
     // 2. Init Presolver
-    info->presolver = new_presolver(
-        original_prob->constraint_matrix_values,
-        original_prob->constraint_matrix_col_indices,
-        original_prob->constraint_matrix_row_pointers,
-        original_prob->num_constraints,
-        original_prob->num_variables,
-        original_prob->constraint_matrix_num_nonzeros,
-        original_prob->constraint_lower_bound,
-        original_prob->constraint_upper_bound,
-        original_prob->variable_lower_bound,
-        original_prob->variable_upper_bound,
-        original_prob->objective_vector,
-        info->settings);
+    info->presolver = new_presolver(original_prob->constraint_matrix_values,
+                                    original_prob->constraint_matrix_col_indices,
+                                    original_prob->constraint_matrix_row_pointers,
+                                    original_prob->num_constraints,
+                                    original_prob->num_variables,
+                                    original_prob->constraint_matrix_num_nonzeros,
+                                    original_prob->constraint_lower_bound,
+                                    original_prob->constraint_upper_bound,
+                                    original_prob->variable_lower_bound,
+                                    original_prob->variable_upper_bound,
+                                    original_prob->objective_vector,
+                                    info->settings);
 
     // 3. Run Presolve
     PresolveStatus status = run_presolver(info->presolver);
@@ -101,10 +100,10 @@ cupdlpx_presolve_info_t *pslp_presolve(const lp_problem_t *original_prob, const
         printf("  %-15s : %s\n", "status", get_presolve_status_str(status));
         printf("  %-15s : %.3g sec\n", "presolve time", info->presolve_time);
         printf("  %-15s : %d rows, %d columns, %d nonzeros\n",
-                "reduced problem",
-                info->presolver->reduced_prob->m,
-                info->presolver->reduced_prob->n,
-                info->presolver->reduced_prob->nnz);
+               "reduced problem",
+               info->presolver->reduced_prob->m,
+               info->presolver->reduced_prob->n,
+               info->presolver->reduced_prob->nnz);
     }
 
     if (status & INFEASIBLE || status & UNBNDORINFEAS || info->presolver->reduced_prob->n == 0)
@@ -180,14 +179,9 @@ cupdlpx_result_t *create_result_from_presolve(const cupdlpx_presolve_info_t *inf
     return result;
 }
 
-void pslp_postsolve(const cupdlpx_presolve_info_t *info,
-                    cupdlpx_result_t *result,
-                    const lp_problem_t *original_prob)
+void pslp_postsolve(const cupdlpx_presolve_info_t *info, cupdlpx_result_t *result, const lp_problem_t *original_prob)
 {
-    postsolve(info->presolver,
-              result->primal_solution,
-              result->dual_solution,
-              result->reduced_cost);
+    postsolve(info->presolver, result->primal_solution, result->dual_solution, result->reduced_cost);
 
     result->num_reduced_variables = info->presolver->reduced_prob->n;
     result->num_reduced_constraints = info->presolver->reduced_prob->m;
@@ -202,7 +196,7 @@ void pslp_postsolve(const cupdlpx_presolve_info_t *info,
     memcpy(result->dual_solution, info->presolver->sol->y, original_prob->num_constraints * sizeof(double));
     memcpy(result->reduced_cost, info->presolver->sol->z, original_prob->num_variables * sizeof(double));
     // TODO: this can be removed if PSLP implements it.
-    for (int i=0; i <original_prob->num_variables; i++)
+    for (int i = 0; i < original_prob->num_variables; i++)
     {
         if (!isfinite(original_prob->variable_lower_bound[i]))
         {
@@ -241,4 +235,4 @@ void cupdlpx_presolve_info_free(cupdlpx_presolve_info_t *info)
     if (info->settings)
         free_settings(info->settings);
     free(info);
-}
\ No newline at end of file
+}
diff --git a/src/solver.cu b/src/solver.cu
index 6cb9648..896f19d 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -29,9 +29,7 @@ limitations under the License.
 #include <stdlib.h>
 #include <time.h>
 
-__global__ void build_row_ind(const int *__restrict__ row_ptr,
-                              int num_rows,
-                              int *__restrict__ row_ind);
+__global__ void build_row_ind(const int *__restrict__ row_ptr, int num_rows, int *__restrict__ row_ind);
 __global__ void build_transpose_map(const int *__restrict__ A_row_ind,
                                     const int *__restrict__ A_col_ind,
                                     const int *__restrict__ At_row_ptr,
@@ -49,82 +47,101 @@ __global__ void rescale_solution_kernel(double *__restrict__ primal_solution,
                                         const double *__restrict__ constraint_rescaling,
                                         const double objective_vector_rescaling,
                                         const double constraint_bound_rescaling,
-                                        int n_vars, int n_cons);
-__global__ void compute_delta_solution_kernel(
-    const double *__restrict__ initial_primal,
-    const double *__restrict__ pdhg_primal, double *__restrict__ delta_primal,
-    const double *__restrict__ initial_dual,
-    const double *__restrict__ pdhg_dual, double *__restrict__ delta_dual,
-    int n_vars, int n_cons);
+                                        int n_vars,
+                                        int n_cons);
+__global__ void compute_delta_solution_kernel(const double *__restrict__ initial_primal,
+                                              const double *__restrict__ pdhg_primal,
+                                              double *__restrict__ delta_primal,
+                                              const double *__restrict__ initial_dual,
+                                              const double *__restrict__ pdhg_dual,
+                                              double *__restrict__ delta_dual,
+                                              int n_vars,
+                                              int n_cons);
 static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state, const lp_problem_t *original_problem);
 static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params);
 static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params);
-static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_problem, const pdhg_parameters_t* params);
+static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_problem,
+                                                    const pdhg_parameters_t *params);
 static void compute_fixed_point_error(pdhg_solver_state_t *state);
 void pdhg_solver_state_free(pdhg_solver_state_t *state);
 void rescale_info_free(rescale_info_t *info);
 
 static void perform_primal_restart(pdhg_solver_state_t *state);
 static void perform_dual_restart(pdhg_solver_state_t *state);
-void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state);
-void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state);
+void primal_feasibility_polish(const pdhg_parameters_t *params,
+                               pdhg_solver_state_t *state,
+                               const pdhg_solver_state_t *ori_state);
+void dual_feasibility_polish(const pdhg_parameters_t *params,
+                             pdhg_solver_state_t *state,
+                             const pdhg_solver_state_t *ori_state);
 void primal_feas_polish_state_free(pdhg_solver_state_t *state);
 void dual_feas_polish_state_free(pdhg_solver_state_t *state);
 void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state);
 static void compute_primal_fixed_point_error(pdhg_solver_state_t *state);
 static void compute_dual_fixed_point_error(pdhg_solver_state_t *state);
-static pdhg_solver_state_t *initialize_primal_feas_polish_state(
-    const pdhg_solver_state_t *original_state);
-static pdhg_solver_state_t *initialize_dual_feas_polish_state(
-    const pdhg_solver_state_t *original_state);
-__global__ void compute_next_primal_solution_kernel(
-    double *__restrict__ current_primal,
-    double *__restrict__ reflected_primal,
-    const double *__restrict__ initial_primal,
-    const double *__restrict__ dual_product,
-    const double *__restrict__ objective,
-    const double *__restrict__ var_lb, const double *__restrict__ var_ub,
-    int n, const double *__restrict__ d_step_size,
-    const int *__restrict__ d_base_count, int k_offset,
-    double reflection_coeff);
-__global__ void compute_next_primal_solution_major_kernel(
-    double *__restrict__ current_primal, double *__restrict__ pdhg_primal,
-    double *__restrict__ reflected_primal,
-    const double *__restrict__ initial_primal,
-    const double *__restrict__ dual_product,
-    const double *__restrict__ objective,
-    const double *__restrict__ var_lb, const double *__restrict__ var_ub,
-    int n, const double *__restrict__ d_step_size,
-    double *__restrict__ dual_slack,
-    const int *__restrict__ d_base_count, int k_offset,
-    double reflection_coeff);
-__global__ void compute_next_dual_solution_kernel(
-    double *__restrict__ current_dual,
-    const double *__restrict__ initial_dual,
-    const double *__restrict__ primal_product,
-    const double *__restrict__ const_lb,
-    const double *__restrict__ const_ub, int n,
-    const double *__restrict__ d_step_size,
-    const int *__restrict__ d_base_count, int k_offset,
-    double reflection_coeff);
-__global__ void compute_next_dual_solution_major_kernel(
-    double *__restrict__ current_dual, double *__restrict__ pdhg_dual,
-    double *__restrict__ reflected_dual,
-    const double *__restrict__ initial_dual,
-    const double *__restrict__ primal_product,
-    const double *__restrict__ const_lb,
-    const double *__restrict__ const_ub, int n,
-    const double *__restrict__ d_step_size,
-    const int *__restrict__ d_base_count, int k_offset,
-    double reflection_coeff);
-static void compute_next_primal_solution(pdhg_solver_state_t *state, const int k_offset, const double reflection_coefficient, bool is_major);
-static void compute_next_dual_solution(pdhg_solver_state_t *state,const int k_offset, const double reflection_coefficient, bool is_major);
+static pdhg_solver_state_t *initialize_primal_feas_polish_state(const pdhg_solver_state_t *original_state);
+static pdhg_solver_state_t *initialize_dual_feas_polish_state(const pdhg_solver_state_t *original_state);
+__global__ void compute_next_primal_solution_kernel(double *__restrict__ current_primal,
+                                                    double *__restrict__ reflected_primal,
+                                                    const double *__restrict__ initial_primal,
+                                                    const double *__restrict__ dual_product,
+                                                    const double *__restrict__ objective,
+                                                    const double *__restrict__ var_lb,
+                                                    const double *__restrict__ var_ub,
+                                                    int n,
+                                                    const double *__restrict__ d_step_size,
+                                                    const int *__restrict__ d_base_count,
+                                                    int k_offset,
+                                                    double reflection_coeff);
+__global__ void compute_next_primal_solution_major_kernel(double *__restrict__ current_primal,
+                                                          double *__restrict__ pdhg_primal,
+                                                          double *__restrict__ reflected_primal,
+                                                          const double *__restrict__ initial_primal,
+                                                          const double *__restrict__ dual_product,
+                                                          const double *__restrict__ objective,
+                                                          const double *__restrict__ var_lb,
+                                                          const double *__restrict__ var_ub,
+                                                          int n,
+                                                          const double *__restrict__ d_step_size,
+                                                          double *__restrict__ dual_slack,
+                                                          const int *__restrict__ d_base_count,
+                                                          int k_offset,
+                                                          double reflection_coeff);
+__global__ void compute_next_dual_solution_kernel(double *__restrict__ current_dual,
+                                                  const double *__restrict__ initial_dual,
+                                                  const double *__restrict__ primal_product,
+                                                  const double *__restrict__ const_lb,
+                                                  const double *__restrict__ const_ub,
+                                                  int n,
+                                                  const double *__restrict__ d_step_size,
+                                                  const int *__restrict__ d_base_count,
+                                                  int k_offset,
+                                                  double reflection_coeff);
+__global__ void compute_next_dual_solution_major_kernel(double *__restrict__ current_dual,
+                                                        double *__restrict__ pdhg_dual,
+                                                        double *__restrict__ reflected_dual,
+                                                        const double *__restrict__ initial_dual,
+                                                        const double *__restrict__ primal_product,
+                                                        const double *__restrict__ const_lb,
+                                                        const double *__restrict__ const_ub,
+                                                        int n,
+                                                        const double *__restrict__ d_step_size,
+                                                        const int *__restrict__ d_base_count,
+                                                        int k_offset,
+                                                        double reflection_coeff);
+static void compute_next_primal_solution(pdhg_solver_state_t *state,
+                                         const int k_offset,
+                                         const double reflection_coefficient,
+                                         bool is_major);
+static void compute_next_dual_solution(pdhg_solver_state_t *state,
+                                       const int k_offset,
+                                       const double reflection_coefficient,
+                                       bool is_major);
 static void sync_step_sizes_to_gpu(pdhg_solver_state_t *state);
 static void sync_inner_count_to_gpu(pdhg_solver_state_t *state);
 static void check_params_validity(const pdhg_parameters_t *params);
 
-cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
-                           lp_problem_t *original_problem)
+cupdlpx_result_t *optimize(const pdhg_parameters_t *params, lp_problem_t *original_problem)
 {
     check_params_validity(params);
     print_initial_info(params, original_problem);
@@ -144,7 +161,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
         }
         working_problem = presolve_info->reduced_problem;
     }
-    
+
     pdhg_solver_state_t *state = initialize_solver_state(working_problem, params);
     display_iteration_stats(state, params->verbose);
 
@@ -181,8 +198,10 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
                 compute_next_dual_solution(state, i, params->reflection_coefficient, false);
             }
 
-            compute_next_primal_solution(state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
-            compute_next_dual_solution(state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
+            compute_next_primal_solution(
+                state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
+            compute_next_dual_solution(
+                state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
             // end CUDA graph capture
 
             cudaGraph_t graph;
@@ -212,8 +231,8 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
         }
 
         // Check Adaptive Restart
-        do_restart = should_do_adaptive_restart(state, &params->restart_params,
-                                                params->termination_evaluation_frequency);
+        do_restart =
+            should_do_adaptive_restart(state, &params->restart_params, params->termination_evaluation_frequency);
         if (do_restart)
         {
             perform_restart(state, params);
@@ -233,8 +252,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
         display_iteration_stats(state, params->verbose);
     }
 
-    if (params->feasibility_polishing &&
-        state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE &&
+    if (params->feasibility_polishing && state->termination_reason != TERMINATION_REASON_DUAL_INFEASIBLE &&
         state->termination_reason != TERMINATION_REASON_PRIMAL_INFEASIBLE)
     {
         feasibility_polish(params, state);
@@ -259,16 +277,16 @@ static void sync_step_sizes_to_gpu(pdhg_solver_state_t *state)
     double current_primal_step = state->step_size / state->primal_weight;
     double current_dual_step = state->step_size * state->primal_weight;
 
-    CUDA_CHECK(cudaMemcpyAsync(state->d_primal_step_size, &current_primal_step,
-                               sizeof(double), cudaMemcpyHostToDevice, state->stream));
-    CUDA_CHECK(cudaMemcpyAsync(state->d_dual_step_size, &current_dual_step,
-                               sizeof(double), cudaMemcpyHostToDevice, state->stream));
+    CUDA_CHECK(cudaMemcpyAsync(
+        state->d_primal_step_size, &current_primal_step, sizeof(double), cudaMemcpyHostToDevice, state->stream));
+    CUDA_CHECK(cudaMemcpyAsync(
+        state->d_dual_step_size, &current_dual_step, sizeof(double), cudaMemcpyHostToDevice, state->stream));
 }
 
 static void sync_inner_count_to_gpu(pdhg_solver_state_t *state)
 {
-    CUDA_CHECK(cudaMemcpyAsync(state->d_inner_count, &state->inner_count,
-                               sizeof(int), cudaMemcpyHostToDevice, state->stream));
+    CUDA_CHECK(
+        cudaMemcpyAsync(state->d_inner_count, &state->inner_count, sizeof(int), cudaMemcpyHostToDevice, state->stream));
 }
 
 static void check_params_validity(const pdhg_parameters_t *params)
@@ -282,16 +300,15 @@ static void check_params_validity(const pdhg_parameters_t *params)
     }
 }
 
-__global__ void compute_and_rescale_reduced_cost_kernel(
-    double *__restrict__ reduced_cost,
-    const double *__restrict__ objective,
-    const double *__restrict__ dual_product,
-    const double *__restrict__ variable_rescaling,
-    const double objective_vector_rescaling,
-    const double constraint_bound_rescaling,
-    const double *__restrict__ variable_lower_bound,
-    const double *__restrict__ variable_upper_bound,
-    int n_vars)
+__global__ void compute_and_rescale_reduced_cost_kernel(double *__restrict__ reduced_cost,
+                                                        const double *__restrict__ objective,
+                                                        const double *__restrict__ dual_product,
+                                                        const double *__restrict__ variable_rescaling,
+                                                        const double objective_vector_rescaling,
+                                                        const double constraint_bound_rescaling,
+                                                        const double *__restrict__ variable_lower_bound,
+                                                        const double *__restrict__ variable_upper_bound,
+                                                        int n_vars)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_vars)
@@ -310,9 +327,8 @@ __global__ void compute_and_rescale_reduced_cost_kernel(
     }
 }
 
-static pdhg_solver_state_t *initialize_solver_state(
-    const lp_problem_t *working_problem,
-    const pdhg_parameters_t* params)
+static pdhg_solver_state_t *initialize_solver_state(const lp_problem_t *working_problem,
+                                                    const pdhg_parameters_t *params)
 {
     pdhg_solver_state_t *state = (pdhg_solver_state_t *)safe_calloc(1, sizeof(pdhg_solver_state_t));
 
@@ -341,20 +357,26 @@ static pdhg_solver_state_t *initialize_solver_state(
 
     state->num_blocks_primal = (state->num_variables + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     state->num_blocks_dual = (state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    state->num_blocks_primal_dual = (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    state->num_blocks_primal_dual =
+        (state->num_variables + state->num_constraints + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     state->num_blocks_nnz = (state->constraint_matrix->num_nonzeros + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 
     CUSPARSE_CHECK(cusparseCreate(&state->sparse_handle));
     CUBLAS_CHECK(cublasCreate(&state->blas_handle));
     CUBLAS_CHECK(cublasSetPointerMode(state->blas_handle, CUBLAS_POINTER_MODE_HOST));
 
-#define ALLOC_AND_COPY(dest, src, bytes)  \
-    CUDA_CHECK(cudaMalloc(&dest, bytes)); \
+#define ALLOC_AND_COPY(dest, src, bytes)                                                                               \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));                                                                              \
     CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice));
 
-    ALLOC_AND_COPY(state->constraint_matrix->row_ptr, working_problem->constraint_matrix_row_pointers, (n_cons + 1) * sizeof(int));
-    ALLOC_AND_COPY(state->constraint_matrix->col_ind, working_problem->constraint_matrix_col_indices, working_problem->constraint_matrix_num_nonzeros * sizeof(int));
-    ALLOC_AND_COPY(state->constraint_matrix->val, working_problem->constraint_matrix_values, working_problem->constraint_matrix_num_nonzeros * sizeof(double));
+    ALLOC_AND_COPY(
+        state->constraint_matrix->row_ptr, working_problem->constraint_matrix_row_pointers, (n_cons + 1) * sizeof(int));
+    ALLOC_AND_COPY(state->constraint_matrix->col_ind,
+                   working_problem->constraint_matrix_col_indices,
+                   working_problem->constraint_matrix_num_nonzeros * sizeof(int));
+    ALLOC_AND_COPY(state->constraint_matrix->val,
+                   working_problem->constraint_matrix_values,
+                   working_problem->constraint_matrix_num_nonzeros * sizeof(double));
 
     CUDA_CHECK(cudaMalloc(&state->constraint_matrix->row_ind, nnz * sizeof(int)));
     build_row_ind<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
@@ -362,25 +384,45 @@ static pdhg_solver_state_t *initialize_solver_state(
     CUDA_CHECK(cudaGetLastError());
 
     CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->row_ptr, (n_vars + 1) * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind, working_problem->constraint_matrix_num_nonzeros * sizeof(int)));
-    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->val, working_problem->constraint_matrix_num_nonzeros * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&state->constraint_matrix_t->col_ind,
+                          working_problem->constraint_matrix_num_nonzeros * sizeof(int)));
+    CUDA_CHECK(
+        cudaMalloc(&state->constraint_matrix_t->val, working_problem->constraint_matrix_num_nonzeros * sizeof(double)));
 
     size_t buffer_size = 0;
     void *buffer = nullptr;
-    CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(
-        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
-        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
-        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
-        CUSPARSE_CSR2CSC_ALG_DEFAULT, &buffer_size));
+    CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(state->sparse_handle,
+                                                 state->constraint_matrix->num_rows,
+                                                 state->constraint_matrix->num_cols,
+                                                 state->constraint_matrix->num_nonzeros,
+                                                 state->constraint_matrix->val,
+                                                 state->constraint_matrix->row_ptr,
+                                                 state->constraint_matrix->col_ind,
+                                                 state->constraint_matrix_t->val,
+                                                 state->constraint_matrix_t->row_ptr,
+                                                 state->constraint_matrix_t->col_ind,
+                                                 CUDA_R_64F,
+                                                 CUSPARSE_ACTION_NUMERIC,
+                                                 CUSPARSE_INDEX_BASE_ZERO,
+                                                 CUSPARSE_CSR2CSC_ALG_DEFAULT,
+                                                 &buffer_size));
     CUDA_CHECK(cudaMalloc(&buffer, buffer_size));
 
-    CUSPARSE_CHECK(cusparseCsr2cscEx2(
-        state->sparse_handle, state->constraint_matrix->num_rows, state->constraint_matrix->num_cols, state->constraint_matrix->num_nonzeros,
-        state->constraint_matrix->val, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->val, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind,
-        CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
-        CUSPARSE_CSR2CSC_ALG_DEFAULT, buffer));
+    CUSPARSE_CHECK(cusparseCsr2cscEx2(state->sparse_handle,
+                                      state->constraint_matrix->num_rows,
+                                      state->constraint_matrix->num_cols,
+                                      state->constraint_matrix->num_nonzeros,
+                                      state->constraint_matrix->val,
+                                      state->constraint_matrix->row_ptr,
+                                      state->constraint_matrix->col_ind,
+                                      state->constraint_matrix_t->val,
+                                      state->constraint_matrix_t->row_ptr,
+                                      state->constraint_matrix_t->col_ind,
+                                      CUDA_R_64F,
+                                      CUSPARSE_ACTION_NUMERIC,
+                                      CUSPARSE_INDEX_BASE_ZERO,
+                                      CUSPARSE_CSR2CSC_ALG_DEFAULT,
+                                      buffer));
 
     CUDA_CHECK(cudaFree(buffer));
 
@@ -391,13 +433,12 @@ static pdhg_solver_state_t *initialize_solver_state(
 
     CUDA_CHECK(cudaMalloc(&state->constraint_matrix->transpose_map, nnz * sizeof(int)));
     state->constraint_matrix_t->transpose_map = NULL;
-    build_transpose_map<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(
-        state->constraint_matrix->row_ind,
-        state->constraint_matrix->col_ind,
-        state->constraint_matrix_t->row_ptr,
-        state->constraint_matrix_t->col_ind,
-        nnz,
-        state->constraint_matrix->transpose_map);
+    build_transpose_map<<<state->num_blocks_nnz, THREADS_PER_BLOCK>>>(state->constraint_matrix->row_ind,
+                                                                      state->constraint_matrix->col_ind,
+                                                                      state->constraint_matrix_t->row_ptr,
+                                                                      state->constraint_matrix_t->col_ind,
+                                                                      nnz,
+                                                                      state->constraint_matrix->transpose_map);
     CUDA_CHECK(cudaGetLastError());
 
     ALLOC_AND_COPY(state->variable_lower_bound, working_problem->variable_lower_bound, var_bytes);
@@ -405,9 +446,9 @@ static pdhg_solver_state_t *initialize_solver_state(
     ALLOC_AND_COPY(state->objective_vector, working_problem->objective_vector, var_bytes);
     ALLOC_AND_COPY(state->constraint_lower_bound, working_problem->constraint_lower_bound, con_bytes);
     ALLOC_AND_COPY(state->constraint_upper_bound, working_problem->constraint_upper_bound, con_bytes);
-    
-#define ALLOC_ZERO(dest, bytes)           \
-    CUDA_CHECK(cudaMalloc(&dest, bytes)); \
+
+#define ALLOC_ZERO(dest, bytes)                                                                                        \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));                                                                              \
     CUDA_CHECK(cudaMemset(dest, 0, bytes));
 
     ALLOC_ZERO(state->initial_primal_solution, var_bytes);
@@ -430,15 +471,13 @@ static pdhg_solver_state_t *initialize_solver_state(
 
     if (working_problem->primal_start)
     {
-    CUDA_CHECK(cudaMemcpy(state->initial_primal_solution,
-                          working_problem->primal_start,
-                          var_bytes, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(
+            state->initial_primal_solution, working_problem->primal_start, var_bytes, cudaMemcpyHostToDevice));
     }
     if (working_problem->dual_start)
     {
-    CUDA_CHECK(cudaMemcpy(state->initial_dual_solution,
-                          working_problem->dual_start,
-                          con_bytes, cudaMemcpyHostToDevice));
+        CUDA_CHECK(
+            cudaMemcpy(state->initial_dual_solution, working_problem->dual_start, con_bytes, cudaMemcpyHostToDevice));
     }
 
     rescale_info_t *rescale_info = rescale_problem(params, state);
@@ -453,37 +492,31 @@ static pdhg_solver_state_t *initialize_solver_state(
     rescale_info->var_rescale = NULL;
     rescale_info_free(rescale_info);
 
-    CUDA_CHECK(cudaMemcpy(state->current_primal_solution,
-                          state->initial_primal_solution,
-                          var_bytes, cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(state->current_dual_solution,
-                          state->initial_dual_solution,
-                          con_bytes, cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(state->pdhg_primal_solution,
-                          state->initial_primal_solution,
-                          var_bytes, cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(state->pdhg_dual_solution,
-                          state->initial_dual_solution,
-                          con_bytes, cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(
+        state->current_primal_solution, state->initial_primal_solution, var_bytes, cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(
+        cudaMemcpy(state->current_dual_solution, state->initial_dual_solution, con_bytes, cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(
+        cudaMemcpy(state->pdhg_primal_solution, state->initial_primal_solution, var_bytes, cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(
+        cudaMemcpy(state->pdhg_dual_solution, state->initial_dual_solution, con_bytes, cudaMemcpyDeviceToDevice));
 
     CUDA_CHECK(cudaMalloc(&state->constraint_lower_bound_finite_val, con_bytes));
     CUDA_CHECK(cudaMalloc(&state->constraint_upper_bound_finite_val, con_bytes));
     CUDA_CHECK(cudaMalloc(&state->variable_lower_bound_finite_val, var_bytes));
     CUDA_CHECK(cudaMalloc(&state->variable_upper_bound_finite_val, var_bytes));
 
-    fill_finite_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
-        state->constraint_lower_bound,
-        state->constraint_upper_bound,
-        state->constraint_lower_bound_finite_val,
-        state->constraint_upper_bound_finite_val,
-        n_cons);
+    fill_finite_bounds_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(state->constraint_lower_bound,
+                                                                             state->constraint_upper_bound,
+                                                                             state->constraint_lower_bound_finite_val,
+                                                                             state->constraint_upper_bound_finite_val,
+                                                                             n_cons);
 
-    fill_finite_bounds_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
-        state->variable_lower_bound,
-        state->variable_upper_bound,
-        state->variable_lower_bound_finite_val,
-        state->variable_upper_bound_finite_val,
-        n_vars);
+    fill_finite_bounds_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(state->variable_lower_bound,
+                                                                               state->variable_upper_bound,
+                                                                               state->variable_lower_bound_finite_val,
+                                                                               state->variable_upper_bound_finite_val,
+                                                                               n_vars);
 
     CUDA_CHECK(cudaFree(state->constraint_matrix->row_ind));
     state->constraint_matrix->row_ind = NULL;
@@ -497,17 +530,24 @@ static pdhg_solver_state_t *initialize_solver_state(
     double val = 0.0;
     for (int i = 0; i < n_vars; ++i)
     {
-        if (params->optimality_norm == NORM_TYPE_L_INF) {
+        if (params->optimality_norm == NORM_TYPE_L_INF)
+        {
             val = fabs(working_problem->objective_vector[i]);
-            if (val > max_val) max_val = val;
-        } else {
+            if (val > max_val)
+                max_val = val;
+        }
+        else
+        {
             sum_of_squares += working_problem->objective_vector[i] * working_problem->objective_vector[i];
         }
     }
 
-    if (params->optimality_norm == NORM_TYPE_L_INF) {
+    if (params->optimality_norm == NORM_TYPE_L_INF)
+    {
         state->objective_vector_norm = max_val;
-    } else {
+    }
+    else
+    {
         state->objective_vector_norm = sqrt(sum_of_squares);
     }
 
@@ -519,27 +559,39 @@ static pdhg_solver_state_t *initialize_solver_state(
         double lower = working_problem->constraint_lower_bound[i];
         double upper = working_problem->constraint_upper_bound[i];
 
-        if (params->optimality_norm == NORM_TYPE_L_INF) {
-            if (isfinite(lower) && (lower != upper)) {
+        if (params->optimality_norm == NORM_TYPE_L_INF)
+        {
+            if (isfinite(lower) && (lower != upper))
+            {
                 val = fabs(lower);
-                if (val > max_val) max_val = val;
+                if (val > max_val)
+                    max_val = val;
             }
-            if (isfinite(upper)) {
+            if (isfinite(upper))
+            {
                 val = fabs(upper);
-                if (val > max_val) max_val = val;
+                if (val > max_val)
+                    max_val = val;
             }
-        } else {
-            if (isfinite(lower) && (lower != upper)) {
+        }
+        else
+        {
+            if (isfinite(lower) && (lower != upper))
+            {
                 sum_of_squares += lower * lower;
             }
-            if (isfinite(upper)) {
+            if (isfinite(upper))
+            {
                 sum_of_squares += upper * upper;
             }
         }
     }
-    if (params->optimality_norm == NORM_TYPE_L_INF) {
+    if (params->optimality_norm == NORM_TYPE_L_INF)
+    {
         state->constraint_bound_norm = max_val;
-    } else {
+    }
+    else
+    {
         state->constraint_bound_norm = sqrt(sum_of_squares);
     }
 
@@ -551,28 +603,83 @@ static pdhg_solver_state_t *initialize_solver_state(
     size_t primal_spmv_buffer_size;
     size_t dual_spmv_buffer_size;
 
-    CUSPARSE_CHECK(cusparseCreateCsr(&state->matA, state->num_constraints, state->num_variables, state->constraint_matrix->num_nonzeros, state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateCsr(&state->matA,
+                                     state->num_constraints,
+                                     state->num_variables,
+                                     state->constraint_matrix->num_nonzeros,
+                                     state->constraint_matrix->row_ptr,
+                                     state->constraint_matrix->col_ind,
+                                     state->constraint_matrix->val,
+                                     CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_BASE_ZERO,
+                                     CUDA_R_64F));
     CUDA_CHECK(cudaGetLastError());
 
-    CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt, state->num_variables, state->num_constraints, state->constraint_matrix_t->num_nonzeros, state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateCsr(&state->matAt,
+                                     state->num_variables,
+                                     state->num_constraints,
+                                     state->constraint_matrix_t->num_nonzeros,
+                                     state->constraint_matrix_t->row_ptr,
+                                     state->constraint_matrix_t->col_ind,
+                                     state->constraint_matrix_t->val,
+                                     CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_BASE_ZERO,
+                                     CUDA_R_64F));
     CUDA_CHECK(cudaGetLastError());
 
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F));
+    CUSPARSE_CHECK(
+        cusparseCreateDnVec(&state->vec_primal_sol, state->num_variables, state->pdhg_primal_solution, CUDA_R_64F));
+    CUSPARSE_CHECK(
+        cusparseCreateDnVec(&state->vec_dual_sol, state->num_constraints, state->pdhg_dual_solution, CUDA_R_64F));
+    CUSPARSE_CHECK(
+        cusparseCreateDnVec(&state->vec_primal_prod, state->num_constraints, state->primal_product, CUDA_R_64F));
     CUSPARSE_CHECK(cusparseCreateDnVec(&state->vec_dual_prod, state->num_variables, state->dual_product, CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &primal_spmv_buffer_size));
-
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &dual_spmv_buffer_size));
+    CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &HOST_ONE,
+                                           state->matA,
+                                           state->vec_primal_sol,
+                                           &HOST_ZERO,
+                                           state->vec_primal_prod,
+                                           CUDA_R_64F,
+                                           CUSPARSE_SPMV_CSR_ALG2,
+                                           &primal_spmv_buffer_size));
+
+    CUSPARSE_CHECK(cusparseSpMV_bufferSize(state->sparse_handle,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &HOST_ONE,
+                                           state->matAt,
+                                           state->vec_dual_sol,
+                                           &HOST_ZERO,
+                                           state->vec_dual_prod,
+                                           CUDA_R_64F,
+                                           CUSPARSE_SPMV_CSR_ALG2,
+                                           &dual_spmv_buffer_size));
     CUDA_CHECK(cudaMalloc(&state->primal_spmv_buffer, primal_spmv_buffer_size));
-    CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                           &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
-                                           CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &HOST_ONE,
+                                           state->matA,
+                                           state->vec_primal_sol,
+                                           &HOST_ZERO,
+                                           state->vec_primal_prod,
+                                           CUDA_R_64F,
+                                           CUSPARSE_SPMV_CSR_ALG2,
+                                           state->primal_spmv_buffer));
 
     CUDA_CHECK(cudaMalloc(&state->dual_spmv_buffer, dual_spmv_buffer_size));
-    CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                           &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-                                           CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV_preprocess(state->sparse_handle,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &HOST_ONE,
+                                           state->matAt,
+                                           state->vec_dual_sol,
+                                           &HOST_ZERO,
+                                           state->vec_dual_prod,
+                                           CUDA_R_64F,
+                                           CUSPARSE_SPMV_CSR_ALG2,
+                                           state->dual_spmv_buffer));
 
     CUDA_CHECK(cudaMalloc(&state->d_primal_step_size, sizeof(double)));
     CUDA_CHECK(cudaMalloc(&state->d_dual_step_size, sizeof(double)));
@@ -582,23 +689,21 @@ static pdhg_solver_state_t *initialize_solver_state(
     CUDA_CHECK(cudaMemset(state->d_dual_step_size, 0, sizeof(double)));
     CUDA_CHECK(cudaMemset(state->d_inner_count, 0, sizeof(int)));
 
-    CUDA_CHECK(
-        cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double)));
-    CUDA_CHECK(
-        cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&state->ones_primal_d, state->num_variables * sizeof(double)));
+    CUDA_CHECK(cudaMalloc(&state->ones_dual_d, state->num_constraints * sizeof(double)));
 
     double *ones_primal_h = (double *)safe_malloc(state->num_variables * sizeof(double));
     for (int i = 0; i < state->num_variables; ++i)
         ones_primal_h[i] = 1.0;
-    CUDA_CHECK(cudaMemcpy(state->ones_primal_d, ones_primal_h, state->num_variables * sizeof(double), cudaMemcpyHostToDevice));
+    CUDA_CHECK(
+        cudaMemcpy(state->ones_primal_d, ones_primal_h, state->num_variables * sizeof(double), cudaMemcpyHostToDevice));
     free(ones_primal_h);
 
     double *ones_dual_h = (double *)safe_malloc(state->num_constraints * sizeof(double));
     for (int i = 0; i < state->num_constraints; ++i)
         ones_dual_h[i] = 1.0;
-    CUDA_CHECK(cudaMemcpy(state->ones_dual_d, ones_dual_h,
-                          state->num_constraints * sizeof(double),
-                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(
+        cudaMemcpy(state->ones_dual_d, ones_dual_h, state->num_constraints * sizeof(double), cudaMemcpyHostToDevice));
 
     // --- CUDA Graph Initialization ---
     CUDA_CHECK(cudaStreamCreate(&state->stream));
@@ -610,11 +715,22 @@ static pdhg_solver_state_t *initialize_solver_state(
     {
         printf("---------------------------------------------------------------------"
                "------------------\n");
-        printf("%s | %s | %s | %s \n", "   runtime    ", "    objective     ",
-               "  absolute residuals   ", "  relative residuals   ");
-        printf("%s %s | %s %s | %s %s %s | %s %s %s \n", "  iter", "  time ",
-               " pr obj ", "  du obj ", " pr res", " du res", "  gap  ", " pr res",
-               " du res", "  gap  ");
+        printf("%s | %s | %s | %s \n",
+               "   runtime    ",
+               "    objective     ",
+               "  absolute residuals   ",
+               "  relative residuals   ");
+        printf("%s %s | %s %s | %s %s %s | %s %s %s \n",
+               "  iter",
+               "  time ",
+               " pr obj ",
+               "  du obj ",
+               " pr res",
+               " du res",
+               "  gap  ",
+               " pr res",
+               " du res",
+               "  gap  ");
         printf("---------------------------------------------------------------------"
                "------------------\n");
     }
@@ -622,9 +738,7 @@ static pdhg_solver_state_t *initialize_solver_state(
     return state;
 }
 
-__global__ void build_row_ind(const int *__restrict__ row_ptr,
-                              int num_rows,
-                              int *__restrict__ row_ind)
+__global__ void build_row_ind(const int *__restrict__ row_ptr, int num_rows, int *__restrict__ row_ind)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= num_rows)
@@ -639,13 +753,12 @@ __global__ void build_row_ind(const int *__restrict__ row_ptr,
     }
 }
 
-__global__ void build_transpose_map(
-    const int *__restrict__ A_row_ind,
-    const int *__restrict__ A_col_ind,
-    const int *__restrict__ At_row_ptr,
-    const int *__restrict__ At_col_ind,
-    int nnz,
-    int *__restrict__ A_to_At)
+__global__ void build_transpose_map(const int *__restrict__ A_row_ind,
+                                    const int *__restrict__ A_col_ind,
+                                    const int *__restrict__ At_row_ptr,
+                                    const int *__restrict__ At_col_ind,
+                                    int nnz,
+                                    int *__restrict__ A_to_At)
 {
     int k = blockIdx.x * blockDim.x + threadIdx.x;
     if (k >= nnz)
@@ -673,12 +786,11 @@ __global__ void build_transpose_map(
     A_to_At[k] = pos;
 }
 
-__global__ void fill_finite_bounds_kernel(
-    const double *__restrict__ lb,
-    const double *__restrict__ ub,
-    double *__restrict__ lb_finite,
-    double *__restrict__ ub_finite,
-    int num_elements)
+__global__ void fill_finite_bounds_kernel(const double *__restrict__ lb,
+                                          const double *__restrict__ ub,
+                                          double *__restrict__ lb_finite,
+                                          double *__restrict__ ub_finite,
+                                          int num_elements)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= num_elements)
@@ -691,16 +803,18 @@ __global__ void fill_finite_bounds_kernel(
     ub_finite[i] = isfinite(Ui) ? Ui : 0.0;
 }
 
-__global__ void compute_next_primal_solution_kernel(
-    double *__restrict__ current_primal,
-    double *__restrict__ reflected_primal,
-    const double *__restrict__ initial_primal,
-    const double *__restrict__ dual_product,
-    const double *__restrict__ objective,
-    const double *__restrict__ var_lb, const double *__restrict__ var_ub,
-    int n, const double *__restrict__ d_step_size,
-    const int *__restrict__ d_base_count, int k_offset,
-    double reflection_coeff)
+__global__ void compute_next_primal_solution_kernel(double *__restrict__ current_primal,
+                                                    double *__restrict__ reflected_primal,
+                                                    const double *__restrict__ initial_primal,
+                                                    const double *__restrict__ dual_product,
+                                                    const double *__restrict__ objective,
+                                                    const double *__restrict__ var_lb,
+                                                    const double *__restrict__ var_ub,
+                                                    int n,
+                                                    const double *__restrict__ d_step_size,
+                                                    const int *__restrict__ d_base_count,
+                                                    int k_offset,
+                                                    double reflection_coeff)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     double step_size = *d_step_size;
@@ -708,27 +822,28 @@ __global__ void compute_next_primal_solution_kernel(
     double weight = (double)(current_k) / (double)(current_k + 1);
     if (i < n)
     {
-        double temp =
-            current_primal[i] - step_size * (objective[i] - dual_product[i]);
+        double temp = current_primal[i] - step_size * (objective[i] - dual_product[i]);
         double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
         reflected_primal[i] = 2.0 * temp_proj - current_primal[i];
-        double reflected = reflection_coeff * reflected_primal[i] +
-                           (1.0 - reflection_coeff) * current_primal[i];
+        double reflected = reflection_coeff * reflected_primal[i] + (1.0 - reflection_coeff) * current_primal[i];
         current_primal[i] = weight * reflected + (1.0 - weight) * initial_primal[i];
     }
 }
 
-__global__ void compute_next_primal_solution_major_kernel(
-    double *__restrict__ current_primal, double *__restrict__ pdhg_primal,
-    double *__restrict__ reflected_primal,
-    const double *__restrict__ initial_primal,
-    const double *__restrict__ dual_product,
-    const double *__restrict__ objective,
-    const double *__restrict__ var_lb, const double *__restrict__ var_ub,
-    int n, const double *__restrict__ d_step_size,
-    double *__restrict__ dual_slack,
-    const int *__restrict__ d_base_count, int k_offset,
-    double reflection_coeff)
+__global__ void compute_next_primal_solution_major_kernel(double *__restrict__ current_primal,
+                                                          double *__restrict__ pdhg_primal,
+                                                          double *__restrict__ reflected_primal,
+                                                          const double *__restrict__ initial_primal,
+                                                          const double *__restrict__ dual_product,
+                                                          const double *__restrict__ objective,
+                                                          const double *__restrict__ var_lb,
+                                                          const double *__restrict__ var_ub,
+                                                          int n,
+                                                          const double *__restrict__ d_step_size,
+                                                          double *__restrict__ dual_slack,
+                                                          const int *__restrict__ d_base_count,
+                                                          int k_offset,
+                                                          double reflection_coeff)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     double step_size = *d_step_size;
@@ -736,26 +851,25 @@ __global__ void compute_next_primal_solution_major_kernel(
     double weight = (double)(current_k) / (double)(current_k + 1);
     if (i < n)
     {
-        double temp =
-            current_primal[i] - step_size * (objective[i] - dual_product[i]);
+        double temp = current_primal[i] - step_size * (objective[i] - dual_product[i]);
         pdhg_primal[i] = fmax(var_lb[i], fmin(temp, var_ub[i]));
         dual_slack[i] = (pdhg_primal[i] - temp) / step_size;
         reflected_primal[i] = 2.0 * pdhg_primal[i] - current_primal[i];
-        double reflected = reflection_coeff * reflected_primal[i] +
-                           (1.0 - reflection_coeff) * current_primal[i];
+        double reflected = reflection_coeff * reflected_primal[i] + (1.0 - reflection_coeff) * current_primal[i];
         current_primal[i] = weight * reflected + (1.0 - weight) * initial_primal[i];
     }
 }
 
-__global__ void compute_next_dual_solution_kernel(
-    double *__restrict__ current_dual,
-    const double *__restrict__ initial_dual,
-    const double *__restrict__ primal_product,
-    const double *__restrict__ const_lb,
-    const double *__restrict__ const_ub, int n,
-    const double *__restrict__ d_step_size,
-    const int *__restrict__ d_base_count, int k_offset,
-    double reflection_coeff)
+__global__ void compute_next_dual_solution_kernel(double *__restrict__ current_dual,
+                                                  const double *__restrict__ initial_dual,
+                                                  const double *__restrict__ primal_product,
+                                                  const double *__restrict__ const_lb,
+                                                  const double *__restrict__ const_ub,
+                                                  int n,
+                                                  const double *__restrict__ d_step_size,
+                                                  const int *__restrict__ d_base_count,
+                                                  int k_offset,
+                                                  double reflection_coeff)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     double step_size = *d_step_size;
@@ -766,21 +880,23 @@ __global__ void compute_next_dual_solution_kernel(
         double temp = current_dual[i] / step_size - primal_product[i];
         double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
         double reflected = reflection_coeff * (2.0 * (temp - temp_proj) * step_size - current_dual[i]) +
-                    (1.0 - reflection_coeff) * current_dual[i];
+            (1.0 - reflection_coeff) * current_dual[i];
         current_dual[i] = weight * reflected + (1.0 - weight) * initial_dual[i];
     }
 }
 
-__global__ void compute_next_dual_solution_major_kernel(
-    double *__restrict__ current_dual, double *__restrict__ pdhg_dual,
-    double *__restrict__ reflected_dual,
-    const double *__restrict__ initial_dual,
-    const double *__restrict__ primal_product,
-    const double *__restrict__ const_lb,
-    const double *__restrict__ const_ub, int n,
-    const double *__restrict__ d_step_size,
-    const int *__restrict__ d_base_count, int k_offset,
-    double reflection_coeff)
+__global__ void compute_next_dual_solution_major_kernel(double *__restrict__ current_dual,
+                                                        double *__restrict__ pdhg_dual,
+                                                        double *__restrict__ reflected_dual,
+                                                        const double *__restrict__ initial_dual,
+                                                        const double *__restrict__ primal_product,
+                                                        const double *__restrict__ const_lb,
+                                                        const double *__restrict__ const_ub,
+                                                        int n,
+                                                        const double *__restrict__ d_step_size,
+                                                        const int *__restrict__ d_base_count,
+                                                        int k_offset,
+                                                        double reflection_coeff)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     double step_size = *d_step_size;
@@ -792,41 +908,40 @@ __global__ void compute_next_dual_solution_major_kernel(
         double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
         pdhg_dual[i] = (temp - temp_proj) * step_size;
         reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i];
-        double reflected = reflection_coeff * reflected_dual[i] +
-            (1.0 - reflection_coeff) * current_dual[i];
+        double reflected = reflection_coeff * reflected_dual[i] + (1.0 - reflection_coeff) * current_dual[i];
         current_dual[i] = weight * reflected + (1.0 - weight) * initial_dual[i];
     }
 }
 
-
 __global__ void rescale_solution_kernel(double *__restrict__ primal_solution,
                                         double *__restrict__ dual_solution,
                                         const double *__restrict__ variable_rescaling,
                                         const double *__restrict__ constraint_rescaling,
                                         const double objective_vector_rescaling,
                                         const double constraint_bound_rescaling,
-                                        int n_vars, int n_cons)
+                                        int n_vars,
+                                        int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_vars)
     {
-        primal_solution[i] =
-            primal_solution[i] / variable_rescaling[i] / constraint_bound_rescaling;
+        primal_solution[i] = primal_solution[i] / variable_rescaling[i] / constraint_bound_rescaling;
     }
     else if (i < n_vars + n_cons)
     {
         int idx = i - n_vars;
-        dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx] /
-                             objective_vector_rescaling;
+        dual_solution[idx] = dual_solution[idx] / constraint_rescaling[idx] / objective_vector_rescaling;
     }
 }
 
-__global__ void compute_delta_solution_kernel(
-    const double *__restrict__ initial_primal,
-    const double *__restrict__ pdhg_primal, double *__restrict__ delta_primal,
-    const double *__restrict__ initial_dual,
-    const double *__restrict__ pdhg_dual, double *__restrict__ delta_dual,
-    int n_vars, int n_cons)
+__global__ void compute_delta_solution_kernel(const double *__restrict__ initial_primal,
+                                              const double *__restrict__ pdhg_primal,
+                                              double *__restrict__ delta_primal,
+                                              const double *__restrict__ initial_dual,
+                                              const double *__restrict__ pdhg_dual,
+                                              double *__restrict__ delta_dual,
+                                              int n_vars,
+                                              int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_vars)
@@ -840,110 +955,145 @@ __global__ void compute_delta_solution_kernel(
     }
 }
 
-static void compute_next_primal_solution(pdhg_solver_state_t *state, const int k_offset, const double reflection_coefficient, bool is_major)
+static void compute_next_primal_solution(pdhg_solver_state_t *state,
+                                         const int k_offset,
+                                         const double reflection_coefficient,
+                                         bool is_major)
 {
-    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol,
-                                          state->current_dual_solution));
-    CUSPARSE_CHECK(
-        cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product));
-
-    CUSPARSE_CHECK(cusparseSpMV(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->current_dual_solution));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product));
+
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matAt,
+                                state->vec_dual_sol,
+                                &HOST_ZERO,
+                                state->vec_dual_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->dual_spmv_buffer));
 
     double step = state->step_size / state->primal_weight;
 
     if (is_major)
     {
-        compute_next_primal_solution_major_kernel<<<state->num_blocks_primal,
-                                                         THREADS_PER_BLOCK, 0, state->stream>>>(
-            state->current_primal_solution, state->pdhg_primal_solution,
-            state->reflected_primal_solution, state->initial_primal_solution, state->dual_product,
-            state->objective_vector, state->variable_lower_bound,
-            state->variable_upper_bound, state->num_variables, state->d_primal_step_size,
-            state->dual_slack, state->d_inner_count, k_offset, reflection_coefficient);
+        compute_next_primal_solution_major_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>(
+            state->current_primal_solution,
+            state->pdhg_primal_solution,
+            state->reflected_primal_solution,
+            state->initial_primal_solution,
+            state->dual_product,
+            state->objective_vector,
+            state->variable_lower_bound,
+            state->variable_upper_bound,
+            state->num_variables,
+            state->d_primal_step_size,
+            state->dual_slack,
+            state->d_inner_count,
+            k_offset,
+            reflection_coefficient);
     }
     else
     {
-        compute_next_primal_solution_kernel<<<state->num_blocks_primal,
-                                                   THREADS_PER_BLOCK, 0, state->stream>>>(
-            state->current_primal_solution, state->reflected_primal_solution, state->initial_primal_solution,
-            state->dual_product, state->objective_vector,
-            state->variable_lower_bound, state->variable_upper_bound,
-            state->num_variables, state->d_primal_step_size,
-            state->d_inner_count, k_offset, reflection_coefficient
-        );
+        compute_next_primal_solution_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>(
+            state->current_primal_solution,
+            state->reflected_primal_solution,
+            state->initial_primal_solution,
+            state->dual_product,
+            state->objective_vector,
+            state->variable_lower_bound,
+            state->variable_upper_bound,
+            state->num_variables,
+            state->d_primal_step_size,
+            state->d_inner_count,
+            k_offset,
+            reflection_coefficient);
     }
 }
 
-static void compute_next_dual_solution(pdhg_solver_state_t *state,const int k_offset, const double reflection_coefficient, bool is_major)
+static void compute_next_dual_solution(pdhg_solver_state_t *state,
+                                       const int k_offset,
+                                       const double reflection_coefficient,
+                                       bool is_major)
 {
-    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol,
-                                          state->reflected_primal_solution));
-    CUSPARSE_CHECK(
-        cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product));
-
-    CUSPARSE_CHECK(cusparseSpMV(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->reflected_primal_solution));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product));
+
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matA,
+                                state->vec_primal_sol,
+                                &HOST_ZERO,
+                                state->vec_primal_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->primal_spmv_buffer));
 
     double step = state->step_size * state->primal_weight;
 
     if (is_major)
     {
-        compute_next_dual_solution_major_kernel<<<state->num_blocks_dual,
-                                                       THREADS_PER_BLOCK, 0, state->stream>>>(
-            state->current_dual_solution, state->pdhg_dual_solution,
-            state->reflected_dual_solution, state->initial_dual_solution, state->primal_product,
-            state->constraint_lower_bound, state->constraint_upper_bound,
-            state->num_constraints, state->d_dual_step_size,
-            state->d_inner_count, k_offset, reflection_coefficient
-        );
+        compute_next_dual_solution_major_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
+            state->current_dual_solution,
+            state->pdhg_dual_solution,
+            state->reflected_dual_solution,
+            state->initial_dual_solution,
+            state->primal_product,
+            state->constraint_lower_bound,
+            state->constraint_upper_bound,
+            state->num_constraints,
+            state->d_dual_step_size,
+            state->d_inner_count,
+            k_offset,
+            reflection_coefficient);
     }
     else
     {
-        compute_next_dual_solution_kernel<<<state->num_blocks_dual,
-                                                 THREADS_PER_BLOCK, 0, state->stream>>>(
-            state->current_dual_solution, state->initial_dual_solution,
-            state->primal_product, state->constraint_lower_bound,
-            state->constraint_upper_bound, state->num_constraints, state->d_dual_step_size,
-            state->d_inner_count, k_offset, reflection_coefficient);
+        compute_next_dual_solution_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
+            state->current_dual_solution,
+            state->initial_dual_solution,
+            state->primal_product,
+            state->constraint_lower_bound,
+            state->constraint_upper_bound,
+            state->num_constraints,
+            state->d_dual_step_size,
+            state->d_inner_count,
+            k_offset,
+            reflection_coefficient);
     }
 }
 
-static void perform_restart(pdhg_solver_state_t *state,
-                            const pdhg_parameters_t *params)
+static void perform_restart(pdhg_solver_state_t *state, const pdhg_parameters_t *params)
 {
-    compute_delta_solution_kernel<<<state->num_blocks_primal_dual,
-                                    THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->initial_primal_solution, state->pdhg_primal_solution,
-        state->delta_primal_solution, state->initial_dual_solution,
-        state->pdhg_dual_solution, state->delta_dual_solution,
-        state->num_variables, state->num_constraints);
+    compute_delta_solution_kernel<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
+        state->initial_primal_solution,
+        state->pdhg_primal_solution,
+        state->delta_primal_solution,
+        state->initial_dual_solution,
+        state->pdhg_dual_solution,
+        state->delta_dual_solution,
+        state->num_variables,
+        state->num_constraints);
 
     double primal_dist, dual_dist;
-    CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables,
-                                   state->delta_primal_solution, 1,
-                                   &primal_dist));
-    CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints,
-                                   state->delta_dual_solution, 1, &dual_dist));
+    CUBLAS_CHECK(
+        cublasDnrm2_v2_64(state->blas_handle, state->num_variables, state->delta_primal_solution, 1, &primal_dist));
+    CUBLAS_CHECK(
+        cublasDnrm2_v2_64(state->blas_handle, state->num_constraints, state->delta_dual_solution, 1, &dual_dist));
 
-    double ratio_infeas =
-        state->relative_dual_residual / state->relative_primal_residual;
+    double ratio_infeas = state->relative_dual_residual / state->relative_primal_residual;
 
-    if (primal_dist > 1e-16 && dual_dist > 1e-16 && primal_dist < 1e12 &&
-        dual_dist < 1e12 && ratio_infeas > 1e-8 && ratio_infeas < 1e8)
+    if (primal_dist > 1e-16 && dual_dist > 1e-16 && primal_dist < 1e12 && dual_dist < 1e12 && ratio_infeas > 1e-8 &&
+        ratio_infeas < 1e8)
     {
-        double error =
-            log(dual_dist) - log(primal_dist) - log(state->primal_weight);
+        double error = log(dual_dist) - log(primal_dist) - log(state->primal_weight);
         state->primal_weight_error_sum *= params->restart_params.i_smooth;
         state->primal_weight_error_sum += error;
         double delta_error = error - state->primal_weight_last_error;
         state->primal_weight *=
-            exp(params->restart_params.k_p * error +
-                params->restart_params.k_i * state->primal_weight_error_sum +
+            exp(params->restart_params.k_p * error + params->restart_params.k_i * state->primal_weight_error_sum +
                 params->restart_params.k_d * delta_error);
         state->primal_weight_last_error = error;
     }
@@ -954,24 +1104,27 @@ static void perform_restart(pdhg_solver_state_t *state,
         state->primal_weight_last_error = 0.0;
     }
 
-    double primal_dual_residual_gap = fabs(
-        log10(state->relative_dual_residual / state->relative_primal_residual));
+    double primal_dual_residual_gap = fabs(log10(state->relative_dual_residual / state->relative_primal_residual));
     if (primal_dual_residual_gap < state->best_primal_dual_residual_gap)
     {
         state->best_primal_dual_residual_gap = primal_dual_residual_gap;
         state->best_primal_weight = state->primal_weight;
     }
 
-    CUDA_CHECK(cudaMemcpy(
-        state->initial_primal_solution, state->pdhg_primal_solution,
-        state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(
-        state->current_primal_solution, state->pdhg_primal_solution,
-        state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, state->pdhg_dual_solution,
+    CUDA_CHECK(cudaMemcpy(state->initial_primal_solution,
+                          state->pdhg_primal_solution,
+                          state->num_variables * sizeof(double),
+                          cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->current_primal_solution,
+                          state->pdhg_primal_solution,
+                          state->num_variables * sizeof(double),
+                          cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->initial_dual_solution,
+                          state->pdhg_dual_solution,
                           state->num_constraints * sizeof(double),
                           cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(state->current_dual_solution, state->pdhg_dual_solution,
+    CUDA_CHECK(cudaMemcpy(state->current_dual_solution,
+                          state->pdhg_dual_solution,
                           state->num_constraints * sizeof(double),
                           cudaMemcpyDeviceToDevice));
 
@@ -979,8 +1132,7 @@ static void perform_restart(pdhg_solver_state_t *state,
     state->last_trial_fixed_point_error = INFINITY;
 }
 
-static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
-                                                   const pdhg_parameters_t *params)
+static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state, const pdhg_parameters_t *params)
 {
     if (state->constraint_matrix->num_nonzeros == 0)
     {
@@ -988,9 +1140,12 @@ static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
     }
     else
     {
-        double max_sv = estimate_maximum_singular_value(
-            state->sparse_handle, state->blas_handle, state->constraint_matrix,
-            state->constraint_matrix_t, params->sv_max_iter, params->sv_tol);
+        double max_sv = estimate_maximum_singular_value(state->sparse_handle,
+                                                        state->blas_handle,
+                                                        state->constraint_matrix,
+                                                        state->constraint_matrix_t,
+                                                        params->sv_max_iter,
+                                                        params->sv_tol);
         state->step_size = 0.998 / max_sv;
     }
 
@@ -1000,30 +1155,36 @@ static void initialize_step_size_and_primal_weight(pdhg_solver_state_t *state,
     }
     else
     {
-        state->primal_weight = (state->objective_vector_norm + 1.0) /
-                               (state->constraint_bound_norm + 1.0);
+        state->primal_weight = (state->objective_vector_norm + 1.0) / (state->constraint_bound_norm + 1.0);
     }
     state->best_primal_weight = state->primal_weight;
 }
 
 static void compute_fixed_point_error(pdhg_solver_state_t *state)
 {
-    compute_delta_solution_kernel<<<state->num_blocks_primal_dual,
-                                    THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->pdhg_primal_solution, state->reflected_primal_solution,
-        state->delta_primal_solution, state->pdhg_dual_solution,
-        state->reflected_dual_solution, state->delta_dual_solution,
-        state->num_variables, state->num_constraints);
+    compute_delta_solution_kernel<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
+        state->pdhg_primal_solution,
+        state->reflected_primal_solution,
+        state->delta_primal_solution,
+        state->pdhg_dual_solution,
+        state->reflected_dual_solution,
+        state->delta_dual_solution,
+        state->num_variables,
+        state->num_constraints);
 
-    CUSPARSE_CHECK(
-        cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution));
-    CUSPARSE_CHECK(
-        cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product));
 
-    CUSPARSE_CHECK(cusparseSpMV(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matAt,
+                                state->vec_dual_sol,
+                                &HOST_ZERO,
+                                state->vec_dual_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->dual_spmv_buffer));
 
     double interaction, movement;
 
@@ -1031,17 +1192,19 @@ static void compute_fixed_point_error(pdhg_solver_state_t *state)
     double dual_norm = 0.0;
     double cross_term = 0.0;
 
-    CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints,
-                                   state->delta_dual_solution, 1, &dual_norm));
-    CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables,
-                                   state->delta_primal_solution, 1,
-                                   &primal_norm));
-    movement = primal_norm * primal_norm * state->primal_weight +
-               dual_norm * dual_norm / state->primal_weight;
-
-    CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables,
-                            state->dual_product, 1, state->delta_primal_solution,
-                            1, &cross_term));
+    CUBLAS_CHECK(
+        cublasDnrm2_v2_64(state->blas_handle, state->num_constraints, state->delta_dual_solution, 1, &dual_norm));
+    CUBLAS_CHECK(
+        cublasDnrm2_v2_64(state->blas_handle, state->num_variables, state->delta_primal_solution, 1, &primal_norm));
+    movement = primal_norm * primal_norm * state->primal_weight + dual_norm * dual_norm / state->primal_weight;
+
+    CUBLAS_CHECK(cublasDdot(state->blas_handle,
+                            state->num_variables,
+                            state->dual_product,
+                            1,
+                            state->delta_primal_solution,
+                            1,
+                            &cross_term));
     interaction = 2 * state->step_size * cross_term;
 
     state->fixed_point_error = sqrt(movement + interaction);
@@ -1183,19 +1346,22 @@ void rescale_info_free(rescale_info_t *info)
 
 static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state, const lp_problem_t *original_problem)
 {
-    cupdlpx_result_t *results =
-        (cupdlpx_result_t *)safe_calloc(1, sizeof(cupdlpx_result_t));
+    cupdlpx_result_t *results = (cupdlpx_result_t *)safe_calloc(1, sizeof(cupdlpx_result_t));
 
     // Compute reduced cost
-    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol,
-                                          state->pdhg_dual_solution));
-    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod,
-                                          state->dual_product));
-
-    CUSPARSE_CHECK(cusparseSpMV(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product));
+
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matAt,
+                                state->vec_dual_sol,
+                                &HOST_ZERO,
+                                state->vec_dual_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->dual_spmv_buffer));
 
     compute_and_rescale_reduced_cost_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>(
         state->dual_slack,
@@ -1209,27 +1375,29 @@ static cupdlpx_result_t *create_result_from_state(pdhg_solver_state_t *state, co
         state->num_variables);
 
     rescale_solution_kernel<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->pdhg_primal_solution, state->pdhg_dual_solution,
-        state->variable_rescaling, state->constraint_rescaling,
-        state->objective_vector_rescaling, state->constraint_bound_rescaling,
-        state->num_variables, state->num_constraints);
-
-    results->primal_solution =
-        (double *)safe_malloc(state->num_variables * sizeof(double));
-    results->dual_solution =
-        (double *)safe_malloc(state->num_constraints * sizeof(double));
-    results->reduced_cost =
-        (double *)safe_malloc(state->num_variables * sizeof(double));
-
-    CUDA_CHECK(cudaMemcpy(results->primal_solution, state->pdhg_primal_solution,
+        state->pdhg_primal_solution,
+        state->pdhg_dual_solution,
+        state->variable_rescaling,
+        state->constraint_rescaling,
+        state->objective_vector_rescaling,
+        state->constraint_bound_rescaling,
+        state->num_variables,
+        state->num_constraints);
+
+    results->primal_solution = (double *)safe_malloc(state->num_variables * sizeof(double));
+    results->dual_solution = (double *)safe_malloc(state->num_constraints * sizeof(double));
+    results->reduced_cost = (double *)safe_malloc(state->num_variables * sizeof(double));
+
+    CUDA_CHECK(cudaMemcpy(results->primal_solution,
+                          state->pdhg_primal_solution,
                           state->num_variables * sizeof(double),
                           cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpy(results->dual_solution, state->pdhg_dual_solution,
+    CUDA_CHECK(cudaMemcpy(results->dual_solution,
+                          state->pdhg_dual_solution,
                           state->num_constraints * sizeof(double),
                           cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpy(results->reduced_cost, state->dual_slack,
-                          state->num_variables * sizeof(double),
-                          cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpy(
+        results->reduced_cost, state->dual_slack, state->num_variables * sizeof(double), cudaMemcpyDeviceToHost));
 
     results->num_variables = original_problem->num_variables;
     results->num_constraints = original_problem->num_constraints;
@@ -1287,9 +1455,10 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
 
     if (primal_state->termination_reason == TERMINATION_REASON_FEAS_POLISH_SUCCESS)
     {
-        CUDA_CHECK(cudaMemcpy(
-            state->pdhg_primal_solution, primal_state->pdhg_primal_solution,
-            state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaMemcpy(state->pdhg_primal_solution,
+                              primal_state->pdhg_primal_solution,
+                              state->num_variables * sizeof(double),
+                              cudaMemcpyDeviceToDevice));
         state->absolute_primal_residual = primal_state->absolute_primal_residual;
         state->relative_primal_residual = primal_state->relative_primal_residual;
         state->primal_objective_value = primal_state->primal_objective_value;
@@ -1304,9 +1473,10 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
 
     if (dual_state->termination_reason == TERMINATION_REASON_FEAS_POLISH_SUCCESS)
     {
-        CUDA_CHECK(cudaMemcpy(
-            state->pdhg_dual_solution, dual_state->pdhg_dual_solution,
-            state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaMemcpy(state->pdhg_dual_solution,
+                              dual_state->pdhg_dual_solution,
+                              state->num_constraints * sizeof(double),
+                              cudaMemcpyDeviceToDevice));
         state->absolute_dual_residual = dual_state->absolute_dual_residual;
         state->relative_dual_residual = dual_state->relative_dual_residual;
         state->dual_objective_value = dual_state->dual_objective_value;
@@ -1314,7 +1484,8 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
     state->feasibility_iteration += dual_state->total_count - 1;
 
     state->objective_gap = fabs(state->primal_objective_value - state->dual_objective_value);
-    state->relative_objective_gap = state->objective_gap / (1.0 + fabs(state->primal_objective_value) + fabs(state->dual_objective_value));
+    state->relative_objective_gap =
+        state->objective_gap / (1.0 + fabs(state->primal_objective_value) + fabs(state->dual_objective_value));
 
     // FINAL LOGGING
     pdhg_feas_polish_final_log(primal_state, dual_state, params->verbose);
@@ -1325,7 +1496,9 @@ void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *st
     return;
 }
 
-void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state)
+void primal_feasibility_polish(const pdhg_parameters_t *params,
+                               pdhg_solver_state_t *state,
+                               const pdhg_solver_state_t *ori_state)
 {
     print_initial_feas_polish_info(true, params);
     bool do_restart = false;
@@ -1356,8 +1529,10 @@ void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_stat
                 compute_next_dual_solution(state, i, params->reflection_coefficient, false);
             }
 
-            compute_next_primal_solution(state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
-            compute_next_dual_solution(state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
+            compute_next_primal_solution(
+                state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
+            compute_next_dual_solution(
+                state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
             // end CUDA graph capture
 
             cudaGraph_t graph;
@@ -1378,10 +1553,10 @@ void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_stat
         {
             display_feas_polish_iteration_stats(state, params->verbose, true);
         }
-        
+
         // Check Adaptive Restart
-        do_restart = should_do_adaptive_restart(state, &params->restart_params,
-                                                params->termination_evaluation_frequency);
+        do_restart =
+            should_do_adaptive_restart(state, &params->restart_params, params->termination_evaluation_frequency);
         if (do_restart)
         {
             perform_primal_restart(state);
@@ -1396,7 +1571,9 @@ void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_stat
     return;
 }
 
-void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state)
+void dual_feasibility_polish(const pdhg_parameters_t *params,
+                             pdhg_solver_state_t *state,
+                             const pdhg_solver_state_t *ori_state)
 {
     print_initial_feas_polish_info(false, params);
     bool do_restart = false;
@@ -1427,8 +1604,10 @@ void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_
                 compute_next_dual_solution(state, i, params->reflection_coefficient, false);
             }
 
-            compute_next_primal_solution(state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
-            compute_next_dual_solution(state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
+            compute_next_primal_solution(
+                state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
+            compute_next_dual_solution(
+                state, params->termination_evaluation_frequency, params->reflection_coefficient, true);
             // end CUDA graph capture
 
             cudaGraph_t graph;
@@ -1449,10 +1628,10 @@ void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_
         {
             display_feas_polish_iteration_stats(state, params->verbose, false);
         }
-        
+
         // Check Adaptive Restart
-        do_restart = should_do_adaptive_restart(state, &params->restart_params,
-                                                params->termination_evaluation_frequency);
+        do_restart =
+            should_do_adaptive_restart(state, &params->restart_params, params->termination_evaluation_frequency);
         if (do_restart)
         {
             perform_dual_restart(state);
@@ -1467,31 +1646,34 @@ void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_
     return;
 }
 
-static pdhg_solver_state_t *initialize_primal_feas_polish_state(
-    const pdhg_solver_state_t *original_state)
+static pdhg_solver_state_t *initialize_primal_feas_polish_state(const pdhg_solver_state_t *original_state)
 {
     pdhg_solver_state_t *primal_state = (pdhg_solver_state_t *)safe_malloc(sizeof(pdhg_solver_state_t));
     *primal_state = *original_state;
     int num_var = original_state->num_variables;
     int num_cons = original_state->num_constraints;
 
-#define ALLOC_ZERO(dest, bytes)           \
-    CUDA_CHECK(cudaMalloc(&dest, bytes)); \
+#define ALLOC_ZERO(dest, bytes)                                                                                        \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));                                                                              \
     CUDA_CHECK(cudaMemset(dest, 0, bytes));
 
     // RESET PROBLEM TO FEASIBILITY PROBLEM
     ALLOC_ZERO(primal_state->objective_vector, num_var * sizeof(double));
     primal_state->objective_constant = 0.0;
 
-#define ALLOC_AND_COPY_DEV(dest, src, bytes) \
-    CUDA_CHECK(cudaMalloc(&dest, bytes));    \
+#define ALLOC_AND_COPY_DEV(dest, src, bytes)                                                                           \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));                                                                              \
     CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice));
 
     // ALLOCATE AND COPY SOLUTION VECTORS
-    ALLOC_AND_COPY_DEV(primal_state->initial_primal_solution, original_state->initial_primal_solution, num_var * sizeof(double));
-    ALLOC_AND_COPY_DEV(primal_state->current_primal_solution, original_state->current_primal_solution, num_var * sizeof(double));
-    ALLOC_AND_COPY_DEV(primal_state->pdhg_primal_solution, original_state->pdhg_primal_solution, num_var * sizeof(double));
-    ALLOC_AND_COPY_DEV(primal_state->reflected_primal_solution, original_state->reflected_primal_solution, num_var * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        primal_state->initial_primal_solution, original_state->initial_primal_solution, num_var * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        primal_state->current_primal_solution, original_state->current_primal_solution, num_var * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        primal_state->pdhg_primal_solution, original_state->pdhg_primal_solution, num_var * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        primal_state->reflected_primal_solution, original_state->reflected_primal_solution, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(primal_state->primal_product, original_state->primal_product, num_cons * sizeof(double));
 
     // ALLOC ZERO FOR OTHERS
@@ -1536,11 +1718,11 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(
 
 void primal_feas_polish_state_free(pdhg_solver_state_t *state)
 {
-#define SAFE_CUDA_FREE(p)        \
-    if ((p) != NULL)             \
-    {                            \
-        CUDA_CHECK(cudaFree(p)); \
-        (p) = NULL;              \
+#define SAFE_CUDA_FREE(p)                                                                                              \
+    if ((p) != NULL)                                                                                                   \
+    {                                                                                                                  \
+        CUDA_CHECK(cudaFree(p));                                                                                       \
+        (p) = NULL;                                                                                                    \
     }
 
     if (!state)
@@ -1565,8 +1747,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state)
     free(state);
 }
 
-__global__ void zero_finite_value_vectors_kernel(
-    double *__restrict__ vec, int n)
+__global__ void zero_finite_value_vectors_kernel(double *__restrict__ vec, int n)
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < n)
@@ -1576,39 +1757,42 @@ __global__ void zero_finite_value_vectors_kernel(
     }
 }
 
-static pdhg_solver_state_t *initialize_dual_feas_polish_state(
-    const pdhg_solver_state_t *original_state)
+static pdhg_solver_state_t *initialize_dual_feas_polish_state(const pdhg_solver_state_t *original_state)
 {
     pdhg_solver_state_t *dual_state = (pdhg_solver_state_t *)safe_malloc(sizeof(pdhg_solver_state_t));
     *dual_state = *original_state;
     int num_var = original_state->num_variables;
     int num_cons = original_state->num_constraints;
 
-#define ALLOC_AND_COPY_DEV(dest, src, bytes) \
-    CUDA_CHECK(cudaMalloc(&dest, bytes));    \
+#define ALLOC_AND_COPY_DEV(dest, src, bytes)                                                                           \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));                                                                              \
     CUDA_CHECK(cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice));
 
 // RESET PROBLEM TO DUAL FEASIBILITY PROBLEM
-#define SET_FINITE_TO_ZERO(vec, n)                                     \
-    {                                                                  \
-        int threads = 256;                                             \
-        int blocks = (n + threads - 1) / threads;                      \
-        zero_finite_value_vectors_kernel<<<blocks, threads>>>(vec, n); \
-        CUDA_CHECK(cudaDeviceSynchronize());                           \
+#define SET_FINITE_TO_ZERO(vec, n)                                                                                     \
+    {                                                                                                                  \
+        int threads = 256;                                                                                             \
+        int blocks = (n + threads - 1) / threads;                                                                      \
+        zero_finite_value_vectors_kernel<<<blocks, threads>>>(vec, n);                                                 \
+        CUDA_CHECK(cudaDeviceSynchronize());                                                                           \
     }
 
-    ALLOC_AND_COPY_DEV(dual_state->constraint_lower_bound, original_state->constraint_lower_bound, num_cons * sizeof(double));
-    ALLOC_AND_COPY_DEV(dual_state->constraint_upper_bound, original_state->constraint_upper_bound, num_cons * sizeof(double));
-    ALLOC_AND_COPY_DEV(dual_state->variable_lower_bound, original_state->variable_lower_bound, num_var * sizeof(double));
-    ALLOC_AND_COPY_DEV(dual_state->variable_upper_bound, original_state->variable_upper_bound, num_var * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        dual_state->constraint_lower_bound, original_state->constraint_lower_bound, num_cons * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        dual_state->constraint_upper_bound, original_state->constraint_upper_bound, num_cons * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        dual_state->variable_lower_bound, original_state->variable_lower_bound, num_var * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        dual_state->variable_upper_bound, original_state->variable_upper_bound, num_var * sizeof(double));
 
     SET_FINITE_TO_ZERO(dual_state->constraint_lower_bound, num_cons);
     SET_FINITE_TO_ZERO(dual_state->constraint_upper_bound, num_cons);
     SET_FINITE_TO_ZERO(dual_state->variable_lower_bound, num_var);
     SET_FINITE_TO_ZERO(dual_state->variable_upper_bound, num_var);
 
-#define ALLOC_ZERO(dest, bytes)           \
-    CUDA_CHECK(cudaMalloc(&dest, bytes)); \
+#define ALLOC_ZERO(dest, bytes)                                                                                        \
+    CUDA_CHECK(cudaMalloc(&dest, bytes));                                                                              \
     CUDA_CHECK(cudaMemset(dest, 0, bytes));
 
     ALLOC_ZERO(dual_state->constraint_lower_bound_finite_val, num_cons * sizeof(double));
@@ -1617,10 +1801,13 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
     ALLOC_ZERO(dual_state->variable_upper_bound_finite_val, num_var * sizeof(double));
 
     // ALLOCATE AND COPY SOLUTION VECTORS
-    ALLOC_AND_COPY_DEV(dual_state->initial_dual_solution, original_state->initial_dual_solution, num_cons * sizeof(double));
-    ALLOC_AND_COPY_DEV(dual_state->current_dual_solution, original_state->current_dual_solution, num_cons * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        dual_state->initial_dual_solution, original_state->initial_dual_solution, num_cons * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        dual_state->current_dual_solution, original_state->current_dual_solution, num_cons * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->pdhg_dual_solution, original_state->pdhg_dual_solution, num_cons * sizeof(double));
-    ALLOC_AND_COPY_DEV(dual_state->reflected_dual_solution, original_state->reflected_dual_solution, num_cons * sizeof(double));
+    ALLOC_AND_COPY_DEV(
+        dual_state->reflected_dual_solution, original_state->reflected_dual_solution, num_cons * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->dual_product, original_state->dual_product, num_var * sizeof(double));
     ALLOC_AND_COPY_DEV(dual_state->dual_slack, original_state->dual_slack, num_var * sizeof(double));
 
@@ -1663,11 +1850,11 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(
 
 void dual_feas_polish_state_free(pdhg_solver_state_t *state)
 {
-#define SAFE_CUDA_FREE(p)        \
-    if ((p) != NULL)             \
-    {                            \
-        CUDA_CHECK(cudaFree(p)); \
-        (p) = NULL;              \
+#define SAFE_CUDA_FREE(p)                                                                                              \
+    if ((p) != NULL)                                                                                                   \
+    {                                                                                                                  \
+        CUDA_CHECK(cudaFree(p));                                                                                       \
+        (p) = NULL;                                                                                                    \
     }
 
     if (!state)
@@ -1704,24 +1891,36 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state)
 
 static void perform_primal_restart(pdhg_solver_state_t *state)
 {
-    CUDA_CHECK(cudaMemcpy(state->initial_primal_solution, state->pdhg_primal_solution, state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(state->current_primal_solution, state->pdhg_primal_solution, state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->initial_primal_solution,
+                          state->pdhg_primal_solution,
+                          state->num_variables * sizeof(double),
+                          cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->current_primal_solution,
+                          state->pdhg_primal_solution,
+                          state->num_variables * sizeof(double),
+                          cudaMemcpyDeviceToDevice));
     state->inner_count = 0;
     state->last_trial_fixed_point_error = INFINITY;
 }
 
 static void perform_dual_restart(pdhg_solver_state_t *state)
 {
-    CUDA_CHECK(cudaMemcpy(state->initial_dual_solution, state->pdhg_dual_solution, state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(state->current_dual_solution, state->pdhg_dual_solution, state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->initial_dual_solution,
+                          state->pdhg_dual_solution,
+                          state->num_constraints * sizeof(double),
+                          cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->current_dual_solution,
+                          state->pdhg_dual_solution,
+                          state->num_constraints * sizeof(double),
+                          cudaMemcpyDeviceToDevice));
     state->inner_count = 0;
     state->last_trial_fixed_point_error = INFINITY;
 }
 
-__global__ void compute_delta_primal_solution_kernel(
-    const double *__restrict__ initial_primal,
-    const double *__restrict__ pdhg_primal,
-    double *__restrict__ delta_primal, int n_vars)
+__global__ void compute_delta_primal_solution_kernel(const double *__restrict__ initial_primal,
+                                                     const double *__restrict__ pdhg_primal,
+                                                     double *__restrict__ delta_primal,
+                                                     int n_vars)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_vars)
@@ -1730,10 +1929,10 @@ __global__ void compute_delta_primal_solution_kernel(
     }
 }
 
-__global__ void compute_delta_dual_solution_kernel(
-    const double *__restrict__ initial_dual,
-    const double *__restrict__ pdhg_dual,
-    double *__restrict__ delta_dual, int n_cons)
+__global__ void compute_delta_dual_solution_kernel(const double *__restrict__ initial_dual,
+                                                   const double *__restrict__ pdhg_dual,
+                                                   double *__restrict__ delta_dual,
+                                                   int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_cons)
@@ -1750,26 +1949,17 @@ static void compute_primal_fixed_point_error(pdhg_solver_state_t *state)
         state->delta_primal_solution,
         state->num_variables);
     double primal_norm = 0.0;
-    CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle,
-                                   state->num_variables,
-                                   state->delta_primal_solution,
-                                   1,
-                                   &primal_norm));
+    CUBLAS_CHECK(
+        cublasDnrm2_v2_64(state->blas_handle, state->num_variables, state->delta_primal_solution, 1, &primal_norm));
     state->fixed_point_error = primal_norm * primal_norm * state->primal_weight;
 }
 
 static void compute_dual_fixed_point_error(pdhg_solver_state_t *state)
 {
     compute_delta_dual_solution_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->pdhg_dual_solution,
-        state->reflected_dual_solution,
-        state->delta_dual_solution,
-        state->num_constraints);
+        state->pdhg_dual_solution, state->reflected_dual_solution, state->delta_dual_solution, state->num_constraints);
     double dual_norm = 0.0;
-    CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle,
-                                   state->num_constraints,
-                                   state->delta_dual_solution,
-                                   1,
-                                   &dual_norm));
+    CUBLAS_CHECK(
+        cublasDnrm2_v2_64(state->blas_handle, state->num_constraints, state->delta_dual_solution, 1, &dual_norm));
     state->fixed_point_error = dual_norm * dual_norm / state->primal_weight;
 }
diff --git a/src/utils.cu b/src/utils.cu
index 0f7e0f7..c6e8fe4 100644
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -70,7 +70,8 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle,
                                        cublasHandle_t blas_handle,
                                        const cu_sparse_matrix_csr_t *A,
                                        const cu_sparse_matrix_csr_t *AT,
-                                       int max_iterations, double tolerance)
+                                       int max_iterations,
+                                       double tolerance)
 {
     int m = A->num_rows;
     int n = A->num_cols;
@@ -86,8 +87,7 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle,
         eigenvector_h[i] = dist(gen);
     }
 
-    CUDA_CHECK(cudaMemcpy(eigenvector_d, eigenvector_h, m * sizeof(double),
-                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(eigenvector_d, eigenvector_h, m * sizeof(double), cudaMemcpyHostToDevice));
     free(eigenvector_h);
 
     double sigma_max_sq = 1.0;
@@ -95,31 +95,57 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle,
     const double zero = 0.0;
 
     cusparseSpMatDescr_t matA, matAT;
-    CUSPARSE_CHECK(cusparseCreateCsr(
-        &matA, A->num_rows, A->num_cols, A->num_nonzeros, A->row_ptr, A->col_ind,
-        A->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
-        CUDA_R_64F));
-    CUSPARSE_CHECK(cusparseCreateCsr(
-        &matAT, AT->num_rows, AT->num_cols, AT->num_nonzeros, AT->row_ptr,
-        AT->col_ind, AT->val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateCsr(&matA,
+                                     A->num_rows,
+                                     A->num_cols,
+                                     A->num_nonzeros,
+                                     A->row_ptr,
+                                     A->col_ind,
+                                     A->val,
+                                     CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_BASE_ZERO,
+                                     CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateCsr(&matAT,
+                                     AT->num_rows,
+                                     AT->num_cols,
+                                     AT->num_nonzeros,
+                                     AT->row_ptr,
+                                     AT->col_ind,
+                                     AT->val,
+                                     CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_BASE_ZERO,
+                                     CUDA_R_64F));
 
     cusparseDnVecDescr_t vecEigen, vecNextEigen, vecDual;
     CUSPARSE_CHECK(cusparseCreateDnVec(&vecEigen, m, eigenvector_d, CUDA_R_64F));
-    CUSPARSE_CHECK(
-        cusparseCreateDnVec(&vecNextEigen, m, next_eigenvector_d, CUDA_R_64F));
+    CUSPARSE_CHECK(cusparseCreateDnVec(&vecNextEigen, m, next_eigenvector_d, CUDA_R_64F));
     CUSPARSE_CHECK(cusparseCreateDnVec(&vecDual, n, dual_product_d, CUDA_R_64F));
 
     void *dBufferAT = NULL;
     void *dBufferA = NULL;
     size_t bufferSizeAT = 0, bufferSizeA = 0;
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-        sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, matAT,
-        vecNextEigen, &zero, vecDual, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2,
-        &bufferSizeAT));
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-        sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, matA, vecDual,
-        &zero, vecEigen, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, &bufferSizeA));
+    CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &one,
+                                           matAT,
+                                           vecNextEigen,
+                                           &zero,
+                                           vecDual,
+                                           CUDA_R_64F,
+                                           CUSPARSE_SPMV_CSR_ALG2,
+                                           &bufferSizeAT));
+    CUSPARSE_CHECK(cusparseSpMV_bufferSize(sparse_handle,
+                                           CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           &one,
+                                           matA,
+                                           vecDual,
+                                           &zero,
+                                           vecEigen,
+                                           CUDA_R_64F,
+                                           CUSPARSE_SPMV_CSR_ALG2,
+                                           &bufferSizeA));
 
     CUDA_CHECK(cudaMalloc(&dBufferAT, bufferSizeAT));
     CUDA_CHECK(cudaMalloc(&dBufferA, bufferSizeA));
@@ -127,36 +153,43 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle,
     for (int i = 0; i < max_iterations; ++i)
     {
 
-        CUDA_CHECK(cudaMemcpy(next_eigenvector_d, eigenvector_d, m * sizeof(double),
-                              cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaMemcpy(next_eigenvector_d, eigenvector_d, m * sizeof(double), cudaMemcpyDeviceToDevice));
         double eigenvector_norm;
-        CUBLAS_CHECK(cublasDnrm2_v2_64(blas_handle, m, next_eigenvector_d, 1,
-                                       &eigenvector_norm));
+        CUBLAS_CHECK(cublasDnrm2_v2_64(blas_handle, m, next_eigenvector_d, 1, &eigenvector_norm));
 
         double inv_eigenvector_norm = 1.0 / eigenvector_norm;
-        CUBLAS_CHECK(cublasDscal(blas_handle, m, &inv_eigenvector_norm,
-                                 next_eigenvector_d, 1));
-
-        CUSPARSE_CHECK(cusparseSpMV(sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                    &one, matAT, vecNextEigen, &zero, vecDual,
-                                    CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, dBufferAT));
-
-        CUSPARSE_CHECK(cusparseSpMV(sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                    &one, matA, vecDual, &zero, vecEigen,
-                                    CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, dBufferA));
-
-        CUBLAS_CHECK(cublasDdot(blas_handle, m, next_eigenvector_d, 1,
-                                eigenvector_d, 1, &sigma_max_sq));
+        CUBLAS_CHECK(cublasDscal(blas_handle, m, &inv_eigenvector_norm, next_eigenvector_d, 1));
+
+        CUSPARSE_CHECK(cusparseSpMV(sparse_handle,
+                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                    &one,
+                                    matAT,
+                                    vecNextEigen,
+                                    &zero,
+                                    vecDual,
+                                    CUDA_R_64F,
+                                    CUSPARSE_SPMV_CSR_ALG2,
+                                    dBufferAT));
+
+        CUSPARSE_CHECK(cusparseSpMV(sparse_handle,
+                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                    &one,
+                                    matA,
+                                    vecDual,
+                                    &zero,
+                                    vecEigen,
+                                    CUDA_R_64F,
+                                    CUSPARSE_SPMV_CSR_ALG2,
+                                    dBufferA));
+
+        CUBLAS_CHECK(cublasDdot(blas_handle, m, next_eigenvector_d, 1, eigenvector_d, 1, &sigma_max_sq));
 
         double neg_sigma_sq = -sigma_max_sq;
-        CUBLAS_CHECK(
-            cublasDscal(blas_handle, m, &neg_sigma_sq, next_eigenvector_d, 1));
-        CUBLAS_CHECK(cublasDaxpy(blas_handle, m, &one, eigenvector_d, 1,
-                                 next_eigenvector_d, 1));
+        CUBLAS_CHECK(cublasDscal(blas_handle, m, &neg_sigma_sq, next_eigenvector_d, 1));
+        CUBLAS_CHECK(cublasDaxpy(blas_handle, m, &one, eigenvector_d, 1, next_eigenvector_d, 1));
 
         double residual_norm;
-        CUBLAS_CHECK(cublasDnrm2_v2_64(blas_handle, m, next_eigenvector_d, 1,
-                                       &residual_norm));
+        CUBLAS_CHECK(cublasDnrm2_v2_64(blas_handle, m, next_eigenvector_d, 1, &residual_norm));
 
         if (residual_norm < tolerance)
             break;
@@ -176,22 +209,23 @@ double estimate_maximum_singular_value(cusparseHandle_t sparse_handle,
     return sqrt(sigma_max_sq);
 }
 
-void compute_interaction_and_movement(pdhg_solver_state_t *state,
-                                      double *interaction, double *movement)
+void compute_interaction_and_movement(pdhg_solver_state_t *state, double *interaction, double *movement)
 {
     double dual_norm, primal_norm, cross_term;
 
-    CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints,
-                                   state->delta_dual_solution, 1, &dual_norm));
-    CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables,
-                                   state->delta_primal_solution, 1,
-                                   &primal_norm));
-    *movement = 0.5 * (primal_norm * primal_norm * state->primal_weight +
-                       dual_norm * dual_norm / state->primal_weight);
-
-    CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables,
-                            state->dual_product, 1, state->delta_primal_solution,
-                            1, &cross_term));
+    CUBLAS_CHECK(
+        cublasDnrm2_v2_64(state->blas_handle, state->num_constraints, state->delta_dual_solution, 1, &dual_norm));
+    CUBLAS_CHECK(
+        cublasDnrm2_v2_64(state->blas_handle, state->num_variables, state->delta_primal_solution, 1, &primal_norm));
+    *movement = 0.5 * (primal_norm * primal_norm * state->primal_weight + dual_norm * dual_norm / state->primal_weight);
+
+    CUBLAS_CHECK(cublasDdot(state->blas_handle,
+                            state->num_variables,
+                            state->dual_product,
+                            1,
+                            state->delta_primal_solution,
+                            1,
+                            &cross_term));
     *interaction = fabs(cross_term);
 }
 
@@ -199,37 +233,34 @@ const char *termination_reason_to_string(termination_reason_t reason)
 {
     switch (reason)
     {
-    case TERMINATION_REASON_OPTIMAL:
-        return "OPTIMAL";
-    case TERMINATION_REASON_PRIMAL_INFEASIBLE:
-        return "PRIMAL_INFEASIBLE";
-    case TERMINATION_REASON_DUAL_INFEASIBLE:
-        return "DUAL_INFEASIBLE";
-    case TERMINATION_REASON_INFEASIBLE_OR_UNBOUNDED:
-        return "INFEASIBLE_OR_UNBOUNDED";
-    case TERMINATION_REASON_TIME_LIMIT:
-        return "TIME_LIMIT";
-    case TERMINATION_REASON_ITERATION_LIMIT:
-        return "ITERATION_LIMIT";
-    case TERMINATION_REASON_UNSPECIFIED:
-        return "UNSPECIFIED";
-    case TERMINATION_REASON_FEAS_POLISH_SUCCESS:
-        return "FEAS_POLISH_SUCCESS";
-    default:
-        return "UNKNOWN";
+        case TERMINATION_REASON_OPTIMAL:
+            return "OPTIMAL";
+        case TERMINATION_REASON_PRIMAL_INFEASIBLE:
+            return "PRIMAL_INFEASIBLE";
+        case TERMINATION_REASON_DUAL_INFEASIBLE:
+            return "DUAL_INFEASIBLE";
+        case TERMINATION_REASON_INFEASIBLE_OR_UNBOUNDED:
+            return "INFEASIBLE_OR_UNBOUNDED";
+        case TERMINATION_REASON_TIME_LIMIT:
+            return "TIME_LIMIT";
+        case TERMINATION_REASON_ITERATION_LIMIT:
+            return "ITERATION_LIMIT";
+        case TERMINATION_REASON_UNSPECIFIED:
+            return "UNSPECIFIED";
+        case TERMINATION_REASON_FEAS_POLISH_SUCCESS:
+            return "FEAS_POLISH_SUCCESS";
+        default:
+            return "UNKNOWN";
     }
 }
 
-bool optimality_criteria_met(const pdhg_solver_state_t *state,
-                             double rel_opt_tol, double rel_feas_tol)
+bool optimality_criteria_met(const pdhg_solver_state_t *state, double rel_opt_tol, double rel_feas_tol)
 {
-    return state->relative_dual_residual < rel_feas_tol &&
-            state->relative_primal_residual < rel_feas_tol &&
-            state->relative_objective_gap < rel_opt_tol;
+    return state->relative_dual_residual < rel_feas_tol && state->relative_primal_residual < rel_feas_tol &&
+        state->relative_objective_gap < rel_opt_tol;
 }
 
-bool primal_infeasibility_criteria_met(const pdhg_solver_state_t *state,
-                                       double eps)
+bool primal_infeasibility_criteria_met(const pdhg_solver_state_t *state, double eps)
 {
     if (state->dual_ray_objective <= 0.0)
     {
@@ -238,24 +269,19 @@ bool primal_infeasibility_criteria_met(const pdhg_solver_state_t *state,
     return state->max_dual_ray_infeasibility / state->dual_ray_objective <= eps;
 }
 
-bool dual_infeasibility_criteria_met(const pdhg_solver_state_t *state,
-                                     double eps)
+bool dual_infeasibility_criteria_met(const pdhg_solver_state_t *state, double eps)
 {
     if (state->primal_ray_linear_objective >= 0.0)
     {
         return false;
     }
-    return state->max_primal_ray_infeasibility /
-               (-state->primal_ray_linear_objective) <=
-           eps;
+    return state->max_primal_ray_infeasibility / (-state->primal_ray_linear_objective) <= eps;
 }
 
-void check_termination_criteria(pdhg_solver_state_t *solver_state,
-                                const termination_criteria_t *criteria)
+void check_termination_criteria(pdhg_solver_state_t *solver_state, const termination_criteria_t *criteria)
 {
     solver_state->cumulative_time_sec = (double)(clock() - solver_state->start_time) / CLOCKS_PER_SEC;
-    if (optimality_criteria_met(solver_state, criteria->eps_optimal_relative,
-                                criteria->eps_feasible_relative))
+    if (optimality_criteria_met(solver_state, criteria->eps_optimal_relative, criteria->eps_feasible_relative))
     {
         solver_state->termination_reason = TERMINATION_REASON_OPTIMAL;
         return;
@@ -284,24 +310,19 @@ bool should_do_adaptive_restart(pdhg_solver_state_t *solver_state,
     else if (solver_state->total_count > termination_evaluation_frequency)
     {
         if (solver_state->fixed_point_error <=
-            restart_params->sufficient_reduction_for_restart *
-                solver_state->initial_fixed_point_error)
+            restart_params->sufficient_reduction_for_restart * solver_state->initial_fixed_point_error)
         {
             do_restart = true;
         }
         if (solver_state->fixed_point_error <=
-            restart_params->necessary_reduction_for_restart *
-                solver_state->initial_fixed_point_error)
+            restart_params->necessary_reduction_for_restart * solver_state->initial_fixed_point_error)
         {
-            if (solver_state->fixed_point_error >
-                solver_state->last_trial_fixed_point_error)
+            if (solver_state->fixed_point_error > solver_state->last_trial_fixed_point_error)
             {
                 do_restart = true;
             }
         }
-        if (solver_state->inner_count >=
-            restart_params->artificial_restart_threshold *
-                solver_state->total_count)
+        if (solver_state->inner_count >= restart_params->artificial_restart_threshold * solver_state->total_count)
         {
             do_restart = true;
         }
@@ -421,33 +442,40 @@ static void filter_constraint_matrix_entries(lp_problem_t *problem, const pdhg_p
     if (((nnz - filtered_nnz) > 0) && params->verbose)
     {
         printf("Dropped %d near-zero %s (|value| <= %.1e) from the constraint matrix\n",
-                nnz - filtered_nnz, (nnz - filtered_nnz == 1 ? "entry" : "entries"), params->matrix_zero_tol);
+               nnz - filtered_nnz,
+               (nnz - filtered_nnz == 1 ? "entry" : "entries"),
+               params->matrix_zero_tol);
     }
 }
 
-#define PRINT_DIFF_INT(name, current, default_val) \
-    do { \
-        if ((current) != (default_val)) { \
-            printf("  %-18s : %d\n", name, current); \
-        } \
-    } while(0)
-
-#define PRINT_DIFF_DBL(name, current, default_val) \
-    do { \
-        if (fabs((current) - (default_val)) > 1e-9) { \
-            printf("  %-18s : %.1e\n", name, (double)(current)); \
-        } \
-    } while(0)
-
-#define PRINT_DIFF_BOOL(name, current, default_val) \
-    do { \
-        if ((current) != (default_val)) { \
-            printf("  %-18s : %s\n", name, (current) ? "on" : "off"); \
-        } \
-    } while(0)
-
-void print_initial_info(const pdhg_parameters_t *params,
-                        lp_problem_t *problem)
+#define PRINT_DIFF_INT(name, current, default_val)                                                                     \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if ((current) != (default_val))                                                                                \
+        {                                                                                                              \
+            printf("  %-18s : %d\n", name, current);                                                                   \
+        }                                                                                                              \
+    } while (0)
+
+#define PRINT_DIFF_DBL(name, current, default_val)                                                                     \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (fabs((current) - (default_val)) > 1e-9)                                                                    \
+        {                                                                                                              \
+            printf("  %-18s : %.1e\n", name, (double)(current));                                                       \
+        }                                                                                                              \
+    } while (0)
+
+#define PRINT_DIFF_BOOL(name, current, default_val)                                                                    \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if ((current) != (default_val))                                                                                \
+        {                                                                                                              \
+            printf("  %-18s : %s\n", name, (current) ? "on" : "off");                                                  \
+        }                                                                                                              \
+    } while (0)
+
+void print_initial_info(const pdhg_parameters_t *params, lp_problem_t *problem)
 {
     pdhg_parameters_t default_params;
     set_default_parameters(&default_params);
@@ -468,64 +496,43 @@ void print_initial_info(const pdhg_parameters_t *params,
            "------------------\n");
 
     filter_constraint_matrix_entries(problem, params);
-    printf("Problem: %d rows, %d columns, %d nonzeros\n", problem->num_constraints, problem->num_variables, problem->constraint_matrix_num_nonzeros);
+    printf("Problem: %d rows, %d columns, %d nonzeros\n",
+           problem->num_constraints,
+           problem->num_variables,
+           problem->constraint_matrix_num_nonzeros);
 
     printf("Settings:\n");
-    printf("  iter_limit         : %d\n",
-           params->termination_criteria.iteration_limit);
-    printf("  time_limit         : %.2f sec\n",
-           params->termination_criteria.time_sec_limit);
-    printf("  eps_opt            : %.1e\n",
-           params->termination_criteria.eps_optimal_relative);
-    printf("  eps_feas           : %.1e\n",
-           params->termination_criteria.eps_feasible_relative);
-    if (params->optimality_norm != default_params.optimality_norm) {
-        printf("  optimality_norm    : %s\n",
-               params->optimality_norm == NORM_TYPE_L_INF ? "L_inf" : "L2");
-    }
-
-    PRINT_DIFF_INT("l_inf_ruiz_iter",
-                   params->l_inf_ruiz_iterations, 
-                   default_params.l_inf_ruiz_iterations);
-    PRINT_DIFF_DBL("pock_chambolle_alpha",
-                   params->pock_chambolle_alpha, 
-                   default_params.pock_chambolle_alpha);
-    PRINT_DIFF_BOOL("has_pock_chambolle_alpha",
-                    params->has_pock_chambolle_alpha, 
-                    default_params.has_pock_chambolle_alpha);
-    PRINT_DIFF_BOOL("bound_obj_rescaling",
-                    params->bound_objective_rescaling, 
-                    default_params.bound_objective_rescaling);
-    PRINT_DIFF_INT("sv_max_iter",
-                   params->sv_max_iter, 
-                   default_params.sv_max_iter);
-    PRINT_DIFF_DBL("sv_tol",
-                   params->sv_tol, 
-                   default_params.sv_tol);
-    PRINT_DIFF_INT("evaluation_freq", 
-                   params->termination_evaluation_frequency, 
-                   default_params.termination_evaluation_frequency);
-    PRINT_DIFF_BOOL("feasibility_polishing",
-                    params->feasibility_polishing, 
-                    default_params.feasibility_polishing);
+    printf("  iter_limit         : %d\n", params->termination_criteria.iteration_limit);
+    printf("  time_limit         : %.2f sec\n", params->termination_criteria.time_sec_limit);
+    printf("  eps_opt            : %.1e\n", params->termination_criteria.eps_optimal_relative);
+    printf("  eps_feas           : %.1e\n", params->termination_criteria.eps_feasible_relative);
+    if (params->optimality_norm != default_params.optimality_norm)
+    {
+        printf("  optimality_norm    : %s\n", params->optimality_norm == NORM_TYPE_L_INF ? "L_inf" : "L2");
+    }
+
+    PRINT_DIFF_INT("l_inf_ruiz_iter", params->l_inf_ruiz_iterations, default_params.l_inf_ruiz_iterations);
+    PRINT_DIFF_DBL("pock_chambolle_alpha", params->pock_chambolle_alpha, default_params.pock_chambolle_alpha);
+    PRINT_DIFF_BOOL(
+        "has_pock_chambolle_alpha", params->has_pock_chambolle_alpha, default_params.has_pock_chambolle_alpha);
+    PRINT_DIFF_BOOL("bound_obj_rescaling", params->bound_objective_rescaling, default_params.bound_objective_rescaling);
+    PRINT_DIFF_INT("sv_max_iter", params->sv_max_iter, default_params.sv_max_iter);
+    PRINT_DIFF_DBL("sv_tol", params->sv_tol, default_params.sv_tol);
+    PRINT_DIFF_INT(
+        "evaluation_freq", params->termination_evaluation_frequency, default_params.termination_evaluation_frequency);
+    PRINT_DIFF_BOOL("feasibility_polishing", params->feasibility_polishing, default_params.feasibility_polishing);
     PRINT_DIFF_DBL("eps_feas_polish_relative",
                    params->termination_criteria.eps_feas_polish_relative,
                    default_params.termination_criteria.eps_feas_polish_relative);
-    PRINT_DIFF_BOOL("presolve",
-                    params->presolve,
-                    default_params.presolve);
-    PRINT_DIFF_DBL("matrix_zero_tol",
-                   params->matrix_zero_tol,
-                   default_params.matrix_zero_tol);
+    PRINT_DIFF_BOOL("presolve", params->presolve, default_params.presolve);
+    PRINT_DIFF_DBL("matrix_zero_tol", params->matrix_zero_tol, default_params.matrix_zero_tol);
 }
 
 #undef PRINT_DIFF_INT
 #undef PRINT_DIFF_DBL
 #undef PRINT_DIFF_BOOL
 
-void pdhg_final_log(
-    const cupdlpx_result_t *result,
-    const pdhg_parameters_t *params)
+void pdhg_final_log(const cupdlpx_result_t *result, const pdhg_parameters_t *params)
 {
     if (params->verbose)
     {
@@ -583,11 +590,16 @@ void display_iteration_stats(const pdhg_solver_state_t *state, bool verbose)
     if (state->total_count % get_print_frequency(state->total_count) == 0)
     {
         printf("%6d %.1e | %8.1e  %8.1e | %.1e %.1e %.1e | %.1e %.1e %.1e \n",
-               state->total_count, state->cumulative_time_sec,
-               state->primal_objective_value, state->dual_objective_value,
-               state->absolute_primal_residual, state->absolute_dual_residual,
-               state->objective_gap, state->relative_primal_residual,
-               state->relative_dual_residual, state->relative_objective_gap);
+               state->total_count,
+               state->cumulative_time_sec,
+               state->primal_objective_value,
+               state->dual_objective_value,
+               state->absolute_primal_residual,
+               state->absolute_dual_residual,
+               state->objective_gap,
+               state->relative_primal_residual,
+               state->relative_dual_residual,
+               state->relative_objective_gap);
     }
 }
 
@@ -604,51 +616,45 @@ int get_print_frequency(int iter)
     return step;
 }
 
-__global__ void compute_residual_kernel(
-    double *__restrict__ primal_residual,
-    const double *__restrict__ primal_product,
-    const double *__restrict__ constraint_lower_bound,
-    const double *__restrict__ constraint_upper_bound,
-    const double *__restrict__ dual_solution,
-    double *__restrict__ dual_residual,
-    const double *__restrict__ dual_product,
-    const double *__restrict__ dual_slack,
-    const double *__restrict__ objective_vector,
-    const double *__restrict__ constraint_rescaling,
-    const double *__restrict__ variable_rescaling,
-    double *__restrict__ dual_obj_contribution,
-    const double *__restrict__ const_lb_finite,
-    const double *__restrict__ const_ub_finite,
-    int num_constraints, int num_variables)
+__global__ void compute_residual_kernel(double *__restrict__ primal_residual,
+                                        const double *__restrict__ primal_product,
+                                        const double *__restrict__ constraint_lower_bound,
+                                        const double *__restrict__ constraint_upper_bound,
+                                        const double *__restrict__ dual_solution,
+                                        double *__restrict__ dual_residual,
+                                        const double *__restrict__ dual_product,
+                                        const double *__restrict__ dual_slack,
+                                        const double *__restrict__ objective_vector,
+                                        const double *__restrict__ constraint_rescaling,
+                                        const double *__restrict__ variable_rescaling,
+                                        double *__restrict__ dual_obj_contribution,
+                                        const double *__restrict__ const_lb_finite,
+                                        const double *__restrict__ const_ub_finite,
+                                        int num_constraints,
+                                        int num_variables)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i < num_constraints)
     {
 
-        double clamped_val =
-            fmax(constraint_lower_bound[i],
-                 fmin(primal_product[i], constraint_upper_bound[i]));
-        primal_residual[i] =
-            (primal_product[i] - clamped_val) * constraint_rescaling[i];
+        double clamped_val = fmax(constraint_lower_bound[i], fmin(primal_product[i], constraint_upper_bound[i]));
+        primal_residual[i] = (primal_product[i] - clamped_val) * constraint_rescaling[i];
 
         dual_obj_contribution[i] =
-            fmax(dual_solution[i], 0.0) * const_lb_finite[i] +
-            fmin(dual_solution[i], 0.0) * const_ub_finite[i];
+            fmax(dual_solution[i], 0.0) * const_lb_finite[i] + fmin(dual_solution[i], 0.0) * const_ub_finite[i];
     }
     else if (i < num_constraints + num_variables)
     {
         int idx = i - num_constraints;
-        dual_residual[idx] =
-            (objective_vector[idx] - dual_product[idx] - dual_slack[idx]) *
-            variable_rescaling[idx];
+        dual_residual[idx] = (objective_vector[idx] - dual_product[idx] - dual_slack[idx]) * variable_rescaling[idx];
     }
 }
 
-__global__ void primal_infeasibility_project_kernel(
-    double *__restrict__ primal_ray_estimate,
-    const double *__restrict__ variable_lower_bound,
-    const double *__restrict__ variable_upper_bound, int num_variables)
+__global__ void primal_infeasibility_project_kernel(double *__restrict__ primal_ray_estimate,
+                                                    const double *__restrict__ variable_lower_bound,
+                                                    const double *__restrict__ variable_upper_bound,
+                                                    int num_variables)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < num_variables)
@@ -664,10 +670,10 @@ __global__ void primal_infeasibility_project_kernel(
     }
 }
 
-__global__ void dual_infeasibility_project_kernel(
-    double *__restrict__ dual_ray_estimate,
-    const double *__restrict__ constraint_lower_bound,
-    const double *__restrict__ constraint_upper_bound, int num_constraints)
+__global__ void dual_infeasibility_project_kernel(double *__restrict__ dual_ray_estimate,
+                                                  const double *__restrict__ constraint_lower_bound,
+                                                  const double *__restrict__ constraint_upper_bound,
+                                                  int num_constraints)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < num_constraints)
@@ -683,46 +689,45 @@ __global__ void dual_infeasibility_project_kernel(
     }
 }
 
-__global__ void compute_primal_infeasibility_kernel(
-    const double *__restrict__ primal_product,
-    const double *__restrict__ const_lb,
-    const double *__restrict__ const_ub, int num_constraints,
-    double *__restrict__ primal_infeasibility,
-    const double *__restrict__ constraint_rescaling)
+__global__ void compute_primal_infeasibility_kernel(const double *__restrict__ primal_product,
+                                                    const double *__restrict__ const_lb,
+                                                    const double *__restrict__ const_ub,
+                                                    int num_constraints,
+                                                    double *__restrict__ primal_infeasibility,
+                                                    const double *__restrict__ constraint_rescaling)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < num_constraints)
     {
         double pp_val = primal_product[i];
-        primal_infeasibility[i] = (fmax(0.0, -pp_val) * isfinite(const_lb[i]) +
-                                   fmax(0.0, pp_val) * isfinite(const_ub[i])) *
-                                  constraint_rescaling[i];
+        primal_infeasibility[i] =
+            (fmax(0.0, -pp_val) * isfinite(const_lb[i]) + fmax(0.0, pp_val) * isfinite(const_ub[i])) *
+            constraint_rescaling[i];
     }
 }
 
-__global__ void
-compute_dual_infeasibility_kernel(const double *dual_product,
-                                  const double *__restrict__ var_lb,
-                                  const double *__restrict__ var_ub,
-                                  int num_variables,
-                                  double *__restrict__ dual_infeasibility,
-                                  const double *__restrict__ variable_rescaling)
+__global__ void compute_dual_infeasibility_kernel(const double *dual_product,
+                                                  const double *__restrict__ var_lb,
+                                                  const double *__restrict__ var_ub,
+                                                  int num_variables,
+                                                  double *__restrict__ dual_infeasibility,
+                                                  const double *__restrict__ variable_rescaling)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < num_variables)
     {
         double dp_val = -dual_product[i];
-        dual_infeasibility[i] = (fmax(0.0, dp_val) * !isfinite(var_lb[i]) -
-                                 fmin(0.0, dp_val) * !isfinite(var_ub[i])) *
-                                variable_rescaling[i];
+        dual_infeasibility[i] = (fmax(0.0, dp_val) * !isfinite(var_lb[i]) - fmin(0.0, dp_val) * !isfinite(var_ub[i])) *
+            variable_rescaling[i];
     }
 }
 
-__global__ void dual_solution_dual_objective_contribution_kernel(
-    const double *__restrict__ constraint_lower_bound_finite_val,
-    const double *__restrict__ constraint_upper_bound_finite_val,
-    const double *__restrict__ dual_solution, int num_constraints,
-    double *__restrict__ dual_objective_dual_solution_contribution_array)
+__global__ void
+dual_solution_dual_objective_contribution_kernel(const double *__restrict__ constraint_lower_bound_finite_val,
+                                                 const double *__restrict__ constraint_upper_bound_finite_val,
+                                                 const double *__restrict__ dual_solution,
+                                                 int num_constraints,
+                                                 double *__restrict__ dual_objective_dual_solution_contribution_array)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -734,12 +739,12 @@ __global__ void dual_solution_dual_objective_contribution_kernel(
     }
 }
 
-__global__ void dual_objective_dual_slack_contribution_array_kernel(
-    const double *__restrict__ dual_slack,
-    double *__restrict__ dual_objective_dual_slack_contribution_array,
-    const double *__restrict__ variable_lower_bound_finite_val,
-    const double *__restrict__ variable_upper_bound_finite_val,
-    int num_variables)
+__global__ void
+dual_objective_dual_slack_contribution_array_kernel(const double *__restrict__ dual_slack,
+                                                    double *__restrict__ dual_objective_dual_slack_contribution_array,
+                                                    const double *__restrict__ variable_lower_bound_finite_val,
+                                                    const double *__restrict__ variable_upper_bound_finite_val,
+                                                    int num_variables)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -751,8 +756,7 @@ __global__ void dual_objective_dual_slack_contribution_array_kernel(
     }
 }
 
-static double get_vector_inf_norm(cublasHandle_t handle, int n,
-                                  const double *x_d)
+static double get_vector_inf_norm(cublasHandle_t handle, int n, const double *x_d)
 {
     if (n <= 0)
         return 0.0;
@@ -761,13 +765,11 @@ static double get_vector_inf_norm(cublasHandle_t handle, int n,
     cublasIdamax(handle, n, x_d, 1, &index);
     double max_val;
 
-    CUDA_CHECK(cudaMemcpy(&max_val, x_d + (index - 1), sizeof(double),
-                          cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpy(&max_val, x_d + (index - 1), sizeof(double), cudaMemcpyDeviceToHost));
     return fabs(max_val);
 }
 
-static double get_vector_sum(cublasHandle_t handle, int n, double *ones_d,
-                             const double *x_d)
+static double get_vector_sum(cublasHandle_t handle, int n, double *ones_d, const double *x_d)
 {
     if (n <= 0)
         return 0.0;
@@ -784,166 +786,203 @@ void compute_residual(pdhg_solver_state_t *state, norm_type_t optimality_norm)
     cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product);
     cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product);
 
-    CUSPARSE_CHECK(cusparseSpMV(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
-
-    CUSPARSE_CHECK(cusparseSpMV(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matA,
+                                state->vec_primal_sol,
+                                &HOST_ZERO,
+                                state->vec_primal_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->primal_spmv_buffer));
+
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matAt,
+                                state->vec_dual_sol,
+                                &HOST_ZERO,
+                                state->vec_dual_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->dual_spmv_buffer));
 
     compute_residual_kernel<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->primal_residual, state->primal_product,
-        state->constraint_lower_bound, state->constraint_upper_bound,
-        state->pdhg_dual_solution, state->dual_residual, state->dual_product,
-        state->dual_slack, state->objective_vector, state->constraint_rescaling,
-        state->variable_rescaling, state->primal_slack,
+        state->primal_residual,
+        state->primal_product,
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->pdhg_dual_solution,
+        state->dual_residual,
+        state->dual_product,
+        state->dual_slack,
+        state->objective_vector,
+        state->constraint_rescaling,
+        state->variable_rescaling,
+        state->primal_slack,
         state->constraint_lower_bound_finite_val,
-        state->constraint_upper_bound_finite_val, state->num_constraints,
+        state->constraint_upper_bound_finite_val,
+        state->num_constraints,
         state->num_variables);
 
-    if (optimality_norm == NORM_TYPE_L_INF) {
-        state->absolute_primal_residual = get_vector_inf_norm(state->blas_handle, 
-                                          state->num_constraints, state->primal_residual);
-    } else {
-        CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints, 
-                                       state->primal_residual, 1, 
-                                       &state->absolute_primal_residual));
+    if (optimality_norm == NORM_TYPE_L_INF)
+    {
+        state->absolute_primal_residual =
+            get_vector_inf_norm(state->blas_handle, state->num_constraints, state->primal_residual);
+    }
+    else
+    {
+        CUBLAS_CHECK(cublasDnrm2_v2_64(
+            state->blas_handle, state->num_constraints, state->primal_residual, 1, &state->absolute_primal_residual));
     }
 
-
-    if (optimality_norm == NORM_TYPE_L_INF) {
-        state->absolute_dual_residual = get_vector_inf_norm(state->blas_handle, 
-                                        state->num_variables, state->dual_residual);
-    } else {
+    if (optimality_norm == NORM_TYPE_L_INF)
+    {
+        state->absolute_dual_residual =
+            get_vector_inf_norm(state->blas_handle, state->num_variables, state->dual_residual);
+    }
+    else
+    {
         state->absolute_primal_residual /= state->constraint_bound_rescaling;
-        CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables,
-                                       state->dual_residual, 1,
-                                       &state->absolute_dual_residual));
+        CUBLAS_CHECK(cublasDnrm2_v2_64(
+            state->blas_handle, state->num_variables, state->dual_residual, 1, &state->absolute_dual_residual));
     }
 
     state->absolute_dual_residual /= state->objective_vector_rescaling;
 
-    CUBLAS_CHECK(cublasDdot(
-        state->blas_handle, state->num_variables, state->objective_vector, 1,
-        state->pdhg_primal_solution, 1, &state->primal_objective_value));
+    CUBLAS_CHECK(cublasDdot(state->blas_handle,
+                            state->num_variables,
+                            state->objective_vector,
+                            1,
+                            state->pdhg_primal_solution,
+                            1,
+                            &state->primal_objective_value));
     state->primal_objective_value =
-        state->primal_objective_value / (state->constraint_bound_rescaling *
-                                         state->objective_vector_rescaling) +
+        state->primal_objective_value / (state->constraint_bound_rescaling * state->objective_vector_rescaling) +
         state->objective_constant;
 
     double base_dual_objective;
-    CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables,
-                            state->dual_slack, 1, state->pdhg_primal_solution, 1,
+    CUBLAS_CHECK(cublasDdot(state->blas_handle,
+                            state->num_variables,
+                            state->dual_slack,
+                            1,
+                            state->pdhg_primal_solution,
+                            1,
                             &base_dual_objective));
     double dual_slack_sum =
-        get_vector_sum(state->blas_handle, state->num_constraints,
-                       state->ones_dual_d, state->primal_slack);
+        get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack);
     state->dual_objective_value = (base_dual_objective + dual_slack_sum) /
-                                      (state->constraint_bound_rescaling *
-                                       state->objective_vector_rescaling) +
-                                  state->objective_constant;
+            (state->constraint_bound_rescaling * state->objective_vector_rescaling) +
+        state->objective_constant;
+
+    state->relative_primal_residual = state->absolute_primal_residual / (1.0 + state->constraint_bound_norm);
 
-    state->relative_primal_residual = 
-        state->absolute_primal_residual / (1.0 + state->constraint_bound_norm);
-    
-    state->relative_dual_residual =
-        state->absolute_dual_residual / (1.0 + state->objective_vector_norm);
+    state->relative_dual_residual = state->absolute_dual_residual / (1.0 + state->objective_vector_norm);
 
-    state->objective_gap =
-        fabs(state->primal_objective_value - state->dual_objective_value);
+    state->objective_gap = fabs(state->primal_objective_value - state->dual_objective_value);
 
     state->relative_objective_gap =
-        state->objective_gap / (1.0 + fabs(state->primal_objective_value) +
-                                fabs(state->dual_objective_value));
+        state->objective_gap / (1.0 + fabs(state->primal_objective_value) + fabs(state->dual_objective_value));
 }
 
 void compute_infeasibility_information(pdhg_solver_state_t *state)
 {
-    primal_infeasibility_project_kernel<<<state->num_blocks_primal,
-                                          THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->delta_primal_solution, state->variable_lower_bound,
-        state->variable_upper_bound, state->num_variables);
-    dual_infeasibility_project_kernel<<<state->num_blocks_dual,
-                                        THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->delta_dual_solution, state->constraint_lower_bound,
-        state->constraint_upper_bound, state->num_constraints);
-
-    double primal_ray_inf_norm = get_vector_inf_norm(
-        state->blas_handle, state->num_variables, state->delta_primal_solution);
+    primal_infeasibility_project_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>(
+        state->delta_primal_solution, state->variable_lower_bound, state->variable_upper_bound, state->num_variables);
+    dual_infeasibility_project_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
+        state->delta_dual_solution,
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->num_constraints);
+
+    double primal_ray_inf_norm =
+        get_vector_inf_norm(state->blas_handle, state->num_variables, state->delta_primal_solution);
     if (primal_ray_inf_norm > 0.0)
     {
         double scale = 1.0 / primal_ray_inf_norm;
-        cublasDscal(state->blas_handle, state->num_variables, &scale,
-                    state->delta_primal_solution, 1);
-    }
-    double dual_ray_inf_norm = get_vector_inf_norm(
-        state->blas_handle, state->num_constraints, state->delta_dual_solution);
-
-    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol,
-                                          state->delta_primal_solution));
-    CUSPARSE_CHECK(
-        cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution));
-    CUSPARSE_CHECK(
-        cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product));
-    CUSPARSE_CHECK(
-        cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product));
-
-    CUSPARSE_CHECK(cusparseSpMV(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
-
-    CUSPARSE_CHECK(cusparseSpMV(
-        state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE,
-        state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod,
-        CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
-
-    CUBLAS_CHECK(cublasDdot(
-        state->blas_handle, state->num_variables, state->objective_vector, 1,
-        state->delta_primal_solution, 1, &state->primal_ray_linear_objective));
-    state->primal_ray_linear_objective /=
-        (state->constraint_bound_rescaling * state->objective_vector_rescaling);
-
-    dual_solution_dual_objective_contribution_kernel<<<state->num_blocks_dual,
-                                                       THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->constraint_lower_bound_finite_val,
-        state->constraint_upper_bound_finite_val, state->delta_dual_solution,
-        state->num_constraints, state->primal_slack);
+        cublasDscal(state->blas_handle, state->num_variables, &scale, state->delta_primal_solution, 1);
+    }
+    double dual_ray_inf_norm =
+        get_vector_inf_norm(state->blas_handle, state->num_constraints, state->delta_dual_solution);
+
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->delta_primal_solution));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->delta_dual_solution));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product));
+    CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product));
 
-    dual_objective_dual_slack_contribution_array_kernel<<<
-        state->num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->dual_product, state->dual_slack,
-        state->variable_lower_bound_finite_val,
-        state->variable_upper_bound_finite_val, state->num_variables);
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matA,
+                                state->vec_primal_sol,
+                                &HOST_ZERO,
+                                state->vec_primal_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->primal_spmv_buffer));
+
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matAt,
+                                state->vec_dual_sol,
+                                &HOST_ZERO,
+                                state->vec_dual_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->dual_spmv_buffer));
+
+    CUBLAS_CHECK(cublasDdot(state->blas_handle,
+                            state->num_variables,
+                            state->objective_vector,
+                            1,
+                            state->delta_primal_solution,
+                            1,
+                            &state->primal_ray_linear_objective));
+    state->primal_ray_linear_objective /= (state->constraint_bound_rescaling * state->objective_vector_rescaling);
+
+    dual_solution_dual_objective_contribution_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
+        state->constraint_lower_bound_finite_val,
+        state->constraint_upper_bound_finite_val,
+        state->delta_dual_solution,
+        state->num_constraints,
+        state->primal_slack);
+
+    dual_objective_dual_slack_contribution_array_kernel<<<state->num_blocks_primal,
+                                                          THREADS_PER_BLOCK,
+                                                          0,
+                                                          state->stream>>>(state->dual_product,
+                                                                           state->dual_slack,
+                                                                           state->variable_lower_bound_finite_val,
+                                                                           state->variable_upper_bound_finite_val,
+                                                                           state->num_variables);
 
     double sum_primal_slack =
-        get_vector_sum(state->blas_handle, state->num_constraints,
-                       state->ones_dual_d, state->primal_slack);
+        get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack);
     double sum_dual_slack =
-        get_vector_sum(state->blas_handle, state->num_variables,
-                       state->ones_primal_d, state->dual_slack);
+        get_vector_sum(state->blas_handle, state->num_variables, state->ones_primal_d, state->dual_slack);
     state->dual_ray_objective =
-        (sum_primal_slack + sum_dual_slack) /
-        (state->constraint_bound_rescaling * state->objective_vector_rescaling);
-
-    compute_primal_infeasibility_kernel<<<state->num_blocks_dual,
-                                          THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->primal_product, state->constraint_lower_bound,
-        state->constraint_upper_bound, state->num_constraints,
-        state->primal_slack, state->constraint_rescaling);
-    compute_dual_infeasibility_kernel<<<state->num_blocks_primal,
-                                        THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->dual_product, state->variable_lower_bound,
-        state->variable_upper_bound, state->num_variables, state->dual_slack,
+        (sum_primal_slack + sum_dual_slack) / (state->constraint_bound_rescaling * state->objective_vector_rescaling);
+
+    compute_primal_infeasibility_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
+        state->primal_product,
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->num_constraints,
+        state->primal_slack,
+        state->constraint_rescaling);
+    compute_dual_infeasibility_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK, 0, state->stream>>>(
+        state->dual_product,
+        state->variable_lower_bound,
+        state->variable_upper_bound,
+        state->num_variables,
+        state->dual_slack,
         state->variable_rescaling);
 
-    state->max_primal_ray_infeasibility = get_vector_inf_norm(
-        state->blas_handle, state->num_constraints, state->primal_slack);
-    double dual_slack_norm = get_vector_inf_norm(
-        state->blas_handle, state->num_variables, state->dual_slack);
+    state->max_primal_ray_infeasibility =
+        get_vector_inf_norm(state->blas_handle, state->num_constraints, state->primal_slack);
+    double dual_slack_norm = get_vector_inf_norm(state->blas_handle, state->num_variables, state->dual_slack);
     state->max_dual_ray_infeasibility = dual_slack_norm;
 
     double scaling_factor = fmax(dual_ray_inf_norm, dual_slack_norm);
@@ -971,8 +1010,7 @@ void fill_or_copy(double **dst, int n, const double *src, double fill_val)
 }
 
 // convert dense → CSR
-int dense_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
-                 double **vals, int *nnz_out)
+int dense_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals, int *nnz_out)
 {
     int m = desc->m, n = desc->n;
 
@@ -1011,8 +1049,7 @@ int dense_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
 }
 
 // convert CSC → CSR
-int csc_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
-               double **vals)
+int csc_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals)
 {
     const int m = desc->m, n = desc->n;
     const int *col_ptr = desc->data.csc.col_ptr;
@@ -1069,8 +1106,7 @@ int csc_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
 }
 
 // convert COO → CSR
-int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
-               double **vals)
+int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals)
 {
     const int m = desc->m, n = desc->n;
     const int nnz = desc->data.coo.nnz;
@@ -1125,11 +1161,10 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
     return 0;
 }
 
-void check_feas_polishing_termination_criteria(
-    pdhg_solver_state_t *solver_state,
-    const pdhg_solver_state_t *ori_solver_state,
-    const termination_criteria_t *criteria,
-    bool is_primal_polish)
+void check_feas_polishing_termination_criteria(pdhg_solver_state_t *solver_state,
+                                               const pdhg_solver_state_t *ori_solver_state,
+                                               const termination_criteria_t *criteria,
+                                               bool is_primal_polish)
 {
     solver_state->cumulative_time_sec = (double)(clock() - solver_state->start_time) / CLOCKS_PER_SEC;
     if (is_primal_polish)
@@ -1179,7 +1214,9 @@ void print_initial_feas_polish_info(bool is_primal_polish, const pdhg_parameters
     printf("---------------------------------------------------------------------------------------\n");
 }
 
-void pdhg_feas_polish_final_log(const pdhg_solver_state_t *primal_state, const pdhg_solver_state_t *dual_state, bool verbose)
+void pdhg_feas_polish_final_log(const pdhg_solver_state_t *primal_state,
+                                const pdhg_solver_state_t *dual_state,
+                                bool verbose)
 {
     if (verbose)
     {
@@ -1194,7 +1231,9 @@ void pdhg_feas_polish_final_log(const pdhg_solver_state_t *primal_state, const p
     printf("  Dual Time Usage      : %.3g sec\n", dual_state->cumulative_time_sec);
     printf("  Primal Residual      : %.3e\n", primal_state->relative_primal_residual);
     printf("  Dual Residual        : %.3e\n", dual_state->relative_dual_residual);
-    printf("  Primal Dual Gap      : %.3e\n", fabs(primal_state->primal_objective_value - dual_state->dual_objective_value) / (1.0 + fabs(primal_state->primal_objective_value) + fabs(dual_state->dual_objective_value)));
+    printf("  Primal Dual Gap      : %.3e\n",
+           fabs(primal_state->primal_objective_value - dual_state->dual_objective_value) /
+               (1.0 + fabs(primal_state->primal_objective_value) + fabs(dual_state->dual_objective_value)));
 }
 
 void display_feas_polish_iteration_stats(const pdhg_solver_state_t *state, bool verbose, bool is_primal_polish)
@@ -1226,13 +1265,12 @@ void display_feas_polish_iteration_stats(const pdhg_solver_state_t *state, bool
     }
 }
 
-__global__ void compute_primal_feas_polish_residual_kernel(
-    double *__restrict__ primal_residual,
-    const double *__restrict__ primal_product,
-    const double *__restrict__ constraint_lower_bound,
-    const double *__restrict__ constraint_upper_bound,
-    const double *__restrict__ constraint_rescaling,
-    int num_constraints)
+__global__ void compute_primal_feas_polish_residual_kernel(double *__restrict__ primal_residual,
+                                                           const double *__restrict__ primal_product,
+                                                           const double *__restrict__ constraint_lower_bound,
+                                                           const double *__restrict__ constraint_upper_bound,
+                                                           const double *__restrict__ constraint_rescaling,
+                                                           int num_constraints)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -1244,18 +1282,17 @@ __global__ void compute_primal_feas_polish_residual_kernel(
     }
 }
 
-__global__ void compute_dual_feas_polish_residual_kernel(
-    double *__restrict__ dual_residual,
-    const double *__restrict__ dual_solution,
-    const double *__restrict__ dual_product,
-    const double *__restrict__ dual_slack,
-    const double *__restrict__ objective_vector,
-    const double *__restrict__ variable_rescaling,
-    double *__restrict__ dual_obj_contribution,
-    const double *__restrict__ const_lb_finite,
-    const double *__restrict__ const_ub_finite,
-    int num_variables,
-    int num_constraints)
+__global__ void compute_dual_feas_polish_residual_kernel(double *__restrict__ dual_residual,
+                                                         const double *__restrict__ dual_solution,
+                                                         const double *__restrict__ dual_product,
+                                                         const double *__restrict__ dual_slack,
+                                                         const double *__restrict__ objective_vector,
+                                                         const double *__restrict__ variable_rescaling,
+                                                         double *__restrict__ dual_obj_contribution,
+                                                         const double *__restrict__ const_lb_finite,
+                                                         const double *__restrict__ const_ub_finite,
+                                                         int num_variables,
+                                                         int num_constraints)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -1266,63 +1303,104 @@ __global__ void compute_dual_feas_polish_residual_kernel(
     else if (i < num_constraints + num_variables)
     {
         int idx = i - num_variables;
-        dual_obj_contribution[idx] = fmax(dual_solution[idx], 0.0) * const_lb_finite[idx] + fmin(dual_solution[idx], 0.0) * const_ub_finite[idx];
+        dual_obj_contribution[idx] =
+            fmax(dual_solution[idx], 0.0) * const_lb_finite[idx] + fmin(dual_solution[idx], 0.0) * const_ub_finite[idx];
     }
 }
 
-void compute_primal_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm)
+void compute_primal_feas_polish_residual(pdhg_solver_state_t *state,
+                                         const pdhg_solver_state_t *ori_state,
+                                         norm_type_t optimality_norm)
 {
     CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->pdhg_primal_solution));
     CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_prod, state->primal_product));
 
-    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matA, state->vec_primal_sol, &HOST_ZERO, state->vec_primal_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->primal_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matA,
+                                state->vec_primal_sol,
+                                &HOST_ZERO,
+                                state->vec_primal_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->primal_spmv_buffer));
 
     compute_primal_feas_polish_residual_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK, 0, state->stream>>>(
-        state->primal_residual, state->primal_product, state->constraint_lower_bound,
-        state->constraint_upper_bound, state->constraint_rescaling,
+        state->primal_residual,
+        state->primal_product,
+        state->constraint_lower_bound,
+        state->constraint_upper_bound,
+        state->constraint_rescaling,
         state->num_constraints);
 
-    if (optimality_norm == NORM_TYPE_L_INF) {
-        state->absolute_primal_residual = get_vector_inf_norm(state->blas_handle, 
-                                          state->num_constraints, state->primal_residual);
-    } else {
-        CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_constraints, 
-                                       state->primal_residual, 1, 
-                                       &state->absolute_primal_residual));
+    if (optimality_norm == NORM_TYPE_L_INF)
+    {
+        state->absolute_primal_residual =
+            get_vector_inf_norm(state->blas_handle, state->num_constraints, state->primal_residual);
+    }
+    else
+    {
+        CUBLAS_CHECK(cublasDnrm2_v2_64(
+            state->blas_handle, state->num_constraints, state->primal_residual, 1, &state->absolute_primal_residual));
     }
 
     state->absolute_primal_residual /= state->constraint_bound_rescaling;
 
     state->relative_primal_residual = state->absolute_primal_residual / (1.0 + state->constraint_bound_norm);
 
-    CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, ori_state->objective_vector, 1, state->pdhg_primal_solution, 1, &state->primal_objective_value));
-    state->primal_objective_value = state->primal_objective_value / (state->constraint_bound_rescaling * state->objective_vector_rescaling) + state->objective_constant;
+    CUBLAS_CHECK(cublasDdot(state->blas_handle,
+                            state->num_variables,
+                            ori_state->objective_vector,
+                            1,
+                            state->pdhg_primal_solution,
+                            1,
+                            &state->primal_objective_value));
+    state->primal_objective_value =
+        state->primal_objective_value / (state->constraint_bound_rescaling * state->objective_vector_rescaling) +
+        state->objective_constant;
 }
 
-void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state, norm_type_t optimality_norm)
+void compute_dual_feas_polish_residual(pdhg_solver_state_t *state,
+                                       const pdhg_solver_state_t *ori_state,
+                                       norm_type_t optimality_norm)
 {
     CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->pdhg_dual_solution));
     CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_prod, state->dual_product));
 
-    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &HOST_ONE, state->matAt, state->vec_dual_sol, &HOST_ZERO, state->vec_dual_prod, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, state->dual_spmv_buffer));
+    CUSPARSE_CHECK(cusparseSpMV(state->sparse_handle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                &HOST_ONE,
+                                state->matAt,
+                                state->vec_dual_sol,
+                                &HOST_ZERO,
+                                state->vec_dual_prod,
+                                CUDA_R_64F,
+                                CUSPARSE_SPMV_CSR_ALG2,
+                                state->dual_spmv_buffer));
 
     compute_dual_feas_polish_residual_kernel<<<state->num_blocks_primal_dual, THREADS_PER_BLOCK>>>(
         state->dual_residual,
         state->pdhg_dual_solution,
         state->dual_product,
-        state->dual_slack, state->objective_vector,
+        state->dual_slack,
+        state->objective_vector,
         state->variable_rescaling,
         state->primal_slack,
         ori_state->constraint_lower_bound_finite_val,
         ori_state->constraint_upper_bound_finite_val,
-        state->num_variables, state->num_constraints);
+        state->num_variables,
+        state->num_constraints);
 
-    if (optimality_norm == NORM_TYPE_L_INF) {
-        state->absolute_dual_residual = get_vector_inf_norm(state->blas_handle, 
-                                        state->num_variables, state->dual_residual);
-    } else {
-        CUBLAS_CHECK(cublasDnrm2_v2_64(state->blas_handle, state->num_variables, 
-                                       state->dual_residual, 1, &state->absolute_dual_residual));
+    if (optimality_norm == NORM_TYPE_L_INF)
+    {
+        state->absolute_dual_residual =
+            get_vector_inf_norm(state->blas_handle, state->num_variables, state->dual_residual);
+    }
+    else
+    {
+        CUBLAS_CHECK(cublasDnrm2_v2_64(
+            state->blas_handle, state->num_variables, state->dual_residual, 1, &state->absolute_dual_residual));
     }
 
     state->absolute_dual_residual /= state->objective_vector_rescaling;
@@ -1330,7 +1408,16 @@ void compute_dual_feas_polish_residual(pdhg_solver_state_t *state, const pdhg_so
     state->relative_dual_residual = state->absolute_dual_residual / (1.0 + state->objective_vector_norm);
 
     double base_dual_objective;
-    CUBLAS_CHECK(cublasDdot(state->blas_handle, state->num_variables, state->dual_slack, 1, ori_state->pdhg_primal_solution, 1, &base_dual_objective));
-    double dual_slack_sum = get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack);
-    state->dual_objective_value = (base_dual_objective + dual_slack_sum) / (state->constraint_bound_rescaling * state->objective_vector_rescaling) + state->objective_constant;
+    CUBLAS_CHECK(cublasDdot(state->blas_handle,
+                            state->num_variables,
+                            state->dual_slack,
+                            1,
+                            ori_state->pdhg_primal_solution,
+                            1,
+                            &base_dual_objective));
+    double dual_slack_sum =
+        get_vector_sum(state->blas_handle, state->num_constraints, state->ones_dual_d, state->primal_slack);
+    state->dual_objective_value = (base_dual_objective + dual_slack_sum) /
+            (state->constraint_bound_rescaling * state->objective_vector_rescaling) +
+        state->objective_constant;
 }
diff --git a/test/test_interface.c b/test/test_interface.c
index ea71345..5a23077 100644
--- a/test/test_interface.c
+++ b/test/test_interface.c
@@ -15,40 +15,41 @@ limitations under the License.
 */
 
 #include "cupdlpx.h"
+#include <math.h>
 #include <stdio.h>
-#include <math.h> 
 
-static void print_vec(const char* name, const double* v, int n) {
+static void print_vec(const char *name, const double *v, int n)
+{
     printf("%s:", name);
-    for (int i = 0; i < n; ++i) printf(" % .6g", v[i]);
+    for (int i = 0; i < n; ++i)
+        printf(" % .6g", v[i]);
     printf("\n");
 }
 
-static void run_once(const char* tag,
-                     const matrix_desc_t* A_desc,
-                     const double* c, const double* l, const double* u)
+static void run_once(const char *tag, const matrix_desc_t *A_desc, const double *c, const double *l, const double *u)
 {
     printf("\n=== %s ===\n", tag);
-    
+
     // build problem
-    lp_problem_t* prob = create_lp_problem(
-        c,       // objective_c
-        A_desc,  // constraint matrix A
-        l,       // constraint lower bound con_lb
-        u,       // constraint upper bound con_ub
-        NULL,    // variable lower bound var_lb (defaults to 0)
-        NULL,    // variable upper bound var_ub (defaults to +inf)
-        NULL     // objective constant c0 (defaults to 0.0)
+    lp_problem_t *prob = create_lp_problem(c,      // objective_c
+                                           A_desc, // constraint matrix A
+                                           l,      // constraint lower bound con_lb
+                                           u,      // constraint upper bound con_ub
+                                           NULL,   // variable lower bound var_lb (defaults to 0)
+                                           NULL,   // variable upper bound var_ub (defaults to +inf)
+                                           NULL    // objective constant c0 (defaults to 0.0)
     );
-    if (!prob) {
+    if (!prob)
+    {
         fprintf(stderr, "[test] create_lp_problem failed for %s.\n", tag);
         return;
     }
 
     // solve
-    cupdlpx_result_t* res = solve_lp_problem(prob, NULL);
+    cupdlpx_result_t *res = solve_lp_problem(prob, NULL);
     lp_problem_free(prob);
-    if (!res) {
+    if (!res)
+    {
         fprintf(stderr, "[test] solve_lp_problem failed for %s.\n", tag);
         return;
     }
@@ -56,38 +57,39 @@ static void run_once(const char* tag,
     // print results
     print_vec("x", res->primal_solution, res->num_variables);
     print_vec("y", res->dual_solution, res->num_constraints);
-    
+
     // free
     cupdlpx_result_free(res);
 }
 
-static void test_warm_start(const char* tag,
-                                      const matrix_desc_t* A_desc,
-                                      const double* c, const double* l, const double* u)
+static void
+test_warm_start(const char *tag, const matrix_desc_t *A_desc, const double *c, const double *l, const double *u)
 {
     printf("\n=== %s (with initial solution) ===\n", tag);
 
     int n = A_desc->n;
     int m = A_desc->m;
 
-    lp_problem_t* prob = create_lp_problem(
-        c, A_desc, l, u, NULL, NULL, NULL
-    );
-    if (!prob) {
+    lp_problem_t *prob = create_lp_problem(c, A_desc, l, u, NULL, NULL, NULL);
+    if (!prob)
+    {
         fprintf(stderr, "[test] create_lp_problem failed for %s.\n", tag);
         return;
     }
 
     // Allocate and set initial solutions (e.g., zeros)
-    double* primal = (double*)malloc(n * sizeof(double));
-    double* dual   = (double*)malloc(m * sizeof(double));
-    for (int i = 0; i < n; ++i) primal[i] = 1.0;
-    for (int i = 0; i < m; ++i) dual[i] = 1.0;
+    double *primal = (double *)malloc(n * sizeof(double));
+    double *dual = (double *)malloc(m * sizeof(double));
+    for (int i = 0; i < n; ++i)
+        primal[i] = 1.0;
+    for (int i = 0; i < m; ++i)
+        dual[i] = 1.0;
 
     set_start_values(prob, primal, dual);
 
-    cupdlpx_result_t* res = solve_lp_problem(prob, NULL);
-    if (!res) {
+    cupdlpx_result_t *res = solve_lp_problem(prob, NULL);
+    if (!res)
+    {
         fprintf(stderr, "[test] solve_lp_problem failed for %s.\n", tag);
         lp_problem_free(prob);
         return;
@@ -96,14 +98,14 @@ static void test_warm_start(const char* tag,
     print_vec("x", res->primal_solution, res->num_variables);
     print_vec("y", res->dual_solution, res->num_constraints);
 
-    
     free(primal);
     free(dual);
     cupdlpx_result_free(res);
     lp_problem_free(prob);
 }
 
-int main() {
+int main()
+{
     // Example: min c^T x
     // s.t. l <= A x <= u, x >= 0
 
@@ -111,11 +113,7 @@ int main() {
     int n = 2; // number of variables
 
     // A as a dense matrix
-    double A[3][2] = {
-        {1.0, 2.0},
-        {0.0, 1.0},
-        {3.0, 2.0}
-    };
+    double A[3][2] = {{1.0, 2.0}, {0.0, 1.0}, {3.0, 2.0}};
 
     // describe A using matrix_desc_t
     matrix_desc_t A_dense;
@@ -132,7 +130,8 @@ int main() {
 
     // describe A using matrix_desc_t
     matrix_desc_t A_csr;
-    A_csr.m = m; A_csr.n = n;
+    A_csr.m = m;
+    A_csr.n = n;
     A_csr.fmt = matrix_csr;
     A_csr.zero_tolerance = 0.0;
     A_csr.data.csr.nnz = 5;
@@ -147,7 +146,8 @@ int main() {
 
     // describe A using matrix_desc_t
     matrix_desc_t A_csc;
-    A_csc.m = m; A_csc.n = n;
+    A_csc.m = m;
+    A_csc.n = n;
     A_csc.fmt = matrix_csc;
     A_csc.zero_tolerance = 0.0;
     A_csc.data.csc.nnz = 5;
@@ -162,7 +162,8 @@ int main() {
 
     // describe A using matrix_desc_t
     matrix_desc_t A_coo;
-    A_coo.m = m; A_coo.n = n;
+    A_coo.m = m;
+    A_coo.n = n;
     A_coo.fmt = matrix_coo;
     A_coo.zero_tolerance = 0.0;
     A_coo.data.coo.nnz = 5;
@@ -174,47 +175,51 @@ int main() {
     double c[2] = {1.0, 1.0};
 
     // l&u: constraintbounds
-    double l[3] = {5.0, -INFINITY, -INFINITY};  // lower bounds
-    double u[3] = {5.0, 2.0, 8.0}; // upper bounds
+    double l[3] = {5.0, -INFINITY, -INFINITY}; // lower bounds
+    double u[3] = {5.0, 2.0, 8.0};             // upper bounds
 
     printf("Objective c = [");
-    for (int j = 0; j < n; j++) {
+    for (int j = 0; j < n; j++)
+    {
         printf(" %g", c[j]);
     }
     printf(" ]\n");
 
     printf("Matrix A:\n");
-    for (int i = 0; i < m; i++) {
+    for (int i = 0; i < m; i++)
+    {
         printf("  [");
-        for (int j = 0; j < n; j++) {
+        for (int j = 0; j < n; j++)
+        {
             printf(" %g", A[i][j]);
         }
         printf(" ]\n");
     }
 
     printf("Constraint bounds (l <= A x <= b):\n");
-    for (int i = 0; i < m; i++) {
+    for (int i = 0; i < m; i++)
+    {
         printf("  row %d: l = %g, u = %g\n", i, l[i], u[i]);
     }
 
-     lp_problem_t* prob = create_lp_problem(
-        c,              // c
-        &A_dense,       // A
-        l,              // con_lb
-        u,              // con_ub
-        NULL,           // var_lb
-        NULL,           // var_ub
-        NULL            // objective_constant
+    lp_problem_t *prob = create_lp_problem(c,        // c
+                                           &A_dense, // A
+                                           l,        // con_lb
+                                           u,        // con_ub
+                                           NULL,     // var_lb
+                                           NULL,     // var_ub
+                                           NULL      // objective_constant
     );
-    if (!prob) {
+    if (!prob)
+    {
         fprintf(stderr, "[test] create_lp_problem failed.\n");
         return 1;
     }
 
     run_once("Test 1: Dense Matrix", &A_dense, c, l, u);
-    run_once("Test 2: CSR Matrix",   &A_csr,   c, l, u);
-    run_once("Test 3: CSC Matrix",   &A_csc,   c, l, u);
-    run_once("Test 4: COO Matrix",   &A_coo,   c, l, u);
+    run_once("Test 2: CSR Matrix", &A_csr, c, l, u);
+    run_once("Test 3: CSC Matrix", &A_csc, c, l, u);
+    run_once("Test 4: COO Matrix", &A_coo, c, l, u);
 
     test_warm_start("Test 5: Dense Matrix", &A_dense, c, l, u);
     test_warm_start("Test 6: CSR Matrix", &A_csr, c, l, u);
@@ -222,4 +227,4 @@ int main() {
     test_warm_start("Test 8: COO Matrix", &A_coo, c, l, u);
 
     return 0;
-}
\ No newline at end of file
+}