From 434f8f0d69212f7a7826d1f69bf702ab587e1145 Mon Sep 17 00:00:00 2001
From: Jonathan Rampersad <rampersad.jonathan@gmail.com>
Date: Wed, 18 Jun 2025 09:19:30 -0400
Subject: [PATCH] FEAT: Dynamically select coefficient dtype and CUDA kernel

---
 src/polysolve/__init__.py | 50 +++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/src/polysolve/__init__.py b/src/polysolve/__init__.py
index 931b2b1..436f56c 100644
--- a/src/polysolve/__init__.py
+++ b/src/polysolve/__init__.py
@@ -12,8 +12,8 @@ try:
 except ImportError:
     _CUPY_AVAILABLE = False
 
-# The CUDA kernel for the fitness function
-_FITNESS_KERNEL = """
+# The CUDA kernels for the fitness function
+_FITNESS_KERNEL_FLOAT = """
 extern "C" __global__ void fitness_kernel(
     const double* coefficients, 
     int num_coefficients, 
@@ -37,6 +37,31 @@ extern "C" __global__ void fitness_kernel(
     }
 }
 """
+_FITNESS_KERNEL_INT = """
+extern "C" __global__ void fitness_kernel(
+    const long long* coefficients, 
+    int num_coefficients, 
+    const double* x_vals, 
+    double* ranks, 
+    int size, 
+    double y_val)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < size)
+    {
+        double ans = 0;
+        int lrgst_expo = num_coefficients - 1;
+        for (int i = 0; i < num_coefficients; ++i)
+        {
+            ans += coefficients[i] * pow(x_vals[idx], (double)(lrgst_expo - i));
+        }
+
+        ans -= y_val;
+        ranks[idx] = (ans == 0) ? 1.7976931348623157e+308 : fabs(1.0 / ans);
+    }
+}
+"""
+
 
 @dataclass
 class GA_Options:
@@ -96,8 +121,14 @@ class Function:
             )
         if coefficients[0] == 0 and self._largest_exponent > 0:
             raise ValueError("The first constant (for the largest exponent) cannot be 0.")
+        
+        # Check if any coefficient is a float
+        is_float = any(isinstance(c, float) for c in coefficients)
 
-        self.coefficients = np.array(coefficients, dtype=np.float64)
+        # Choose the dtype based on the input
+        target_dtype = np.float64 if is_float else np.int64
+
+        self.coefficients = np.array(coefficients, dtype=target_dtype)
         self._initialized = True
 
     def _check_initialized(self):
@@ -276,11 +307,16 @@ class Function:
 
     def _solve_x_cuda(self, y_val: float, options: GA_Options) -> np.ndarray:
         """Genetic algorithm implementation using CuPy (GPU/CUDA)."""
-        # Load the raw CUDA kernel
-        fitness_gpu = cupy.RawKernel(_FITNESS_KERNEL, 'fitness_kernel')
         
-        # Move coefficients to GPU
-        d_coefficients = cupy.array(self.coefficients, dtype=cupy.float64)
+        # Check the dtype of our coefficients array
+        if self.coefficients.dtype == np.float64:
+            fitness_gpu = cupy.RawKernel(_FITNESS_KERNEL_FLOAT, 'fitness_kernel')
+            d_coefficients = cupy.array(self.coefficients, dtype=cupy.float64)
+        elif self.coefficients.dtype == np.int64:
+            fitness_gpu = cupy.RawKernel(_FITNESS_KERNEL_INT, 'fitness_kernel')
+            d_coefficients = cupy.array(self.coefficients, dtype=cupy.int64)
+        else:
+            raise TypeError(f"Unsupported dtype for CUDA solver: {self.coefficients.dtype}")
         
         # Create initial random solutions on the GPU
         d_solutions = cupy.random.uniform(