2025-06-18 13:20:18 +00:00
1 changed files with 43 additions and 7 deletions
--- a/src/polysolve/init.py
+++ b/src/polysolve/init.py
@@ -12,8 +12,8 @@ try:
 except ImportError:
    _CUPY_AVAILABLE = False
-# The CUDA kernel for the fitness function
+# The CUDA kernels for the fitness function
-_FITNESS_KERNEL = """
+_FITNESS_KERNEL_FLOAT = """
 extern "C" __global__ void fitness_kernel(
    const double* coefficients, 
    int num_coefficients, 
@@ -37,6 +37,31 @@ extern "C" __global__ void fitness_kernel(
    }
 }
 """
 _FITNESS_KERNEL_INT = """
 extern "C" __global__ void fitness_kernel(
    const long long* coefficients, 
    int num_coefficients, 
    const double* x_vals, 
    double* ranks, 
    int size, 
    double y_val)
 {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < size)
    {
        double ans = 0;
        int lrgst_expo = num_coefficients - 1;
        for (int i = 0; i < num_coefficients; ++i)
        {
            ans += coefficients[i] * pow(x_vals[idx], (double)(lrgst_expo - i));
        }
        ans -= y_val;
        ranks[idx] = (ans == 0) ? 1.7976931348623157e+308 : fabs(1.0 / ans);
    }
 }
 """
@dataclass
 class GA_Options:
@@ -96,8 +121,14 @@ class Function:
            )
        if coefficients[0] == 0 and self._largest_exponent > 0:
            raise ValueError("The first constant (for the largest exponent) cannot be 0.")
        # Check if any coefficient is a float
        is_float = any(isinstance(c, float) for c in coefficients)
-        self.coefficients = np.array(coefficients, dtype=np.float64)
+        # Choose the dtype based on the input
        target_dtype = np.float64 if is_float else np.int64
        self.coefficients = np.array(coefficients, dtype=target_dtype)
        self._initialized = True
    def _check_initialized(self):
@@ -276,11 +307,16 @@ class Function:
    def _solve_x_cuda(self, y_val: float, options: GA_Options) -> np.ndarray:
        """Genetic algorithm implementation using CuPy (GPU/CUDA)."""
        # Load the raw CUDA kernel
        fitness_gpu = cupy.RawKernel(_FITNESS_KERNEL, 'fitness_kernel')
-        # Move coefficients to GPU
+        # Check the dtype of our coefficients array
-        d_coefficients = cupy.array(self.coefficients, dtype=cupy.float64)
+        if self.coefficients.dtype == np.float64:
            fitness_gpu = cupy.RawKernel(_FITNESS_KERNEL_FLOAT, 'fitness_kernel')
            d_coefficients = cupy.array(self.coefficients, dtype=cupy.float64)
        elif self.coefficients.dtype == np.int64:
            fitness_gpu = cupy.RawKernel(_FITNESS_KERNEL_INT, 'fitness_kernel')
            d_coefficients = cupy.array(self.coefficients, dtype=cupy.int64)
        else:
            raise TypeError(f"Unsupported dtype for CUDA solver: {self.coefficients.dtype}")
        # Create initial random solutions on the GPU
        d_solutions = cupy.random.uniform(