feat(ga): Implement quality filtering and precision-based clustering (#19)

The previous GA logic was returning the "top N" solutions, which led to test failures when the algorithm correctly converged on only one of all possible roots (e.g., returning 1000 variations of -1.0). This commit fixes the root-finding logic to correctly identify and return *all* unique, high-quality roots: 1. **feat(api):** Adds `root_precision` to `GA_Options`. This new parameter (default: 5) allows the user to control the number of decimal places for clustering unique roots. 2. **fix(ga):** Replaces the flawed "top N" logic in both `_solve_x_numpy` and `_solve_x_cuda`. The new process is: * Dynamically sets a `quality_threshold` based on the user's `root_precision` (e.g., `precision=5` requires a rank > `1e6`). * Filters the *entire* final population for all solutions that meet this quality threshold. * Rounds these high-quality solutions to `root_precision`. * Returns only the `np.unique()` results. This ensures the solver returns all distinct roots that meet the accuracy requirements, rather than just the top N variations of a single root. Reviewed-on: #19 Co-authored-by: Jonathan Rampersad <rampersad.jonathan@gmail.com> Co-committed-by: Jonathan Rampersad <rampersad.jonathan@gmail.com>
2025-10-27 19:26:50 +00:00
parent 962eab5af7
commit 4e46c11f83
3 changed files with 46 additions and 21 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 # --- Core Metadata ---
 name = "polysolve"
-version = "0.4.2"
+version = "0.5.0"
 authors = [
  { name="Jonathan Rampersad", email="jonathan@jono-rams.work" },
 ]
--- a/src/polysolve/init.py
+++ b/src/polysolve/init.py
@@ -49,8 +49,6 @@ class GA_Options:
                           Default: 100.0
        num_of_generations (int): The number of iterations the algorithm will run.
                                  Default: 10
-        sample_size (int): The number of top solutions to *return* at the end.
-                           Default: 1000
        data_size (int): The total number of solutions (population size)
                         generated in each generation. Default: 100000
        mutation_strength (float): The percentage (e.g., 0.01 for 1%) by which
@@ -64,16 +62,21 @@ class GA_Options:
        mutation_ratio (float): The percentage (e.g., 0.40 for 40%) of the next
                                generation to be created by mutating solutions
                                from the parent pool. Default: 0.40
+        root_precision (int): The number of decimal places to round roots to
+                              when clustering. A smaller number (e.g., 3)
+                              groups roots more aggressively. A larger number
+                              (e.g., 7) is more precise but may return
+                              multiple near-identical roots. Default: 5
    """
    min_range: float = -100.0
    max_range: float = 100.0
    num_of_generations: int = 10
-    sample_size: int = 1000
    data_size: int = 100000
    mutation_strength: float = 0.01
    elite_ratio: float = 0.05
    crossover_ratio: float = 0.45
    mutation_ratio: float = 0.40
+    root_precision: int = 5

    def __post_init__(self):
        """Validates the GA options after initialization."""
@@ -84,11 +87,6 @@ class GA_Options:
            )
        if any(r < 0 for r in [self.elite_ratio, self.crossover_ratio, self.mutation_ratio]):
            raise ValueError("GA ratios cannot be negative.")
-        if self.data_size < self.sample_size:
-            warnings.warn(
-                f"data_size ({self.data_size}) is less than sample_size ({self.sample_size}). "
-                "The number of returned solutions will be limited to data_size."
-            )

 def _get_cauchy_bound(coeffs: np.ndarray) -> float:
    """
@@ -380,12 +378,26 @@ class Function:
        error = y_calculated - y_val
        with np.errstate(divide='ignore'):
            ranks = np.where(error == 0, np.finfo(float).max, np.abs(1.0 / error))
-        sorted_indices = np.argsort(-ranks)
        
-        # Get the top 'sample_size' solutions the user asked for
-        best_solutions = solutions[sorted_indices][:options.sample_size]
+        # 1. Define quality based on the user's desired precision
+        #    (e.g., precision=5 -> rank > 1e6, precision=8 -> rank > 1e9)
+        #    We add +1 for a buffer, ensuring we only get high-quality roots.
+        quality_threshold = 10**(options.root_precision + 1)
+
+        # 2. Get all solutions that meet this quality threshold
+        high_quality_solutions = solutions[ranks > quality_threshold]
+
+        if high_quality_solutions.size == 0:
+            # No roots found that meet the quality, return empty
+            return np.array([])
        
-        return np.sort(best_solutions)
+        # 3. Cluster these high-quality solutions by rounding
+        rounded_solutions = np.round(high_quality_solutions, options.root_precision)
+
+        # 4. Return only the unique roots
+        unique_roots = np.unique(rounded_solutions)
+        
+        return np.sort(unique_roots)

    def _solve_x_cuda(self, y_val: float, options: GA_Options) -> np.ndarray:
        """Genetic algorithm implementation using CuPy (GPU/CUDA)."""
@@ -490,13 +502,26 @@ class Function:
            (blocks_per_grid,), (threads_per_block,),
            (d_coefficients, d_coefficients.size, d_solutions, d_ranks, d_solutions.size, y_val)
        )
-        sorted_indices = cupy.argsort(-d_ranks)
        
-        # Get the top 'sample_size' solutions
-        d_best_solutions = d_solutions[sorted_indices][:options.sample_size]
+        # 1. Define quality based on the user's desired precision
+        #    (e.g., precision=5 -> rank > 1e6, precision=8 -> rank > 1e9)
+        #    We add +1 for a buffer, ensuring we only get high-quality roots.
+        quality_threshold = 10**(options.root_precision + 1)
+        
+        # 2. Get all solutions that meet this quality threshold
+        d_high_quality_solutions = d_solutions[d_ranks > quality_threshold]

-        # Get the final sample, sort it, and copy back to CPU
-        final_solutions_gpu = cupy.sort(d_best_solutions)
+        if d_high_quality_solutions.size == 0:
+            return np.array([])
+            
+        # 3. Cluster these high-quality solutions on the GPU by rounding
+        d_rounded_solutions = cupy.round(d_high_quality_solutions, options.root_precision)
+        
+        # 4. Get only the unique roots
+        d_unique_roots = cupy.unique(d_rounded_solutions)
+
+        # Sort the unique roots and copy back to CPU
+        final_solutions_gpu = cupy.sort(d_unique_roots)
        return final_solutions_gpu.get()


@@ -692,7 +717,7 @@ if __name__ == '__main__':
    print(f"Analytic roots of f1: {roots_analytic}") # Expected: -1, 2.5

    # 2. Genetic algorithm solution
-    ga_opts = GA_Options(num_of_generations=20, data_size=50000, sample_size=10)
+    ga_opts = GA_Options(num_of_generations=20, data_size=50000)
    print("\nFinding roots with Genetic Algorithm (CPU)...")
    roots_ga_cpu = f1.get_real_roots(ga_opts)
    print(f"Approximate roots from GA (CPU): {roots_ga_cpu}")
--- a/tests/test_polysolve.py
+++ b/tests/test_polysolve.py
@@ -101,7 +101,7 @@ def test_get_real_roots_numpy(quadratic_func):
    Tests that the NumPy-based genetic algorithm approximates the roots correctly.
    """
    # Using more generations for higher accuracy in testing
-    ga_opts = GA_Options(num_of_generations=25, data_size=50000)
+    ga_opts = GA_Options(num_of_generations=50, data_size=200000, root_precision=3)
    
    roots = quadratic_func.get_real_roots(ga_opts, use_cuda=False)
    
@@ -124,7 +124,7 @@ def test_get_real_roots_cuda(quadratic_func):
    It will be skipped automatically if CuPy is not available.
    """
    
-    ga_opts = GA_Options(num_of_generations=25, data_size=50000)
+    ga_opts = GA_Options(num_of_generations=50, data_size=200000, root_precision=3)
    
    roots = quadratic_func.get_real_roots(ga_opts, use_cuda=True)