From 4e46c11f83dbf56ca247e1aff833e65ce3ce19c6 Mon Sep 17 00:00:00 2001
From: Jonathan Rampersad <rampersad.jonathan@gmail.com>
Date: Mon, 27 Oct 2025 19:26:50 +0000
Subject: [PATCH] feat(ga): Implement quality filtering and precision-based
 clustering (#19)

The previous GA logic was returning the "top N" solutions, which led to test failures when the algorithm correctly converged on only one of all possible roots (e.g., returning 1000 variations of -1.0).

This commit fixes the root-finding logic to correctly identify and return *all* unique, high-quality roots:

1.  **feat(api):** Adds `root_precision` to `GA_Options`. This new parameter (default: 5) allows the user to control the number of decimal places for clustering unique roots.

2.  **fix(ga):** Replaces the flawed "top N" logic in both `_solve_x_numpy` and `_solve_x_cuda`. The new process is:
    * Dynamically sets a `quality_threshold` based on the user's `root_precision` (e.g., `precision=5` requires a rank > `1e6`).
    * Filters the *entire* final population for all solutions that meet this quality threshold.
    * Rounds these high-quality solutions to `root_precision`.
    * Returns only the `np.unique()` results.

This ensures the solver returns all distinct roots that meet the accuracy requirements, rather than just the top N variations of a single root.

Reviewed-on: https://gitea.jono-rams.work/jono/PolySolve/pulls/19
Co-authored-by: Jonathan Rampersad <rampersad.jonathan@gmail.com>
Co-committed-by: Jonathan Rampersad <rampersad.jonathan@gmail.com>
---
 pyproject.toml            |  2 +-
 src/polysolve/__init__.py | 61 +++++++++++++++++++++++++++------------
 tests/test_polysolve.py   |  4 +--
 3 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 27ddb68..d078e9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 # --- Core Metadata ---
 name = "polysolve"
-version = "0.4.2"
+version = "0.5.0"
 authors = [
   { name="Jonathan Rampersad", email="jonathan@jono-rams.work" },
 ]
diff --git a/src/polysolve/__init__.py b/src/polysolve/__init__.py
index d7c22b9..733090c 100644
--- a/src/polysolve/__init__.py
+++ b/src/polysolve/__init__.py
@@ -49,8 +49,6 @@ class GA_Options:
                            Default: 100.0
         num_of_generations (int): The number of iterations the algorithm will run.
                                   Default: 10
-        sample_size (int): The number of top solutions to *return* at the end.
-                           Default: 1000
         data_size (int): The total number of solutions (population size)
                          generated in each generation. Default: 100000
         mutation_strength (float): The percentage (e.g., 0.01 for 1%) by which
@@ -64,16 +62,21 @@ class GA_Options:
         mutation_ratio (float): The percentage (e.g., 0.40 for 40%) of the next
                                 generation to be created by mutating solutions
                                 from the parent pool. Default: 0.40
+        root_precision (int): The number of decimal places to round roots to
+                              when clustering. A smaller number (e.g., 3)
+                              groups roots more aggressively. A larger number
+                              (e.g., 7) is more precise but may return
+                              multiple near-identical roots. Default: 5
     """
     min_range: float = -100.0
     max_range: float = 100.0
     num_of_generations: int = 10
-    sample_size: int = 1000
     data_size: int = 100000
     mutation_strength: float = 0.01
     elite_ratio: float = 0.05
     crossover_ratio: float = 0.45
     mutation_ratio: float = 0.40
+    root_precision: int = 5
 
     def __post_init__(self):
         """Validates the GA options after initialization."""
@@ -84,11 +87,6 @@ class GA_Options:
             )
         if any(r < 0 for r in [self.elite_ratio, self.crossover_ratio, self.mutation_ratio]):
             raise ValueError("GA ratios cannot be negative.")
-        if self.data_size < self.sample_size:
-            warnings.warn(
-                f"data_size ({self.data_size}) is less than sample_size ({self.sample_size}). "
-                "The number of returned solutions will be limited to data_size."
-            )
 
 def _get_cauchy_bound(coeffs: np.ndarray) -> float:
     """
@@ -380,12 +378,26 @@ class Function:
         error = y_calculated - y_val
         with np.errstate(divide='ignore'):
             ranks = np.where(error == 0, np.finfo(float).max, np.abs(1.0 / error))
-        sorted_indices = np.argsort(-ranks)
         
-        # Get the top 'sample_size' solutions the user asked for
-        best_solutions = solutions[sorted_indices][:options.sample_size]
+        # 1. Define quality based on the user's desired precision
+        #    (e.g., precision=5 -> rank > 1e6, precision=8 -> rank > 1e9)
+        #    We add +1 for a buffer, ensuring we only get high-quality roots.
+        quality_threshold = 10**(options.root_precision + 1)
+
+        # 2. Get all solutions that meet this quality threshold
+        high_quality_solutions = solutions[ranks > quality_threshold]
+
+        if high_quality_solutions.size == 0:
+            # No roots found that meet the quality, return empty
+            return np.array([])
         
-        return np.sort(best_solutions)
+        # 3. Cluster these high-quality solutions by rounding
+        rounded_solutions = np.round(high_quality_solutions, options.root_precision)
+
+        # 4. Return only the unique roots
+        unique_roots = np.unique(rounded_solutions)
+        
+        return np.sort(unique_roots)
 
     def _solve_x_cuda(self, y_val: float, options: GA_Options) -> np.ndarray:
         """Genetic algorithm implementation using CuPy (GPU/CUDA)."""
@@ -490,13 +502,26 @@ class Function:
             (blocks_per_grid,), (threads_per_block,),
             (d_coefficients, d_coefficients.size, d_solutions, d_ranks, d_solutions.size, y_val)
         )
-        sorted_indices = cupy.argsort(-d_ranks)
         
-        # Get the top 'sample_size' solutions
-        d_best_solutions = d_solutions[sorted_indices][:options.sample_size]
+        # 1. Define quality based on the user's desired precision
+        #    (e.g., precision=5 -> rank > 1e6, precision=8 -> rank > 1e9)
+        #    We add +1 for a buffer, ensuring we only get high-quality roots.
+        quality_threshold = 10**(options.root_precision + 1)
+        
+        # 2. Get all solutions that meet this quality threshold
+        d_high_quality_solutions = d_solutions[d_ranks > quality_threshold]
 
-        # Get the final sample, sort it, and copy back to CPU
-        final_solutions_gpu = cupy.sort(d_best_solutions)
+        if d_high_quality_solutions.size == 0:
+            return np.array([])
+            
+        # 3. Cluster these high-quality solutions on the GPU by rounding
+        d_rounded_solutions = cupy.round(d_high_quality_solutions, options.root_precision)
+        
+        # 4. Get only the unique roots
+        d_unique_roots = cupy.unique(d_rounded_solutions)
+
+        # Sort the unique roots and copy back to CPU
+        final_solutions_gpu = cupy.sort(d_unique_roots)
         return final_solutions_gpu.get()
 
 
@@ -692,7 +717,7 @@ if __name__ == '__main__':
     print(f"Analytic roots of f1: {roots_analytic}") # Expected: -1, 2.5
 
     # 2. Genetic algorithm solution
-    ga_opts = GA_Options(num_of_generations=20, data_size=50000, sample_size=10)
+    ga_opts = GA_Options(num_of_generations=20, data_size=50000)
     print("\nFinding roots with Genetic Algorithm (CPU)...")
     roots_ga_cpu = f1.get_real_roots(ga_opts)
     print(f"Approximate roots from GA (CPU): {roots_ga_cpu}")
diff --git a/tests/test_polysolve.py b/tests/test_polysolve.py
index 432d8a2..1f9a8cb 100644
--- a/tests/test_polysolve.py
+++ b/tests/test_polysolve.py
@@ -101,7 +101,7 @@ def test_get_real_roots_numpy(quadratic_func):
     Tests that the NumPy-based genetic algorithm approximates the roots correctly.
     """
     # Using more generations for higher accuracy in testing
-    ga_opts = GA_Options(num_of_generations=25, data_size=50000)
+    ga_opts = GA_Options(num_of_generations=50, data_size=200000, root_precision=3)
     
     roots = quadratic_func.get_real_roots(ga_opts, use_cuda=False)
     
@@ -124,7 +124,7 @@ def test_get_real_roots_cuda(quadratic_func):
     It will be skipped automatically if CuPy is not available.
     """
     
-    ga_opts = GA_Options(num_of_generations=25, data_size=50000)
+    ga_opts = GA_Options(num_of_generations=50, data_size=200000, root_precision=3)
     
     roots = quadratic_func.get_real_roots(ga_opts, use_cuda=True)