Browse Source

xds testing: add failover test based on load

Menghan Li 4 years ago
parent
commit
ead3014550
2 changed files with 91 additions and 2 deletions
  1. 33 0
      doc/xds-test-descriptions.md
  2. 58 2
      tools/run_tests/run_xds_tests.py

+ 33 - 0
doc/xds-test-descriptions.md

@@ -321,3 +321,36 @@ Assert:
 1. Once all backends receive at least one RPC, the following 1000 RPCs are
 distributed across the 2 backends as a: 20, b: 80.
 
+### load_based_failover
+
+This test verifies that traffic is partially diverted to a secondary locality
+when the QPS is greater than the configured RPS in the priority locality.
+
+Client parameters:
+
+1.  --num_channels=1
+1.  --qps=100
+
+Load balancer configuration:
+
+1.  The primary MIG with 2 backends in the same zone as the client
+1.  The secondary MIG with 2 backends in a different zone
+
+Test driver asserts:
+
+1.  All backends in the primary locality receive at least 1 RPC.
+1.  No backends in the secondary locality receive RPCs.
+
+The test driver sets `balancingMode` is `RATE`, and `maxRate` to 20 in the primary locality.
+
+Test driver asserts:
+
+1.  All backends in the primary locality receive at least 1 RPC.
+1.  All backends in the secondary locality receive at least 1 RPC.
+
+The test driver set `maxRate` to 120 in the primary locality.
+
+Test driver asserts:
+
+1.  All backends in the primary locality receive at least 1 RPC.
+1.  No backends in the secondary locality receive RPCs.

+ 58 - 2
tools/run_tests/run_xds_tests.py

@@ -50,6 +50,7 @@ _TEST_CASES = [
     'backends_restart',
     'change_backend_service',
     'gentle_failover',
+    'load_report_based_failover',
     'ping_pong',
     'remove_instance_group',
     'round_robin',
@@ -488,6 +489,56 @@ def test_gentle_failover(gcp,
                                                  _WAIT_FOR_BACKEND_SEC)
 
 
+def test_load_report_based_failover(gcp, backend_service,
+                                    primary_instance_group,
+                                    secondary_instance_group):
+    logger.info('Running test_load_report_based_failover')
+    try:
+        patch_backend_instances(
+            gcp, backend_service,
+            [primary_instance_group, secondary_instance_group])
+        primary_instance_names = get_instance_names(gcp, primary_instance_group)
+        secondary_instance_names = get_instance_names(gcp,
+                                                      secondary_instance_group)
+        wait_for_healthy_backends(gcp, backend_service, primary_instance_group)
+        wait_for_healthy_backends(gcp, backend_service,
+                                  secondary_instance_group)
+        wait_until_all_rpcs_go_to_given_backends(primary_instance_names,
+                                                 _WAIT_FOR_STATS_SEC)
+        # Set primary locality's balance mode to RATE, and RPS to 20% of the
+        # client's QPS. The secondary locality will be used.
+        max_rate = int(args.qps * 1 / 5)
+        logger.info('Patching backend service to RATE with %d max_rate',
+                    max_rate)
+        patch_backend_instances(
+            gcp,
+            backend_service, [primary_instance_group, secondary_instance_group],
+            balancing_mode='RATE',
+            max_rate=max_rate)
+        wait_until_all_rpcs_go_to_given_backends(
+            primary_instance_names + secondary_instance_names,
+            _WAIT_FOR_BACKEND_SEC)
+
+        # Set primary locality's balance mode to RATE, and RPS to 120% of the
+        # client's QPS. Only the primary locality will be used.
+        max_rate = int(args.qps * 6 / 5)
+        logger.info('Patching backend service to RATE with %d max_rate',
+                    max_rate)
+        patch_backend_instances(
+            gcp,
+            backend_service, [primary_instance_group, secondary_instance_group],
+            balancing_mode='RATE',
+            max_rate=max_rate)
+        wait_until_all_rpcs_go_to_given_backends(primary_instance_names,
+                                                 _WAIT_FOR_BACKEND_SEC)
+        logger.info("success")
+    finally:
+        patch_backend_instances(gcp, backend_service, [primary_instance_group])
+        instance_names = get_instance_names(gcp, primary_instance_group)
+        wait_until_all_rpcs_go_to_given_backends(instance_names,
+                                                 _WAIT_FOR_BACKEND_SEC)
+
+
 def test_ping_pong(gcp, backend_service, instance_group):
     logger.info('Running test_ping_pong')
     wait_for_healthy_backends(gcp, backend_service, instance_group)
@@ -1460,7 +1511,8 @@ def delete_instance_template(gcp):
 def patch_backend_instances(gcp,
                             backend_service,
                             instance_groups,
-                            balancing_mode='UTILIZATION'):
+                            balancing_mode='UTILIZATION',
+                            max_rate=1):
     if gcp.alpha_compute:
         compute_to_use = gcp.alpha_compute
     else:
@@ -1469,7 +1521,7 @@ def patch_backend_instances(gcp,
         'backends': [{
             'group': instance_group.url,
             'balancingMode': balancing_mode,
-            'maxRate': 1 if balancing_mode == 'RATE' else None
+            'maxRate': max_rate if balancing_mode == 'RATE' else None
         } for instance_group in instance_groups],
     }
     logger.debug('Sending GCP request with body=%s', config)
@@ -1873,6 +1925,10 @@ try:
                 elif test_case == 'gentle_failover':
                     test_gentle_failover(gcp, backend_service, instance_group,
                                          secondary_zone_instance_group)
+                elif test_case == 'load_report_based_failover':
+                    test_load_report_based_failover(
+                        gcp, backend_service, instance_group,
+                        secondary_zone_instance_group)
                 elif test_case == 'ping_pong':
                     test_ping_pong(gcp, backend_service, instance_group)
                 elif test_case == 'remove_instance_group':