5 years ago · aafecef6d4
--- a/tools/internal_ci/linux/grpc_xds.cfg
+++ b/tools/internal_ci/linux/grpc_xds.cfg
@@ -16,7 +16,7 @@
 
															 # Location of the continuous shell script in repository.
														
 
															 build_file: "grpc/tools/internal_ci/linux/grpc_xds.sh"
														
 
															-timeout_mins: 90
														
 
															+timeout_mins: 120
														
 
															 env_vars {
														
 
															   key: "BAZEL_SCRIPT"
														
 
															   value: "tools/internal_ci/linux/grpc_xds_bazel_test_in_docker.sh"
														
--- a/tools/run_tests/run_xds_tests.py
+++ b/tools/run_tests/run_xds_tests.py
@@ -48,7 +48,6 @@ _TEST_CASES = [
 
															     'backends_restart',
														
 
															     'change_backend_service',
														
 
															     'gentle_failover',
														
 
															-    'new_instance_group_receives_traffic',
														
 
															     'ping_pong',
														
 
															     'remove_instance_group',
														
 
															     'round_robin',
														
@@ -234,9 +233,7 @@ _BOOTSTRAP_TEMPLATE = """
 
															 # TODO(ericgribkoff) Add change_backend_service to this list once TD no longer
														
 
															 # sends an update with no localities when adding the MIG to the backend service
														
 
															 # can race with the URL map patch.
														
 
															-_TESTS_TO_FAIL_ON_RPC_FAILURE = [
														
 
															-    'new_instance_group_receives_traffic', 'ping_pong', 'round_robin'
														
 
															-]
														
 
															+_TESTS_TO_FAIL_ON_RPC_FAILURE = ['ping_pong', 'round_robin']
														
 
															 _PATH_MATCHER_NAME = 'path-matcher'
														
 
															 _BASE_TEMPLATE_NAME = 'test-template'
														
 
															 _BASE_INSTANCE_GROUP_NAME = 'test-ig'
														
@@ -456,32 +453,6 @@ def test_gentle_failover(gcp,
 
															                                                  _WAIT_FOR_BACKEND_SEC)
														
 
															-def test_new_instance_group_receives_traffic(gcp, backend_service,
														
 
															-                                             instance_group,
														
 
															-                                             same_zone_instance_group):
														
 
															-    logger.info('Running test_new_instance_group_receives_traffic')
														
 
															-    instance_names = get_instance_names(gcp, instance_group)
														
 
															-    # TODO(ericgribkoff) Reduce this timeout. When running sequentially, this
														
 
															-    # occurs after patching the url map in test_change_backend_service, so we
														
 
															-    # need the extended timeout here as well.
														
 
															-    wait_until_all_rpcs_go_to_given_backends(instance_names,
														
 
															-                                             _WAIT_FOR_URL_MAP_PATCH_SEC)
														
 
															-    try:
														
 
															-        patch_backend_instances(gcp,
														
 
															-                                backend_service,
														
 
															-                                [instance_group, same_zone_instance_group],
														
 
															-                                balancing_mode='RATE')
														
 
															-        wait_for_healthy_backends(gcp, backend_service, instance_group)
														
 
															-        wait_for_healthy_backends(gcp, backend_service,
														
 
															-                                  same_zone_instance_group)
														
 
															-        combined_instance_names = instance_names + get_instance_names(
														
 
															-            gcp, same_zone_instance_group)
														
 
															-        wait_until_all_rpcs_go_to_given_backends(combined_instance_names,
														
 
															-                                                 _WAIT_FOR_BACKEND_SEC)
														
 
															-    finally:
														
 
															-        patch_backend_instances(gcp, backend_service, [instance_group])
														
 
															-
														
 
															-
														
 
															 def test_ping_pong(gcp, backend_service, instance_group):
														
 
															     logger.info('Running test_ping_pong')
														
 
															     wait_for_healthy_backends(gcp, backend_service, instance_group)
														
@@ -504,12 +475,30 @@ def test_remove_instance_group(gcp, backend_service, instance_group,
 
															         instance_names = get_instance_names(gcp, instance_group)
														
 
															         same_zone_instance_names = get_instance_names(gcp,
														
 
															                                                       same_zone_instance_group)
														
 
															-        wait_until_all_rpcs_go_to_given_backends(
														
 
															-            instance_names + same_zone_instance_names, _WAIT_FOR_BACKEND_SEC)
														
 
															+        try:
														
 
															+            wait_until_all_rpcs_go_to_given_backends(
														
 
															+                instance_names + same_zone_instance_names,
														
 
															+                _WAIT_FOR_OPERATION_SEC)
														
 
															+            remaining_instance_group = same_zone_instance_group
														
 
															+            remaining_instance_names = same_zone_instance_names
														
 
															+        except RpcDistributionError as e:
														
 
															+            # If connected to TD in a different zone, we may route traffic to
														
 
															+            # only one instance group. Determine which group that is to continue
														
 
															+            # with the remainder of the test case.
														
 
															+            try:
														
 
															+                wait_until_all_rpcs_go_to_given_backends(
														
 
															+                    instance_names, _WAIT_FOR_STATS_SEC)
														
 
															+                remaining_instance_group = same_zone_instance_group
														
 
															+                remaining_instance_names = same_zone_instance_names
														
 
															+            except RpcDistributionError as e:
														
 
															+                wait_until_all_rpcs_go_to_given_backends(
														
 
															+                    same_zone_instance_names, _WAIT_FOR_STATS_SEC)
														
 
															+                remaining_instance_group = instance_group
														
 
															+                remaining_instance_names = instance_names
														
 
															         patch_backend_instances(gcp,
														
 
															-                                backend_service, [same_zone_instance_group],
														
 
															+                                backend_service, [remaining_instance_group],
														
 
															                                 balancing_mode='RATE')
														
 
															-        wait_until_all_rpcs_go_to_given_backends(same_zone_instance_names,
														
 
															+        wait_until_all_rpcs_go_to_given_backends(remaining_instance_names,
														
 
															                                                  _WAIT_FOR_BACKEND_SEC)
														
 
															     finally:
														
 
															         patch_backend_instances(gcp, backend_service, [instance_group])
														
@@ -524,17 +513,27 @@ def test_round_robin(gcp, backend_service, instance_group):
 
															     threshold = 1
														
 
															     wait_until_all_rpcs_go_to_given_backends(instance_names,
														
 
															                                              _WAIT_FOR_STATS_SEC)
														
 
															-    stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
														
 
															-    requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
														
 
															-    total_requests_received = sum(requests_received)
														
 
															-    if total_requests_received != _NUM_TEST_RPCS:
														
 
															-        raise Exception('Unexpected RPC failures', stats)
														
 
															-    expected_requests = total_requests_received / len(instance_names)
														
 
															-    for instance in instance_names:
														
 
															-        if abs(stats.rpcs_by_peer[instance] - expected_requests) > threshold:
														
 
															-            raise Exception(
														
 
															-                'RPC peer distribution differs from expected by more than %d '
														
 
															-                'for instance %s (%s)', threshold, instance, stats)
														
 
															+    # TODO(ericgribkoff) Delayed config propagation from earlier tests
														
 
															+    # may result in briefly receiving an empty EDS update, resulting in failed
														
 
															+    # RPCs. Retry distribution validation if this occurs; long-term fix is
														
 
															+    # creating new backend resources for each individual test case.
														
 
															+    max_attempts = 10
														
 
															+    for i in range(max_attempts):
														
 
															+        stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
														
 
															+        requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
														
 
															+        total_requests_received = sum(requests_received)
														
 
															+        if total_requests_received != _NUM_TEST_RPCS:
														
 
															+            logger.info('Unexpected RPC failures, retrying: %s', stats)
														
 
															+            continue
														
 
															+        expected_requests = total_requests_received / len(instance_names)
														
 
															+        for instance in instance_names:
														
 
															+            if abs(stats.rpcs_by_peer[instance] -
														
 
															+                   expected_requests) > threshold:
														
 
															+                raise Exception(
														
 
															+                    'RPC peer distribution differs from expected by more than %d '
														
 
															+                    'for instance %s (%s)' % (threshold, instance, stats))
														
 
															+        return
														
 
															+    raise Exception('RPC failures persisted through %d retries' % max_attempts)
														
 
															 def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
														
@@ -1504,17 +1503,20 @@ try:
 
															             test_log_filename = os.path.join(log_dir, _SPONGE_LOG_NAME)
														
 
															             test_log_file = open(test_log_filename, 'w+')
														
 
															             client_process = None
														
 
															-            if test_case in _TESTS_TO_FAIL_ON_RPC_FAILURE:
														
 
															-                wait_for_config_propagation(
														
 
															-                    gcp, instance_group,
														
 
															-                    args.client_cmd.format(server_uri=server_uri,
														
 
															-                                           stats_port=args.stats_port,
														
 
															-                                           qps=args.qps,
														
 
															-                                           fail_on_failed_rpc=False),
														
 
															-                    client_env)
														
 
															-                fail_on_failed_rpc = '--fail_on_failed_rpc=true'
														
 
															-            else:
														
 
															-                fail_on_failed_rpc = '--fail_on_failed_rpc=false'
														
 
															+            # TODO(ericgribkoff) Temporarily disable fail_on_failed_rpc checks
														
 
															+            # in the client. This means we will ignore intermittent RPC
														
 
															+            # failures (but this framework still checks that the final result
														
 
															+            # is as expected).
														
 
															+            #
														
 
															+            # Reason for disabling this is, the resources are shared by
														
 
															+            # multiple tests, and a change in previous test could be delayed
														
 
															+            # until the second test starts. The second test may see
														
 
															+            # intermittent failures because of that.
														
 
															+            #
														
 
															+            # A fix is to not share resources between tests (though that does
														
 
															+            # mean the tests will be significantly slower due to creating new
														
 
															+            # resources).
														
 
															+            fail_on_failed_rpc = ''
														
 
															             client_cmd = shlex.split(
														
 
															                 args.client_cmd.format(server_uri=server_uri,
														
 
															                                        stats_port=args.stats_port,
														
@@ -1535,10 +1537,6 @@ try:
 
															                 elif test_case == 'gentle_failover':
														
 
															                     test_gentle_failover(gcp, backend_service, instance_group,
														
 
															                                          secondary_zone_instance_group)
														
 
															-                elif test_case == 'new_instance_group_receives_traffic':
														
 
															-                    test_new_instance_group_receives_traffic(
														
 
															-                        gcp, backend_service, instance_group,
														
 
															-                        same_zone_instance_group)
														
 
															                 elif test_case == 'ping_pong':
														
 
															                     test_ping_pong(gcp, backend_service, instance_group)
														
 
															                 elif test_case == 'remove_instance_group':