|
@@ -48,7 +48,6 @@ _TEST_CASES = [
|
|
'backends_restart',
|
|
'backends_restart',
|
|
'change_backend_service',
|
|
'change_backend_service',
|
|
'gentle_failover',
|
|
'gentle_failover',
|
|
- 'new_instance_group_receives_traffic',
|
|
|
|
'ping_pong',
|
|
'ping_pong',
|
|
'remove_instance_group',
|
|
'remove_instance_group',
|
|
'round_robin',
|
|
'round_robin',
|
|
@@ -234,9 +233,7 @@ _BOOTSTRAP_TEMPLATE = """
|
|
# TODO(ericgribkoff) Add change_backend_service to this list once TD no longer
|
|
# TODO(ericgribkoff) Add change_backend_service to this list once TD no longer
|
|
# sends an update with no localities when adding the MIG to the backend service
|
|
# sends an update with no localities when adding the MIG to the backend service
|
|
# can race with the URL map patch.
|
|
# can race with the URL map patch.
|
|
-_TESTS_TO_FAIL_ON_RPC_FAILURE = [
|
|
|
|
- 'new_instance_group_receives_traffic', 'ping_pong', 'round_robin'
|
|
|
|
-]
|
|
|
|
|
|
+_TESTS_TO_FAIL_ON_RPC_FAILURE = ['ping_pong', 'round_robin']
|
|
_PATH_MATCHER_NAME = 'path-matcher'
|
|
_PATH_MATCHER_NAME = 'path-matcher'
|
|
_BASE_TEMPLATE_NAME = 'test-template'
|
|
_BASE_TEMPLATE_NAME = 'test-template'
|
|
_BASE_INSTANCE_GROUP_NAME = 'test-ig'
|
|
_BASE_INSTANCE_GROUP_NAME = 'test-ig'
|
|
@@ -456,32 +453,6 @@ def test_gentle_failover(gcp,
|
|
_WAIT_FOR_BACKEND_SEC)
|
|
_WAIT_FOR_BACKEND_SEC)
|
|
|
|
|
|
|
|
|
|
-def test_new_instance_group_receives_traffic(gcp, backend_service,
|
|
|
|
- instance_group,
|
|
|
|
- same_zone_instance_group):
|
|
|
|
- logger.info('Running test_new_instance_group_receives_traffic')
|
|
|
|
- instance_names = get_instance_names(gcp, instance_group)
|
|
|
|
- # TODO(ericgribkoff) Reduce this timeout. When running sequentially, this
|
|
|
|
- # occurs after patching the url map in test_change_backend_service, so we
|
|
|
|
- # need the extended timeout here as well.
|
|
|
|
- wait_until_all_rpcs_go_to_given_backends(instance_names,
|
|
|
|
- _WAIT_FOR_URL_MAP_PATCH_SEC)
|
|
|
|
- try:
|
|
|
|
- patch_backend_instances(gcp,
|
|
|
|
- backend_service,
|
|
|
|
- [instance_group, same_zone_instance_group],
|
|
|
|
- balancing_mode='RATE')
|
|
|
|
- wait_for_healthy_backends(gcp, backend_service, instance_group)
|
|
|
|
- wait_for_healthy_backends(gcp, backend_service,
|
|
|
|
- same_zone_instance_group)
|
|
|
|
- combined_instance_names = instance_names + get_instance_names(
|
|
|
|
- gcp, same_zone_instance_group)
|
|
|
|
- wait_until_all_rpcs_go_to_given_backends(combined_instance_names,
|
|
|
|
- _WAIT_FOR_BACKEND_SEC)
|
|
|
|
- finally:
|
|
|
|
- patch_backend_instances(gcp, backend_service, [instance_group])
|
|
|
|
-
|
|
|
|
-
|
|
|
|
def test_ping_pong(gcp, backend_service, instance_group):
|
|
def test_ping_pong(gcp, backend_service, instance_group):
|
|
logger.info('Running test_ping_pong')
|
|
logger.info('Running test_ping_pong')
|
|
wait_for_healthy_backends(gcp, backend_service, instance_group)
|
|
wait_for_healthy_backends(gcp, backend_service, instance_group)
|
|
@@ -504,12 +475,30 @@ def test_remove_instance_group(gcp, backend_service, instance_group,
|
|
instance_names = get_instance_names(gcp, instance_group)
|
|
instance_names = get_instance_names(gcp, instance_group)
|
|
same_zone_instance_names = get_instance_names(gcp,
|
|
same_zone_instance_names = get_instance_names(gcp,
|
|
same_zone_instance_group)
|
|
same_zone_instance_group)
|
|
- wait_until_all_rpcs_go_to_given_backends(
|
|
|
|
- instance_names + same_zone_instance_names, _WAIT_FOR_BACKEND_SEC)
|
|
|
|
|
|
+ try:
|
|
|
|
+ wait_until_all_rpcs_go_to_given_backends(
|
|
|
|
+ instance_names + same_zone_instance_names,
|
|
|
|
+ _WAIT_FOR_OPERATION_SEC)
|
|
|
|
+ remaining_instance_group = same_zone_instance_group
|
|
|
|
+ remaining_instance_names = same_zone_instance_names
|
|
|
|
+ except RpcDistributionError as e:
|
|
|
|
+ # If connected to TD in a different zone, we may route traffic to
|
|
|
|
+ # only one instance group. Determine which group that is to continue
|
|
|
|
+ # with the remainder of the test case.
|
|
|
|
+ try:
|
|
|
|
+ wait_until_all_rpcs_go_to_given_backends(
|
|
|
|
+ instance_names, _WAIT_FOR_STATS_SEC)
|
|
|
|
+ remaining_instance_group = same_zone_instance_group
|
|
|
|
+ remaining_instance_names = same_zone_instance_names
|
|
|
|
+ except RpcDistributionError as e:
|
|
|
|
+ wait_until_all_rpcs_go_to_given_backends(
|
|
|
|
+ same_zone_instance_names, _WAIT_FOR_STATS_SEC)
|
|
|
|
+ remaining_instance_group = instance_group
|
|
|
|
+ remaining_instance_names = instance_names
|
|
patch_backend_instances(gcp,
|
|
patch_backend_instances(gcp,
|
|
- backend_service, [same_zone_instance_group],
|
|
|
|
|
|
+ backend_service, [remaining_instance_group],
|
|
balancing_mode='RATE')
|
|
balancing_mode='RATE')
|
|
- wait_until_all_rpcs_go_to_given_backends(same_zone_instance_names,
|
|
|
|
|
|
+ wait_until_all_rpcs_go_to_given_backends(remaining_instance_names,
|
|
_WAIT_FOR_BACKEND_SEC)
|
|
_WAIT_FOR_BACKEND_SEC)
|
|
finally:
|
|
finally:
|
|
patch_backend_instances(gcp, backend_service, [instance_group])
|
|
patch_backend_instances(gcp, backend_service, [instance_group])
|
|
@@ -524,17 +513,27 @@ def test_round_robin(gcp, backend_service, instance_group):
|
|
threshold = 1
|
|
threshold = 1
|
|
wait_until_all_rpcs_go_to_given_backends(instance_names,
|
|
wait_until_all_rpcs_go_to_given_backends(instance_names,
|
|
_WAIT_FOR_STATS_SEC)
|
|
_WAIT_FOR_STATS_SEC)
|
|
- stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
|
|
|
|
- requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
|
|
|
|
- total_requests_received = sum(requests_received)
|
|
|
|
- if total_requests_received != _NUM_TEST_RPCS:
|
|
|
|
- raise Exception('Unexpected RPC failures', stats)
|
|
|
|
- expected_requests = total_requests_received / len(instance_names)
|
|
|
|
- for instance in instance_names:
|
|
|
|
- if abs(stats.rpcs_by_peer[instance] - expected_requests) > threshold:
|
|
|
|
- raise Exception(
|
|
|
|
- 'RPC peer distribution differs from expected by more than %d '
|
|
|
|
- 'for instance %s (%s)', threshold, instance, stats)
|
|
|
|
|
|
+ # TODO(ericgribkoff) Delayed config propagation from earlier tests
|
|
|
|
+ # may result in briefly receiving an empty EDS update, resulting in failed
|
|
|
|
+ # RPCs. Retry distribution validation if this occurs; long-term fix is
|
|
|
|
+ # creating new backend resources for each individual test case.
|
|
|
|
+ max_attempts = 10
|
|
|
|
+ for i in range(max_attempts):
|
|
|
|
+ stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
|
|
|
|
+ requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
|
|
|
|
+ total_requests_received = sum(requests_received)
|
|
|
|
+ if total_requests_received != _NUM_TEST_RPCS:
|
|
|
|
+ logger.info('Unexpected RPC failures, retrying: %s', stats)
|
|
|
|
+ continue
|
|
|
|
+ expected_requests = total_requests_received / len(instance_names)
|
|
|
|
+ for instance in instance_names:
|
|
|
|
+ if abs(stats.rpcs_by_peer[instance] -
|
|
|
|
+ expected_requests) > threshold:
|
|
|
|
+ raise Exception(
|
|
|
|
+ 'RPC peer distribution differs from expected by more than %d '
|
|
|
|
+ 'for instance %s (%s)' % (threshold, instance, stats))
|
|
|
|
+ return
|
|
|
|
+ raise Exception('RPC failures persisted through %d retries' % max_attempts)
|
|
|
|
|
|
|
|
|
|
def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
|
|
def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
|
|
@@ -1504,17 +1503,20 @@ try:
|
|
test_log_filename = os.path.join(log_dir, _SPONGE_LOG_NAME)
|
|
test_log_filename = os.path.join(log_dir, _SPONGE_LOG_NAME)
|
|
test_log_file = open(test_log_filename, 'w+')
|
|
test_log_file = open(test_log_filename, 'w+')
|
|
client_process = None
|
|
client_process = None
|
|
- if test_case in _TESTS_TO_FAIL_ON_RPC_FAILURE:
|
|
|
|
- wait_for_config_propagation(
|
|
|
|
- gcp, instance_group,
|
|
|
|
- args.client_cmd.format(server_uri=server_uri,
|
|
|
|
- stats_port=args.stats_port,
|
|
|
|
- qps=args.qps,
|
|
|
|
- fail_on_failed_rpc=False),
|
|
|
|
- client_env)
|
|
|
|
- fail_on_failed_rpc = '--fail_on_failed_rpc=true'
|
|
|
|
- else:
|
|
|
|
- fail_on_failed_rpc = '--fail_on_failed_rpc=false'
|
|
|
|
|
|
+ # TODO(ericgribkoff) Temporarily disable fail_on_failed_rpc checks
|
|
|
|
+ # in the client. This means we will ignore intermittent RPC
|
|
|
|
+ # failures (but this framework still checks that the final result
|
|
|
|
+ # is as expected).
|
|
|
|
+ #
|
|
|
|
+ # Reason for disabling this is, the resources are shared by
|
|
|
|
+ # multiple tests, and a change in previous test could be delayed
|
|
|
|
+ # until the second test starts. The second test may see
|
|
|
|
+ # intermittent failures because of that.
|
|
|
|
+ #
|
|
|
|
+ # A fix is to not share resources between tests (though that does
|
|
|
|
+ # mean the tests will be significantly slower due to creating new
|
|
|
|
+ # resources).
|
|
|
|
+ fail_on_failed_rpc = ''
|
|
client_cmd = shlex.split(
|
|
client_cmd = shlex.split(
|
|
args.client_cmd.format(server_uri=server_uri,
|
|
args.client_cmd.format(server_uri=server_uri,
|
|
stats_port=args.stats_port,
|
|
stats_port=args.stats_port,
|
|
@@ -1535,10 +1537,6 @@ try:
|
|
elif test_case == 'gentle_failover':
|
|
elif test_case == 'gentle_failover':
|
|
test_gentle_failover(gcp, backend_service, instance_group,
|
|
test_gentle_failover(gcp, backend_service, instance_group,
|
|
secondary_zone_instance_group)
|
|
secondary_zone_instance_group)
|
|
- elif test_case == 'new_instance_group_receives_traffic':
|
|
|
|
- test_new_instance_group_receives_traffic(
|
|
|
|
- gcp, backend_service, instance_group,
|
|
|
|
- same_zone_instance_group)
|
|
|
|
elif test_case == 'ping_pong':
|
|
elif test_case == 'ping_pong':
|
|
test_ping_pong(gcp, backend_service, instance_group)
|
|
test_ping_pong(gcp, backend_service, instance_group)
|
|
elif test_case == 'remove_instance_group':
|
|
elif test_case == 'remove_instance_group':
|