|  | @@ -48,7 +48,6 @@ _TEST_CASES = [
 | 
	
		
			
				|  |  |      'backends_restart',
 | 
	
		
			
				|  |  |      'change_backend_service',
 | 
	
		
			
				|  |  |      'gentle_failover',
 | 
	
		
			
				|  |  | -    'new_instance_group_receives_traffic',
 | 
	
		
			
				|  |  |      'ping_pong',
 | 
	
		
			
				|  |  |      'remove_instance_group',
 | 
	
		
			
				|  |  |      'round_robin',
 | 
	
	
		
			
				|  | @@ -247,9 +246,7 @@ _BOOTSTRAP_TEMPLATE = """
 | 
	
		
			
				|  |  |  # TODO(ericgribkoff) Add change_backend_service to this list once TD no longer
 | 
	
		
			
				|  |  |  # sends an update with no localities when adding the MIG to the backend service
 | 
	
		
			
				|  |  |  # can race with the URL map patch.
 | 
	
		
			
				|  |  | -_TESTS_TO_FAIL_ON_RPC_FAILURE = [
 | 
	
		
			
				|  |  | -    'new_instance_group_receives_traffic', 'ping_pong', 'round_robin'
 | 
	
		
			
				|  |  | -]
 | 
	
		
			
				|  |  | +_TESTS_TO_FAIL_ON_RPC_FAILURE = ['ping_pong', 'round_robin']
 | 
	
		
			
				|  |  |  # Tests that run UnaryCall and EmptyCall.
 | 
	
		
			
				|  |  |  _TESTS_TO_RUN_MULTIPLE_RPCS = ['path_matching', 'header_matching']
 | 
	
		
			
				|  |  |  # Tests that make UnaryCall with test metadata.
 | 
	
	
		
			
				|  | @@ -498,32 +495,6 @@ def test_gentle_failover(gcp,
 | 
	
		
			
				|  |  |                                                   _WAIT_FOR_BACKEND_SEC)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -def test_new_instance_group_receives_traffic(gcp, backend_service,
 | 
	
		
			
				|  |  | -                                             instance_group,
 | 
	
		
			
				|  |  | -                                             same_zone_instance_group):
 | 
	
		
			
				|  |  | -    logger.info('Running test_new_instance_group_receives_traffic')
 | 
	
		
			
				|  |  | -    instance_names = get_instance_names(gcp, instance_group)
 | 
	
		
			
				|  |  | -    # TODO(ericgribkoff) Reduce this timeout. When running sequentially, this
 | 
	
		
			
				|  |  | -    # occurs after patching the url map in test_change_backend_service, so we
 | 
	
		
			
				|  |  | -    # need the extended timeout here as well.
 | 
	
		
			
				|  |  | -    wait_until_all_rpcs_go_to_given_backends(instance_names,
 | 
	
		
			
				|  |  | -                                             _WAIT_FOR_URL_MAP_PATCH_SEC)
 | 
	
		
			
				|  |  | -    try:
 | 
	
		
			
				|  |  | -        patch_backend_instances(gcp,
 | 
	
		
			
				|  |  | -                                backend_service,
 | 
	
		
			
				|  |  | -                                [instance_group, same_zone_instance_group],
 | 
	
		
			
				|  |  | -                                balancing_mode='RATE')
 | 
	
		
			
				|  |  | -        wait_for_healthy_backends(gcp, backend_service, instance_group)
 | 
	
		
			
				|  |  | -        wait_for_healthy_backends(gcp, backend_service,
 | 
	
		
			
				|  |  | -                                  same_zone_instance_group)
 | 
	
		
			
				|  |  | -        combined_instance_names = instance_names + get_instance_names(
 | 
	
		
			
				|  |  | -            gcp, same_zone_instance_group)
 | 
	
		
			
				|  |  | -        wait_until_all_rpcs_go_to_given_backends(combined_instance_names,
 | 
	
		
			
				|  |  | -                                                 _WAIT_FOR_BACKEND_SEC)
 | 
	
		
			
				|  |  | -    finally:
 | 
	
		
			
				|  |  | -        patch_backend_instances(gcp, backend_service, [instance_group])
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  |  def test_ping_pong(gcp, backend_service, instance_group):
 | 
	
		
			
				|  |  |      logger.info('Running test_ping_pong')
 | 
	
		
			
				|  |  |      wait_for_healthy_backends(gcp, backend_service, instance_group)
 | 
	
	
		
			
				|  | @@ -546,12 +517,30 @@ def test_remove_instance_group(gcp, backend_service, instance_group,
 | 
	
		
			
				|  |  |          instance_names = get_instance_names(gcp, instance_group)
 | 
	
		
			
				|  |  |          same_zone_instance_names = get_instance_names(gcp,
 | 
	
		
			
				|  |  |                                                        same_zone_instance_group)
 | 
	
		
			
				|  |  | -        wait_until_all_rpcs_go_to_given_backends(
 | 
	
		
			
				|  |  | -            instance_names + same_zone_instance_names, _WAIT_FOR_BACKEND_SEC)
 | 
	
		
			
				|  |  | +        try:
 | 
	
		
			
				|  |  | +            wait_until_all_rpcs_go_to_given_backends(
 | 
	
		
			
				|  |  | +                instance_names + same_zone_instance_names,
 | 
	
		
			
				|  |  | +                _WAIT_FOR_OPERATION_SEC)
 | 
	
		
			
				|  |  | +            remaining_instance_group = same_zone_instance_group
 | 
	
		
			
				|  |  | +            remaining_instance_names = same_zone_instance_names
 | 
	
		
			
				|  |  | +        except RpcDistributionError as e:
 | 
	
		
			
				|  |  | +            # If connected to TD in a different zone, we may route traffic to
 | 
	
		
			
				|  |  | +            # only one instance group. Determine which group that is to continue
 | 
	
		
			
				|  |  | +            # with the remainder of the test case.
 | 
	
		
			
				|  |  | +            try:
 | 
	
		
			
				|  |  | +                wait_until_all_rpcs_go_to_given_backends(
 | 
	
		
			
				|  |  | +                    instance_names, _WAIT_FOR_STATS_SEC)
 | 
	
		
			
				|  |  | +                remaining_instance_group = same_zone_instance_group
 | 
	
		
			
				|  |  | +                remaining_instance_names = same_zone_instance_names
 | 
	
		
			
				|  |  | +            except RpcDistributionError as e:
 | 
	
		
			
				|  |  | +                wait_until_all_rpcs_go_to_given_backends(
 | 
	
		
			
				|  |  | +                    same_zone_instance_names, _WAIT_FOR_STATS_SEC)
 | 
	
		
			
				|  |  | +                remaining_instance_group = instance_group
 | 
	
		
			
				|  |  | +                remaining_instance_names = instance_names
 | 
	
		
			
				|  |  |          patch_backend_instances(gcp,
 | 
	
		
			
				|  |  | -                                backend_service, [same_zone_instance_group],
 | 
	
		
			
				|  |  | +                                backend_service, [remaining_instance_group],
 | 
	
		
			
				|  |  |                                  balancing_mode='RATE')
 | 
	
		
			
				|  |  | -        wait_until_all_rpcs_go_to_given_backends(same_zone_instance_names,
 | 
	
		
			
				|  |  | +        wait_until_all_rpcs_go_to_given_backends(remaining_instance_names,
 | 
	
		
			
				|  |  |                                                   _WAIT_FOR_BACKEND_SEC)
 | 
	
		
			
				|  |  |      finally:
 | 
	
		
			
				|  |  |          patch_backend_instances(gcp, backend_service, [instance_group])
 | 
	
	
		
			
				|  | @@ -566,17 +555,27 @@ def test_round_robin(gcp, backend_service, instance_group):
 | 
	
		
			
				|  |  |      threshold = 1
 | 
	
		
			
				|  |  |      wait_until_all_rpcs_go_to_given_backends(instance_names,
 | 
	
		
			
				|  |  |                                               _WAIT_FOR_STATS_SEC)
 | 
	
		
			
				|  |  | -    stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
 | 
	
		
			
				|  |  | -    requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
 | 
	
		
			
				|  |  | -    total_requests_received = sum(requests_received)
 | 
	
		
			
				|  |  | -    if total_requests_received != _NUM_TEST_RPCS:
 | 
	
		
			
				|  |  | -        raise Exception('Unexpected RPC failures', stats)
 | 
	
		
			
				|  |  | -    expected_requests = total_requests_received / len(instance_names)
 | 
	
		
			
				|  |  | -    for instance in instance_names:
 | 
	
		
			
				|  |  | -        if abs(stats.rpcs_by_peer[instance] - expected_requests) > threshold:
 | 
	
		
			
				|  |  | -            raise Exception(
 | 
	
		
			
				|  |  | -                'RPC peer distribution differs from expected by more than %d '
 | 
	
		
			
				|  |  | -                'for instance %s (%s)', threshold, instance, stats)
 | 
	
		
			
				|  |  | +    # TODO(ericgribkoff) Delayed config propagation from earlier tests
 | 
	
		
			
				|  |  | +    # may result in briefly receiving an empty EDS update, resulting in failed
 | 
	
		
			
				|  |  | +    # RPCs. Retry distribution validation if this occurs; long-term fix is
 | 
	
		
			
				|  |  | +    # creating new backend resources for each individual test case.
 | 
	
		
			
				|  |  | +    max_attempts = 10
 | 
	
		
			
				|  |  | +    for i in range(max_attempts):
 | 
	
		
			
				|  |  | +        stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
 | 
	
		
			
				|  |  | +        requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
 | 
	
		
			
				|  |  | +        total_requests_received = sum(requests_received)
 | 
	
		
			
				|  |  | +        if total_requests_received != _NUM_TEST_RPCS:
 | 
	
		
			
				|  |  | +            logger.info('Unexpected RPC failures, retrying: %s', stats)
 | 
	
		
			
				|  |  | +            continue
 | 
	
		
			
				|  |  | +        expected_requests = total_requests_received / len(instance_names)
 | 
	
		
			
				|  |  | +        for instance in instance_names:
 | 
	
		
			
				|  |  | +            if abs(stats.rpcs_by_peer[instance] -
 | 
	
		
			
				|  |  | +                   expected_requests) > threshold:
 | 
	
		
			
				|  |  | +                raise Exception(
 | 
	
		
			
				|  |  | +                    'RPC peer distribution differs from expected by more than %d '
 | 
	
		
			
				|  |  | +                    'for instance %s (%s)' % (threshold, instance, stats))
 | 
	
		
			
				|  |  | +        return
 | 
	
		
			
				|  |  | +    raise Exception('RPC failures persisted through %d retries' % max_attempts)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
 | 
	
	
		
			
				|  | @@ -1750,25 +1749,20 @@ try:
 | 
	
		
			
				|  |  |                  # metadata arg is not specified.
 | 
	
		
			
				|  |  |                  metadata_to_send = ''
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -            if test_case in _TESTS_TO_FAIL_ON_RPC_FAILURE:
 | 
	
		
			
				|  |  | -                # TODO(ericgribkoff) Unconditional wait is recommended by TD
 | 
	
		
			
				|  |  | -                # team when reusing backend resources after config changes
 | 
	
		
			
				|  |  | -                # between test cases, as we are doing here. This should address
 | 
	
		
			
				|  |  | -                # flakiness issues with these tests; other attempts to deflake
 | 
	
		
			
				|  |  | -                # (such as waiting for the first successful RPC before failing
 | 
	
		
			
				|  |  | -                # on any subsequent failures) were insufficient because, due to
 | 
	
		
			
				|  |  | -                # propagation delays, we may initially see an RPC succeed to the
 | 
	
		
			
				|  |  | -                # expected backends but due to a stale configuration: e.g., test
 | 
	
		
			
				|  |  | -                # A (1) routes traffic to MIG A, then (2) switches to MIG B,
 | 
	
		
			
				|  |  | -                # then (3) back to MIG A. Test B begins running and sees RPCs
 | 
	
		
			
				|  |  | -                # going to MIG A, as expected. However, due to propagation
 | 
	
		
			
				|  |  | -                # delays, Test B is actually seeing the stale config from step
 | 
	
		
			
				|  |  | -                # (1), and then fails when it gets update (2) unexpectedly
 | 
	
		
			
				|  |  | -                # switching to MIG B.
 | 
	
		
			
				|  |  | -                time.sleep(200)
 | 
	
		
			
				|  |  | -                fail_on_failed_rpc = '--fail_on_failed_rpc=true'
 | 
	
		
			
				|  |  | -            else:
 | 
	
		
			
				|  |  | -                fail_on_failed_rpc = '--fail_on_failed_rpc=false'
 | 
	
		
			
				|  |  | +            # TODO(ericgribkoff) Temporarily disable fail_on_failed_rpc checks
 | 
	
		
			
				|  |  | +            # in the client. This means we will ignore intermittent RPC
 | 
	
		
			
				|  |  | +            # failures (but this framework still checks that the final result
 | 
	
		
			
				|  |  | +            # is as expected).
 | 
	
		
			
				|  |  | +            #
 | 
	
		
			
				|  |  | +            # Reason for disabling this is, the resources are shared by
 | 
	
		
			
				|  |  | +            # multiple tests, and a change in previous test could be delayed
 | 
	
		
			
				|  |  | +            # until the second test starts. The second test may see
 | 
	
		
			
				|  |  | +            # intermittent failures because of that.
 | 
	
		
			
				|  |  | +            #
 | 
	
		
			
				|  |  | +            # A fix is to not share resources between tests (though that does
 | 
	
		
			
				|  |  | +            # mean the tests will be significantly slower due to creating new
 | 
	
		
			
				|  |  | +            # resources).
 | 
	
		
			
				|  |  | +            fail_on_failed_rpc = ''
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |              client_cmd_formatted = args.client_cmd.format(
 | 
	
		
			
				|  |  |                  server_uri=server_uri,
 | 
	
	
		
			
				|  | @@ -1794,10 +1788,6 @@ try:
 | 
	
		
			
				|  |  |                  elif test_case == 'gentle_failover':
 | 
	
		
			
				|  |  |                      test_gentle_failover(gcp, backend_service, instance_group,
 | 
	
		
			
				|  |  |                                           secondary_zone_instance_group)
 | 
	
		
			
				|  |  | -                elif test_case == 'new_instance_group_receives_traffic':
 | 
	
		
			
				|  |  | -                    test_new_instance_group_receives_traffic(
 | 
	
		
			
				|  |  | -                        gcp, backend_service, instance_group,
 | 
	
		
			
				|  |  | -                        same_zone_instance_group)
 | 
	
		
			
				|  |  |                  elif test_case == 'ping_pong':
 | 
	
		
			
				|  |  |                      test_ping_pong(gcp, backend_service, instance_group)
 | 
	
		
			
				|  |  |                  elif test_case == 'remove_instance_group':
 |