| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310 | // Copyright 2017 The Abseil Authors.//// Licensed under the Apache License, Version 2.0 (the "License");// you may not use this file except in compliance with the License.// You may obtain a copy of the License at////      https://www.apache.org/licenses/LICENSE-2.0//// Unless required by applicable law or agreed to in writing, software// distributed under the License is distributed on an "AS IS" BASIS,// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.// See the License for the specific language governing permissions and// limitations under the License.#include <cstdint>#include <mutex>  // NOLINT(build/c++11)#include <vector>#include "absl/base/config.h"#include "absl/base/internal/cycleclock.h"#include "absl/base/internal/spinlock.h"#include "absl/synchronization/blocking_counter.h"#include "absl/synchronization/internal/thread_pool.h"#include "absl/synchronization/mutex.h"#include "benchmark/benchmark.h"namespace {void BM_Mutex(benchmark::State& state) {  static absl::Mutex* mu = new absl::Mutex;  for (auto _ : state) {    absl::MutexLock lock(mu);  }}BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();static void DelayNs(int64_t ns, int* data) {  int64_t end = absl::base_internal::CycleClock::Now() +                ns * absl::base_internal::CycleClock::Frequency() / 1e9;  while (absl::base_internal::CycleClock::Now() < end) {    ++(*data);    benchmark::DoNotOptimize(*data);  }}template <typename MutexType>class RaiiLocker { public:  explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }  ~RaiiLocker() { mu_->Unlock(); } private:  MutexType* mu_;};template <>class RaiiLocker<std::mutex> { public:  explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }  ~RaiiLocker() { mu_->unlock(); } private:  std::mutex* mu_;};// RAII object to change the Mutex priority of the running thread.class ScopedThreadMutexPriority { public:  explicit ScopedThreadMutexPriority(int priority) {    absl::base_internal::ThreadIdentity* identity =        absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();    identity->per_thread_synch.priority = priority;    // Bump next_priority_read_cycles to the infinite future so that the    // implementation doesn't re-read the thread's actual scheduler priority    // and replace our temporary scoped priority.    identity->per_thread_synch.next_priority_read_cycles =        std::numeric_limits<int64_t>::max();  }  ~ScopedThreadMutexPriority() {    // Reset the "next priority read time" back to the infinite past so that    // the next time the Mutex implementation wants to know this thread's    // priority, it re-reads it from the OS instead of using our overridden    // priority.    absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()        ->per_thread_synch.next_priority_read_cycles =        std::numeric_limits<int64_t>::min();  }};void BM_MutexEnqueue(benchmark::State& state) {  // In the "multiple priorities" variant of the benchmark, one of the  // threads runs with Mutex priority 0 while the rest run at elevated priority.  // This benchmarks the performance impact of the presence of a low priority  // waiter when a higher priority waiter adds itself of the queue  // (b/175224064).  //  // NOTE: The actual scheduler priority is not modified in this benchmark:  // all of the threads get CPU slices with the same priority. Only the  // Mutex queueing behavior is modified.  const bool multiple_priorities = state.range(0);  ScopedThreadMutexPriority priority_setter(      (multiple_priorities && state.thread_index != 0) ? 1 : 0);  struct Shared {    absl::Mutex mu;    std::atomic<int> looping_threads{0};    std::atomic<int> blocked_threads{0};    std::atomic<bool> thread_has_mutex{false};  };  static Shared* shared = new Shared;  // Set up 'blocked_threads' to count how many threads are currently blocked  // in Abseil synchronization code.  //  // NOTE: Blocking done within the Google Benchmark library itself (e.g.  // the barrier which synchronizes threads entering and exiting the benchmark  // loop) does _not_ get registered in this counter. This is because Google  // Benchmark uses its own synchronization primitives based on std::mutex, not  // Abseil synchronization primitives. If at some point the benchmark library  // merges into Abseil, this code may break.  absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(      &shared->blocked_threads);  // The benchmark framework may run several iterations in the same process,  // reusing the same static-initialized 'shared' object. Given the semantics  // of the members, here, we expect everything to be reset to zero by the  // end of any iteration. Assert that's the case, just to be sure.  ABSL_RAW_CHECK(      shared->looping_threads.load(std::memory_order_relaxed) == 0 &&          shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&          !shared->thread_has_mutex.load(std::memory_order_relaxed),      "Shared state isn't zeroed at start of benchmark iteration");  static constexpr int kBatchSize = 1000;  while (state.KeepRunningBatch(kBatchSize)) {    shared->looping_threads.fetch_add(1);    for (int i = 0; i < kBatchSize; i++) {      {        absl::MutexLock l(&shared->mu);        shared->thread_has_mutex.store(true, std::memory_order_relaxed);        // Spin until all other threads are either out of the benchmark loop        // or blocked on the mutex. This ensures that the mutex queue is kept        // at its maximal length to benchmark the performance of queueing on        // a highly contended mutex.        while (shared->looping_threads.load(std::memory_order_relaxed) -                   shared->blocked_threads.load(std::memory_order_relaxed) !=               1) {        }        shared->thread_has_mutex.store(false);      }      // Spin until some other thread has acquired the mutex before we block      // again. This ensures that we always go through the slow (queueing)      // acquisition path rather than reacquiring the mutex we just released.      while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&             shared->looping_threads.load(std::memory_order_relaxed) > 1) {      }    }    // The benchmark framework uses a barrier to ensure that all of the threads    // complete their benchmark loop together before any of the threads exit    // the loop. So, we need to remove ourselves from the "looping threads"    // counter here before potentially blocking on that barrier. Otherwise,    // another thread spinning above might wait forever for this thread to    // block on the mutex while we in fact are waiting to exit.    shared->looping_threads.fetch_add(-1);  }  absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(      nullptr);}BENCHMARK(BM_MutexEnqueue)    ->Threads(4)    ->Threads(64)    ->Threads(128)    ->Threads(512)    ->ArgName("multiple_priorities")    ->Arg(false)    ->Arg(true);template <typename MutexType>void BM_Contended(benchmark::State& state) {  int priority = state.thread_index % state.range(1);  ScopedThreadMutexPriority priority_setter(priority);  struct Shared {    MutexType mu;    int data = 0;  };  static auto* shared = new Shared;  int local = 0;  for (auto _ : state) {    // Here we model both local work outside of the critical section as well as    // some work inside of the critical section. The idea is to capture some    // more or less realisitic contention levels.    // If contention is too low, the benchmark won't measure anything useful.    // If contention is unrealistically high, the benchmark will favor    // bad mutex implementations that block and otherwise distract threads    // from the mutex and shared state for as much as possible.    // To achieve this amount of local work is multiplied by number of threads    // to keep ratio between local work and critical section approximately    // equal regardless of number of threads.    DelayNs(100 * state.threads, &local);    RaiiLocker<MutexType> locker(&shared->mu);    DelayNs(state.range(0), &shared->data);  }}void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,                        bool do_test_priorities) {  const int max_num_priorities = do_test_priorities ? 2 : 1;  bm->UseRealTime()      // ThreadPerCpu poorly handles non-power-of-two CPU counts.      ->Threads(1)      ->Threads(2)      ->Threads(4)      ->Threads(6)      ->Threads(8)      ->Threads(12)      ->Threads(16)      ->Threads(24)      ->Threads(32)      ->Threads(48)      ->Threads(64)      ->Threads(96)      ->Threads(128)      ->Threads(192)      ->Threads(256)      ->ArgNames({"cs_ns", "num_prios"});  // Some empirically chosen amounts of work in critical section.  // 1 is low contention, 2000 is high contention and few values in between.  for (int critical_section_ns : {1, 20, 50, 200, 2000}) {    for (int num_priorities = 1; num_priorities <= max_num_priorities;         num_priorities++) {      bm->ArgPair(critical_section_ns, num_priorities);    }  }}BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)    ->Apply([](benchmark::internal::Benchmark* bm) {      SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);    });BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)    ->Apply([](benchmark::internal::Benchmark* bm) {      SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);    });BENCHMARK_TEMPLATE(BM_Contended, std::mutex)    ->Apply([](benchmark::internal::Benchmark* bm) {      SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);    });// Measure the overhead of conditions on mutex release (when they must be// evaluated).  Mutex has (some) support for equivalence classes allowing// Conditions with the same function/argument to potentially not be multiply// evaluated.//// num_classes==0 is used for the special case of every waiter being distinct.void BM_ConditionWaiters(benchmark::State& state) {  int num_classes = state.range(0);  int num_waiters = state.range(1);  struct Helper {    static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {      init->DecrementCount();      m->LockWhen(absl::Condition(          static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));      m->Unlock();    }  };  if (num_classes == 0) {    // No equivalence classes.    num_classes = num_waiters;  }  absl::BlockingCounter init(num_waiters);  absl::Mutex mu;  std::vector<int> equivalence_classes(num_classes, 1);  // Must be declared last to be destroyed first.  absl::synchronization_internal::ThreadPool pool(num_waiters);  for (int i = 0; i < num_waiters; i++) {    // Mutex considers Conditions with the same function and argument    // to be equivalent.    pool.Schedule([&, i] {      Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);    });  }  init.Wait();  for (auto _ : state) {    mu.Lock();    mu.Unlock();  // Each unlock requires Condition evaluation for our waiters.  }  mu.Lock();  for (int i = 0; i < num_classes; i++) {    equivalence_classes[i] = 0;  }  mu.Unlock();}// Some configurations have higher thread limits than others.#if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)constexpr int kMaxConditionWaiters = 8192;#elseconstexpr int kMaxConditionWaiters = 1024;#endifBENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);}  // namespace
 |