Explorar el Código

Merge pull request #1464 from google/benchmarks

Added framework for generating/consuming benchmarking data sets.
Joshua Haberman hace 9 años
padre
commit
07bcf21a9c

+ 5 - 1
Makefile.am

@@ -9,7 +9,7 @@ AUTOMAKE_OPTIONS = foreign
 SUBDIRS = . src
 
 # Always include gmock in distributions.
-DIST_SUBDIRS = $(subdirs) src conformance
+DIST_SUBDIRS = $(subdirs) src conformance benchmarks
 
 # Build gmock before we build protobuf tests.  We don't add gmock to SUBDIRS
 # because then "make check" would also build and run all of gmock's own tests,
@@ -36,6 +36,10 @@ clean-local:
 	  echo "Making clean in conformance"; \
 	  cd conformance && $(MAKE) $(AM_MAKEFLAGS) clean; \
 	fi; \
+	if test -e benchmarks/Makefile; then \
+	  echo "Making clean in benchmarks"; \
+	  cd benchmarks && $(MAKE) $(AM_MAKEFLAGS) clean; \
+	fi; \
 	if test -e objectivec/DevTools; then \
 	  echo "Cleaning any ObjC pyc files"; \
 	  rm -f objectivec/DevTools/*.pyc; \

+ 66 - 0
benchmarks/Makefile.am

@@ -0,0 +1,66 @@
+
+benchmarks_protoc_inputs =                                     \
+  benchmarks.proto                                             \
+  benchmark_messages_proto3.proto
+
+benchmarks_protoc_inputs_proto2 =                              \
+  benchmark_messages_proto2.proto
+
+benchmarks_protoc_outputs =                                    \
+  benchmarks.pb.cc                                             \
+  benchmarks.pb.h                                              \
+  benchmark_messages_proto3.pb.cc                              \
+  benchmark_messages_proto3.pb.h
+
+benchmarks_protoc_outputs_proto2 =                             \
+  benchmark_messages_proto2.pb.cc                              \
+  benchmark_messages_proto2.pb.h
+
+bin_PROGRAMS = generate-datasets
+
+generate_datasets_LDADD = $(top_srcdir)/src/libprotobuf.la
+generate_datasets_SOURCES = generate_datasets.cc
+generate_datasets_CPPFLAGS = -I$(top_srcdir)/src -I$(srcdir)
+nodist_generate_datasets_SOURCES =                             \
+  $(benchmarks_protoc_outputs)                                 \
+  $(benchmarks_protoc_outputs_proto2)
+
+# Explicit deps because BUILT_SOURCES are only done before a "make all/check"
+# so a direct "make test_cpp" could fail if parallel enough.
+# See: https://www.gnu.org/software/automake/manual/html_node/Built-Sources-Example.html#Recording-Dependencies-manually
+generate_datasets-generate_datasets.$(OBJEXT): benchmarks.pb.h
+
+$(benchmarks_protoc_outputs): protoc_middleman
+$(benchmarks_protoc_outputs_proto2): protoc_middleman2
+
+CLEANFILES =                                                   \
+  $(benchmarks_protoc_outputs)                                 \
+  $(benchmarks_protoc_outputs_proto2)                          \
+  protoc_middleman                                             \
+  protoc_middleman2                                            \
+  dataset.*
+
+if USE_EXTERNAL_PROTOC
+
+protoc_middleman: $(benchmarks_protoc_inputs)
+	$(PROTOC) -I$(srcdir) -I$(top_srcdir) --cpp_out=. $(benchmarks_protoc_inputs)
+	touch protoc_middleman
+
+protoc_middleman2: $(benchmarks_protoc_inputs_proto2)
+	$(PROTOC) -I$(srcdir) -I$(top_srcdir) --cpp_out=. $(benchmarks_protoc_inputs_proto2)
+	touch protoc_middleman2
+
+else
+
+# We have to cd to $(srcdir) before executing protoc because $(protoc_inputs) is
+# relative to srcdir, which may not be the same as the current directory when
+# building out-of-tree.
+protoc_middleman: $(top_srcdir)/src/protoc$(EXEEXT) $(benchmarks_protoc_inputs) $(well_known_type_protoc_inputs)
+	oldpwd=`pwd` && ( cd $(srcdir) && $$oldpwd/../src/protoc$(EXEEXT) -I. -I$(top_srcdir)/src --cpp_out=$$oldpwd $(benchmarks_protoc_inputs) )
+	touch protoc_middleman
+
+protoc_middleman2: $(top_srcdir)/src/protoc$(EXEEXT) $(benchmarks_protoc_inputs_proto2) $(well_known_type_protoc_inputs)
+	oldpwd=`pwd` && ( cd $(srcdir) && $$oldpwd/../src/protoc$(EXEEXT) -I. -I$(top_srcdir)/src --cpp_out=$$oldpwd $(benchmarks_protoc_inputs_proto2) )
+	touch protoc_middleman
+
+endif

+ 28 - 0
benchmarks/README.md

@@ -0,0 +1,28 @@
+
+# Protocol Buffers Benchmarks
+
+This directory contains benchmarking schemas and data sets that you
+can use to test a variety of performance scenarios against your
+protobuf language runtime.
+
+The schema for the datasets is described in `benchmarks.proto`.
+
+Generate the data sets like so:
+
+```
+$ make
+$ ./generate-datasets
+Wrote dataset: dataset.google_message1_proto3.pb
+Wrote dataset: dataset.google_message1_proto2.pb
+Wrote dataset: dataset.google_message2.pb
+$
+```
+
+Each data set will be written to its own file.  Benchmarks will
+likely want to run several benchmarks against each data set (parse,
+serialize, possibly JSON, possibly using different APIs, etc).
+
+We would like to add more data sets.  In general we will favor data sets
+that make the overall suite diverse without being too large or having
+too many similar tests.  Ideally everyone can run through the entire
+suite without the test run getting too long.

+ 11 - 8
benchmarks/google_speed.proto → benchmarks/benchmark_messages_proto2.proto

@@ -1,11 +1,14 @@
+// Benchmark messages for proto2.
+
 syntax = "proto2";
 
-package benchmarks;
+package benchmarks.proto2;
+option java_package = "com.google.protobuf.benchmarks";
 
-option java_outer_classname = "GoogleSpeed";
+// This is the default, but we specify it here explicitly.
 option optimize_for = SPEED;
 
-message SpeedMessage1 {
+message GoogleMessage1 {
   required string field1 = 1;
   optional string field9 = 9;
   optional string field18 = 18;
@@ -40,7 +43,7 @@ message SpeedMessage1 {
   optional int32 field23 = 23 [default=0];
   optional bool field24 = 24 [default=false];
   optional int32 field25 = 25 [default=0];
-  optional SpeedMessage1SubMessage field15 = 15;
+  optional GoogleMessage1SubMessage field15 = 15;
   optional bool field78 = 78;
   optional int32 field67 = 67 [default=0];
   optional int32 field68 = 68;
@@ -49,7 +52,7 @@ message SpeedMessage1 {
   optional int32 field131 = 131 [default=0];
 }
 
-message SpeedMessage1SubMessage {
+message GoogleMessage1SubMessage {
   optional int32 field1 = 1 [default=0];
   optional int32 field2 = 2 [default=0];
   optional int32 field3 = 3 [default=0];
@@ -72,7 +75,7 @@ message SpeedMessage1SubMessage {
   optional uint64 field300 = 300;
 }
 
-message SpeedMessage2 {
+message GoogleMessage2 {
   optional string field1 = 1;
   optional int64 field3 = 3;
   optional int64 field4 = 4;
@@ -112,7 +115,7 @@ message SpeedMessage2 {
     repeated int32 field73 = 73;
     optional int32 field20 = 20 [default=0];
     optional string field24 = 24;
-    optional SpeedMessage2GroupedMessage field31 = 31;
+    optional GoogleMessage2GroupedMessage field31 = 31;
   }
   repeated string field128 = 128;
   optional int64 field131 = 131;
@@ -123,7 +126,7 @@ message SpeedMessage2 {
   optional bool field206 = 206 [default=false];
 }
 
-message SpeedMessage2GroupedMessage {
+message GoogleMessage2GroupedMessage {
   optional float field1 = 1;
   optional float field2 = 2;
   optional float field3 = 3 [default=0.0];

+ 76 - 0
benchmarks/benchmark_messages_proto3.proto

@@ -0,0 +1,76 @@
+// Benchmark messages for proto3.
+
+syntax = "proto3";
+
+package benchmarks.proto3;
+option java_package = "com.google.protobuf.benchmarks";
+
+// This is the default, but we specify it here explicitly.
+option optimize_for = SPEED;
+
+message GoogleMessage1 {
+  string field1 = 1;
+  string field9 = 9;
+  string field18 = 18;
+  bool field80 = 80;
+  bool field81 = 81;
+  int32 field2 = 2;
+  int32 field3 = 3;
+  int32 field280 = 280;
+  int32 field6 = 6;
+  int64 field22 = 22;
+  string field4 = 4;
+  repeated fixed64 field5 = 5;
+  bool field59 = 59;
+  string field7 = 7;
+  int32 field16 = 16;
+  int32 field130 = 130;
+  bool field12 = 12;
+  bool field17 = 17;
+  bool field13 = 13;
+  bool field14 = 14;
+  int32 field104 = 104;
+  int32 field100 = 100;
+  int32 field101 = 101;
+  string field102 = 102;
+  string field103 = 103;
+  int32 field29 = 29;
+  bool field30 = 30;
+  int32 field60 = 60;
+  int32 field271 = 271;
+  int32 field272 = 272;
+  int32 field150 = 150;
+  int32 field23 = 23;
+  bool field24 = 24;
+  int32 field25 = 25;
+  GoogleMessage1SubMessage field15 = 15;
+  bool field78 = 78;
+  int32 field67 = 67;
+  int32 field68 = 68;
+  int32 field128 = 128;
+  string field129 = 129;
+  int32 field131 = 131;
+}
+
+message GoogleMessage1SubMessage {
+  int32 field1 = 1;
+  int32 field2 = 2;
+  int32 field3 = 3;
+  string field15 = 15;
+  bool field12 = 12;
+  int64 field13 = 13;
+  int64 field14 = 14;
+  int32 field16 = 16;
+  int32 field19 = 19;
+  bool field20  = 20;
+  bool field28 = 28;
+  fixed64 field21 = 21;
+  int32 field22 = 22;
+  bool field23 = 23;
+  bool field206 = 206;
+  fixed32 field203 = 203;
+  int32 field204 = 204;
+  string field205 = 205;
+  uint64 field207 = 207;
+  uint64 field300 = 300;
+}

+ 63 - 0
benchmarks/benchmarks.proto

@@ -0,0 +1,63 @@
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+syntax = "proto3";
+package benchmarks;
+option java_package = "com.google.protobuf.benchmarks";
+
+message BenchmarkDataset {
+  // Name of the benchmark dataset.  This should be unique across all datasets.
+  // Should only contain word characters: [a-zA-Z0-9_]
+  string name = 1;
+
+  // Fully-qualified name of the protobuf message for this dataset.
+  // It will be one of the messages defined benchmark_messages_proto2.proto
+  // or benchmark_messages_proto3.proto.
+  //
+  // Implementations that do not support reflection can implement this with
+  // an explicit "if/else" chain that lists every known message defined
+  // in those files.
+  string message_name = 2;
+
+  // The payload(s) for this dataset.  They should be parsed or serialized
+  // in sequence, in a loop, ie.
+  //
+  //  while (!benchmarkDone) {  // Benchmark runner decides when to exit.
+  //    for (i = 0; i < benchmark.payload.length; i++) {
+  //      parse(benchmark.payload[i])
+  //    }
+  //  }
+  //
+  // This is intended to let datasets include a variety of data to provide
+  // potentially more realistic results than just parsing the same message
+  // over and over.  A single message parsed repeatedly could yield unusually
+  // good branch prediction performance.
+  repeated bytes payload = 3;
+}

+ 117 - 0
benchmarks/generate_datasets.cc

@@ -0,0 +1,117 @@
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <fstream>
+#include <iostream>
+#include "benchmarks.pb.h"
+
+using benchmarks::BenchmarkDataset;
+using google::protobuf::Descriptor;
+using google::protobuf::DescriptorPool;
+using google::protobuf::Message;
+using google::protobuf::MessageFactory;
+
+std::set<std::string> names;
+
+const char *file_prefix = "dataset.";
+const char *file_suffix = ".pb";
+
+void WriteFileWithPayloads(const std::string& name,
+                           const std::string& message_name,
+                           const std::vector<std::string>& payload) {
+  if (!names.insert(name).second) {
+    std::cerr << "Duplicate test name: " << name << "\n";
+    abort();
+  }
+
+  // First verify that this message name exists in our set of benchmark messages
+  // and that these payloads are valid for the given message.
+  const Descriptor* d =
+      DescriptorPool::generated_pool()->FindMessageTypeByName(message_name);
+
+  if (!d) {
+    std::cerr << "For dataset " << name << ", no such message: "
+              << message_name << "\n";
+    abort();
+  }
+
+  Message* m = MessageFactory::generated_factory()->GetPrototype(d)->New();
+
+  for (size_t i = 0; i < payload.size(); i++) {
+    if (!m->ParseFromString(payload[i])) {
+      std::cerr << "For dataset " << name << ", payload[" << i << "] fails "
+                << "to parse\n";
+      abort();
+    }
+  }
+
+  BenchmarkDataset dataset;
+  dataset.set_name(name);
+  dataset.set_message_name(message_name);
+  for (size_t i = 0; i < payload.size(); i++) {
+    dataset.add_payload()->assign(payload[i]);
+  }
+
+  std::ofstream writer;
+  std::string fname = file_prefix + name + file_suffix;
+  writer.open(fname.c_str());
+  dataset.SerializeToOstream(&writer);
+  writer.close();
+
+  std::cerr << "Wrote dataset: " << fname << "\n";
+}
+
+void WriteFile(const std::string& name, const std::string& message_name,
+               const std::string& payload) {
+  std::vector<std::string> payloads;
+  payloads.push_back(payload);
+  WriteFileWithPayloads(name, message_name, payloads);
+}
+
+std::string ReadFile(const std::string& name) {
+  std::ifstream file(name.c_str());
+  GOOGLE_CHECK(file.is_open()) << "Couldn't find file '" << name <<
+                                  "', please make sure you are running "
+                                  "this command from the benchmarks/ "
+                                  "directory.\n";
+  return std::string((std::istreambuf_iterator<char>(file)),
+                     std::istreambuf_iterator<char>());
+}
+
+int main() {
+  WriteFile("google_message1_proto3", "benchmarks.proto3.GoogleMessage1",
+            ReadFile("google_message1.dat"));
+  WriteFile("google_message1_proto2", "benchmarks.proto2.GoogleMessage1",
+            ReadFile("google_message1.dat"));
+
+  // Not in proto3 because it has a group, which is not supported.
+  WriteFile("google_message2", "benchmarks.proto2.GoogleMessage2",
+            ReadFile("google_message2.dat"));
+}

+ 1 - 1
configure.ac

@@ -180,5 +180,5 @@ export CFLAGS
 export CXXFLAGS
 AC_CONFIG_SUBDIRS([gmock])
 
-AC_CONFIG_FILES([Makefile src/Makefile conformance/Makefile protobuf.pc protobuf-lite.pc])
+AC_CONFIG_FILES([Makefile src/Makefile benchmarks/Makefile conformance/Makefile protobuf.pc protobuf-lite.pc])
 AC_OUTPUT

+ 3 - 0
tests.sh

@@ -36,6 +36,9 @@ build_cpp() {
   internal_build_cpp
   make check -j2
   cd conformance && make test_cpp && cd ..
+
+  # Verify benchmarking code can build successfully.
+  cd benchmarks && make && ./generate-datasets && cd ..
 }
 
 build_cpp_distcheck() {