TextTokenizer.cs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. // Protocol Buffers - Google's data interchange format
  2. // Copyright 2008 Google Inc.
  3. // http://code.google.com/p/protobuf/
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License");
  6. // you may not use this file except in compliance with the License.
  7. // You may obtain a copy of the License at
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS,
  13. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. // See the License for the specific language governing permissions and
  15. // limitations under the License.
  16. using System;
  17. using System.Globalization;
  18. using System.Text.RegularExpressions;
  19. namespace Google.ProtocolBuffers {
  20. /// <summary>
  21. /// Represents a stream of tokens parsed from a string.
  22. /// </summary>
  23. internal sealed class TextTokenizer {
  24. private readonly string text;
  25. private string currentToken;
  26. /// <summary>
  27. /// The character index within the text to perform the next regex match at.
  28. /// </summary>
  29. private int matchPos = 0;
  30. /// <summary>
  31. /// The character index within the text at which the current token begins.
  32. /// </summary>
  33. private int pos = 0;
  34. /// <summary>
  35. /// The line number of the current token.
  36. /// </summary>
  37. private int line = 0;
  38. /// <summary>
  39. /// The column number of the current token.
  40. /// </summary>
  41. private int column = 0;
  42. /// <summary>
  43. /// The line number of the previous token.
  44. /// </summary>
  45. private int previousLine = 0;
  46. /// <summary>
  47. /// The column number of the previous token.
  48. /// </summary>
  49. private int previousColumn = 0;
  50. private static Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#[^\\\n]*\\n))+", RegexOptions.Compiled);
  51. private static Regex TokenPattern = new Regex(
  52. "\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" + // an identifier
  53. "\\G[0-9+-][0-9a-zA-Z_.+-]*|" + // a number
  54. "\\G\"([^\"\\\n\\\\]|\\\\[^\\\n])*(\"|\\\\?$)|" + // a double-quoted string
  55. "\\G\'([^\"\\\n\\\\]|\\\\[^\\\n])*(\'|\\\\?$)", // a single-quoted string
  56. RegexOptions.Compiled);
  57. /** Construct a tokenizer that parses tokens from the given text. */
  58. public TextTokenizer(string text) {
  59. this.text = text;
  60. SkipWhitespace();
  61. NextToken();
  62. }
  63. /// <summary>
  64. /// Are we at the end of the input?
  65. /// </summary>
  66. public bool AtEnd {
  67. get { return currentToken.Length == 0; }
  68. }
  69. /// <summary>
  70. /// Advances to the next token.
  71. /// </summary>
  72. public void NextToken() {
  73. previousLine = line;
  74. previousColumn = column;
  75. // Advance the line counter to the current position.
  76. while (pos < matchPos) {
  77. if (text[pos] == '\n') {
  78. ++line;
  79. column = 0;
  80. } else {
  81. ++column;
  82. }
  83. ++pos;
  84. }
  85. // Match the next token.
  86. if (matchPos == text.Length) {
  87. // EOF
  88. currentToken = "";
  89. } else {
  90. Match match = TokenPattern.Match(text, matchPos);
  91. if (match.Success) {
  92. currentToken = match.Value;
  93. matchPos += match.Length;
  94. } else {
  95. // Take one character.
  96. currentToken = text[matchPos].ToString();
  97. matchPos++;
  98. }
  99. SkipWhitespace();
  100. }
  101. }
  102. /// <summary>
  103. /// Skip over any whitespace so that matchPos starts at the next token.
  104. /// </summary>
  105. private void SkipWhitespace() {
  106. Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
  107. if (match.Success) {
  108. matchPos += match.Length;
  109. }
  110. }
  111. /// <summary>
  112. /// If the next token exactly matches the given token, consume it and return
  113. /// true. Otherwise, return false without doing anything.
  114. /// </summary>
  115. public bool TryConsume(string token) {
  116. if (currentToken == token) {
  117. NextToken();
  118. return true;
  119. }
  120. return false;
  121. }
  122. /*
  123. * If the next token exactly matches {@code token}, consume it. Otherwise,
  124. * throw a {@link ParseException}.
  125. */
  126. /// <summary>
  127. /// If the next token exactly matches the specified one, consume it.
  128. /// Otherwise, throw a FormatException.
  129. /// </summary>
  130. /// <param name="token"></param>
  131. public void Consume(string token) {
  132. if (!TryConsume(token)) {
  133. throw CreateFormatException("Expected \"" + token + "\".");
  134. }
  135. }
  136. /// <summary>
  137. /// Returns true if the next token is an integer, but does not consume it.
  138. /// </summary>
  139. public bool LookingAtInteger() {
  140. if (currentToken.Length == 0) {
  141. return false;
  142. }
  143. char c = currentToken[0];
  144. return ('0' <= c && c <= '9') || c == '-' || c == '+';
  145. }
  146. /// <summary>
  147. /// If the next token is an identifier, consume it and return its value.
  148. /// Otherwise, throw a FormatException.
  149. /// </summary>
  150. public string ConsumeIdentifier() {
  151. foreach (char c in currentToken) {
  152. if (('a' <= c && c <= 'z') ||
  153. ('A' <= c && c <= 'Z') ||
  154. ('0' <= c && c <= '9') ||
  155. (c == '_') || (c == '.')) {
  156. // OK
  157. } else {
  158. throw CreateFormatException("Expected identifier.");
  159. }
  160. }
  161. string result = currentToken;
  162. NextToken();
  163. return result;
  164. }
  165. /// <summary>
  166. /// If the next token is a 32-bit signed integer, consume it and return its
  167. /// value. Otherwise, throw a FormatException.
  168. /// </summary>
  169. public int ConsumeInt32() {
  170. try {
  171. int result = TextFormat.ParseInt32(currentToken);
  172. NextToken();
  173. return result;
  174. } catch (FormatException e) {
  175. throw CreateIntegerParseException(e);
  176. }
  177. }
  178. /// <summary>
  179. /// If the next token is a 32-bit unsigned integer, consume it and return its
  180. /// value. Otherwise, throw a FormatException.
  181. /// </summary>
  182. public uint ConsumeUInt32() {
  183. try {
  184. uint result = TextFormat.ParseUInt32(currentToken);
  185. NextToken();
  186. return result;
  187. } catch (FormatException e) {
  188. throw CreateIntegerParseException(e);
  189. }
  190. }
  191. /// <summary>
  192. /// If the next token is a 64-bit signed integer, consume it and return its
  193. /// value. Otherwise, throw a FormatException.
  194. /// </summary>
  195. public long ConsumeInt64() {
  196. try {
  197. long result = TextFormat.ParseInt64(currentToken);
  198. NextToken();
  199. return result;
  200. } catch (FormatException e) {
  201. throw CreateIntegerParseException(e);
  202. }
  203. }
  204. /// <summary>
  205. /// If the next token is a 64-bit unsigned integer, consume it and return its
  206. /// value. Otherwise, throw a FormatException.
  207. /// </summary>
  208. public ulong ConsumeUInt64() {
  209. try {
  210. ulong result = TextFormat.ParseUInt64(currentToken);
  211. NextToken();
  212. return result;
  213. } catch (FormatException e) {
  214. throw CreateIntegerParseException(e);
  215. }
  216. }
  217. /// <summary>
  218. /// If the next token is a double, consume it and return its value.
  219. /// Otherwise, throw a FormatException.
  220. /// </summary>
  221. public double ConsumeDouble() {
  222. try {
  223. double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
  224. NextToken();
  225. return result;
  226. } catch (FormatException e) {
  227. throw CreateFloatParseException(e);
  228. } catch (OverflowException e) {
  229. throw CreateFloatParseException(e);
  230. }
  231. }
  232. /// <summary>
  233. /// If the next token is a float, consume it and return its value.
  234. /// Otherwise, throw a FormatException.
  235. /// </summary>
  236. public float consumeFloat() {
  237. try {
  238. float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
  239. NextToken();
  240. return result;
  241. } catch (FormatException e) {
  242. throw CreateFloatParseException(e);
  243. } catch (OverflowException e) {
  244. throw CreateFloatParseException(e);
  245. }
  246. }
  247. /// <summary>
  248. /// If the next token is a Boolean, consume it and return its value.
  249. /// Otherwise, throw a FormatException.
  250. /// </summary>
  251. public bool ConsumeBoolean() {
  252. if (currentToken == "true") {
  253. NextToken();
  254. return true;
  255. }
  256. if (currentToken == "false") {
  257. NextToken();
  258. return false;
  259. }
  260. throw CreateFormatException("Expected \"true\" or \"false\".");
  261. }
  262. /// <summary>
  263. /// If the next token is a string, consume it and return its (unescaped) value.
  264. /// Otherwise, throw a FormatException.
  265. /// </summary>
  266. public string ConsumeString() {
  267. return ConsumeByteString().ToStringUtf8();
  268. }
  269. /// <summary>
  270. /// If the next token is a string, consume it, unescape it as a
  271. /// ByteString and return it. Otherwise, throw a FormatException.
  272. /// </summary>
  273. public ByteString ConsumeByteString() {
  274. char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
  275. if (quote != '\"' && quote != '\'') {
  276. throw CreateFormatException("Expected string.");
  277. }
  278. if (currentToken.Length < 2 ||
  279. currentToken[currentToken.Length-1] != quote) {
  280. throw CreateFormatException("String missing ending quote.");
  281. }
  282. try {
  283. string escaped = currentToken.Substring(1, currentToken.Length - 2);
  284. ByteString result = TextFormat.UnescapeBytes(escaped);
  285. NextToken();
  286. return result;
  287. } catch (FormatException e) {
  288. throw CreateFormatException(e.Message);
  289. }
  290. }
  291. /// <summary>
  292. /// Returns a format exception with the current line and column numbers
  293. /// in the description, suitable for throwing.
  294. /// </summary>
  295. public FormatException CreateFormatException(string description) {
  296. // Note: People generally prefer one-based line and column numbers.
  297. return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
  298. }
  299. /// <summary>
  300. /// Returns a format exception with the line and column numbers of the
  301. /// previous token in the description, suitable for throwing.
  302. /// </summary>
  303. public FormatException CreateFormatExceptionPreviousToken(string description) {
  304. // Note: People generally prefer one-based line and column numbers.
  305. return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
  306. }
  307. /// <summary>
  308. /// Constructs an appropriate FormatException for the given existing exception
  309. /// when trying to parse an integer.
  310. /// </summary>
  311. private FormatException CreateIntegerParseException(FormatException e) {
  312. return CreateFormatException("Couldn't parse integer: " + e.Message);
  313. }
  314. /// <summary>
  315. /// Constructs an appropriate FormatException for the given existing exception
  316. /// when trying to parse a float or double.
  317. /// </summary>
  318. private FormatException CreateFloatParseException(Exception e) {
  319. return CreateFormatException("Couldn't parse number: " + e.Message);
  320. }
  321. }
  322. }