generate_unicode_test.c 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. /*
  2. * Generates a Unicode test for xxhsum without using Unicode in the source files.
  3. *
  4. * Copyright (C) 2020 Devin Hussey (easyaspi314)
  5. *
  6. * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions are
  10. * met:
  11. *
  12. * * Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * * Redistributions in binary form must reproduce the above
  15. * copyright notice, this list of conditions and the following disclaimer
  16. * in the documentation and/or other materials provided with the
  17. * distribution.
  18. *
  19. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. */
  31. /*
  32. * Certain terminals don't properly handle UTF-8 (i.e. rxvt and command prompt
  33. * in the default codepage), and that can cause issues when editing text.
  34. *
  35. * We use this C file to generate a file with a Unicode filename, a file with
  36. * a checksum of said file, and both a Windows batch script and a Unix shell
  37. * script to test the file.
  38. */
  39. #define _CRT_SECURE_NO_WARNINGS /* Silence warnings on MSVC */
  40. #include <stdio.h>
  41. /* Use a Japanese filename, something that can't be cheated with ANSI.
  42. * yuniko-do.unicode (literally unicode.unicode) */
  43. /* Use raw hex values to ensure that the output is well-formed UTF-8. It is also more C90 compliant. */
  44. static const char FILENAME[] = {
  45. (char)0xe3, (char)0x83, (char)0xa6, /* U+30e6: Katakana letter yu */
  46. (char)0xe3, (char)0x83, (char)0x8b, /* U+30cb: Katakana letter ni */
  47. (char)0xe3, (char)0x82, (char)0xb3, /* U+30b3: Katakana letter ko */
  48. (char)0xe3, (char)0x83, (char)0xbc, /* U+30fc: Katakana-Hiragana prolonged sound mark (dash) */
  49. (char)0xe3, (char)0x83, (char)0x89, /* U+30c9: Katakana letter do */
  50. '.','u','n','i','c','o','d','e','\0' /* ".unicode" (so we can glob in make clean and .gitignore) */
  51. };
  52. #ifdef _WIN32
  53. /* The same text as above, but encoded in Windows UTF-16. */
  54. static const wchar_t WFILENAME[] = { 0x30e6, 0x30cb, 0x30b3, 0x30fc, 0x30c9, L'.', L'u', L'n', L'i', L'c', L'o', L'd', L'e', L'\0' };
  55. #endif
  56. int main(void)
  57. {
  58. FILE *f, *script, *checksum;
  59. /* Create our Unicode file. Use _wfopen on Windows as fopen doesn't support Unicode filenames. */
  60. #ifdef _WIN32
  61. if (!(f = _wfopen(WFILENAME, L"wb"))) return 1;
  62. #else
  63. if (!(f = fopen(FILENAME, "wb"))) return 1;
  64. #endif
  65. fprintf(f, "test\n");
  66. fclose(f);
  67. /* XXH64 checksum file with the precalculated checksum for said file. */
  68. if (!(checksum = fopen("unicode_test.xxh64", "wb")))
  69. return 1;
  70. fprintf(checksum, "2d7f1808da1fa63c %s\n", FILENAME);
  71. fclose(checksum);
  72. /* Create two scripts for both Windows and Unix. */
  73. /* Generate a Windows batch script. Always insert CRLF manually. */
  74. if (!(script = fopen("unicode_test.bat", "wb")))
  75. return 1;
  76. /* Disable echoing the commands. We do that ourselves the naive way. */
  77. fprintf(script, "@echo off\r\n");
  78. /* Change to codepage 65001 to enable UTF-8 support. */
  79. fprintf(script, "chcp 65001 >NUL 2>&1\r\n");
  80. /* First test a Unicode filename */
  81. fprintf(script, "echo Testing filename provided on command line...\r\n");
  82. fprintf(script, "echo xxhsum.exe \"%s\"\r\n", FILENAME);
  83. fprintf(script, "xxhsum.exe \"%s\"\r\n", FILENAME);
  84. /* Bail on error */
  85. fprintf(script, "if %%ERRORLEVEL%% neq 0 (\r\n");
  86. fprintf(script, " exit /B %%ERRORLEVEL%%\r\n");
  87. fprintf(script, ")\r\n");
  88. /* Then test a checksum file. */
  89. fprintf(script, "echo Testing a checksum file...\r\n");
  90. fprintf(script, "echo xxhsum.exe -c unicode_test.xxh64\r\n");
  91. fprintf(script, "xxhsum.exe -c unicode_test.xxh64\r\n");
  92. fprintf(script, "exit /B %%ERRORLEVEL%%\r\n");
  93. fclose(script);
  94. /* Generate a Unix shell script */
  95. if (!(script = fopen("unicode_test.sh", "wb")))
  96. return 1;
  97. fprintf(script, "#!/bin/sh\n");
  98. /*
  99. * Some versions of MSYS, MinGW and Cygwin do not support UTF-8, and the ones that
  100. * don't may error with something like this:
  101. *
  102. * Error: Could not open '<mojibake>.unicode': No such file or directory.
  103. *
  104. * which is an internal error that happens when it tries to convert MinGW/Cygwin
  105. * paths to Windows paths.
  106. *
  107. * In that case, we bail to cmd.exe and the batch script, which supports UTF-8
  108. * on Windows 7 and later.
  109. */
  110. fprintf(script, "case $(uname) in\n");
  111. /* MinGW/MSYS converts /c to C:\ unless you have a double slash,
  112. * Cygwin does not. */
  113. fprintf(script, " *CYGWIN*)\n");
  114. fprintf(script, " exec cmd.exe /c unicode_test.bat\n");
  115. fprintf(script, " ;;\n");
  116. fprintf(script, " *MINGW*|*MSYS*)\n");
  117. fprintf(script, " exec cmd.exe //c unicode_test.bat\n");
  118. fprintf(script, " ;;\n");
  119. fprintf(script, "esac\n");
  120. /* First test a Unicode filename */
  121. fprintf(script, "echo Testing filename provided on command line...\n");
  122. fprintf(script, "echo './xxhsum \"%s\" || exit $?'\n", FILENAME);
  123. fprintf(script, "./xxhsum \"%s\" || exit $?\n", FILENAME);
  124. /* Then test a checksum file. */
  125. fprintf(script, "echo Testing a checksum file...\n");
  126. fprintf(script, "echo './xxhsum -c unicode_test.xxh64 || exit $?'\n");
  127. fprintf(script, "./xxhsum -c unicode_test.xxh64 || exit $?\n");
  128. fclose(script);
  129. return 0;
  130. }