aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorchriseth <chris@ethereum.org>2017-06-27 20:38:03 +0800
committerGitHub <noreply@github.com>2017-06-27 20:38:03 +0800
commit36044c8c95890bfc25a199510e32a0481e8082d0 (patch)
treeb39a53434983fe7a89051697872b646101d725fe
parentbc31d4969ccdea8804f573bcf5104c154df9aff6 (diff)
parente715dd0b7e382b71abf50c974f943423048d138e (diff)
downloaddexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.tar.gz
dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.tar.zst
dexon-solidity-36044c8c95890bfc25a199510e32a0481e8082d0.zip
Merge pull request #2413 from ethereum/utf8-strict-parser
Implement strict UTF-8 validation
-rw-r--r--Changelog.md2
-rw-r--r--libdevcore/UTF8.cpp84
-rw-r--r--test/libdevcore/UTF8.cpp216
3 files changed, 291 insertions, 11 deletions
diff --git a/Changelog.md b/Changelog.md
index 6d9fe477..3d8701ca 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -14,6 +14,7 @@ Features:
* Type Checker: Warn about copies in storage that might overwrite unexpectedly.
* Code Generator: Added the Whiskers template system.
* Remove obsolete Why3 output.
+ * Type Checker: Enforce strict UTF-8 validation.
Bugfixes:
* Code generator: Use ``REVERT`` instead of ``INVALID`` for generated input validation routines.
@@ -22,6 +23,7 @@ Bugfixes:
* Type Checker: Make UTF8-validation a bit more sloppy to include more valid sequences.
* Type Checker: Disallow comparisons between mapping and non-internal function types.
* Type Checker: Do not treat strings that look like addresses as addresses.
+ * Type Checker: Support valid, but incorrectly rejected UTF-8 sequences.
* Fixed crash concerning non-callable types.
* Unused variable warnings no longer issued for variables used inside inline assembly.
* Code Generator: Fix ABI encoding of empty literal string.
diff --git a/libdevcore/UTF8.cpp b/libdevcore/UTF8.cpp
index 449ccc5d..2ae720ec 100644
--- a/libdevcore/UTF8.cpp
+++ b/libdevcore/UTF8.cpp
@@ -27,25 +27,74 @@
namespace dev
{
+namespace
+{
-bool validateUTF8(std::string const& _input, size_t& _invalidPosition)
+/// Validate byte sequence against Unicode chapter 3 Table 3-7.
+bool isWellFormed(unsigned char byte1, unsigned char byte2)
+{
+ if (byte1 == 0xc0 || byte1 == 0xc1)
+ return false;
+ else if (byte1 >= 0xc2 && byte1 <= 0xdf)
+ return true;
+ else if (byte1 == 0xe0)
+ {
+ if (byte2 < 0xa0)
+ return false;
+ else
+ return true;
+ }
+ else if (byte1 >= 0xe1 && byte1 <= 0xec)
+ return true;
+ else if (byte1 == 0xed)
+ {
+ if (byte2 > 0x9f)
+ return false;
+ else
+ return true;
+ }
+ else if (byte1 == 0xee || byte1 == 0xef)
+ return true;
+ else if (byte1 == 0xf0)
+ {
+ if (byte2 < 0x90)
+ return false;
+ else
+ return true;
+ }
+ else if (byte1 >= 0xf1 && byte1 <= 0xf3)
+ return true;
+ else if (byte1 == 0xf4)
+ {
+ if (byte2 > 0x8f)
+ return false;
+ else
+ return true;
+ }
+ /// 0xf5 .. 0xf7 is disallowed
+ /// Technically anything below 0xc0 or above 0xf7 is
+ /// not possible to encode using Table 3-6 anyway.
+ return false;
+}
+
+bool validateUTF8(const unsigned char *_input, size_t _length, size_t& _invalidPosition)
{
- const size_t length = _input.length();
bool valid = true;
size_t i = 0;
- for (; i < length; i++)
+ for (; i < _length; i++)
{
- if ((unsigned char)_input[i] < 0x80)
+ // Check for Unicode Chapter 3 Table 3-6 conformity.
+ if (_input[i] < 0x80)
continue;
size_t count = 0;
- switch(_input[i] & 0xf0) {
- case 0xc0: count = 1; break;
- case 0xe0: count = 2; break;
- case 0xf0: count = 3; break;
- default: break;
- }
+ if (_input[i] >= 0xc0 && _input[i] <= 0xdf)
+ count = 1;
+ else if (_input[i] >= 0xe0 && _input[i] <= 0xef)
+ count = 2;
+ else if (_input[i] >= 0xf0 && _input[i] <= 0xf7)
+ count = 3;
if (count == 0)
{
@@ -53,7 +102,7 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition)
break;
}
- if ((i + count) >= length)
+ if ((i + count) >= _length)
{
valid = false;
break;
@@ -67,6 +116,13 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition)
valid = false;
break;
}
+
+ // Check for Unicode Chapter 3 Table 3-7 conformity.
+ if ((j == 0) && !isWellFormed(_input[i - 1], _input[i]))
+ {
+ valid = false;
+ break;
+ }
}
}
@@ -77,5 +133,11 @@ bool validateUTF8(std::string const& _input, size_t& _invalidPosition)
return false;
}
+}
+
+bool validateUTF8(std::string const& _input, size_t& _invalidPosition)
+{
+ return validateUTF8(reinterpret_cast<unsigned char const*>(_input.c_str()), _input.length(), _invalidPosition);
+}
}
diff --git a/test/libdevcore/UTF8.cpp b/test/libdevcore/UTF8.cpp
new file mode 100644
index 00000000..719ada72
--- /dev/null
+++ b/test/libdevcore/UTF8.cpp
@@ -0,0 +1,216 @@
+/*
+ This file is part of solidity.
+
+ solidity is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ solidity is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with solidity. If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+ * Unit tests for UTF-8 validation.
+ */
+
+#include <libdevcore/CommonData.h>
+#include <libdevcore/UTF8.h>
+
+#include "../TestHelper.h"
+
+using namespace std;
+
+namespace dev
+{
+namespace test
+{
+
+BOOST_AUTO_TEST_SUITE(UTF8)
+
+namespace {
+
+bool isValidUTF8(string const& _value)
+{
+ size_t pos;
+ return validateUTF8(asString(fromHex(_value)), pos);
+}
+
+bool isInvalidUTF8(string const& _value, size_t _expectedPos)
+{
+ size_t pos;
+ if (validateUTF8(asString(fromHex(_value)), pos))
+ return false;
+ if (pos != _expectedPos)
+ return false;
+ return true;
+}
+
+}
+
+BOOST_AUTO_TEST_CASE(valid)
+{
+ BOOST_CHECK(isValidUTF8("00"));
+ BOOST_CHECK(isValidUTF8("20"));
+ BOOST_CHECK(isValidUTF8("7f"));
+ BOOST_CHECK(isValidUTF8("c281"));
+ BOOST_CHECK(isValidUTF8("df81"));
+ BOOST_CHECK(isValidUTF8("e0a081"));
+ BOOST_CHECK(isValidUTF8("e18081"));
+ BOOST_CHECK(isValidUTF8("ec8081"));
+ BOOST_CHECK(isValidUTF8("ed8081"));
+ BOOST_CHECK(isValidUTF8("ee8081"));
+ BOOST_CHECK(isValidUTF8("ef8081"));
+ BOOST_CHECK(isValidUTF8("f0908081"));
+ BOOST_CHECK(isValidUTF8("f3808081"));
+ BOOST_CHECK(isValidUTF8("f2808081"));
+ BOOST_CHECK(isValidUTF8("f3808081"));
+ BOOST_CHECK(isValidUTF8("f48e8081"));
+}
+
+BOOST_AUTO_TEST_CASE(invalid)
+{
+ // anything between 0x80 and 0xc0 is disallowed
+ BOOST_CHECK(isInvalidUTF8("80", 0)); // invalid per table 3.6
+ BOOST_CHECK(isInvalidUTF8("a0", 0)); // invalid per table 3.6
+ BOOST_CHECK(isInvalidUTF8("c0", 0)); // invalid per table 3.7
+ BOOST_CHECK(isInvalidUTF8("c1", 0)); // invalid per table 3.7
+ BOOST_CHECK(isInvalidUTF8("c2", 0)); // too short (position is reported as the first byte)
+ BOOST_CHECK(isInvalidUTF8("e08081", 2)); // e0 must be followed by >= a0
+ BOOST_CHECK(isInvalidUTF8("e180", 0)); // too short
+ BOOST_CHECK(isInvalidUTF8("ec80", 0)); // too short
+ BOOST_CHECK(isInvalidUTF8("f08f8001", 2)); // f0 must be followed by >= 90
+ BOOST_CHECK(isInvalidUTF8("f18080", 0)); // too short
+ BOOST_CHECK(isInvalidUTF8("f4908081", 2)); // f4 must be followed by < 90
+ // anything above 0xf7 is disallowed
+ BOOST_CHECK(isInvalidUTF8("f8", 0)); // invalid per table 3.7
+ BOOST_CHECK(isInvalidUTF8("f9", 0)); // invalid per table 3.7
+}
+
+BOOST_AUTO_TEST_CASE(corpus)
+{
+ string source = R"(
+κόσμε
+
+hélló
+
+Ā ā Ă ă Ą ą
+
+ƀ Ɓ Ƃ ƃ Ƅ ƅ
+
+ɐ ɑ ɒ ɓ ɔ ɕ
+
+ʰ ʱ ʲ ʳ ʴ ʵ
+
+̀ ́ ̂ ̃ ̄ ̅
+
+ϩ Ϫ ϫ Ϭ ϭ Ϯ
+
+Ё Ђ Ѓ Є Ѕ І
+
+Ա Բ Գ Դ Ե Զ
+
+ ק ר ש ת װ ױ
+
+ځ ڂ ڃ ڄ څ چ
+
+ऑ ऒ ओ औ क ख
+
+ও ঔ ক খ গ ঘ
+
+ਘ ਙ ਚ ਛ ਜ ਝ
+
+ઓ ઔ ક ખ ગ ઘ
+
+ଗ ଘ ଙ ଚ ଛ ଜ
+
+ஔ க ங ச ஜ ஞ
+
+ఎ ఏ ఐ ఒ ఓ ఔ
+
+ಓ ಔ ಕ ಖ ಗ ಘ
+
+ഐ ഒ ഓ ഔ ക
+
+ฒ ณ ด ต ถ ท
+
+ມ ຢ ຣ ລ ວ ສ
+
+༄ ༅ ༆ ༇ ༈ ༉
+
+Ⴑ Ⴒ Ⴓ Ⴔ Ⴕ Ⴖ
+
+ᄌ ᄍ ᄎ ᄏ ᄐ
+
+Ḕ ḕ Ḗ ḗ Ḙ ḙ Ḛ
+
+ἐ ἑ ἒ ἓ ἔ ἕ
+
+₠ ₡ ₢ ₣ ₤ ₥
+
+⃐ ⃑ ⃒ ⃓ ⃔ ⃕ ⃖ ⃗ ⃘ ⃙ ⃚
+
+ℋ ℌ ℍ ℎ ℏ ℐ ℑ
+
+⅓ ⅔ ⅕ ⅖ ⅗
+
+∬ ∭ ∮ ∯ ∰
+
+⌖ ⌗ ⌘ ⌙ ⌚ ⌛
+
+␀ ␁ ␂ ␃ ␄ ␅
+
+⑀ ⑁ ⑂ ⑃ ⑄
+
+① ② ③ ④ ⑤
+
+╘ ╙ ╚ ╛ ╜ ╝
+
+▁ ▂ ▃ ▄ ▅ ▆
+
+▤ ▥ ▦ ▧ ▨
+
+♔ ♕ ♖ ♗ ♘ ♙
+
+✈ ✉ ✌ ✍ ✎
+
+ぁ あ ぃ い ぅ
+
+ァ ア ィ イ ゥ
+
+ㄅ ㄆ ㄇ ㄈ ㄉ
+
+ㄱ ㄲ ㄳ ㄴ ㄵ
+
+㆚ ㆛ ㆜ ㆝ ㆞
+
+㈀ ㈁ ㈂ ㈃ ㈄
+
+㌀ ㌁ ㌂ ㌃ ㌄
+
+乺 乻 乼 乽 乾
+
+걺 걻 걼 걽 걾
+
+豈 更 車 賈 滑
+
+שּׁ שּׂ אַ אָ אּ
+
+ﮄ ﮅ ﮆ ﮇ ﮈ ﮉ
+
+ ﺵ ﺶ ﺷ ﺸ
+
+「 」 、 ・ ヲ ァ ィ ゥ
+ )";
+ size_t pos;
+ BOOST_CHECK(validateUTF8(source, pos));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+}
+}