This fixes several bugs with regards to line breaks and comments:

- any unicode line break (line feed, vertical tab, form feed, carriage return, NEL, LS and PS) is considered to terminate a single-line comment. The line break itself is considered to be the next token after the comment, leading to a parser error if it is not an ascii character (i.e. for NEL, LS and PS). - unterminated multiline comments are considered illegal tokens - '/** /' is considered an unterminated multiline comment (previously, whitespace was allowed before the last '/'
author: chriseth <chris@ethereum.org> 2018-09-06 17:05:35 +0800
committer: chriseth <chris@ethereum.org> 2018-09-06 22:42:59 +0800
commit: 0b7b8162cab67d58915d4561a52d70e7208233c1 (patch)
tree: 7d5a9c0447d0c6654219b85c2375adae6751e002 /libsolidity/parsing
parent: 977ac9c390d034232afdec195ffa069b6a1df21b (diff)
download: dexon-solidity-0b7b8162cab67d58915d4561a52d70e7208233c1.tar.gz
dexon-solidity-0b7b8162cab67d58915d4561a52d70e7208233c1.tar.zst
dexon-solidity-0b7b8162cab67d58915d4561a52d70e7208233c1.zip
2 files changed, 58 insertions, 28 deletions
diff --git a/libsolidity/parsing/Scanner.cpp b/libsolidity/parsing/Scanner.cpp
index a334846f..c9d5b969 100644
--- a/libsolidity/parsing/Scanner.cpp
+++ b/libsolidity/parsing/Scanner.cpp
@@ -243,22 +243,17 @@ bool Scanner::skipWhitespace()
 	return sourcePos() != startPosition;
 }
 
-bool Scanner::skipWhitespaceExceptLF()
+void Scanner::skipWhitespaceExceptUnicodeLinebreak()
 {
-	int const startPosition = sourcePos();
-	while (isWhiteSpace(m_char) && !isLineTerminator(m_char))
+	while (isWhiteSpace(m_char) && !isUnicodeLinebreak())
 		advance();
-	// Return whether or not we skipped any characters.
-	return sourcePos() != startPosition;
 }
 
 Token::Value Scanner::skipSingleLineComment()
 {
-	// The line terminator at the end of the line is not considered
-	// to be part of the single-line comment; it is recognized
-	// separately by the lexical grammar and becomes part of the
-	// stream of input elements for the syntactic grammar
-	while (!isLineTerminator(m_char))
+	// Line terminator is not part of the comment. If it is a
+	// non-ascii line terminator, it will result in a parser error.
+	while (!isUnicodeLinebreak())
 		if (!advance()) break;
 
 	return Token::Whitespace;
@@ -268,7 +263,9 @@ Token::Value Scanner::scanSingleLineDocComment()
 {
 	LiteralScope literal(this, LITERAL_TYPE_COMMENT);
 	advance(); //consume the last '/' at ///
-	skipWhitespaceExceptLF();
+
+	skipWhitespaceExceptUnicodeLinebreak();
+
 	while (!isSourcePastEndOfInput())
 	{
 		if (isLineTerminator(m_char))
@@ -287,6 +284,10 @@ Token::Value Scanner::scanSingleLineDocComment()
 				break; // next line is not a documentation comment, we are done
 
 		}
+		else if (isUnicodeLinebreak())
+			// Any line terminator that is not '\n' is considered to end the
+			// comment.
+			break;
 		addCommentLiteralChar(m_char);
 		advance();
 	}
@@ -321,6 +322,9 @@ Token::Value Scanner::scanMultiLineDocComment()
 	bool endFound = false;
 	bool charsAdded = false;
 
+	while (isWhiteSpace(m_char) && !isLineTerminator(m_char))
+		advance();
+
 	while (!isSourcePastEndOfInput())
 	{
 		//handle newlines in multline comments
@@ -372,7 +376,7 @@ Token::Value Scanner::scanSlash()
 	if (m_char == '/')
 	{
 		if (!advance()) /* double slash comment directly before EOS */
-			return  Token::Whitespace;
+			return Token::Whitespace;
 		else if (m_char == '/')
 		{
 			// doxygen style /// comment
@@ -390,24 +394,27 @@ Token::Value Scanner::scanSlash()
 	{
 		// doxygen style /** natspec comment
 		if (!advance()) /* slash star comment before EOS */
-			return Token::Whitespace;
+			return Token::Illegal;
 		else if (m_char == '*')
 		{
 			advance(); //consume the last '*' at /**
-			skipWhitespaceExceptLF();
 
-			// special case of a closed normal multiline comment
-			if (!m_source.isPastEndOfInput() && m_source.get(0) == '/')
-				advance(); //skip the closing slash
-			else // we actually have a multiline documentation comment
+			// "/**/"
+			if (m_char == '/')
 			{
-				Token::Value comment;
-				m_nextSkippedComment.location.start = firstSlashPosition;
-				comment = scanMultiLineDocComment();
-				m_nextSkippedComment.location.end = sourcePos();
-				m_nextSkippedComment.token = comment;
+				advance(); //skip the closing slash
+				return Token::Whitespace;
 			}
-			return Token::Whitespace;
+			// we actually have a multiline documentation comment
+			Token::Value comment;
+			m_nextSkippedComment.location.start = firstSlashPosition;
+			comment = scanMultiLineDocComment();
+			m_nextSkippedComment.location.end = sourcePos();
+			m_nextSkippedComment.token = comment;
+			if (comment == Token::Illegal)
+				return Token::Illegal;
+			else
+				return Token::Whitespace;
 		}
 		else
 			return skipMultiLineComment();
@@ -670,18 +677,38 @@ bool Scanner::scanEscape()
 		if (!scanHexByte(c))
 			return false;
 		break;
+	default:
+		return false;
 	}
 
 	addLiteralChar(c);
 	return true;
 }
 
+bool Scanner::isUnicodeLinebreak()
+{
+	if (0x0a <= m_char && m_char <= 0x0d)
+		// line feed, vertical tab, form feed, carriage return
+		return true;
+	else if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85)
+		// NEL - U+0085, C2 85 in utf8
+		return true;
+	else if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && (
+		uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9
+	))
+		// LS - U+2028, E2 80 A8  in utf8
+		// PS - U+2029, E2 80 A9  in utf8
+		return true;
+	else
+		return false;
+}
+
 Token::Value Scanner::scanString()
 {
 	char const quote = m_char;
 	advance();  // consume quote
 	LiteralScope literal(this, LITERAL_TYPE_STRING);
-	while (m_char != quote && !isSourcePastEndOfInput() && !isLineTerminator(m_char))
+	while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
 	{
 		char c = m_char;
 		advance();
@@ -705,7 +732,7 @@ Token::Value Scanner::scanHexString()
 	char const quote = m_char;
 	advance();  // consume quote
 	LiteralScope literal(this, LITERAL_TYPE_STRING);
-	while (m_char != quote && !isSourcePastEndOfInput() && !isLineTerminator(m_char))
+	while (m_char != quote && !isSourcePastEndOfInput())
 	{
 		char c = m_char;
 		if (!scanHexByte(c))
diff --git a/libsolidity/parsing/Scanner.h b/libsolidity/parsing/Scanner.h
index 8a3011eb..7564c788 100644
--- a/libsolidity/parsing/Scanner.h
+++ b/libsolidity/parsing/Scanner.h
@@ -197,8 +197,8 @@ private:
 
 	/// Skips all whitespace and @returns true if something was skipped.
 	bool skipWhitespace();
-	/// Skips all whitespace except Line feeds and returns true if something was skipped
-	bool skipWhitespaceExceptLF();
+	/// Skips all whitespace that are neither '\r' nor '\n'.
+	void skipWhitespaceExceptUnicodeLinebreak();
 	Token::Value skipSingleLineComment();
 	Token::Value skipMultiLineComment();
 
@@ -218,6 +218,9 @@ private:
 	/// is scanned.
 	bool scanEscape();
 
+	/// @returns true iff we are currently positioned at a unicode line break.
+	bool isUnicodeLinebreak();
+
 	/// Return the current source position.
 	int sourcePos() const { return m_source.position(); }
 	bool isSourcePastEndOfInput() const { return m_source.isPastEndOfInput(); }
author	chriseth <chris@ethereum.org>	2018-09-06 17:05:35 +0800
committer	chriseth <chris@ethereum.org>	2018-09-06 22:42:59 +0800
commit	0b7b8162cab67d58915d4561a52d70e7208233c1 (patch)
tree	7d5a9c0447d0c6654219b85c2375adae6751e002 /libsolidity/parsing
parent	977ac9c390d034232afdec195ffa069b6a1df21b (diff)
download	dexon-solidity-0b7b8162cab67d58915d4561a52d70e7208233c1.tar.gz dexon-solidity-0b7b8162cab67d58915d4561a52d70e7208233c1.tar.zst dexon-solidity-0b7b8162cab67d58915d4561a52d70e7208233c1.zip