Reject invalid UTF-8 and weird characters in text files

Reject "weird" characters in text files, especially control characters that
might be escape sequences or that might cause other text to appear garbled
(as in https://trojansource.codes/).

Also reject byte sequences that aren't valid UTF-8.

Accept only ASCII (except most control characters), letters, some non-ASCII
punctuation and some mathematical and technical symbols. This covers
everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥).

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
This commit is contained in:
Gilles Peskine 2023-01-05 20:28:57 +01:00
parent b389743ace
commit d11bb47fe0

View file

@ -263,6 +263,45 @@ class Utf8BomIssueTracker(FileIssueTracker):
self.files_with_issues[filepath] = None
class UnicodeIssueTracker(LineIssueTracker):
"""Track lines with invalid characters or invalid text encoding."""
heading = "Invalid UTF-8 or forbidden character:"
# Only allow valid UTF-8, and only white-listed characters.
# We deliberately exclude all characters that aren't a simple non-blank,
# non-zero-width glyph, apart from a very small set (tab, ordinary space,
# line breaks, "basic" no-break space and soft hyphen). In particular,
# non-ASCII control characters, combinig characters, and Unicode state
# changes (e.g. right-to-left text) are forbidden.
# Note that we do allow some characters with a risk of visual confusion,
# for example '-' (U+002D HYPHEN-MINUS) vs '­' (U+00AD SOFT HYPHEN) vs
# '' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
# 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
GOOD_CHARACTERS = ''.join([
'\t\n\r -~', # ASCII (tabs and line endings are checked separately)
'\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
'\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
'\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
'\u2190-\u21FF', # Arrows
'\u2200-\u22FF', # Mathematical Symbols
])
# Allow any of the characters and ranges above, and anything classified
# as a word constituent.
GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
def issue_with_line(self, line, _filepath, line_number):
try:
text = line.decode('utf-8')
except UnicodeDecodeError:
return True
if line_number == 1 and text.startswith('\uFEFF'):
# Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
# Which files are allowed to have a BOM is handled in
# Utf8BomIssueTracker.
text = text[1:]
return not self.GOOD_CHARACTERS_RE.match(text)
class UnixLineEndingIssueTracker(LineIssueTracker):
"""Track files with non-Unix line endings (i.e. files with CR)."""
@ -350,6 +389,7 @@ class IntegrityChecker:
ShebangIssueTracker(),
EndOfFileNewlineIssueTracker(),
Utf8BomIssueTracker(),
UnicodeIssueTracker(),
UnixLineEndingIssueTracker(),
WindowsLineEndingIssueTracker(),
TrailingWhitespaceIssueTracker(),