Reject invalid UTF-8 and weird characters in text files

Reject "weird" characters in text files, especially control characters that might be escape sequences or that might cause other text to appear garbled (as in https://trojansource.codes/). Also reject byte sequences that aren't valid UTF-8. Accept only ASCII (except most control characters), letters, some non-ASCII punctuation and some mathematical and technical symbols. This covers everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥). Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
2023-01-05 20:28:57 +01:00 · 2023-01-05 20:28:57 +01:00 · d11bb47fe0
commit d11bb47fe0
parent b389743ace
1 changed files with 40 additions and 0 deletions
--- a/tests/scripts/check_files.py
+++ b/tests/scripts/check_files.py
@ -263,6 +263,45 @@ class Utf8BomIssueTracker(FileIssueTracker):
                self.files_with_issues[filepath] = None


+class UnicodeIssueTracker(LineIssueTracker):
+    """Track lines with invalid characters or invalid text encoding."""
+
+    heading = "Invalid UTF-8 or forbidden character:"
+
+    # Only allow valid UTF-8, and only white-listed characters.
+    # We deliberately exclude all characters that aren't a simple non-blank,
+    # non-zero-width glyph, apart from a very small set (tab, ordinary space,
+    # line breaks, "basic" no-break space and soft hyphen). In particular,
+    # non-ASCII control characters, combinig characters, and Unicode state
+    # changes (e.g. right-to-left text) are forbidden.
+    # Note that we do allow some characters with a risk of visual confusion,
+    # for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs
+    # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
+    # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
+    GOOD_CHARACTERS = ''.join([
+        '\t\n\r -~', # ASCII (tabs and line endings are checked separately)
+        '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
+        '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
+        '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
+        '\u2190-\u21FF', # Arrows
+        '\u2200-\u22FF', # Mathematical Symbols
+    ])
+    # Allow any of the characters and ranges above, and anything classified
+    # as a word constituent.
+    GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
+
+    def issue_with_line(self, line, _filepath, line_number):
+        try:
+            text = line.decode('utf-8')
+        except UnicodeDecodeError:
+            return True
+        if line_number == 1 and text.startswith('\uFEFF'):
+            # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
+            # Which files are allowed to have a BOM is handled in
+            # Utf8BomIssueTracker.
+            text = text[1:]
+        return not self.GOOD_CHARACTERS_RE.match(text)
+
 class UnixLineEndingIssueTracker(LineIssueTracker):
    """Track files with non-Unix line endings (i.e. files with CR)."""

@ -350,6 +389,7 @@ class IntegrityChecker:
            ShebangIssueTracker(),
            EndOfFileNewlineIssueTracker(),
            Utf8BomIssueTracker(),
+            UnicodeIssueTracker(),
            UnixLineEndingIssueTracker(),
            WindowsLineEndingIssueTracker(),
            TrailingWhitespaceIssueTracker(),