mbedtls/tests/scripts/check_files.py

#!/usr/bin/env python3

# Copyright The Mbed TLS Contributors
# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later

"""
This script checks the current state of the source code for minor issues,
including incorrect file permissions, presence of tabs, non-Unix line endings,
trailing whitespace, and presence of UTF-8 BOM.
Note: requires python 3, must be run from Mbed TLS root.
"""

import argparse
import codecs
import logging
import os
import re
import subprocess
import sys
try:
    from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import
except ImportError:
    pass

import scripts_path # pylint: disable=unused-import
from mbedtls_dev import build_tree


class FileIssueTracker:
    """Base class for file-wide issue tracking.

    To implement a checker that processes a file as a whole, inherit from
    this class and implement `check_file_for_issue` and define ``heading``.

    ``suffix_exemptions``: files whose name ends with a string in this set
     will not be checked.

    ``path_exemptions``: files whose path (relative to the root of the source
    tree) matches this regular expression will not be checked. This can be
    ``None`` to match no path. Paths are normalized and converted to ``/``
    separators before matching.

    ``heading``: human-readable description of the issue
    """

    suffix_exemptions = frozenset() #type: FrozenSet[str]
    path_exemptions = None #type: Optional[Pattern[str]]
    # heading must be defined in derived classes.
    # pylint: disable=no-member

    def __init__(self):
        self.files_with_issues = {}

    @staticmethod
    def normalize_path(filepath):
        """Normalize ``filepath`` with / as the directory separator."""
        filepath = os.path.normpath(filepath)
        # On Windows, we may have backslashes to separate directories.
        # We need slashes to match exemption lists.
        seps = os.path.sep
        if os.path.altsep is not None:
            seps += os.path.altsep
        return '/'.join(filepath.split(seps))

    def should_check_file(self, filepath):
        """Whether the given file name should be checked.

        Files whose name ends with a string listed in ``self.suffix_exemptions``
        or whose path matches ``self.path_exemptions`` will not be checked.
        """
        for files_exemption in self.suffix_exemptions:
            if filepath.endswith(files_exemption):
                return False
        if self.path_exemptions and \
           re.match(self.path_exemptions, self.normalize_path(filepath)):
            return False
        return True

    def check_file_for_issue(self, filepath):
        """Check the specified file for the issue that this class is for.

        Subclasses must implement this method.
        """
        raise NotImplementedError

    def record_issue(self, filepath, line_number):
        """Record that an issue was found at the specified location."""
        if filepath not in self.files_with_issues.keys():
            self.files_with_issues[filepath] = []
        self.files_with_issues[filepath].append(line_number)

    def output_file_issues(self, logger):
        """Log all the locations where the issue was found."""
        if self.files_with_issues.values():
            logger.info(self.heading)
            for filename, lines in sorted(self.files_with_issues.items()):
                if lines:
                    logger.info("{}: {}".format(
                        filename, ", ".join(str(x) for x in lines)
                    ))
                else:
                    logger.info(filename)
            logger.info("")

BINARY_FILE_PATH_RE_LIST = [
    r'docs/.*\.pdf\Z',
    r'programs/fuzz/corpuses/[^.]+\Z',
    r'tests/data_files/[^.]+\Z',
    r'tests/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z',
    r'tests/data_files/.*\.req\.[^/]+\Z',
    r'tests/data_files/.*malformed[^/]+\Z',
    r'tests/data_files/format_pkcs12\.fmt\Z',
    r'tests/data_files/.*\.bin\Z',
]
BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST))

class LineIssueTracker(FileIssueTracker):
    """Base class for line-by-line issue tracking.

    To implement a checker that processes files line by line, inherit from
    this class and implement `line_with_issue`.
    """

    # Exclude binary files.
    path_exemptions = BINARY_FILE_PATH_RE

    def issue_with_line(self, line, filepath, line_number):
        """Check the specified line for the issue that this class is for.

        Subclasses must implement this method.
        """
        raise NotImplementedError

    def check_file_line(self, filepath, line, line_number):
        if self.issue_with_line(line, filepath, line_number):
            self.record_issue(filepath, line_number)

    def check_file_for_issue(self, filepath):
        """Check the lines of the specified file.

        Subclasses must implement the ``issue_with_line`` method.
        """
        with open(filepath, "rb") as f:
            for i, line in enumerate(iter(f.readline, b"")):
                self.check_file_line(filepath, line, i + 1)


def is_windows_file(filepath):
    _root, ext = os.path.splitext(filepath)
    return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj')


class ShebangIssueTracker(FileIssueTracker):
    """Track files with a bad, missing or extraneous shebang line.

    Executable scripts must start with a valid shebang (#!) line.
    """

    heading = "Invalid shebang line:"

    # Allow either /bin/sh, /bin/bash, or /usr/bin/env.
    # Allow at most one argument (this is a Linux limitation).
    # For sh and bash, the argument if present must be options.
    # For env, the argument must be the base name of the interpreter.
    _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?'
                             rb'|/usr/bin/env ([^\n /]+))$')
    _extensions = {
        b'bash': 'sh',
        b'perl': 'pl',
        b'python3': 'py',
        b'sh': 'sh',
    }

    def is_valid_shebang(self, first_line, filepath):
        m = re.match(self._shebang_re, first_line)
        if not m:
            return False
        interpreter = m.group(1) or m.group(2)
        if interpreter not in self._extensions:
            return False
        if not filepath.endswith('.' + self._extensions[interpreter]):
            return False
        return True

    def check_file_for_issue(self, filepath):
        is_executable = os.access(filepath, os.X_OK)
        with open(filepath, "rb") as f:
            first_line = f.readline()
        if first_line.startswith(b'#!'):
            if not is_executable:
                # Shebang on a non-executable file
                self.files_with_issues[filepath] = None
            elif not self.is_valid_shebang(first_line, filepath):
                self.files_with_issues[filepath] = [1]
        elif is_executable:
            # Executable without a shebang
            self.files_with_issues[filepath] = None


class EndOfFileNewlineIssueTracker(FileIssueTracker):
    """Track files that end with an incomplete line
    (no newline character at the end of the last line)."""

    heading = "Missing newline at end of file:"

    path_exemptions = BINARY_FILE_PATH_RE

    def check_file_for_issue(self, filepath):
        with open(filepath, "rb") as f:
            try:
                f.seek(-1, 2)
            except OSError:
                # This script only works on regular files. If we can't seek
                # 1 before the end, it means that this position is before
                # the beginning of the file, i.e. that the file is empty.
                return
            if f.read(1) != b"\n":
                self.files_with_issues[filepath] = None


class Utf8BomIssueTracker(FileIssueTracker):
    """Track files that start with a UTF-8 BOM.
    Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM."""

    heading = "UTF-8 BOM present:"

    suffix_exemptions = frozenset([".vcxproj", ".sln"])
    path_exemptions = BINARY_FILE_PATH_RE

    def check_file_for_issue(self, filepath):
        with open(filepath, "rb") as f:
            if f.read().startswith(codecs.BOM_UTF8):
                self.files_with_issues[filepath] = None


class UnicodeIssueTracker(LineIssueTracker):
    """Track lines with invalid characters or invalid text encoding."""

    heading = "Invalid UTF-8 or forbidden character:"

    # Only allow valid UTF-8, and only other explicitly allowed characters.
    # We deliberately exclude all characters that aren't a simple non-blank,
    # non-zero-width glyph, apart from a very small set (tab, ordinary space,
    # line breaks, "basic" no-break space and soft hyphen). In particular,
    # non-ASCII control characters, combinig characters, and Unicode state
    # changes (e.g. right-to-left text) are forbidden.
    # Note that we do allow some characters with a risk of visual confusion,
    # for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs
    # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
    # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
    GOOD_CHARACTERS = ''.join([
        '\t\n\r -~', # ASCII (tabs and line endings are checked separately)
        '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
        '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
        '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
        '\u2190-\u21FF', # Arrows
        '\u2200-\u22FF', # Mathematical Symbols
        '\u2500-\u257F' # Box Drawings characters used in markdown trees
    ])
    # Allow any of the characters and ranges above, and anything classified
    # as a word constituent.
    GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))

    def issue_with_line(self, line, _filepath, line_number):
        try:
            text = line.decode('utf-8')
        except UnicodeDecodeError:
            return True
        if line_number == 1 and text.startswith('\uFEFF'):
            # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
            # Which files are allowed to have a BOM is handled in
            # Utf8BomIssueTracker.
            text = text[1:]
        return not self.GOOD_CHARACTERS_RE.match(text)

class UnixLineEndingIssueTracker(LineIssueTracker):
    """Track files with non-Unix line endings (i.e. files with CR)."""

    heading = "Non-Unix line endings:"

    def should_check_file(self, filepath):
        if not super().should_check_file(filepath):
            return False
        return not is_windows_file(filepath)

    def issue_with_line(self, line, _filepath, _line_number):
        return b"\r" in line


class WindowsLineEndingIssueTracker(LineIssueTracker):
    """Track files with non-Windows line endings (i.e. CR or LF not in CRLF)."""

    heading = "Non-Windows line endings:"

    def should_check_file(self, filepath):
        if not super().should_check_file(filepath):
            return False
        return is_windows_file(filepath)

    def issue_with_line(self, line, _filepath, _line_number):
        return not line.endswith(b"\r\n") or b"\r" in line[:-2]


class TrailingWhitespaceIssueTracker(LineIssueTracker):
    """Track lines with trailing whitespace."""

    heading = "Trailing whitespace:"
    suffix_exemptions = frozenset([".dsp", ".md"])

    def issue_with_line(self, line, _filepath, _line_number):
        return line.rstrip(b"\r\n") != line.rstrip()


class TabIssueTracker(LineIssueTracker):
    """Track lines with tabs."""

    heading = "Tabs present:"
    suffix_exemptions = frozenset([
        ".pem", # some openssl dumps have tabs
        ".sln",
        "/Makefile",
        "/Makefile.inc",
        "/generate_visualc_files.pl",
    ])

    def issue_with_line(self, line, _filepath, _line_number):
        return b"\t" in line


class MergeArtifactIssueTracker(LineIssueTracker):
    """Track lines with merge artifacts.
    These are leftovers from a ``git merge`` that wasn't fully edited."""

    heading = "Merge artifact:"

    def issue_with_line(self, line, _filepath, _line_number):
        # Detect leftover git conflict markers.
        if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '):
            return True
        if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3
            return True
        if line.rstrip(b'\r\n') == b'=======' and \
           not _filepath.endswith('.md'):
            return True
        return False


class IntegrityChecker:
    """Sanity-check files under the current directory."""

    def __init__(self, log_file):
        """Instantiate the sanity checker.
        Check files under the current directory.
        Write a report of issues to log_file."""
        build_tree.check_repo_path()
        self.logger = None
        self.setup_logger(log_file)
        self.issues_to_check = [
            ShebangIssueTracker(),
            EndOfFileNewlineIssueTracker(),
            Utf8BomIssueTracker(),
            UnicodeIssueTracker(),
            UnixLineEndingIssueTracker(),
            WindowsLineEndingIssueTracker(),
            TrailingWhitespaceIssueTracker(),
            TabIssueTracker(),
            MergeArtifactIssueTracker(),
        ]

    def setup_logger(self, log_file, level=logging.INFO):
        self.logger = logging.getLogger()
        self.logger.setLevel(level)
        if log_file:
            handler = logging.FileHandler(log_file)
            self.logger.addHandler(handler)
        else:
            console = logging.StreamHandler()
            self.logger.addHandler(console)

    @staticmethod
    def collect_files():
        bytes_output = subprocess.check_output(['git', 'ls-files', '-z'])
        bytes_filepaths = bytes_output.split(b'\0')[:-1]
        ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths)
        # Prepend './' to files in the top-level directory so that
        # something like `'/Makefile' in fp` matches in the top-level
        # directory as well as in subdirectories.
        return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp)
                for fp in ascii_filepaths]

    def check_files(self):
        for issue_to_check in self.issues_to_check:
            for filepath in self.collect_files():
                if issue_to_check.should_check_file(filepath):
                    issue_to_check.check_file_for_issue(filepath)

    def output_issues(self):
        integrity_return_code = 0
        for issue_to_check in self.issues_to_check:
            if issue_to_check.files_with_issues:
                integrity_return_code = 1
            issue_to_check.output_file_issues(self.logger)
        return integrity_return_code


def run_main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "-l", "--log_file", type=str, help="path to optional output log",
    )
    check_args = parser.parse_args()
    integrity_check = IntegrityChecker(check_args.log_file)
    integrity_check.check_files()
    return_code = integrity_check.output_issues()
    sys.exit(return_code)


if __name__ == "__main__":
    run_main()
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								#!/usr/bin/env python3
-												Update copyright notices to use Linux Foundation guidance

As a result, the copyright of contributors other than Arm is now
acknowledged, and the years of publishing are no longer tracked in the
source files.

Also remove the now-redundant lines declaring that the files are part of
MbedTLS.

This commit was generated using the following script:

# ========================
#!/bin/sh

# Find files
find '(' -path './.git' -o -path './3rdparty' ')' -prune -o -type f -print | xargs sed -bi '

# Replace copyright attribution line
s/Copyright.*Arm.*/Copyright The Mbed TLS Contributors/I

# Remove redundant declaration and the preceding line
$!N
/This file is part of Mbed TLS/Id
P
D
'
# ========================

Signed-off-by: Bence Szépkúti <bence.szepkuti@arm.com>

											
										
										
											2020-08-07 13:07:28 +02:00
+								# Copyright The Mbed TLS Contributors
-												update headers

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>

											
										
										
											2023-11-02 20:47:20 +01:00
+								# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Use the docstring in the command line help

											
										
										
											2019-07-04 19:31:02 +02:00
+								"""
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								This script checks the current state of the source code for minor issues,
 								including incorrect file permissions, presence of tabs, non-Unix line endings,
-												Allow TODO in code

Don't reject TODO in code. Fix #2587

											
										
										
											2019-07-04 19:31:33 +02:00
+								trailing whitespace, and presence of UTF-8 BOM.
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								Note: requires python 3, must be run from Mbed TLS root.
 								"""
 								import argparse
 								import codecs
-												Sort imports

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-11-03 13:55:00 +01:00
+								import logging
 								import os
-												Regex mechanism for check-specific exemptions

Suffixes are convenient but not always sufficient.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:16 +02:00
+								import re
-												Check only files checked into Git

We're only interested in files that are committed and pushed to be
included in Mbed TLS, not in any other files that may be lying around.
So ask git for the list of file names.

This script is primarily intended to run on the CI, and there it runs
on a fresh Git checkout plus potentially some other checkouts or
leftovers from a previous part of the CI job. It should also run
reasonably well on developer machines, where there may be various
additional files. In both cases, git is available.

Ad hoc directory exclusions are no longer needed.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 17:18:06 +02:00
+								import subprocess
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								import sys
-												check_files.py: pass mypy

Add enough type annotations to pass mypy 0.782 with Python 3.5. The
source code will still run normally under older Python versions.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-08-11 15:11:50 +02:00
+								try:
 								    from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import
 								except ImportError:
 								    pass
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Unify check_repo_path

We had 4 identical copies of the check_repo_path function. Replace them by a
single copy in the build_tree module where it naturally belongs.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2022-09-18 21:17:09 +02:00
+								import scripts_path # pylint: disable=unused-import
 								from mbedtls_dev import build_tree
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Pylint: abide by useless-object-inheritance warnings

Inheriting from object is a remainder of Python 2 habits and is just
clutter in Python 3.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 18:25:17 +01:00
+								class FileIssueTracker:
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								    """Base class for file-wide issue tracking.
 								    To implement a checker that processes a file as a whole, inherit from
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    this class and implement `check_file_for_issue` and define ``heading``.
-												More accurate variable name

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:52:44 +02:00
+								    ``suffix_exemptions``: files whose name ends with a string in this set
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								     will not be checked.
-												Regex mechanism for check-specific exemptions

Suffixes are convenient but not always sufficient.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:16 +02:00
+								    ``path_exemptions``: files whose path (relative to the root of the source
 								    tree) matches this regular expression will not be checked. This can be
 								    ``None`` to match no path. Paths are normalized and converted to ``/``
 								    separators before matching.
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    ``heading``: human-readable description of the issue
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								    """
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												check_files.py: pass mypy

Add enough type annotations to pass mypy 0.782 with Python 3.5. The
source code will still run normally under older Python versions.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-08-11 15:11:50 +02:00
+								    suffix_exemptions = frozenset() #type: FrozenSet[str]
 								    path_exemptions = None #type: Optional[Pattern[str]]
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    # heading must be defined in derived classes.
 								    # pylint: disable=no-member
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								    def __init__(self):
 								        self.files_with_issues = {}
-												Regex mechanism for check-specific exemptions

Suffixes are convenient but not always sufficient.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:16 +02:00
+								    @staticmethod
 								    def normalize_path(filepath):
-												Finish the documentation of normalize_path

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-28 18:19:20 +02:00
+								        """Normalize ``filepath`` with / as the directory separator."""
-												Regex mechanism for check-specific exemptions

Suffixes are convenient but not always sufficient.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:16 +02:00
+								        filepath = os.path.normpath(filepath)
-												Finish the documentation of normalize_path

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-28 18:19:20 +02:00
+								        # On Windows, we may have backslashes to separate directories.
 								        # We need slashes to match exemption lists.
-												Regex mechanism for check-specific exemptions

Suffixes are convenient but not always sufficient.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:16 +02:00
+								        seps = os.path.sep
 								        if os.path.altsep is not None:
 								            seps += os.path.altsep
 								        return '/'.join(filepath.split(seps))
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								    def should_check_file(self, filepath):
-												Document more methods in Python scripts

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 16:49:21 +01:00
+								        """Whether the given file name should be checked.
-												More accurate variable name

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:52:44 +02:00
+								        Files whose name ends with a string listed in ``self.suffix_exemptions``
 								        or whose path matches ``self.path_exemptions`` will not be checked.
-												Document more methods in Python scripts

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 16:49:21 +01:00
+								        """
-												More accurate variable name

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:52:44 +02:00
+								        for files_exemption in self.suffix_exemptions:
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								            if filepath.endswith(files_exemption):
 								                return False
-												Regex mechanism for check-specific exemptions

Suffixes are convenient but not always sufficient.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:16 +02:00
+								        if self.path_exemptions and \
 								           re.match(self.path_exemptions, self.normalize_path(filepath)):
 								            return False
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								        return True
 								    def check_file_for_issue(self, filepath):
-												Document more methods in Python scripts

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 16:49:21 +01:00
+								        """Check the specified file for the issue that this class is for.
 								        Subclasses must implement this method.
 								        """
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								        raise NotImplementedError
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Factor record_issue into its own method

											
										
										
											2018-11-23 21:11:30 +01:00
+								    def record_issue(self, filepath, line_number):
-												Document more methods in Python scripts

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 16:49:21 +01:00
+								        """Record that an issue was found at the specified location."""
-												Factor record_issue into its own method

											
										
										
											2018-11-23 21:11:30 +01:00
+								        if filepath not in self.files_with_issues.keys():
 								            self.files_with_issues[filepath] = []
 								        self.files_with_issues[filepath].append(line_number)
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								    def output_file_issues(self, logger):
-												Document more methods in Python scripts

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 16:49:21 +01:00
+								        """Log all the locations where the issue was found."""
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								        if self.files_with_issues.values():
 								            logger.info(self.heading)
 								            for filename, lines in sorted(self.files_with_issues.items()):
 								                if lines:
 								                    logger.info("{}: {}".format(
 								                        filename, ", ".join(str(x) for x in lines)
 								                    ))
 								                else:
 								                    logger.info(filename)
 								            logger.info("")
-												Exclude binary files from text checks

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:59 +02:00
+								BINARY_FILE_PATH_RE_LIST = [
 								    r'docs/.*\.pdf\Z',
 								    r'programs/fuzz/corpuses/[^.]+\Z',
 								    r'tests/data_files/[^.]+\Z',
 								    r'tests/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z',
 								    r'tests/data_files/.*\.req\.[^/]+\Z',
 								    r'tests/data_files/.*malformed[^/]+\Z',
 								    r'tests/data_files/format_pkcs12\.fmt\Z',
-												Treat more *.bin files as binary

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:27:18 +01:00
+								    r'tests/data_files/.*\.bin\Z',
-												Exclude binary files from text checks

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:59 +02:00
+								]
 								BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST))
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								class LineIssueTracker(FileIssueTracker):
 								    """Base class for line-by-line issue tracking.
 								    To implement a checker that processes files line by line, inherit from
 								    this class and implement `line_with_issue`.
 								    """
-												Exclude binary files from text checks

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:59 +02:00
+								    # Exclude binary files.
 								    path_exemptions = BINARY_FILE_PATH_RE
-												Pass line number to issue_with_line

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:30 +01:00
+								    def issue_with_line(self, line, filepath, line_number):
-												Document more methods in Python scripts

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 16:49:21 +01:00
+								        """Check the specified line for the issue that this class is for.
 								        Subclasses must implement this method.
 								        """
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								        raise NotImplementedError
 								    def check_file_line(self, filepath, line, line_number):
-												Pass line number to issue_with_line

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:30 +01:00
+								        if self.issue_with_line(line, filepath, line_number):
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								            self.record_issue(filepath, line_number)
 								    def check_file_for_issue(self, filepath):
-												Document more methods in Python scripts

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 16:49:21 +01:00
+								        """Check the lines of the specified file.
 								        Subclasses must implement the ``issue_with_line`` method.
 								        """
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								        with open(filepath, "rb") as f:
 								            for i, line in enumerate(iter(f.readline, b"")):
 								                self.check_file_line(filepath, line, i + 1)
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Also check Windows files

Check Windows files for some issues, including permissions. Omit the
checks related to special characters (whitespace, line endings,
encoding) as appropriate.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:26:01 +01:00
 								def is_windows_file(filepath):
 								    _root, ext = os.path.splitext(filepath)
-												.dsw files are Visual Studio stuff

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 17:36:51 +02:00
+								    return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj')
-												Also check Windows files

Check Windows files for some issues, including permissions. Omit the
checks related to special characters (whitespace, line endings,
encoding) as appropriate.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:26:01 +01:00
-												Test shebang lines

Executable scripts must have shebang (#!) line to be effectively
executable on most Unix-like systems. Enforce this, and conversely
enforce that files with a shebang line are executable.

Check that the specified interperter is consistent with the file
extension.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-08-08 23:15:18 +02:00
+								class ShebangIssueTracker(FileIssueTracker):
 								    """Track files with a bad, missing or extraneous shebang line.
 								    Executable scripts must start with a valid shebang (#!) line.
 								    """
 								    heading = "Invalid shebang line:"
 								    # Allow either /bin/sh, /bin/bash, or /usr/bin/env.
 								    # Allow at most one argument (this is a Linux limitation).
 								    # For sh and bash, the argument if present must be options.
-												Redo of PR#5345. Fixed spelling and typographical errors found by CodeSpell.

Signed-off-by: Shaun Case <warmsocks@gmail.com>
Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>

											
										
										
											2021-12-21 06:14:10 +01:00
+								    # For env, the argument must be the base name of the interpreter.
-												Test shebang lines

Executable scripts must have shebang (#!) line to be effectively
executable on most Unix-like systems. Enforce this, and conversely
enforce that files with a shebang line are executable.

Check that the specified interperter is consistent with the file
extension.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-08-08 23:15:18 +02:00
+								    _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?'
 								                             rb'|/usr/bin/env ([^\n /]+))$')
 								    _extensions = {
 								        b'bash': 'sh',
 								        b'perl': 'pl',
 								        b'python3': 'py',
 								        b'sh': 'sh',
 								    }
 								    def is_valid_shebang(self, first_line, filepath):
 								        m = re.match(self._shebang_re, first_line)
 								        if not m:
 								            return False
 								        interpreter = m.group(1) or m.group(2)
 								        if interpreter not in self._extensions:
 								            return False
 								        if not filepath.endswith('.' + self._extensions[interpreter]):
 								            return False
 								        return True
 								    def check_file_for_issue(self, filepath):
 								        is_executable = os.access(filepath, os.X_OK)
 								        with open(filepath, "rb") as f:
 								            first_line = f.readline()
 								        if first_line.startswith(b'#!'):
 								            if not is_executable:
 								                # Shebang on a non-executable file
 								                self.files_with_issues[filepath] = None
 								            elif not self.is_valid_shebang(first_line, filepath):
 								                self.files_with_issues[filepath] = [1]
 								        elif is_executable:
 								            # Executable without a shebang
 								            self.files_with_issues[filepath] = None
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								class EndOfFileNewlineIssueTracker(FileIssueTracker):
-												check-files.py: document some classes and methods

Document all classes and longer methods.

Declare a static method as such. Pointed out by pylint.

											
										
										
											2019-02-25 20:35:31 +01:00
+								    """Track files that end with an incomplete line
 								    (no newline character at the end of the last line)."""
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    heading = "Missing newline at end of file:"
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Exclude binary files from text checks

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:59 +02:00
+								    path_exemptions = BINARY_FILE_PATH_RE
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								    def check_file_for_issue(self, filepath):
 								        with open(filepath, "rb") as f:
-												Permit empty files

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 17:36:42 +02:00
+								            try:
 								                f.seek(-1, 2)
 								            except OSError:
 								                # This script only works on regular files. If we can't seek
 								                # 1 before the end, it means that this position is before
 								                # the beginning of the file, i.e. that the file is empty.
 								                return
 								            if f.read(1) != b"\n":
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								                self.files_with_issues[filepath] = None
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								class Utf8BomIssueTracker(FileIssueTracker):
-												check-files.py: document some classes and methods

Document all classes and longer methods.

Declare a static method as such. Pointed out by pylint.

											
										
										
											2019-02-25 20:35:31 +01:00
+								    """Track files that start with a UTF-8 BOM.
 								    Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM."""
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    heading = "UTF-8 BOM present:"
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												More accurate variable name

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:52:44 +02:00
+								    suffix_exemptions = frozenset([".vcxproj", ".sln"])
-												Exclude binary files from text checks

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:59 +02:00
+								    path_exemptions = BINARY_FILE_PATH_RE
-												Also check Windows files

Check Windows files for some issues, including permissions. Omit the
checks related to special characters (whitespace, line endings,
encoding) as appropriate.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:26:01 +01:00
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								    def check_file_for_issue(self, filepath):
 								        with open(filepath, "rb") as f:
 								            if f.read().startswith(codecs.BOM_UTF8):
 								                self.files_with_issues[filepath] = None
-												Reject invalid UTF-8 and weird characters in text files

Reject "weird" characters in text files, especially control characters that
might be escape sequences or that might cause other text to appear garbled
(as in https://trojansource.codes/).

Also reject byte sequences that aren't valid UTF-8.

Accept only ASCII (except most control characters), letters, some non-ASCII
punctuation and some mathematical and technical symbols. This covers
everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥).

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:57 +01:00
+								class UnicodeIssueTracker(LineIssueTracker):
 								    """Track lines with invalid characters or invalid text encoding."""
 								    heading = "Invalid UTF-8 or forbidden character:"
-												Modify comments to make them more inclusive

Signed-off-by: Aditya Deshpande <aditya.deshpande@arm.com>

											
										
										
											2023-01-30 14:46:58 +01:00
+								    # Only allow valid UTF-8, and only other explicitly allowed characters.
-												Reject invalid UTF-8 and weird characters in text files

Reject "weird" characters in text files, especially control characters that
might be escape sequences or that might cause other text to appear garbled
(as in https://trojansource.codes/).

Also reject byte sequences that aren't valid UTF-8.

Accept only ASCII (except most control characters), letters, some non-ASCII
punctuation and some mathematical and technical symbols. This covers
everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥).

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:57 +01:00
+								    # We deliberately exclude all characters that aren't a simple non-blank,
 								    # non-zero-width glyph, apart from a very small set (tab, ordinary space,
 								    # line breaks, "basic" no-break space and soft hyphen). In particular,
 								    # non-ASCII control characters, combinig characters, and Unicode state
 								    # changes (e.g. right-to-left text) are forbidden.
 								    # Note that we do allow some characters with a risk of visual confusion,
 								    # for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs
 								    # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
 								    # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
 								    GOOD_CHARACTERS = ''.join([
 								        '\t\n\r -~', # ASCII (tabs and line endings are checked separately)
 								        '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
 								        '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
 								        '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
 								        '\u2190-\u21FF', # Arrows
 								        '\u2200-\u22FF', # Mathematical Symbols
-												Allow whole Box Drawings range

Signed-off-by: Aditya Deshpande <aditya.deshpande@arm.com>

											
										
										
											2023-02-01 14:30:26 +01:00
+								        '\u2500-\u257F' # Box Drawings characters used in markdown trees
-												Reject invalid UTF-8 and weird characters in text files

Reject "weird" characters in text files, especially control characters that
might be escape sequences or that might cause other text to appear garbled
(as in https://trojansource.codes/).

Also reject byte sequences that aren't valid UTF-8.

Accept only ASCII (except most control characters), letters, some non-ASCII
punctuation and some mathematical and technical symbols. This covers
everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥).

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:57 +01:00
+								    ])
 								    # Allow any of the characters and ranges above, and anything classified
 								    # as a word constituent.
 								    GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
 								    def issue_with_line(self, line, _filepath, line_number):
 								        try:
 								            text = line.decode('utf-8')
 								        except UnicodeDecodeError:
 								            return True
 								        if line_number == 1 and text.startswith('\uFEFF'):
 								            # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
 								            # Which files are allowed to have a BOM is handled in
 								            # Utf8BomIssueTracker.
 								            text = text[1:]
 								        return not self.GOOD_CHARACTERS_RE.match(text)
-												Also check Windows files

Check Windows files for some issues, including permissions. Omit the
checks related to special characters (whitespace, line endings,
encoding) as appropriate.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:26:01 +01:00
+								class UnixLineEndingIssueTracker(LineIssueTracker):
-												check-files.py: document some classes and methods

Document all classes and longer methods.

Declare a static method as such. Pointed out by pylint.

											
										
										
											2019-02-25 20:35:31 +01:00
+								    """Track files with non-Unix line endings (i.e. files with CR)."""
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Also check Windows files

Check Windows files for some issues, including permissions. Omit the
checks related to special characters (whitespace, line endings,
encoding) as appropriate.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:26:01 +01:00
+								    heading = "Non-Unix line endings:"
 								    def should_check_file(self, filepath):
-												Regex mechanism for check-specific exemptions

Suffixes are convenient but not always sufficient.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:16 +02:00
+								        if not super().should_check_file(filepath):
 								            return False
-												Also check Windows files

Check Windows files for some issues, including permissions. Omit the
checks related to special characters (whitespace, line endings,
encoding) as appropriate.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:26:01 +01:00
+								        return not is_windows_file(filepath)
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Pass line number to issue_with_line

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:30 +01:00
+								    def issue_with_line(self, line, _filepath, _line_number):
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								        return b"\r" in line
-												Check that Windows files have Windows line endings

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:29:11 +01:00
+								class WindowsLineEndingIssueTracker(LineIssueTracker):
-												In Windows files, detect CR without LF as well as LF without CR

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-04-01 13:35:46 +02:00
+								    """Track files with non-Windows line endings (i.e. CR or LF not in CRLF)."""
-												Check that Windows files have Windows line endings

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:29:11 +01:00
 								    heading = "Non-Windows line endings:"
 								    def should_check_file(self, filepath):
-												Regex mechanism for check-specific exemptions

Suffixes are convenient but not always sufficient.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:57:16 +02:00
+								        if not super().should_check_file(filepath):
 								            return False
-												Check that Windows files have Windows line endings

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:29:11 +01:00
+								        return is_windows_file(filepath)
-												Pass line number to issue_with_line

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:30 +01:00
+								    def issue_with_line(self, line, _filepath, _line_number):
-												In Windows files, detect CR without LF as well as LF without CR

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-04-01 13:35:46 +02:00
+								        return not line.endswith(b"\r\n") or b"\r" in line[:-2]
-												Check that Windows files have Windows line endings

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:29:11 +01:00
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								class TrailingWhitespaceIssueTracker(LineIssueTracker):
-												check-files.py: document some classes and methods

Document all classes and longer methods.

Declare a static method as such. Pointed out by pylint.

											
										
										
											2019-02-25 20:35:31 +01:00
+								    """Track lines with trailing whitespace."""
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    heading = "Trailing whitespace:"
-												More accurate variable name

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:52:44 +02:00
+								    suffix_exemptions = frozenset([".dsp", ".md"])
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Pass line number to issue_with_line

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:30 +01:00
+								    def issue_with_line(self, line, _filepath, _line_number):
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								        return line.rstrip(b"\r\n") != line.rstrip()
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								class TabIssueTracker(LineIssueTracker):
-												check-files.py: document some classes and methods

Document all classes and longer methods.

Declare a static method as such. Pointed out by pylint.

											
										
										
											2019-02-25 20:35:31 +01:00
+								    """Track lines with tabs."""
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    heading = "Tabs present:"
-												More accurate variable name

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 16:52:44 +02:00
+								    suffix_exemptions = frozenset([
-												Some .pem files are openssl output and have tabs and that's ok

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 17:37:02 +02:00
+								        ".pem", # some openssl dumps have tabs
-												Also check Windows files

Check Windows files for some issues, including permissions. Omit the
checks related to special characters (whitespace, line endings,
encoding) as appropriate.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:26:01 +01:00
+								        ".sln",
-												Clarify confusion between file names and suffixes of file names

To test a file name exactly, prepend a / to the base name.

files_to_check actually checks suffixes, not file names, so rename it
to extensions_to_check.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:01:28 +01:00
+								        "/Makefile",
 								        "/Makefile.inc",
 								        "/generate_visualc_files.pl",
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    ])
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Pass line number to issue_with_line

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:30 +01:00
+								    def issue_with_line(self, line, _filepath, _line_number):
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								        return b"\t" in line
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								class MergeArtifactIssueTracker(LineIssueTracker):
-												check-files.py: document some classes and methods

Document all classes and longer methods.

Declare a static method as such. Pointed out by pylint.

											
										
										
											2019-02-25 20:35:31 +01:00
+								    """Track lines with merge artifacts.
 								    These are leftovers from a ``git merge`` that wasn't fully edited."""
-												check-files: detect merge artifacts

Detect Git merge artifacts. These are lines starting with "<<<<<<",
"|||||||" or ">>>>>>>" followed by a space, or containing just
"=======". For "=======", exempt Markdown files, because this can be
used to underline a title, as a compromise between false negatives and
false positives.

											
										
										
											2018-11-23 21:11:52 +01:00
-												check-files.py: use class fields for class-wide constants

In an issue tracker, heading and files_exemptions are class-wide
constants, so make them so instead of being per-instance fields.

											
										
										
											2019-02-25 21:10:04 +01:00
+								    heading = "Merge artifact:"
-												check-files: detect merge artifacts

Detect Git merge artifacts. These are lines starting with "<<<<<<",
"|||||||" or ">>>>>>>" followed by a space, or containing just
"=======". For "=======", exempt Markdown files, because this can be
used to underline a title, as a compromise between false negatives and
false positives.

											
										
										
											2018-11-23 21:11:52 +01:00
-												Pass line number to issue_with_line

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:30 +01:00
+								    def issue_with_line(self, line, _filepath, _line_number):
-												check-files: detect merge artifacts

Detect Git merge artifacts. These are lines starting with "<<<<<<",
"|||||||" or ">>>>>>>" followed by a space, or containing just
"=======". For "=======", exempt Markdown files, because this can be
used to underline a title, as a compromise between false negatives and
false positives.

											
										
										
											2018-11-23 21:11:52 +01:00
+								        # Detect leftover git conflict markers.
 								        if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '):
 								            return True
 								        if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3
 								            return True
 								        if line.rstrip(b'\r\n') == b'=======' and \
-												check-files.py: clean up class structure

Line issue trackers are conceptually a subclass of file issue
trackers: they're file issue trackers where issues arise from checking
each line independently. So make it an actual subclass.

Pylint pointed out the design smell: there was an abstract method that
wasn't always overridden in concrete child classes.

											
										
										
											2019-02-25 20:59:05 +01:00
+								           not _filepath.endswith('.md'):
-												check-files: detect merge artifacts

Detect Git merge artifacts. These are lines starting with "<<<<<<",
"|||||||" or ">>>>>>>" followed by a space, or containing just
"=======". For "=======", exempt Markdown files, because this can be
used to underline a title, as a compromise between false negatives and
false positives.

											
										
										
											2018-11-23 21:11:52 +01:00
+								            return True
 								        return False
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
-												Pylint: abide by useless-object-inheritance warnings

Inheriting from object is a remainder of Python 2 habits and is just
clutter in Python 3.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 18:25:17 +01:00
+								class IntegrityChecker:
-												check-files.py: document some classes and methods

Document all classes and longer methods.

Declare a static method as such. Pointed out by pylint.

											
										
										
											2019-02-25 20:35:31 +01:00
+								    """Sanity-check files under the current directory."""
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
 								    def __init__(self, log_file):
-												check-files.py: document some classes and methods

Document all classes and longer methods.

Declare a static method as such. Pointed out by pylint.

											
										
										
											2019-02-25 20:35:31 +01:00
+								        """Instantiate the sanity checker.
 								        Check files under the current directory.
 								        Write a report of issues to log_file."""
-												Unify check_repo_path

We had 4 identical copies of the check_repo_path function. Replace them by a
single copy in the build_tree module where it naturally belongs.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2022-09-18 21:17:09 +02:00
+								        build_tree.check_repo_path()
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								        self.logger = None
 								        self.setup_logger(log_file)
 								        self.issues_to_check = [
-												Test shebang lines

Executable scripts must have shebang (#!) line to be effectively
executable on most Unix-like systems. Enforce this, and conversely
enforce that files with a shebang line are executable.

Check that the specified interperter is consistent with the file
extension.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-08-08 23:15:18 +02:00
+								            ShebangIssueTracker(),
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								            EndOfFileNewlineIssueTracker(),
 								            Utf8BomIssueTracker(),
-												Reject invalid UTF-8 and weird characters in text files

Reject "weird" characters in text files, especially control characters that
might be escape sequences or that might cause other text to appear garbled
(as in https://trojansource.codes/).

Also reject byte sequences that aren't valid UTF-8.

Accept only ASCII (except most control characters), letters, some non-ASCII
punctuation and some mathematical and technical symbols. This covers
everything that's currently present in Mbed TLS ( §áèéëñóöüłŽ–—’“”…≥).

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2023-01-05 20:28:57 +01:00
+								            UnicodeIssueTracker(),
-												Also check Windows files

Check Windows files for some issues, including permissions. Omit the
checks related to special characters (whitespace, line endings,
encoding) as appropriate.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:26:01 +01:00
+								            UnixLineEndingIssueTracker(),
-												Check that Windows files have Windows line endings

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-03-24 22:29:11 +01:00
+								            WindowsLineEndingIssueTracker(),
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								            TrailingWhitespaceIssueTracker(),
 								            TabIssueTracker(),
-												check-files: detect merge artifacts

Detect Git merge artifacts. These are lines starting with "<<<<<<",
"|||||||" or ">>>>>>>" followed by a space, or containing just
"=======". For "=======", exempt Markdown files, because this can be
used to underline a title, as a compromise between false negatives and
false positives.

											
										
										
											2018-11-23 21:11:52 +01:00
+								            MergeArtifactIssueTracker(),
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								        ]
 								    def setup_logger(self, log_file, level=logging.INFO):
 								        self.logger = logging.getLogger()
 								        self.logger.setLevel(level)
 								        if log_file:
 								            handler = logging.FileHandler(log_file)
 								            self.logger.addHandler(handler)
 								        else:
 								            console = logging.StreamHandler()
 								            self.logger.addHandler(console)
-												Check only files checked into Git

We're only interested in files that are committed and pushed to be
included in Mbed TLS, not in any other files that may be lying around.
So ask git for the list of file names.

This script is primarily intended to run on the CI, and there it runs
on a fresh Git checkout plus potentially some other checkouts or
leftovers from a previous part of the CI job. It should also run
reasonably well on developer machines, where there may be various
additional files. In both cases, git is available.

Ad hoc directory exclusions are no longer needed.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 17:18:06 +02:00
+								    @staticmethod
 								    def collect_files():
 								        bytes_output = subprocess.check_output(['git', 'ls-files', '-z'])
 								        bytes_filepaths = bytes_output.split(b'\0')[:-1]
 								        ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths)
 								        # Prepend './' to files in the top-level directory so that
 								        # something like `'/Makefile' in fp` matches in the top-level
 								        # directory as well as in subdirectories.
 								        return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp)
 								                for fp in ascii_filepaths]
-												check-files: exclude .git and third-party files

Exclude ".git" directories anywhere. This avoids spurious errors in git
checkouts that contain branch names that look like a file
check-files.py would check. Fix #1713

Exclude "mbed-os" anywhere and "examples" from the root. Switch to the
new mechanism to exclude "yotta/module". These are directories where
we store third-party files that do not need to match our preferences.

Exclude "cov-int" from the root. Fix #1691

											
										
										
											2018-09-28 11:48:10 +02:00
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								    def check_files(self):
-												Check only files checked into Git

We're only interested in files that are committed and pushed to be
included in Mbed TLS, not in any other files that may be lying around.
So ask git for the list of file names.

This script is primarily intended to run on the CI, and there it runs
on a fresh Git checkout plus potentially some other checkouts or
leftovers from a previous part of the CI job. It should also run
reasonably well on developer machines, where there may be various
additional files. In both cases, git is available.

Ad hoc directory exclusions are no longer needed.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>

											
										
										
											2020-05-10 17:18:06 +02:00
+								        for issue_to_check in self.issues_to_check:
 								            for filepath in self.collect_files():
 								                if issue_to_check.should_check_file(filepath):
 								                    issue_to_check.check_file_for_issue(filepath)
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
 								    def output_issues(self):
 								        integrity_return_code = 0
 								        for issue_to_check in self.issues_to_check:
 								            if issue_to_check.files_with_issues:
 								                integrity_return_code = 1
 								            issue_to_check.output_file_issues(self.logger)
 								        return integrity_return_code
 								def run_main():
-												Use the docstring in the command line help

											
										
										
											2019-07-04 19:31:02 +02:00
+								    parser = argparse.ArgumentParser(description=__doc__)
-												Add script for source integrity checking

											
										
										
											2018-02-28 11:02:55 +01:00
+								    parser.add_argument(
 								        "-l", "--log_file", type=str, help="path to optional output log",
 								    )
 								    check_args = parser.parse_args()
 								    integrity_check = IntegrityChecker(check_args.log_file)
 								    integrity_check.check_files()
 								    return_code = integrity_check.output_issues()
 								    sys.exit(return_code)
 								if __name__ == "__main__":
 								    run_main()