Switch assemble_changelog to using text strings

Changelog contents should be UTF-8 text files. There's no need to be binary-safe. So switch to using text strings in Python (str, not bytes). This commit makes the following changes: * Bytes literals (b'…') to string literals ('…'). * Subprocess output (which is all git information) is decoded as ascii. * Inject text directly in exceptions rather than calling a decode method. This is enough to make the script work as desired in a UTF-8 locale. Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
2021-05-18 14:39:40 +02:00 · 2021-05-18 14:39:40 +02:00 · 7261fff37b
commit 7261fff37b
parent 383339c1f1
1 changed files with 39 additions and 39 deletions
--- a/scripts/assemble_changelog.py
+++ b/scripts/assemble_changelog.py
@ -63,15 +63,15 @@ class LostContent(Exception):
 # The category names we use in the changelog.
 # If you edit this, update ChangeLog.d/README.md.
 STANDARD_CATEGORIES = (
-    b'API changes',
-    b'Default behavior changes',
-    b'Requirement changes',
-    b'New deprecations',
-    b'Removals',
-    b'Features',
-    b'Security',
-    b'Bugfix',
-    b'Changes',
+    'API changes',
+    'Default behavior changes',
+    'Requirement changes',
+    'New deprecations',
+    'Removals',
+    'Features',
+    'Security',
+    'Bugfix',
+    'Changes',
 )

 # The maximum line length for an entry
@ -122,13 +122,13 @@ class ChangelogFormat:
 class TextChangelogFormat(ChangelogFormat):
    """The traditional Mbed TLS changelog format."""

-    _unreleased_version_text = b'= mbed TLS x.x.x branch released xxxx-xx-xx'
+    _unreleased_version_text = '= mbed TLS x.x.x branch released xxxx-xx-xx'
    @classmethod
    def is_released_version(cls, title):
        # Look for an incomplete release date
-        return not re.search(br'[0-9x]{4}-[0-9x]{2}-[0-9x]?x', title)
+        return not re.search(r'[0-9x]{4}-[0-9x]{2}-[0-9x]?x', title)

-    _top_version_re = re.compile(br'(?:\A|\n)(=[^\n]*\n+)(.*?\n)(?:=|$)',
+    _top_version_re = re.compile(r'(?:\A|\n)(=[^\n]*\n+)(.*?\n)(?:=|$)',
                                 re.DOTALL)
    @classmethod
    def extract_top_version(cls, changelog_file_content):
@ -140,17 +140,17 @@ class TextChangelogFormat(ChangelogFormat):
        top_version_body = m.group(2)
        if cls.is_released_version(top_version_title):
            top_version_end = top_version_start
-            top_version_title = cls._unreleased_version_text + b'\n\n'
-            top_version_body = b''
+            top_version_title = cls._unreleased_version_text + '\n\n'
+            top_version_body = ''
        return (changelog_file_content[:top_version_start],
                top_version_title, top_version_body,
                changelog_file_content[top_version_end:])

    @classmethod
    def version_title_text(cls, version_title):
-        return re.sub(br'\n.*', version_title, re.DOTALL)
+        return re.sub(r'\n.*', version_title, re.DOTALL)

-    _category_title_re = re.compile(br'(^\w.*)\n+', re.MULTILINE)
+    _category_title_re = re.compile(r'(^\w.*)\n+', re.MULTILINE)
    @classmethod
    def split_categories(cls, version_body):
        """A category title is a line with the title in column 0."""
@ -163,10 +163,10 @@ class TextChangelogFormat(ChangelogFormat):
        title_starts = [m.start(1) for m in title_matches]
        body_starts = [m.end(0) for m in title_matches]
        body_ends = title_starts[1:] + [len(version_body)]
-        bodies = [version_body[body_start:body_end].rstrip(b'\n') + b'\n'
+        bodies = [version_body[body_start:body_end].rstrip('\n') + '\n'
                  for (body_start, body_end) in zip(body_starts, body_ends)]
-        title_lines = [version_body[:pos].count(b'\n') for pos in title_starts]
-        body_lines = [version_body[:pos].count(b'\n') for pos in body_starts]
+        title_lines = [version_body[:pos].count('\n') for pos in title_starts]
+        body_lines = [version_body[:pos].count('\n') for pos in body_starts]
        return [CategoryContent(title_match.group(1), title_line,
                                body, body_line)
                for title_match, title_line, body, body_line
@ -176,9 +176,9 @@ class TextChangelogFormat(ChangelogFormat):
    def format_category(cls, title, body):
        # `split_categories` ensures that each body ends with a newline.
        # Make sure that there is additionally a blank line between categories.
-        if not body.endswith(b'\n\n'):
-            body += b'\n'
-        return title + b'\n' + body
+        if not body.endswith('\n\n'):
+            body += '\n'
+        return title + '\n' + body

 class ChangeLog:
    """An Mbed TLS changelog.
@ -199,10 +199,10 @@ class ChangeLog:
    # Only accept dotted version numbers (e.g. "3.1", not "3").
    # Refuse ".x" in a version number where x is a letter: this indicates
    # a version that is not yet released. Something like "3.1a" is accepted.
-    _version_number_re = re.compile(br'[0-9]+\.[0-9A-Za-z.]+')
-    _incomplete_version_number_re = re.compile(br'.*\.[A-Za-z]')
-    _only_url_re = re.compile(br'^\s*\w+://\S+\s*$')
-    _has_url_re = re.compile(br'.*://.*')
+    _version_number_re = re.compile(r'[0-9]+\.[0-9A-Za-z.]+')
+    _incomplete_version_number_re = re.compile(r'.*\.[A-Za-z]')
+    _only_url_re = re.compile(r'^\s*\w+://\S+\s*$')
+    _has_url_re = re.compile(r'.*://.*')

    def add_categories_from_text(self, filename, line_offset,
                                 text, allow_unknown_category):
@ -218,7 +218,7 @@ class ChangeLog:
                raise InputFormatError(filename,
                                       line_offset + category.title_line,
                                       'Unknown category: "{}"',
-                                       category.name.decode('utf8'))
+                                       category.name)

            body_split = category.body.splitlines()

@ -250,8 +250,8 @@ class ChangeLog:
        # Split the top version section into categories.
        self.categories = OrderedDict()
        for category in STANDARD_CATEGORIES:
-            self.categories[category] = b''
-        offset = (self.header + self.top_version_title).count(b'\n') + 1
+            self.categories[category] = ''
+        offset = (self.header + self.top_version_title).count('\n') + 1
        self.add_categories_from_text(input_stream.name, offset,
                                      top_version_body, True)

@ -264,7 +264,7 @@ class ChangeLog:
    def write(self, filename):
        """Write the changelog to the specified file.
        """
-        with open(filename, 'wb') as out:
+        with open(filename, 'w') as out:
            out.write(self.header)
            out.write(self.top_version_title)
            for title, body in self.categories.items():
@ -303,7 +303,7 @@ class EntryFileSortKey:
        hashes = subprocess.check_output(['git', 'log', '--format=%H',
                                          '--follow',
                                          '--', filename])
-        m = re.search(b'(.+)$', hashes)
+        m = re.search('(.+)$', hashes.decode('ascii'))
        if not m:
            # The git output is empty. This means that the file was
            # never checked in.
@ -320,8 +320,8 @@ class EntryFileSortKey:
        """
        text = subprocess.check_output(['git', 'rev-list',
                                        '--merges', *options,
-                                        b'..'.join([some_hash, target])])
-        return text.rstrip(b'\n').split(b'\n')
+                                        '..'.join([some_hash, target])])
+        return text.decode('ascii').rstrip('\n').split('\n')

    @classmethod
    def merge_hash(cls, some_hash):
@ -329,7 +329,7 @@ class EntryFileSortKey:

        Return None if the given commit was never merged.
        """
-        target = b'HEAD'
+        target = 'HEAD'
        # List the merges from some_hash to the target in two ways.
        # The ancestry list is the ones that are both descendants of
        # some_hash and ancestors of the target.
@ -407,12 +407,12 @@ def check_output(generated_output_file, main_input_file, merged_files):
    is also present in an output file. This is not perfect but good enough
    for now.
    """
-    generated_output = set(open(generated_output_file, 'rb'))
-    for line in open(main_input_file, 'rb'):
+    generated_output = set(open(generated_output_file, 'r'))
+    for line in open(main_input_file, 'r'):
        if line not in generated_output:
            raise LostContent('original file', line)
    for merged_file in merged_files:
-        for line in open(merged_file, 'rb'):
+        for line in open(merged_file, 'r'):
            if line not in generated_output:
                raise LostContent(merged_file, line)

@ -455,14 +455,14 @@ def merge_entries(options):
    Write the new changelog to options.output.
    Remove the merged entries if options.keep_entries is false.
    """
-    with open(options.input, 'rb') as input_file:
+    with open(options.input, 'r') as input_file:
        changelog = ChangeLog(input_file, TextChangelogFormat)
    files_to_merge = list_files_to_merge(options)
    if not files_to_merge:
        sys.stderr.write('There are no pending changelog entries.\n')
        return
    for filename in files_to_merge:
-        with open(filename, 'rb') as input_file:
+        with open(filename, 'r') as input_file:
            changelog.add_file(input_file)
    finish_output(changelog, options.output, options.input, files_to_merge)
    if not options.keep_entries: