From eda81eee9ee730911b3779cc367c51a7123fded8 Mon Sep 17 00:00:00 2001
From: Simon Marchi <simon.marchi@efficios.com>
Date: Fri, 22 Sep 2023 16:43:00 -0400
Subject: [PATCH] tests: add moultipart.py

Add moultipart.py, a Python module to help read text files meant to be
split in different parts.  A moultipart-compliant file is a succession
of parts, where each part is made of a header line followed with zero or
more lines of content.

A header line consists of `---` optionally followed by a space and some
custom information.

The moultipart module exposes the `parse` function, which takes a TextIO
object and returns a list of Part objects.

For instance, the following input:

    --- Victoria
    Parenteau
    ---
    Taillon
    --- This part is empty
    --- Josianne
    Gervais

results in the following parts:

    [Part('Victoria', 'Parenteau\n', 2),
     Part('', 'Taillon\n', 4),
     Part('This part is empty', '', 6),
     Part('Josianne', 'Gervais\n', 7)]

Change-Id: I50fb88ed6e064c09deaf1d56187415bc26002a14
Signed-off-by: Simon Marchi <simon.marchi@efficios.com>
Signed-off-by: Philippe Proulx <eeppeliteloop@gmail.com>
Reviewed-on: https://review.lttng.org/c/babeltrace/+/10912
Tested-by: jenkins <jenkins@lttng.org>
---
 setup.cfg                        |   3 +-
 tests/utils/python/moultipart.py | 147 +++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 tests/utils/python/moultipart.py

diff --git a/setup.cfg b/setup.cfg
index fa412107..02702a12 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,7 +2,8 @@
 # E501: line too long
 # W503: line break before binary operator (conflicts with black's way of
 #       formatting)
-ignore = E501,W503
+# E203: Whitespace before ':' (conclicts with black's way of formatting)
+ignore = E501,W503,E203
 
 # Disabled warnings for `bt2/__init__.py`:
 #
diff --git a/tests/utils/python/moultipart.py b/tests/utils/python/moultipart.py
new file mode 100644
index 00000000..d33748c7
--- /dev/null
+++ b/tests/utils/python/moultipart.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (C) 2023 EfficiOS Inc.
+#
+# pyright: strict, reportTypeCommentUsage=false
+
+import re
+from typing import TextIO
+
+
+# One part of a moultipart document.
+#
+# For example, for this part of which the header is at line 37:
+#
+#     --- Another Oscar Wilde quote
+#     I can resist everything except temptation.
+#
+# The corresponding `Part` object is:
+#
+#     Part('Another Oscar Wilde quote',
+#          'I can resist everything except temptation',
+#          38)
+class Part:
+    def __init__(self, header_info: str, content: str, first_content_line_no: int):
+        self._header_info = header_info
+        self._content = content
+        self._first_content_line_no = first_content_line_no
+
+    @property
+    def header_info(self):
+        return self._header_info
+
+    @property
+    def content(self):
+        return self._content
+
+    # Number of the first line, relative to the beginning of the
+    # containing moultipart document, of the content of this part.
+    @property
+    def first_content_line_no(self):
+        return self._first_content_line_no
+
+    def __repr__(self):
+        return "Part({}, {}, {})".format(
+            repr(self.header_info), repr(self.content), self.first_content_line_no
+        )
+
+
+def _try_parse_header(line: str):
+    m = re.match(r"---(\s*| .+)$", line)
+
+    if m is None:
+        return
+
+    return m.group(1).strip()
+
+
+# Parses the moultipart document file `in_file` and returns its parts
+# (list of `Part` objects).
+#
+# A moultipart document is a sequence of parts.
+#
+# A moutlipart part is:
+#
+# 1. A header line, that is, in this order:
+#
+#    a) Exactly `---`.
+#    b) Zero or more spaces.
+#    c) Optional: custom information until the end of the line.
+#
+# 2. Zero or more lines of text which aren't header lines.
+#
+# For example, consider the following moultipart document:
+#
+#     --- Victoria
+#     Parenteau
+#     ---
+#     Taillon
+#     --- This part is empty
+#     --- Josianne
+#     Gervais
+#
+# Then this function would return the following part objects:
+#
+#     [
+#         Part('Victoria',           'Parenteau\n', 2),
+#         Part('',                   'Taillon\n',   4),
+#         Part('This part is empty', '',            6),
+#         Part('Josianne',           'Gervais\n',   7),
+#     ]
+#
+# Raises `RuntimeError` on any parsing error.
+def parse(in_file: TextIO):
+    # Read the first header
+    cur_part_content = ""
+    cur_first_content_line_no = 2
+    parts = []  # type: list[Part]
+    line_no = 1
+    line = next(in_file)
+    cur_part_header_info = _try_parse_header(line)
+
+    if cur_part_header_info is None:
+        raise RuntimeError(
+            "Expecting header line starting with `---`, got `{}`".format(
+                line.strip("\n")
+            )
+        )
+
+    for line in in_file:
+        line_no += 1
+        maybe_part_header_info = _try_parse_header(line)
+
+        if maybe_part_header_info is not None:
+            # New header
+            parts.append(
+                Part(
+                    cur_part_header_info,
+                    cur_part_content,
+                    cur_first_content_line_no,
+                )
+            )
+            cur_part_content = ""
+            cur_part_header_info = maybe_part_header_info
+            cur_first_content_line_no = line_no + 1
+            continue
+
+        # Accumulate content lines
+        cur_part_content += line
+
+    # Last part (always exists)
+    parts.append(
+        Part(
+            cur_part_header_info,
+            cur_part_content,
+            cur_first_content_line_no,
+        )
+    )
+
+    return parts
+
+
+if __name__ == "__main__":
+    import sys
+    import pprint
+
+    with open(sys.argv[1]) as f:
+        pprint.pprint(parse(f))
-- 
2.34.1