Commit | Line | Data |
---|---|---|
eda81eee SM |
1 | # SPDX-License-Identifier: GPL-2.0-only |
2 | # | |
3 | # Copyright (C) 2023 EfficiOS Inc. | |
4 | # | |
5 | # pyright: strict, reportTypeCommentUsage=false | |
6 | ||
7 | import re | |
8 | from typing import TextIO | |
9 | ||
10 | ||
11 | # One part of a moultipart document. | |
12 | # | |
13 | # For example, for this part of which the header is at line 37: | |
14 | # | |
15 | # --- Another Oscar Wilde quote | |
16 | # I can resist everything except temptation. | |
17 | # | |
18 | # The corresponding `Part` object is: | |
19 | # | |
20 | # Part('Another Oscar Wilde quote', | |
21 | # 'I can resist everything except temptation', | |
22 | # 38) | |
23 | class Part: | |
24 | def __init__(self, header_info: str, content: str, first_content_line_no: int): | |
25 | self._header_info = header_info | |
26 | self._content = content | |
27 | self._first_content_line_no = first_content_line_no | |
28 | ||
29 | @property | |
30 | def header_info(self): | |
31 | return self._header_info | |
32 | ||
33 | @property | |
34 | def content(self): | |
35 | return self._content | |
36 | ||
37 | # Number of the first line, relative to the beginning of the | |
38 | # containing moultipart document, of the content of this part. | |
39 | @property | |
40 | def first_content_line_no(self): | |
41 | return self._first_content_line_no | |
42 | ||
43 | def __repr__(self): | |
44 | return "Part({}, {}, {})".format( | |
45 | repr(self.header_info), repr(self.content), self.first_content_line_no | |
46 | ) | |
47 | ||
48 | ||
49 | def _try_parse_header(line: str): | |
50 | m = re.match(r"---(\s*| .+)$", line) | |
51 | ||
52 | if m is None: | |
53 | return | |
54 | ||
55 | return m.group(1).strip() | |
56 | ||
57 | ||
58 | # Parses the moultipart document file `in_file` and returns its parts | |
59 | # (list of `Part` objects). | |
60 | # | |
61 | # A moultipart document is a sequence of parts. | |
62 | # | |
63 | # A moutlipart part is: | |
64 | # | |
65 | # 1. A header line, that is, in this order: | |
66 | # | |
67 | # a) Exactly `---`. | |
68 | # b) Zero or more spaces. | |
69 | # c) Optional: custom information until the end of the line. | |
70 | # | |
71 | # 2. Zero or more lines of text which aren't header lines. | |
72 | # | |
73 | # For example, consider the following moultipart document: | |
74 | # | |
75 | # --- Victoria | |
76 | # Parenteau | |
77 | # --- | |
78 | # Taillon | |
79 | # --- This part is empty | |
80 | # --- Josianne | |
81 | # Gervais | |
82 | # | |
83 | # Then this function would return the following part objects: | |
84 | # | |
85 | # [ | |
86 | # Part('Victoria', 'Parenteau\n', 2), | |
87 | # Part('', 'Taillon\n', 4), | |
88 | # Part('This part is empty', '', 6), | |
89 | # Part('Josianne', 'Gervais\n', 7), | |
90 | # ] | |
91 | # | |
92 | # Raises `RuntimeError` on any parsing error. | |
93 | def parse(in_file: TextIO): | |
94 | # Read the first header | |
95 | cur_part_content = "" | |
96 | cur_first_content_line_no = 2 | |
97 | parts = [] # type: list[Part] | |
98 | line_no = 1 | |
99 | line = next(in_file) | |
100 | cur_part_header_info = _try_parse_header(line) | |
101 | ||
102 | if cur_part_header_info is None: | |
103 | raise RuntimeError( | |
104 | "Expecting header line starting with `---`, got `{}`".format( | |
105 | line.strip("\n") | |
106 | ) | |
107 | ) | |
108 | ||
109 | for line in in_file: | |
110 | line_no += 1 | |
111 | maybe_part_header_info = _try_parse_header(line) | |
112 | ||
113 | if maybe_part_header_info is not None: | |
114 | # New header | |
115 | parts.append( | |
116 | Part( | |
117 | cur_part_header_info, | |
118 | cur_part_content, | |
119 | cur_first_content_line_no, | |
120 | ) | |
121 | ) | |
122 | cur_part_content = "" | |
123 | cur_part_header_info = maybe_part_header_info | |
124 | cur_first_content_line_no = line_no + 1 | |
125 | continue | |
126 | ||
127 | # Accumulate content lines | |
128 | cur_part_content += line | |
129 | ||
130 | # Last part (always exists) | |
131 | parts.append( | |
132 | Part( | |
133 | cur_part_header_info, | |
134 | cur_part_content, | |
135 | cur_first_content_line_no, | |
136 | ) | |
137 | ) | |
138 | ||
139 | return parts | |
140 | ||
141 | ||
142 | if __name__ == "__main__": | |
143 | import sys | |
144 | import pprint | |
145 | ||
146 | with open(sys.argv[1]) as f: | |
147 | pprint.pprint(parse(f)) |