dtlib: use IntEnum for token IDs

The way that _init_tokens() is manipulating globals() defeats static
analyses of the file that are trying to infer a type for the 'tok_id'
variable in assignment expressions like 'tok_id = _T_INCLUDE'.

To make it easier on the analyser, define the token types as an
enum.IntEnum named _T. This means we can write e.g. '_T.INCLUDE'
instead of '_T_INCLUDE', avoiding line length increases in the lexing
code.

While we're here, use '==' and '!=' instead of 'is' and 'is not'
when comparing a tok_id that is obtained from an re.Match.lastindex
with a _T.FOO value.

This is now necessary since an int object and a _T object definitely
don't point to the same memory. It worked previously because CPython
interns all integer instances from -5 to 256, but that's an
implementation detail and not a language feature. Since we're getting
the ints from an re.Match.lastindex instead of putting the exact
_T_FOO values into some list, this code probably should not strictly
speaking have been using 'is'.

Explicitly initialize the global _token_re also, to make it more
visible for static analysis.

Signed-off-by: Martí Bolívar <marti.bolivar@nordicsemi.no>
This commit is contained in:
Martí Bolívar 2021-04-23 11:06:43 -07:00 committed by Kumar Gala
commit 9d4ccf23ec

View file

@ -618,6 +618,34 @@ class Property:
if marker_type is _MarkerType.PHANDLE: if marker_type is _MarkerType.PHANDLE:
self.value += b"\0\0\0\0" self.value += b"\0\0\0\0"
class _T(enum.IntEnum):
# Token IDs used by the DT lexer.
# These values must be contiguous and start from 1.
INCLUDE = 1
LINE = 2
STRING = 3
DTS_V1 = 4
PLUGIN = 5
MEMRESERVE = 6
BITS = 7
DEL_PROP = 8
DEL_NODE = 9
OMIT_IF_NO_REF = 10
LABEL = 11
CHAR_LITERAL = 12
REF = 13
INCBIN = 14
SKIP = 15
EOF = 16
# These values must be larger than the above contiguous range.
NUM = 17
PROPNODENAME = 18
MISC = 19
BYTE = 20
BAD = 21
class DT: class DT:
""" """
Represents a devicetree parsed from a .dts file (or from many files, if the Represents a devicetree parsed from a .dts file (or from many files, if the
@ -810,14 +838,14 @@ class DT:
self.root = Node(name="/", parent=None, dt=self) self.root = Node(name="/", parent=None, dt=self)
self._parse_node(self.root) self._parse_node(self.root)
elif tok.id in (_T_LABEL, _T_REF): elif tok.id in (_T.LABEL, _T.REF):
# '&foo { ... };' or 'label: &foo { ... };'. The C tools only # '&foo { ... };' or 'label: &foo { ... };'. The C tools only
# support a single label here too. # support a single label here too.
if tok.id is _T_LABEL: if tok.id == _T.LABEL:
label = tok.val label = tok.val
tok = self._next_token() tok = self._next_token()
if tok.id is not _T_REF: if tok.id != _T.REF:
self._parse_error("expected label reference (&foo)") self._parse_error("expected label reference (&foo)")
else: else:
label = None label = None
@ -831,15 +859,15 @@ class DT:
if label: if label:
_append_no_dup(node.labels, label) _append_no_dup(node.labels, label)
elif tok.id is _T_DEL_NODE: elif tok.id == _T.DEL_NODE:
self._next_ref2node()._del() self._next_ref2node()._del()
self._expect_token(";") self._expect_token(";")
elif tok.id is _T_OMIT_IF_NO_REF: elif tok.id == _T.OMIT_IF_NO_REF:
self._next_ref2node()._omit_if_no_ref = True self._next_ref2node()._omit_if_no_ref = True
self._expect_token(";") self._expect_token(";")
elif tok.id is _T_EOF: elif tok.id == _T.EOF:
if not self.root: if not self.root:
self._parse_error("no root node defined") self._parse_error("no root node defined")
return return
@ -853,12 +881,12 @@ class DT:
has_dts_v1 = False has_dts_v1 = False
while self._peek_token().id is _T_DTS_V1: while self._peek_token().id == _T.DTS_V1:
has_dts_v1 = True has_dts_v1 = True
self._next_token() self._next_token()
self._expect_token(";") self._expect_token(";")
# /plugin/ always comes after /dts-v1/ # /plugin/ always comes after /dts-v1/
if self._peek_token().id is _T_PLUGIN: if self._peek_token().id == _T.PLUGIN:
self._parse_error("/plugin/ is not supported") self._parse_error("/plugin/ is not supported")
if not has_dts_v1: if not has_dts_v1:
@ -871,10 +899,10 @@ class DT:
while True: while True:
# Labels before /memreserve/ # Labels before /memreserve/
labels = [] labels = []
while self._peek_token().id is _T_LABEL: while self._peek_token().id == _T.LABEL:
_append_no_dup(labels, self._next_token().val) _append_no_dup(labels, self._next_token().val)
if self._peek_token().id is _T_MEMRESERVE: if self._peek_token().id == _T.MEMRESERVE:
self._next_token() self._next_token()
self.memreserves.append( self.memreserves.append(
(labels, self._eval_prim(), self._eval_prim())) (labels, self._eval_prim(), self._eval_prim()))
@ -894,7 +922,7 @@ class DT:
labels, omit_if_no_ref = self._parse_propnode_labels() labels, omit_if_no_ref = self._parse_propnode_labels()
tok = self._next_token() tok = self._next_token()
if tok.id is _T_PROPNODENAME: if tok.id == _T.PROPNODENAME:
if self._peek_token().val == "{": if self._peek_token().val == "{":
# '<tok> { ...', expect node # '<tok> { ...', expect node
@ -933,17 +961,17 @@ class DT:
for label in labels: for label in labels:
_append_no_dup(prop.labels, label) _append_no_dup(prop.labels, label)
elif tok.id is _T_DEL_NODE: elif tok.id == _T.DEL_NODE:
tok2 = self._next_token() tok2 = self._next_token()
if tok2.id is not _T_PROPNODENAME: if tok2.id != _T.PROPNODENAME:
self._parse_error("expected node name") self._parse_error("expected node name")
if tok2.val in node.nodes: if tok2.val in node.nodes:
node.nodes[tok2.val]._del() node.nodes[tok2.val]._del()
self._expect_token(";") self._expect_token(";")
elif tok.id is _T_DEL_PROP: elif tok.id == _T.DEL_PROP:
tok2 = self._next_token() tok2 = self._next_token()
if tok2.id is not _T_PROPNODENAME: if tok2.id != _T.PROPNODENAME:
self._parse_error("expected property name") self._parse_error("expected property name")
node.props.pop(tok2.val, None) node.props.pop(tok2.val, None)
self._expect_token(";") self._expect_token(";")
@ -964,11 +992,11 @@ class DT:
omit_if_no_ref = False omit_if_no_ref = False
while True: while True:
tok = self._peek_token() tok = self._peek_token()
if tok.id is _T_LABEL: if tok.id == _T.LABEL:
_append_no_dup(labels, tok.val) _append_no_dup(labels, tok.val)
elif tok.id is _T_OMIT_IF_NO_REF: elif tok.id == _T.OMIT_IF_NO_REF:
omit_if_no_ref = True omit_if_no_ref = True
elif (labels or omit_if_no_ref) and tok.id is not _T_PROPNODENAME: elif (labels or omit_if_no_ref) and tok.id != _T.PROPNODENAME:
# Got something like 'foo: bar: }' # Got something like 'foo: bar: }'
self._parse_error("expected node or property name") self._parse_error("expected node or property name")
else: else:
@ -996,7 +1024,7 @@ class DT:
if tok.val == "<": if tok.val == "<":
self._parse_cells(prop, 4) self._parse_cells(prop, 4)
elif tok.id is _T_BITS: elif tok.id == _T.BITS:
n_bits = self._expect_num() n_bits = self._expect_num()
if n_bits not in {8, 16, 32, 64}: if n_bits not in {8, 16, 32, 64}:
self._parse_error("expected 8, 16, 32, or 64") self._parse_error("expected 8, 16, 32, or 64")
@ -1006,14 +1034,14 @@ class DT:
elif tok.val == "[": elif tok.val == "[":
self._parse_bytes(prop) self._parse_bytes(prop)
elif tok.id is _T_STRING: elif tok.id == _T.STRING:
prop._add_marker(_MarkerType.STRING) prop._add_marker(_MarkerType.STRING)
prop.value += self._unescape(tok.val.encode("utf-8")) + b"\0" prop.value += self._unescape(tok.val.encode("utf-8")) + b"\0"
elif tok.id is _T_REF: elif tok.id == _T.REF:
prop._add_marker(_MarkerType.PATH, tok.val) prop._add_marker(_MarkerType.PATH, tok.val)
elif tok.id is _T_INCBIN: elif tok.id == _T.INCBIN:
self._parse_incbin(prop) self._parse_incbin(prop)
else: else:
@ -1036,14 +1064,14 @@ class DT:
while True: while True:
tok = self._peek_token() tok = self._peek_token()
if tok.id is _T_REF: if tok.id == _T.REF:
self._next_token() self._next_token()
if n_bytes != 4: if n_bytes != 4:
self._parse_error("phandle references are only allowed in " self._parse_error("phandle references are only allowed in "
"arrays with 32-bit elements") "arrays with 32-bit elements")
prop._add_marker(_MarkerType.PHANDLE, tok.val) prop._add_marker(_MarkerType.PHANDLE, tok.val)
elif tok.id is _T_LABEL: elif tok.id == _T.LABEL:
prop._add_marker(_MarkerType.LABEL, tok.val) prop._add_marker(_MarkerType.LABEL, tok.val)
self._next_token() self._next_token()
@ -1070,10 +1098,10 @@ class DT:
while True: while True:
tok = self._next_token() tok = self._next_token()
if tok.id is _T_BYTE: if tok.id == _T.BYTE:
prop.value += tok.val.to_bytes(1, "big") prop.value += tok.val.to_bytes(1, "big")
elif tok.id is _T_LABEL: elif tok.id == _T.LABEL:
prop._add_marker(_MarkerType.LABEL, tok.val) prop._add_marker(_MarkerType.LABEL, tok.val)
elif tok.val == "]": elif tok.val == "]":
@ -1096,7 +1124,7 @@ class DT:
self._expect_token("(") self._expect_token("(")
tok = self._next_token() tok = self._next_token()
if tok.id is not _T_STRING: if tok.id != _T.STRING:
self._parse_error("expected quoted filename") self._parse_error("expected quoted filename")
filename = tok.val filename = tok.val
@ -1128,7 +1156,7 @@ class DT:
while True: while True:
tok = self._peek_token() tok = self._peek_token()
if tok.id is not _T_LABEL: if tok.id != _T.LABEL:
return return
prop._add_marker(_MarkerType.LABEL, tok.val) prop._add_marker(_MarkerType.LABEL, tok.val)
self._next_token() self._next_token()
@ -1162,7 +1190,7 @@ class DT:
def _eval_prim(self): def _eval_prim(self):
tok = self._peek_token() tok = self._peek_token()
if tok.id in (_T_NUM, _T_CHAR_LITERAL): if tok.id in (_T.NUM, _T.CHAR_LITERAL):
return self._next_token().val return self._next_token().val
tok = self._next_token() tok = self._next_token()
@ -1309,7 +1337,7 @@ class DT:
match = _token_re.match(self._file_contents, self._tok_end_i) match = _token_re.match(self._file_contents, self._tok_end_i)
if match: if match:
tok_id = match.lastindex tok_id = match.lastindex
if tok_id is _T_CHAR_LITERAL: if tok_id == _T.CHAR_LITERAL:
val = self._unescape(match.group(tok_id).encode("utf-8")) val = self._unescape(match.group(tok_id).encode("utf-8"))
if len(val) != 1: if len(val) != 1:
self._parse_error("character literals must be length 1") self._parse_error("character literals must be length 1")
@ -1320,7 +1348,7 @@ class DT:
elif self._lexer_state is _DEFAULT: elif self._lexer_state is _DEFAULT:
match = _num_re.match(self._file_contents, self._tok_end_i) match = _num_re.match(self._file_contents, self._tok_end_i)
if match: if match:
tok_id = _T_NUM tok_id = _T.NUM
num_s = match.group(1) num_s = match.group(1)
tok_val = int(num_s, tok_val = int(num_s,
16 if num_s.startswith(("0x", "0X")) else 16 if num_s.startswith(("0x", "0X")) else
@ -1331,21 +1359,20 @@ class DT:
match = _propnodename_re.match(self._file_contents, match = _propnodename_re.match(self._file_contents,
self._tok_end_i) self._tok_end_i)
if match: if match:
tok_id = _T_PROPNODENAME tok_id = _T.PROPNODENAME
tok_val = match.group(1) tok_val = match.group(1)
self._lexer_state = _DEFAULT self._lexer_state = _DEFAULT
else: # self._lexer_state is _EXPECT_BYTE else: # self._lexer_state is _EXPECT_BYTE
match = _byte_re.match(self._file_contents, self._tok_end_i) match = _byte_re.match(self._file_contents, self._tok_end_i)
if match: if match:
tok_id = _T_BYTE tok_id = _T.BYTE
tok_val = int(match.group(), 16) tok_val = int(match.group(), 16)
if not tok_id: if not tok_id:
match = _misc_re.match(self._file_contents, self._tok_end_i) match = _misc_re.match(self._file_contents, self._tok_end_i)
if match: if match:
tok_id = _T_MISC tok_id = _T.MISC
tok_val = match.group() tok_val = match.group()
else: else:
self._tok_i = self._tok_end_i self._tok_i = self._tok_end_i
@ -1354,18 +1381,18 @@ class DT:
# files. Generate a token for it so that the error can # files. Generate a token for it so that the error can
# trickle up to some context where we can give a more # trickle up to some context where we can give a more
# helpful error message. # helpful error message.
return _Token(_T_BAD, "<unknown token>") return _Token(_T.BAD, "<unknown token>")
self._tok_i = match.start() self._tok_i = match.start()
self._tok_end_i = match.end() self._tok_end_i = match.end()
if tok_id is _T_SKIP: if tok_id == _T.SKIP:
self._lineno += tok_val.count("\n") self._lineno += tok_val.count("\n")
continue continue
# /include/ is handled in the lexer in the C tools as well, and can # /include/ is handled in the lexer in the C tools as well, and can
# appear anywhere # appear anywhere
if tok_id is _T_INCLUDE: if tok_id == _T.INCLUDE:
# Can have newlines between /include/ and the filename # Can have newlines between /include/ and the filename
self._lineno += tok_val.count("\n") self._lineno += tok_val.count("\n")
# Do this manual extraction instead of doing it in the regex so # Do this manual extraction instead of doing it in the regex so
@ -1374,21 +1401,21 @@ class DT:
self._enter_file(filename) self._enter_file(filename)
continue continue
if tok_id is _T_LINE: if tok_id == _T.LINE:
# #line directive # #line directive
self._lineno = int(tok_val.split()[0]) - 1 self._lineno = int(tok_val.split()[0]) - 1
self.filename = tok_val[tok_val.find('"') + 1:-1] self.filename = tok_val[tok_val.find('"') + 1:-1]
continue continue
if tok_id is _T_EOF: if tok_id == _T.EOF:
if self._filestack: if self._filestack:
self._leave_file() self._leave_file()
continue continue
return _Token(_T_EOF, "<EOF>") return _Token(_T.EOF, "<EOF>")
# State handling # State handling
if tok_id in (_T_DEL_PROP, _T_DEL_NODE, _T_OMIT_IF_NO_REF) or \ if tok_id in (_T.DEL_PROP, _T.DEL_NODE, _T.OMIT_IF_NO_REF) or \
tok_val in ("{", ";"): tok_val in ("{", ";"):
self._lexer_state = _EXPECT_PROPNODENAME self._lexer_state = _EXPECT_PROPNODENAME
@ -1396,7 +1423,7 @@ class DT:
elif tok_val == "[": elif tok_val == "[":
self._lexer_state = _EXPECT_BYTE self._lexer_state = _EXPECT_BYTE
elif tok_id in (_T_MEMRESERVE, _T_BITS) or tok_val == "]": elif tok_id in (_T.MEMRESERVE, _T.BITS) or tok_val == "]":
self._lexer_state = _DEFAULT self._lexer_state = _DEFAULT
return _Token(tok_id, tok_val) return _Token(tok_id, tok_val)
@ -1416,7 +1443,7 @@ class DT:
# Raises an error if the next token is not a number. Returns the token. # Raises an error if the next token is not a number. Returns the token.
tok = self._next_token() tok = self._next_token()
if tok.id is not _T_NUM: if tok.id != _T.NUM:
self._parse_error("expected number") self._parse_error("expected number")
return tok.val return tok.val
@ -1472,7 +1499,7 @@ class DT:
# on errors to save some code in callers. # on errors to save some code in callers.
label = self._next_token() label = self._next_token()
if label.id is not _T_REF: if label.id != _T.REF:
self._parse_error( self._parse_error(
"expected label (&foo) or path (&{/foo/bar}) reference") "expected label (&foo) or path (&{/foo/bar}) reference")
try: try:
@ -1892,58 +1919,45 @@ _line_re = re.compile(
re.MULTILINE) re.MULTILINE)
def _init_tokens(): def _init_tokens():
# Builds a (<token 1>)|(<token 2>)|... regex and assigns the index of each # Builds a (<token 1>)|(<token 2>)|... regex and returns it. The
# capturing group to a corresponding _T_<TOKEN> variable. This makes the # way this is constructed makes the token's value as an int appear
# token type appear in match.lastindex after a match. # in match.lastindex after a match.
global _token_re
global _T_NUM
global _T_PROPNODENAME
global _T_MISC
global _T_BYTE
global _T_BAD
# Each pattern must have exactly one capturing group, which can capture any # Each pattern must have exactly one capturing group, which can capture any
# part of the pattern. This makes match.lastindex match the token type. # part of the pattern. This makes match.lastindex match the token type.
# _Token.val is based on the captured string. # _Token.val is based on the captured string.
token_spec = (("_T_INCLUDE", r'(/include/\s*"(?:[^\\"]|\\.)*")'), token_spec = {
("_T_LINE", # #line directive _T.INCLUDE: r'(/include/\s*"(?:[^\\"]|\\.)*")',
r'^#(?:line)?[ \t]+([0-9]+[ \t]+"(?:[^\\"]|\\.)*")(?:[ \t]+[0-9]+)?'), # #line directive
("_T_STRING", r'"((?:[^\\"]|\\.)*)"'), _T.LINE:
("_T_DTS_V1", r"(/dts-v1/)"), r'^#(?:line)?[ \t]+([0-9]+[ \t]+"(?:[^\\"]|\\.)*")(?:[ \t]+[0-9]+)?',
("_T_PLUGIN", r"(/plugin/)"),
("_T_MEMRESERVE", r"(/memreserve/)"), _T.STRING: r'"((?:[^\\"]|\\.)*)"',
("_T_BITS", r"(/bits/)"), _T.DTS_V1: r"(/dts-v1/)",
("_T_DEL_PROP", r"(/delete-property/)"), _T.PLUGIN: r"(/plugin/)",
("_T_DEL_NODE", r"(/delete-node/)"), _T.MEMRESERVE: r"(/memreserve/)",
("_T_OMIT_IF_NO_REF", r"(/omit-if-no-ref/)"), _T.BITS: r"(/bits/)",
("_T_LABEL", r"([a-zA-Z_][a-zA-Z0-9_]*):"), _T.DEL_PROP: r"(/delete-property/)",
("_T_CHAR_LITERAL", r"'((?:[^\\']|\\.)*)'"), _T.DEL_NODE: r"(/delete-node/)",
("_T_REF", _T.OMIT_IF_NO_REF: r"(/omit-if-no-ref/)",
r"&([a-zA-Z_][a-zA-Z0-9_]*|{[a-zA-Z0-9,._+*#?@/-]*})"), _T.LABEL: r"([a-zA-Z_][a-zA-Z0-9_]*):",
("_T_INCBIN", r"(/incbin/)"), _T.CHAR_LITERAL: r"'((?:[^\\']|\\.)*)'",
# Whitespace, C comments, and C++ comments _T.REF: r"&([a-zA-Z_][a-zA-Z0-9_]*|{[a-zA-Z0-9,._+*#?@/-]*})",
("_T_SKIP", r"(\s+|(?:/\*(?:.|\n)*?\*/)|//.*$)"), _T.INCBIN: r"(/incbin/)",
# Return a token for end-of-file so that the parsing code can # Whitespace, C comments, and C++ comments
# always assume that there are more tokens when looking _T.SKIP: r"(\s+|(?:/\*(?:.|\n)*?\*/)|//.*$)",
# ahead. This simplifies things. # Return a token for end-of-file so that the parsing code can
("_T_EOF", r"(\Z)")) # always assume that there are more tokens when looking
# ahead. This simplifies things.
_T.EOF: r"(\Z)",
}
# MULTILINE is needed for C++ comments and #line directives # MULTILINE is needed for C++ comments and #line directives
_token_re = re.compile("|".join(spec[1] for spec in token_spec), return re.compile("|".join(token_spec[tok_id] for tok_id in
re.MULTILINE | re.ASCII) range(1, _T.EOF + 1)),
re.MULTILINE | re.ASCII)
for i, spec in enumerate(token_spec, 1): _token_re = _init_tokens()
globals()[spec[0]] = i
# pylint: disable=undefined-loop-variable
_T_NUM = i + 1
_T_PROPNODENAME = i + 2
_T_MISC = i + 3
_T_BYTE = i + 4
_T_BAD = i + 5
_init_tokens()
_TYPE_TO_N_BYTES = { _TYPE_TO_N_BYTES = {
_MarkerType.UINT8: 1, _MarkerType.UINT8: 1,