docs: parse-headers.py: convert parse-headers.pl

When the Kernel started to use Sphinx, we had to come up with
a solution to parse media headers. On that time, we didn't have
much experience with Sphinx extensions. So, we came up with our
own script-based solution that were basically implementing a
set of rules we used to have at the Makefile.

Convert it to Python, keeping it bug-compatible with the
original script.

While here, try to better document it.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/ae5cfa8dff37e280cc9493fc95a51cd0cc0ba127.1755872208.git.mchehab+huawei@kernel.org
pull/1354/merge
Mauro Carvalho Chehab 2025-08-22 16:19:14 +02:00 committed by Jonathan Corbet
parent b5698da669
commit 8a5a85be4d
1 changed files with 429 additions and 0 deletions

View File

@ -0,0 +1,429 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2016 by Mauro Carvalho Chehab <mchehab@kernel.org>.
# pylint: disable=C0103,R0902,R0912,R0914,R0915
"""
Convert a C header or source file (C_FILE), into a ReStructured Text
included via ..parsed-literal block with cross-references for the
documentation files that describe the API. It accepts an optional
EXCEPTIONS_FILE with describes what elements will be either ignored or
be pointed to a non-default reference.
The output is written at the (OUT_FILE).
It is capable of identifying defines, functions, structs, typedefs,
enums and enum symbols and create cross-references for all of them.
It is also capable of distinguish #define used for specifying a Linux
ioctl.
The EXCEPTIONS_FILE contains a set of rules like:
ignore ioctl VIDIOC_ENUM_FMT
replace ioctl VIDIOC_DQBUF vidioc_qbuf
replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
"""
import argparse
import os
import re
import sys
class ParseHeader:
"""
Creates an enriched version of a Kernel header file with cross-links
to each C data structure type.
It is meant to allow having a more comprehensive documentation, where
uAPI headers will create cross-reference links to the code.
It is capable of identifying defines, functions, structs, typedefs,
enums and enum symbols and create cross-references for all of them.
It is also capable of distinguish #define used for specifying a Linux
ioctl.
By default, it create rules for all symbols and defines, but it also
allows parsing an exception file. Such file contains a set of rules
using the syntax below:
1. Ignore rules:
ignore <type> <symbol>`
Removes the symbol from reference generation.
2. Replace rules:
replace <type> <old_symbol> <new_reference>
Replaces how old_symbol with a new reference. The new_reference can be:
- A simple symbol name;
- A full Sphinx reference.
On both cases, <type> can be:
- ioctl: for defines that end with _IO*, e.g. ioctl definitions
- define: for other defines
- symbol: for symbols defined within enums;
- typedef: for typedefs;
- enum: for the name of a non-anonymous enum;
- struct: for structs.
Examples:
ignore define __LINUX_MEDIA_H
ignore ioctl VIDIOC_ENUM_FMT
replace ioctl VIDIOC_DQBUF vidioc_qbuf
replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
"""
# Parser regexes with multiple ways to capture enums and structs
RE_ENUMS = [
re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
]
RE_STRUCTS = [
re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"),
re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
]
# FIXME: the original code was written a long time before Sphinx C
# domain to have multiple namespaces. To avoid to much turn at the
# existing hyperlinks, the code kept using "c:type" instead of the
# right types. To change that, we need to change the types not only
# here, but also at the uAPI media documentation.
DEF_SYMBOL_TYPES = {
"ioctl": {
"prefix": "\\ ",
"suffix": "\\ ",
"ref_type": ":ref",
},
"define": {
"prefix": "\\ ",
"suffix": "\\ ",
"ref_type": ":ref",
},
# We're calling each definition inside an enum as "symbol"
"symbol": {
"prefix": "\\ ",
"suffix": "\\ ",
"ref_type": ":ref",
},
"typedef": {
"prefix": "\\ ",
"suffix": "\\ ",
"ref_type": ":c:type",
},
# This is the name of the enum itself
"enum": {
"prefix": "",
"suffix": "\\ ",
"ref_type": ":c:type",
},
"struct": {
"prefix": "",
"suffix": "\\ ",
"ref_type": ":c:type",
},
}
def __init__(self, debug: bool = False):
"""Initialize internal vars"""
self.debug = debug
self.data = ""
self.symbols = {}
for symbol_type in self.DEF_SYMBOL_TYPES:
self.symbols[symbol_type] = {}
def store_type(self, symbol_type: str, symbol: str,
ref_name: str = None, replace_underscores: bool = True):
"""
Stores a new symbol at self.symbols under symbol_type.
By default, underscores are replaced by "-"
"""
defs = self.DEF_SYMBOL_TYPES[symbol_type]
prefix = defs.get("prefix", "")
suffix = defs.get("suffix", "")
ref_type = defs.get("ref_type")
# Determine ref_link based on symbol type
if ref_type:
if symbol_type == "enum":
ref_link = f"{ref_type}:`{symbol}`"
else:
if not ref_name:
ref_name = symbol.lower()
if replace_underscores:
ref_name = ref_name.replace("_", "-")
ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
else:
ref_link = symbol
self.symbols[symbol_type][symbol] = f"{prefix}{ref_link}{suffix}"
def store_line(self, line):
"""Stores a line at self.data, properly indented"""
line = " " + line.expandtabs()
self.data += line.rstrip(" ")
def parse_file(self, file_in: str):
"""Reads a C source file and get identifiers"""
self.data = ""
is_enum = False
is_comment = False
multiline = ""
with open(file_in, "r",
encoding="utf-8", errors="backslashreplace") as f:
for line_no, line in enumerate(f):
self.store_line(line)
line = line.strip("\n")
# Handle continuation lines
if line.endswith(r"\\"):
multiline += line[-1]
continue
if multiline:
line = multiline + line
multiline = ""
# Handle comments. They can be multilined
if not is_comment:
if re.search(r"/\*.*", line):
is_comment = True
else:
# Strip C99-style comments
line = re.sub(r"(//.*)", "", line)
if is_comment:
if re.search(r".*\*/", line):
is_comment = False
else:
multiline = line
continue
# At this point, line variable may be a multilined statement,
# if lines end with \ or if they have multi-line comments
# With that, it can safely remove the entire comments,
# and there's no need to use re.DOTALL for the logic below
line = re.sub(r"(/\*.*\*/)", "", line)
if not line.strip():
continue
# It can be useful for debug purposes to print the file after
# having comments stripped and multi-lines grouped.
if self.debug > 1:
print(f"line {line_no + 1}: {line}")
# Now the fun begins: parse each type and store it.
# We opted for a two parsing logic here due to:
# 1. it makes easier to debug issues not-parsed symbols;
# 2. we want symbol replacement at the entire content, not
# just when the symbol is detected.
if is_enum:
match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
if match:
self.store_type("symbol", match.group(1))
if "}" in line:
is_enum = False
continue
match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
if match:
self.store_type("ioctl", match.group(1),
replace_underscores=False)
continue
match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
if match:
self.store_type("define", match.group(1))
continue
match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
line)
if match:
name = match.group(2).strip()
symbol = match.group(3)
self.store_type("typedef", symbol, ref_name=name,
replace_underscores=False)
continue
for re_enum in self.RE_ENUMS:
match = re_enum.match(line)
if match:
self.store_type("enum", match.group(1))
is_enum = True
break
for re_struct in self.RE_STRUCTS:
match = re_struct.match(line)
if match:
self.store_type("struct", match.group(1),
replace_underscores=False)
break
def process_exceptions(self, fname: str):
"""
Process exceptions file with rules to ignore or replace references.
"""
if not fname:
return
name = os.path.basename(fname)
with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
for ln, line in enumerate(f):
ln += 1
line = line.strip()
if not line or line.startswith("#"):
continue
# Handle ignore rules
match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
if match:
c_type = match.group(1)
symbol = match.group(2)
if c_type not in self.DEF_SYMBOL_TYPES:
sys.exit(f"{name}:{ln}: {c_type} is invalid")
d = self.symbols[c_type]
if symbol in d:
del d[symbol]
continue
# Handle replace rules
match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
if not match:
sys.exit(f"{name}:{ln}: invalid line: {line}")
c_type, old, new = match.groups()
if c_type not in self.DEF_SYMBOL_TYPES:
sys.exit(f"{name}:{ln}: {c_type} is invalid")
reftype = None
# Parse reference type when the type is specified
match = re.match(r"^\:c\:(data|func|macro|type)\:\`(.+)\`", new)
if match:
reftype = f":c:{match.group(1)}"
new = match.group(2)
else:
match = re.search(r"(\:ref)\:\`(.+)\`", new)
if match:
reftype = match.group(1)
new = match.group(2)
# If the replacement rule doesn't have a type, get default
if not reftype:
reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
if not reftype:
reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
new_ref = f"{reftype}:`{old} <{new}>`"
# Change self.symbols to use the replacement rule
if old in self.symbols[c_type]:
self.symbols[c_type][old] = new_ref
else:
print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
def debug_print(self):
"""
Print debug information containing the replacement rules per symbol.
To make easier to check, group them per type.
"""
if not self.debug:
return
for c_type, refs in self.symbols.items():
if not refs: # Skip empty dictionaries
continue
print(f"{c_type}:")
for symbol, ref in sorted(refs.items()):
print(f" {symbol} -> {ref}")
print()
def write_output(self, file_in: str, file_out: str):
"""Write the formatted output to a file."""
# Avoid extra blank lines
text = re.sub(r"\s+$", "", self.data) + "\n"
text = re.sub(r"\n\s+\n", "\n\n", text)
# Escape Sphinx special characters
text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text)
# Source uAPI files may have special notes. Use bold font for them
text = re.sub(r"DEPRECATED", "**DEPRECATED**", text)
# Delimiters to catch the entire symbol after escaped
start_delim = r"([ \n\t\(=\*\@])"
end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)"
# Process all reference types
for ref_dict in self.symbols.values():
for symbol, replacement in ref_dict.items():
symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
text = re.sub(fr'{start_delim}{symbol}{end_delim}',
fr'\1{replacement}\2', text)
# Remove "\ " where not needed: before spaces and at the end of lines
text = re.sub(r"\\ ([\n ])", r"\1", text)
title = os.path.basename(file_in)
with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f:
f.write(".. -*- coding: utf-8; mode: rst -*-\n\n")
f.write(f"{title}\n")
f.write("=" * len(title))
f.write("\n\n.. parsed-literal::\n\n")
f.write(text)
def main():
"""Main function"""
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-d", "--debug", action="count", default=0,
help="Increase debug level. Can be used multiple times")
parser.add_argument("file_in", help="Input C file")
parser.add_argument("file_out", help="Output RST file")
parser.add_argument("file_exceptions", nargs="?",
help="Exceptions file (optional)")
args = parser.parse_args()
parser = ParseHeader(debug=args.debug)
parser.parse_file(args.file_in)
if args.file_exceptions:
parser.process_exceptions(args.file_exceptions)
parser.debug_print()
parser.write_output(args.file_in, args.file_out)
if __name__ == "__main__":
main()