pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python/cpython/commit/ff365ebe98f8e8317403869f6e95c82922ed305c

55097560d244c08.css" /> [3.14] GH-145000: Add a tool to record/check removed HTML IDs (GH-145… · python/cpython@ff365eb · GitHub
Skip to content

Commit ff365eb

Browse files
[3.14] GH-145000: Add a tool to record/check removed HTML IDs (GH-145001) (GH-145212)
(cherry picked from commit 9b22261) Co-authored-by: Petr Viktorin <encukou@gmail.com>
1 parent a7beca8 commit ff365eb

File tree

3 files changed

+190
-0
lines changed

3 files changed

+190
-0
lines changed

Doc/.ruff.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ ignore = [
3232
"E501", # Ignore line length errors (we use auto-formatting)
3333
]
3434

35+
[lint.per-file-ignores]
36+
"tools/check-html-ids.py" = ["I001"] # Unsorted imports
37+
3538
[format]
3639
preview = true
3740
quote-style = "preserve"

Doc/Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,3 +336,9 @@ autobuild-stable-html:
336336
exit 1;; \
337337
esac
338338
@$(MAKE) autobuild-dev-html
339+
340+
# Collect HTML IDs to a JSON document
341+
.PHONY: html-ids
342+
html-ids:
343+
$(PYTHON) tools/check-html-ids.py collect build/html \
344+
-o build/html/html-ids.json.gz

Doc/tools/check-html-ids.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
from compression import gzip
2+
import concurrent.futures
3+
from pathlib import Path
4+
import html.parser
5+
import functools
6+
import argparse
7+
import json
8+
import sys
9+
import re
10+
11+
12+
IGNORED_ID_RE = re.compile(
13+
r"""
14+
index-\d+
15+
| id\d+
16+
| [_a-z]+_\d+
17+
""",
18+
re.VERBOSE,
19+
)
20+
21+
22+
class IDGatherer(html.parser.HTMLParser):
23+
def __init__(self, ids):
24+
super().__init__()
25+
self.__ids = ids
26+
27+
def handle_starttag(self, tag, attrs):
28+
for name, value in attrs:
29+
if name == 'id':
30+
if not IGNORED_ID_RE.fullmatch(value):
31+
self.__ids.add(value)
32+
33+
34+
def get_ids_from_file(path):
35+
ids = set()
36+
gatherer = IDGatherer(ids)
37+
with path.open(encoding='utf-8') as file:
38+
while chunk := file.read(4096):
39+
gatherer.feed(chunk)
40+
return ids
41+
42+
43+
def gather_ids(htmldir, *, verbose_print):
44+
if not htmldir.joinpath('objects.inv').exists():
45+
raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
46+
47+
if sys._is_gil_enabled:
48+
pool = concurrent.futures.ProcessPoolExecutor()
49+
else:
50+
pool = concurrent.futures.ThreadPoolExecutor()
51+
tasks = {}
52+
for path in htmldir.glob('**/*.html'):
53+
relative_path = path.relative_to(htmldir)
54+
if '_static' in relative_path.parts:
55+
continue
56+
if 'whatsnew' in relative_path.parts:
57+
continue
58+
tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
59+
60+
ids_by_page = {}
61+
for relative_path, future in tasks.items():
62+
verbose_print(relative_path)
63+
ids = future.result()
64+
ids_by_page[str(relative_path)] = ids
65+
verbose_print(f' - {len(ids)} ids found')
66+
67+
common = set.intersection(*ids_by_page.values())
68+
verbose_print(f'Filtering out {len(common)} common ids')
69+
for key, page_ids in ids_by_page.items():
70+
ids_by_page[key] = sorted(page_ids - common)
71+
72+
return ids_by_page
73+
74+
75+
def do_check(baseline, checked, excluded, *, verbose_print):
76+
successful = True
77+
for name, baseline_ids in sorted(baseline.items()):
78+
try:
79+
checked_ids = checked[name]
80+
except KeyError:
81+
successful = False
82+
print(f'{name}: (page missing)')
83+
print()
84+
else:
85+
missing_ids = set(baseline_ids) - set(checked_ids)
86+
if missing_ids:
87+
missing_ids = {
88+
a
89+
for a in missing_ids
90+
if not IGNORED_ID_RE.fullmatch(a)
91+
and (name, a) not in excluded
92+
}
93+
if missing_ids:
94+
successful = False
95+
for missing_id in sorted(missing_ids):
96+
print(f'{name}: {missing_id}')
97+
print()
98+
return successful
99+
100+
101+
def main(argv):
102+
parser = argparse.ArgumentParser()
103+
parser.add_argument(
104+
'-v',
105+
'--verbose',
106+
action='store_true',
107+
help='print out more information',
108+
)
109+
subparsers = parser.add_subparsers(dest='command', required=True)
110+
111+
collect = subparsers.add_parser(
112+
'collect', help='collect IDs from a set of HTML files'
113+
)
114+
collect.add_argument(
115+
'htmldir', type=Path, help='directory with HTML documentation'
116+
)
117+
collect.add_argument(
118+
'-o',
119+
'--outfile',
120+
help='File to save the result in; default <htmldir>/html-ids.json.gz',
121+
)
122+
123+
check = subparsers.add_parser('check', help='check two archives of IDs')
124+
check.add_argument(
125+
'baseline_file', type=Path, help='file with baseline IDs'
126+
)
127+
check.add_argument('checked_file', type=Path, help='file with checked IDs')
128+
check.add_argument(
129+
'-x',
130+
'--exclude-file',
131+
type=Path,
132+
help='file with IDs to exclude from the check',
133+
)
134+
135+
args = parser.parse_args(argv[1:])
136+
137+
if args.verbose:
138+
verbose_print = functools.partial(print, file=sys.stderr)
139+
else:
140+
141+
def verbose_print(*args, **kwargs):
142+
"""do nothing"""
143+
144+
if args.command == 'collect':
145+
ids = gather_ids(args.htmldir, verbose_print=verbose_print)
146+
if args.outfile is None:
147+
args.outfile = args.htmldir / 'html-ids.json.gz'
148+
with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
149+
json.dump({'ids_by_page': ids}, zfile)
150+
151+
if args.command == 'check':
152+
with gzip.open(args.baseline_file) as zfile:
153+
baseline = json.load(zfile)['ids_by_page']
154+
with gzip.open(args.checked_file) as zfile:
155+
checked = json.load(zfile)['ids_by_page']
156+
excluded = set()
157+
if args.exclude_file:
158+
with open(args.exclude_file, encoding='utf-8') as file:
159+
for line in file:
160+
line = line.strip()
161+
if line and not line.startswith('#'):
162+
name, sep, excluded_id = line.partition(':')
163+
if sep:
164+
excluded.add((name.strip(), excluded_id.strip()))
165+
if do_check(baseline, checked, excluded, verbose_print=verbose_print):
166+
verbose_print('All OK')
167+
else:
168+
sys.stdout.flush()
169+
print(
170+
'ERROR: Removed IDs found',
171+
'The above HTML IDs were removed from the documentation, '
172+
+ 'resulting in broken links. Please add them back.',
173+
sep='\n',
174+
file=sys.stderr,
175+
)
176+
if args.exclude_file:
177+
print(f'Alternatively, add them to {args.exclude_file}.')
178+
179+
180+
if __name__ == '__main__':
181+
main(sys.argv)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy