pokecrystal/tools/toc.py

100 lines
2.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
2022-09-19 03:08:43 +00:00
Usage: python toc.py file.md
Replace a "## TOC" heading in a Markdown file with a table of contents,
generated from the other headings in the file. Supports multiple files.
Headings must start with "##" signs to be detected.
"""
import sys
import re
from collections import namedtuple
from urllib.parse import quote
toc_name = 'Contents'
valid_toc_headings = {'## TOC', '##TOC'}
TocItem = namedtuple('TocItem', ['name', 'anchor', 'level'])
2022-09-19 03:08:43 +00:00
punctuation_rx = re.compile(r'[^\w\- ]+')
numbered_heading_rx = re.compile(r'^[0-9]+\. ')
specialchar_rx = re.compile(r'[⅔]+')
def name_to_anchor(name):
# GitHub's algorithm for generating anchors from headings
# https://github.com/jch/html-pipeline/blob/master/lib/html/pipeline/toc_filter.rb
2022-09-19 03:08:43 +00:00
anchor = name.strip().lower() # lowercase
anchor = re.sub(punctuation_rx, '', anchor) # remove punctuation
anchor = anchor.replace(' ', '-') # replace spaces with dash
anchor = re.sub(specialchar_rx, '', anchor) # remove misc special chars
anchor = quote(anchor) # url encode
return anchor
def get_toc_index(lines):
toc_index = None
for i, line in enumerate(lines):
if line.rstrip() in valid_toc_headings:
toc_index = i
break
return toc_index
def get_toc_items(lines, toc_index):
for i, line in enumerate(lines):
if i <= toc_index:
continue
if line.startswith('##'):
name = line.lstrip('#')
level = len(line) - len(name) - len('##')
name = name.strip()
anchor = name_to_anchor(name)
yield TocItem(name, anchor, level)
def toc_string(toc_items):
2022-09-19 03:08:43 +00:00
lines = [f'## {toc_name}', '']
for name, anchor, level in toc_items:
padding = ' ' * level
2022-09-19 03:08:43 +00:00
if re.match(numbered_heading_rx, name):
bullet, name = name.split('.', 1)
bullet += '.'
name = name.lstrip()
else:
bullet = '-'
lines.append(f'{padding}{bullet} [{name}](#{anchor})')
return '\n'.join(lines) + '\n'
def add_toc(filename):
2022-09-19 03:08:43 +00:00
with open(filename, 'r', encoding='utf-8') as file:
lines = file.readlines()
toc_index = get_toc_index(lines)
if toc_index is None:
return None # no TOC heading
toc_items = list(get_toc_items(lines, toc_index))
if not toc_items:
return False # no content headings
2022-09-19 03:08:43 +00:00
with open(filename, 'w', encoding='utf-8') as file:
for i, line in enumerate(lines):
if i == toc_index:
2022-09-19 03:08:43 +00:00
file.write(toc_string(toc_items))
else:
2022-09-19 03:08:43 +00:00
file.write(line)
return True # OK
def main():
if len(sys.argv) < 2:
2022-09-19 03:08:43 +00:00
print(f'Usage: {sys.argv[0]} file.md', file=sys.stderr)
sys.exit(1)
for filename in sys.argv[1:]:
print(filename)
result = add_toc(filename)
if result is None:
2022-09-19 03:08:43 +00:00
print('Warning: No "## TOC" heading found', file=sys.stderr)
elif result is False:
2022-09-19 03:08:43 +00:00
print('Warning: No content headings found', file=sys.stderr)
else:
print('OK')
if __name__ == '__main__':
main()