html5lib-python だけを使用することにしました。これは私が思いついたものです:
#!/usr/bin/env python
import sys
from xml.dom.minidom import Node
import html5lib
from html5lib import (HTMLParser, sanitizer, serializer, treebuilders,
treewalkers)
parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
tree=treebuilders.getTreeBuilder("dom"))
serializer = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
document = parser.parse(sys.stdin.read(), encoding="utf-8")
# find the <html> node
for child in document.childNodes:
if child.nodeType == Node.ELEMENT_NODE and child.nodeName == 'html':
htmlNode = child
# find the <body> node
for child in htmlNode.childNodes:
if child.nodeType == Node.ELEMENT_NODE and child.nodeName == 'body':
bodyNode = child
# serialize all children of the <body> node
for child in bodyNode.childNodes:
stream = treewalkers.getTreeWalker("dom")(child)
sys.stdout.write(serializer.render(stream, encoding="utf-8"))
入力例:
<script>alert("hax")</script>
<p onload="alert('this is a dangerous attribute')"><b>hello,</b> world</p>
出力例:
<script>alert("hax")</script>
<p><b>hello,</b> world</p>