私は通常、MS Word を呪ってから、次のWindows Script Hostスクリプトを実行します。
// Replace with path to a file that needs cleaning
PATH = "test.html"
var go = WScript.CreateObject("Scripting.FileSystemObject");
var content = go.GetFile(PATH).OpenAsTextStream().ReadAll();
var out = go.CreateTextFile("clean-"+PATH, true);
// Symbols
content = content.replace(/“/g, '"');
content = content.replace(/”/g, '"');
content = content.replace(/’/g, "'");
content = content.replace(/–/g, "-");
content = content.replace(/©/g, "©");
content = content.replace(/®/g, "®");
content = content.replace(/°/g, "°");
content = content.replace(/¶/g, "<p>");
content = content.replace(/¿/g, "¿");
content = content.replace(/¡/g, '¡');
content = content.replace(/¢/g, '¢');
content = content.replace(/£/g, '£');
content = content.replace(/¥/g, '¥');
out.Write(content);