0

HTML 入力を検証しています (RSS フィードから) Mvc ビューに表示する

次のホワイトリスト アプローチを使用して、html をサニタイズしています

private static Regex _tags = new Regex("<[^>]*(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
private static Regex _whitelist = new Regex(@"
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|u|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$|
^<(b|h)r\s?/?>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_a = new Regex(@"
^<a\s
href=""(\#\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
(\stitle=""[^""<>]+"")?\s?>$|
^</a>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_img = new Regex(@"
^<img\s
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
(\swidth=""\d{1,3}"")?
(\sheight=""\d{1,3}"")?
(\salt=""[^""<>]*"")?
(\stitle=""[^""<>]*"")?
\s?/?>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);


/// <summary>
/// sanitize any potentially dangerous tags from the provided raw HTML input using 
/// a whitelist based approach, leaving the "safe" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937
/// </summary>
public static string Sanitize(string html)
{
    if (String.IsNullOrEmpty(html)) return html;

    string tagname;
    Match tag;

    // match every HTML tag in the input
    MatchCollection tags = _tags.Matches(html);
    for (int i = tags.Count - 1; i > -1; i--)
    {
        tag = tags[i];
        tagname = tag.Value.ToLowerInvariant();

        if (!(_whitelist.IsMatch(tagname) || _whitelist_a.IsMatch(tagname) || _whitelist_img.IsMatch(tagname)))
        {
            html = html.Remove(tag.Index, tag.Length);

        }
    }

    return html;
}

Youtube や Vimeo のビデオ コンテンツを iFrames または html5 ビデオ タグを使用して表示できるようにしたいと考えています。

もう少し柔軟な正規表現の正しい方向に誰かが私を向けることができますか?

これがIframeの私の試みです

private static Regex _whitelist_iframe = new Regex(@"
             ^<iframe\s
            src=""https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+""
            (\swidth=""\d{1,3}"")?
            (\sheight=""\d{1,3}"")?
            (\sframeborder=""\d{1,3}"")?
            (\sallowfullscreen)?
            \s?>$|^</iframe>$",
            RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
4

1 に答える 1

1

上記の正規表現のアプローチは厳しすぎました。ケビンのよくできた点は言うまでもありません!

これが私がしたことです:

html-agility-packを使用して Html を解析し、このスタックオーバーフローの回答に記載されているようにサニタイズしました

また、正規表現に対して画像または iframe の src タグをチェックするコードを追加しました。(私はそれがもっとうまくできると確信しています)

public class HtmlSanitizer
{
    private readonly IDictionary<string, string[]> _whitelist;
    private readonly List<string> _deletableNodesXpath = new List<string>();

    public HtmlSanitizer()
    {
        _whitelist = new Dictionary<string, string[]>
                        {
                            {"a", new[] {"href", "target", "title"}},
                            {"img", new[] {"src", "alt", "width", "height"}},
                            {"iframe", new[] {"src", "width", "height", "frameborder", "allowfullscreen" }},
                            {"strong", null},
                            {"em", null},
                            {"blockquote", null},
                            {"b", null},
                            {"p", null},
                            {"ul", null},
                            {"ol", null},
                            {"li", null},
                            {"div", new[] {"align"}},
                            {"strike", null},
                            {"u", null},
                            {"sub", null},
                            {"sup", null},
                            {"table", null},
                            {"tr", null},
                            {"td", null},
                            {"th", null},
                            {"dd", null},
                            {"dt", null},
                            {"dl", null},
                            {"h1", null},
                            {"h2", null},
                            {"h3", null},
                        };
    }

    public string Sanitize(string input)
    {
        if (input.Trim().Length < 1)
            return string.Empty;
        var htmlDocument = new HtmlDocument();

        htmlDocument.LoadHtml(input);
        SanitizeNode(htmlDocument.DocumentNode);
        string xPath = CreateXPath();

        return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath);
    }

    private void SanitizeChildren(HtmlNode parentNode)
    {
        for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
        {
            SanitizeNode(parentNode.ChildNodes[i]);
        }
    }

    private static Regex _srcAttribute = new Regex(@"^https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+$", RegexOptions.Singleline | RegexOptions.IgnoreCase
                         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

    private static Regex _iframeSrc = new Regex(@"https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+", RegexOptions.Singleline | RegexOptions.IgnoreCase
                         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

    private void SanitizeNode(HtmlNode node)
    {
        if (node.NodeType == HtmlNodeType.Element)
        {
            if (!_whitelist.ContainsKey(node.Name))
            {
                if (!_deletableNodesXpath.Contains(node.Name))
                {
                    //DeletableNodesXpath.Add(node.Name.Replace("?",""));
                    node.Name = "removeableNode";
                    _deletableNodesXpath.Add(node.Name);
                }
                if (node.HasChildNodes)
                {
                    SanitizeChildren(node);
                }

                return;
            }

            if (node.HasAttributes)
            {
                for (int i = node.Attributes.Count - 1; i >= 0; i--)
                {
                    HtmlAttribute currentAttribute = node.Attributes[i];
                    string[] allowedAttributes = _whitelist[node.Name];
                    if (allowedAttributes != null)
                    {
                        if (!allowedAttributes.Contains(currentAttribute.Name))
                        {
                            node.Attributes.Remove(currentAttribute);
                        }

                        // if img src ensure matches regex 
                        if (node.Name == "img" && currentAttribute.Name == "src")
                        {
                            if (!_srcAttribute.IsMatch(currentAttribute.Value))
                            {
                                // remove node 
                                node.Name = "removeableNode";
                                _deletableNodesXpath.Add(node.Name);
                            }
                        }

                        // if iframe - ensure it within allowed src tags 
                        if (node.Name == "iframe" && currentAttribute.Name == "src")
                        {
                            if (!_iframeSrc.IsMatch(currentAttribute.Value))
                            {
                                // remove node 
                                node.Name = "removeableNode";
                                _deletableNodesXpath.Add(node.Name);
                            }
                        }

                    }
                    else
                    {
                        node.Attributes.Remove(currentAttribute);
                    }
                }
            }
        }

        if (node.HasChildNodes)
        {
            SanitizeChildren(node);
        }
    }

    private string StripHtml(string html, string xPath)
    {
        HtmlDocument htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(html);
        if (xPath.Length > 0)
        {
            HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath);
            foreach (HtmlNode node in invalidNodes)
            {
                node.ParentNode.RemoveChild(node, true);
            }
        }
        return htmlDoc.DocumentNode.WriteContentTo();
        ;
    }

    private string CreateXPath()
    {
        string xPath = string.Empty;
        for (int i = 0; i < _deletableNodesXpath.Count; i++)
        {
            if (i != _deletableNodesXpath.Count - 1)
            {
                xPath += string.Format("//{0}|", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
            }
            else xPath += string.Format("//{0}", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
        }
        return xPath;
    }
}
于 2012-05-28T17:13:27.157 に答える