// Copyright (C) 2006 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /** * @fileoverview * An HTML sanitizer that can satisfy a variety of security policies. * *

* The HTML sanitizer is built around a SAX parser and HTML element and * attributes schemas. * * If the cssparser is loaded, inline styles are sanitized using the * css property and value schemas. Else they are remove during * sanitization. * * If it exists, uses parseCssDeclarations, sanitizeCssProperty, cssSchema * * @author mikesamuel@gmail.com * @author jasvir@gmail.com * \@requires html4, URI * \@overrides window * \@provides html, html_sanitize */ // The Turkish i seems to be a non-issue, but abort in case it is. if ('I'.toLowerCase() !== 'i') { throw 'I/i problem'; } /** * \@namespace */ define(['lib/html4-defs'], function (html4) { var html = (function(html4) { // For closure compiler var parseCssDeclarations, sanitizeCssProperty, cssSchema; if ('undefined' !== typeof window) { parseCssDeclarations = window['parseCssDeclarations']; sanitizeCssProperty = window['sanitizeCssProperty']; cssSchema = window['cssSchema']; } // The keys of this object must be 'quoted' or JSCompiler will mangle them! // This is a partial list -- lookupEntity() uses the host browser's parser // (when available) to implement full entity lookup. // Note that entities are in general case-sensitive; the uppercase ones are // explicitly defined by HTML5 (presumably as compatibility). var ENTITIES = { 'lt': '<', 'LT': '<', 'gt': '>', 'GT': '>', 'amp': '&', 'AMP': '&', 'quot': '"', 'apos': '\'', 'nbsp': '\240' }; // Patterns for types of entity/character reference names. var decimalEscapeRe = /^#(\d+)$/; var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/; // contains every entity per http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html var safeEntityNameRe = /^[A-Za-z][A-za-z0-9]+$/; // Used as a hook to invoke the browser's entity parsing. is used // because its content is parsed for entities but not tags. // TODO(kpreid): This retrieval is a kludge and leads to silent loss of // functionality if the document isn't available. var entityLookupElement = ('undefined' !== typeof window && window['document']) ? window['document'].createElement('textarea') : null; /** * Decodes an HTML entity. * * {\@updoc * $ lookupEntity('lt') * # '<' * $ lookupEntity('GT') * # '>' * $ lookupEntity('amp') * # '&' * $ lookupEntity('nbsp') * # '\xA0' * $ lookupEntity('apos') * # "'" * $ lookupEntity('quot') * # '"' * $ lookupEntity('#xa') * # '\n' * $ lookupEntity('#10') * # '\n' * $ lookupEntity('#x0a') * # '\n' * $ lookupEntity('#010') * # '\n' * $ lookupEntity('#x00A') * # '\n' * $ lookupEntity('Pi') // Known failure * # '\u03A0' * $ lookupEntity('pi') // Known failure * # '\u03C0' * } * * @param {string} name the content between the '&' and the ';'. * @return {string} a single unicode code-point as a string. */ function lookupEntity(name) { // TODO: entity lookup as specified by HTML5 actually depends on the // presence of the ";". if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; } var m = name.match(decimalEscapeRe); if (m) { return String.fromCharCode(parseInt(m[1], 10)); } else if (!!(m = name.match(hexEscapeRe))) { return String.fromCharCode(parseInt(m[1], 16)); } else if (entityLookupElement && safeEntityNameRe.test(name)) { entityLookupElement.innerHTML = '&' + name + ';'; var text = entityLookupElement.textContent; ENTITIES[name] = text; return text; } else { return '&' + name + ';'; } } function decodeOneEntity(_, name) { return lookupEntity(name); } var nulRe = /\0/g; function stripNULs(s) { return s.replace(nulRe, ''); } var ENTITY_RE_1 = /&(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/g; var ENTITY_RE_2 = /^(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/; /** * The plain text of a chunk of HTML CDATA which possibly containing. * * {\@updoc * $ unescapeEntities('') * # '' * $ unescapeEntities('hello World!') * # 'hello World!' * $ unescapeEntities('1 < 2 && 4 > 3 ') * # '1 < 2 && 4 > 3\n' * $ unescapeEntities('<< <- unfinished entity>') * # '<< <- unfinished entity>' * $ unescapeEntities('/foo?bar=baz©=true') // & often unescaped in URLS * # '/foo?bar=baz©=true' * $ unescapeEntities('pi=ππ, Pi=Π\u03A0') // FIXME: known failure * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0' * } * * @param {string} s a chunk of HTML CDATA. It must not start or end inside * an HTML entity. */ function unescapeEntities(s) { return s.replace(ENTITY_RE_1, decodeOneEntity); } var ampRe = /&/g; var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi; var ltRe = /[<]/g; var gtRe = />/g; var quotRe = /\"/g; /** * Escapes HTML special characters in attribute values. * * {\@updoc * $ escapeAttrib('') * # '' * $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence. * # '"<<&==&>>"' * $ escapeAttrib('Hello <World>!') * # 'Hello <World>!' * } */ function escapeAttrib(s) { return ('' + s).replace(ampRe, '&').replace(ltRe, '<') .replace(gtRe, '>').replace(quotRe, '"'); } /** * Escape entities in RCDATA that can be escaped without changing the meaning. * {\@updoc * $ normalizeRCData('1 < 2 && 3 > 4 && 5 < 7&8') * # '1 < 2 && 3 > 4 && 5 < 7&8' * } */ function normalizeRCData(rcdata) { return rcdata .replace(looseAmpRe, '&$1') .replace(ltRe, '<') .replace(gtRe, '>'); } // TODO(felix8a): validate sanitizer regexs against the HTML5 grammar at // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html // We initially split input so that potentially meaningful characters // like '<' and '>' are separate tokens, using a fast dumb process that // ignores quoting. Then we walk that token stream, and when we see a // '<' that's the start of a tag, we use ATTR_RE to extract tag // attributes from the next token. That token will never have a '>' // character. However, it might have an unbalanced quote character, and // when we see that, we combine additional tokens to balance the quote. var ATTR_RE = new RegExp( '^\\s*' + '([-.:\\w]+)' + // 1 = Attribute name '(?:' + ( '\\s*(=)\\s*' + // 2 = Is there a value? '(' + ( // 3 = Attribute value // TODO(felix8a): maybe use backref to match quotes '(\")[^\"]*(\"|$)' + // 4, 5 = Double-quoted string '|' + '(\')[^\']*(\'|$)' + // 6, 7 = Single-quoted string '|' + // Positive lookahead to prevent interpretation of // <foo a= b=c> as <foo a='b=c'> // TODO(felix8a): might be able to drop this case '(?=[a-z][-\\w]*\\s*=)' + '|' + // Unquoted value that isn't an attribute name // (since we didn't match the positive lookahead above) '[^\"\'\\s]*' ) + ')' ) + ')?', 'i'); // false on IE<=8, true on most other browsers var splitWillCapture = ('a,b'.split(/(,)/).length === 3); // bitmask for tags with special parsing, like <script> and <textarea> var EFLAGS_TEXT = html4.eflags['CDATA'] | html4.eflags['RCDATA']; /** * Given a SAX-like event handler, produce a function that feeds those * events and a parameter to the event handler. * * The event handler has the form:{@code * { * // Name is an upper-case HTML tag name. Attribs is an array of * // alternating upper-case attribute names, and attribute values. The * // attribs array is reused by the parser. Param is the value passed to * // the saxParser. * startTag: function (name, attribs, param) { ... }, * endTag: function (name, param) { ... }, * pcdata: function (text, param) { ... }, * rcdata: function (text, param) { ... }, * cdata: function (text, param) { ... }, * startDoc: function (param) { ... }, * endDoc: function (param) { ... } * }} * * @param {Object} handler a record containing event handlers. * @return {function(string, Object)} A function that takes a chunk of HTML * and a parameter. The parameter is passed on to the handler methods. */ function makeSaxParser(handler) { // Accept quoted or unquoted keys (Closure compat) var hcopy = { cdata: handler.cdata || handler['cdata'], comment: handler.comment || handler['comment'], endDoc: handler.endDoc || handler['endDoc'], endTag: handler.endTag || handler['endTag'], pcdata: handler.pcdata || handler['pcdata'], rcdata: handler.rcdata || handler['rcdata'], startDoc: handler.startDoc || handler['startDoc'], startTag: handler.startTag || handler['startTag'] }; return function(htmlText, param) { return parse(htmlText, hcopy, param); }; } // Parsing strategy is to split input into parts that might be lexically // meaningful (every ">" becomes a separate part), and then recombine // parts if we discover they're in a different context. // TODO(felix8a): Significant performance regressions from -legacy, // tested on // Chrome 18.0 // Firefox 11.0 // IE 6, 7, 8, 9 // Opera 11.61 // Safari 5.1.3 // Many of these are unusual patterns that are linearly slower and still // pretty fast (eg 1ms to 5ms), so not necessarily worth fixing. // TODO(felix8a): "<script> && && && ... <\/script>" is slower on all // browsers. The hotspot is htmlSplit. // TODO(felix8a): "<p title='>>>>...'><\/p>" is slower on all browsers. // This is partly htmlSplit, but the hotspot is parseTagAndAttrs. // TODO(felix8a): "<a><\/a><a><\/a>..." is slower on IE9. // "<a>1<\/a><a>1<\/a>..." is faster, "<a><\/a>2<a><\/a>2..." is faster. // TODO(felix8a): "<p<p<p..." is slower on IE[6-8] var continuationMarker = {}; function parse(htmlText, handler, param) { var m, p, tagName; var parts = htmlSplit(htmlText); var state = { noMoreGT: false, noMoreEndComments: false }; parseCPS(handler, parts, 0, state, param); } function continuationMaker(h, parts, initial, state, param) { return function () { parseCPS(h, parts, initial, state, param); }; } function parseCPS(h, parts, initial, state, param) { try { if (h.startDoc && initial == 0) { h.startDoc(param); } var m, p, tagName; for (var pos = initial, end = parts.length; pos < end;) { var current = parts[pos++]; var next = parts[pos]; switch (current) { case '&': if (ENTITY_RE_2.test(next)) { if (h.pcdata) { h.pcdata('&' + next, param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } pos++; } else { if (h.pcdata) { h.pcdata("&", param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '<\/': if ((m = /^([-\w:]+)[^\'\"]*/.exec(next))) { if (m[0].length === next.length && parts[pos + 1] === '>') { // fast case, no attribute parsing needed pos += 2; tagName = m[1].toLowerCase(); if (h.endTag) { h.endTag(tagName, param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } else { // slow case, need to parse attributes // TODO(felix8a): do we really care about misparsing this? pos = parseEndTag( parts, pos, h, param, continuationMarker, state); } } else { if (h.pcdata) { h.pcdata('</', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '<': if (m = /^([-\w:]+)\s*\/?/.exec(next)) { if (m[0].length === next.length && parts[pos + 1] === '>') { // fast case, no attribute parsing needed pos += 2; tagName = m[1].toLowerCase(); if (h.startTag) { h.startTag(tagName, [], param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } // tags like <script> and <textarea> have special parsing var eflags = html4.ELEMENTS[tagName]; if (eflags & EFLAGS_TEXT) { var tag = { name: tagName, next: pos, eflags: eflags }; pos = parseText( parts, tag, h, param, continuationMarker, state); } } else { // slow case, need to parse attributes pos = parseStartTag( parts, pos, h, param, continuationMarker, state); } } else { if (h.pcdata) { h.pcdata('<', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '<\!--': // The pathological case is n copies of '<\!--' without '-->', and // repeated failure to find '-->' is quadratic. We avoid that by // remembering when search for '-->' fails. if (!state.noMoreEndComments) { // A comment <\!--x--> is split into three tokens: // '<\!--', 'x--', '>' // We want to find the next '>' token that has a preceding '--'. // pos is at the 'x--'. for (p = pos + 1; p < end; p++) { if (parts[p] === '>' && /--$/.test(parts[p - 1])) { break; } } if (p < end) { if (h.comment) { var comment = parts.slice(pos, p).join(''); h.comment( comment.substr(0, comment.length - 2), param, continuationMarker, continuationMaker(h, parts, p + 1, state, param)); } pos = p + 1; } else { state.noMoreEndComments = true; } } if (state.noMoreEndComments) { if (h.pcdata) { h.pcdata('<!--', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '<\!': if (!/^\w/.test(next)) { if (h.pcdata) { h.pcdata('<!', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } else { // similar to noMoreEndComment logic if (!state.noMoreGT) { for (p = pos + 1; p < end; p++) { if (parts[p] === '>') { break; } } if (p < end) { pos = p + 1; } else { state.noMoreGT = true; } } if (state.noMoreGT) { if (h.pcdata) { h.pcdata('<!', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } } break; case '<?': // similar to noMoreEndComment logic if (!state.noMoreGT) { for (p = pos + 1; p < end; p++) { if (parts[p] === '>') { break; } } if (p < end) { pos = p + 1; } else { state.noMoreGT = true; } } if (state.noMoreGT) { if (h.pcdata) { h.pcdata('<?', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '>': if (h.pcdata) { h.pcdata(">", param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } break; case '': break; default: if (h.pcdata) { h.pcdata(current, param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } break; } } if (h.endDoc) { h.endDoc(param); } } catch (e) { if (e !== continuationMarker) { throw e; } } } // Split str into parts for the html parser. function htmlSplit(str) { // can't hoist this out of the function because of the re.exec loop. var re = /(<\/|<\!--|<[!?]|[&<>])/g; str += ''; if (splitWillCapture) { return str.split(re); } else { var parts = []; var lastPos = 0; var m; while ((m = re.exec(str)) !== null) { parts.push(str.substring(lastPos, m.index)); parts.push(m[0]); lastPos = m.index + m[0].length; } parts.push(str.substring(lastPos)); return parts; } } function parseEndTag(parts, pos, h, param, continuationMarker, state) { var tag = parseTagAndAttrs(parts, pos); // drop unclosed tags if (!tag) { return parts.length; } if (h.endTag) { h.endTag(tag.name, param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } return tag.next; } function parseStartTag(parts, pos, h, param, continuationMarker, state) { var tag = parseTagAndAttrs(parts, pos); // drop unclosed tags if (!tag) { return parts.length; } if (h.startTag) { h.startTag(tag.name, tag.attrs, param, continuationMarker, continuationMaker(h, parts, tag.next, state, param)); } // tags like <script> and <textarea> have special parsing if (tag.eflags & EFLAGS_TEXT) { return parseText(parts, tag, h, param, continuationMarker, state); } else { return tag.next; } } var endTagRe = {}; // Tags like <script> and <textarea> are flagged as CDATA or RCDATA, // which means everything is text until we see the correct closing tag. function parseText(parts, tag, h, param, continuationMarker, state) { var end = parts.length; if (!endTagRe.hasOwnProperty(tag.name)) { endTagRe[tag.name] = new RegExp('^' + tag.name + '(?:[\\s\\/]|$)', 'i'); } var re = endTagRe[tag.name]; var first = tag.next; var p = tag.next + 1; for (; p < end; p++) { if (parts[p - 1] === '<\/' && re.test(parts[p])) { break; } } if (p < end) { p -= 1; } var buf = parts.slice(first, p).join(''); if (tag.eflags & html4.eflags['CDATA']) { if (h.cdata) { h.cdata(buf, param, continuationMarker, continuationMaker(h, parts, p, state, param)); } } else if (tag.eflags & html4.eflags['RCDATA']) { if (h.rcdata) { h.rcdata(normalizeRCData(buf), param, continuationMarker, continuationMaker(h, parts, p, state, param)); } } else { throw new Error('bug'); } return p; } // at this point, parts[pos-1] is either "<" or "<\/". function parseTagAndAttrs(parts, pos) { var m = /^([-\w:]+)/.exec(parts[pos]); var tag = {}; tag.name = m[1].toLowerCase(); tag.eflags = html4.ELEMENTS[tag.name]; var buf = parts[pos].substr(m[0].length); // Find the next '>'. We optimistically assume this '>' is not in a // quoted context, and further down we fix things up if it turns out to // be quoted. var p = pos + 1; var end = parts.length; for (; p < end; p++) { if (parts[p] === '>') { break; } buf += parts[p]; } if (end <= p) { return void 0; } var attrs = []; while (buf !== '') { m = ATTR_RE.exec(buf); if (!m) { // No attribute found: skip garbage buf = buf.replace(/^[\s\S][^a-z\s]*/, ''); } else if ((m[4] && !m[5]) || (m[6] && !m[7])) { // Unterminated quote: slurp to the next unquoted '>' var quote = m[4] || m[6]; var sawQuote = false; var abuf = [buf, parts[p++]]; for (; p < end; p++) { if (sawQuote) { if (parts[p] === '>') { break; } } else if (0 <= parts[p].indexOf(quote)) { sawQuote = true; } abuf.push(parts[p]); } // Slurp failed: lose the garbage if (end <= p) { break; } // Otherwise retry attribute parsing buf = abuf.join(''); continue; } else { // We have an attribute var aName = m[1].toLowerCase(); var aValue = m[2] ? decodeValue(m[3]) : ''; attrs.push(aName, aValue); buf = buf.substr(m[0].length); } } tag.attrs = attrs; tag.next = p + 1; return tag; } function decodeValue(v) { var q = v.charCodeAt(0); if (q === 0x22 || q === 0x27) { // " or ' v = v.substr(1, v.length - 2); } return unescapeEntities(stripNULs(v)); } /** * Returns a function that strips unsafe tags and attributes from html. * @param {function(string, Array.<string>): ?Array.<string>} tagPolicy * A function that takes (tagName, attribs[]), where tagName is a key in * html4.ELEMENTS and attribs is an array of alternating attribute names * and values. It should return a record (as follows), or null to delete * the element. It's okay for tagPolicy to modify the attribs array, * but the same array is reused, so it should not be held between calls. * Record keys: * attribs: (required) Sanitized attributes array. * tagName: Replacement tag name. * @return {function(string, Array)} A function that sanitizes a string of * HTML and appends result strings to the second argument, an array. */ function makeHtmlSanitizer(tagPolicy) { var stack; var ignoring; var emit = function (text, out) { if (!ignoring) { out.push(text); } }; return makeSaxParser({ 'startDoc': function(_) { stack = []; ignoring = false; }, 'startTag': function(tagNameOrig, attribs, out) { if (ignoring) { return; } if (!html4.ELEMENTS.hasOwnProperty(tagNameOrig)) { return; } var eflagsOrig = html4.ELEMENTS[tagNameOrig]; if (eflagsOrig & html4.eflags['FOLDABLE']) { return; } var decision = tagPolicy(tagNameOrig, attribs); if (!decision) { ignoring = !(eflagsOrig & html4.eflags['EMPTY']); return; } else if (typeof decision !== 'object') { throw new Error('tagPolicy did not return object (old API?)'); } if ('attribs' in decision) { attribs = decision['attribs']; } else { throw new Error('tagPolicy gave no attribs'); } var eflagsRep; var tagNameRep; if ('tagName' in decision) { tagNameRep = decision['tagName']; eflagsRep = html4.ELEMENTS[tagNameRep]; } else { tagNameRep = tagNameOrig; eflagsRep = eflagsOrig; } // TODO(mikesamuel): relying on tagPolicy not to insert unsafe // attribute names. // If this is an optional-end-tag element and either this element or its // previous like sibling was rewritten, then insert a close tag to // preserve structure. if (eflagsOrig & html4.eflags['OPTIONAL_ENDTAG']) { var onStack = stack[stack.length - 1]; if (onStack && onStack.orig === tagNameOrig && (onStack.rep !== tagNameRep || tagNameOrig !== tagNameRep)) { out.push('<\/', onStack.rep, '>'); } } if (!(eflagsOrig & html4.eflags['EMPTY'])) { stack.push({orig: tagNameOrig, rep: tagNameRep}); } out.push('<', tagNameRep); for (var i = 0, n = attribs.length; i < n; i += 2) { var attribName = attribs[i], value = attribs[i + 1]; if (value !== null && value !== void 0) { out.push(' ', attribName, '="', escapeAttrib(value), '"'); } } out.push('>'); if ((eflagsOrig & html4.eflags['EMPTY']) && !(eflagsRep & html4.eflags['EMPTY'])) { // replacement is non-empty, synthesize end tag out.push('<\/', tagNameRep, '>'); } }, 'endTag': function(tagName, out) { if (ignoring) { ignoring = false; return; } if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; } var eflags = html4.ELEMENTS[tagName]; if (!(eflags & (html4.eflags['EMPTY'] | html4.eflags['FOLDABLE']))) { var index; if (eflags & html4.eflags['OPTIONAL_ENDTAG']) { for (index = stack.length; --index >= 0;) { var stackElOrigTag = stack[index].orig; if (stackElOrigTag === tagName) { break; } if (!(html4.ELEMENTS[stackElOrigTag] & html4.eflags['OPTIONAL_ENDTAG'])) { // Don't pop non optional end tags looking for a match. return; } } } else { for (index = stack.length; --index >= 0;) { if (stack[index].orig === tagName) { break; } } } if (index < 0) { return; } // Not opened. for (var i = stack.length; --i > index;) { var stackElRepTag = stack[i].rep; if (!(html4.ELEMENTS[stackElRepTag] & html4.eflags['OPTIONAL_ENDTAG'])) { out.push('<\/', stackElRepTag, '>'); } } if (index < stack.length) { tagName = stack[index].rep; } stack.length = index; out.push('<\/', tagName, '>'); } }, 'pcdata': emit, 'rcdata': emit, 'cdata': emit, 'endDoc': function(out) { for (; stack.length; stack.length--) { out.push('<\/', stack[stack.length - 1].rep, '>'); } } }); } var ALLOWED_URI_SCHEMES = /^(?:https?|mailto)$/i; function safeUri(uri, effect, ltype, hints, naiveUriRewriter) { if (!naiveUriRewriter) { return null; } try { var parsed = URI.parse('' + uri); if (parsed) { if (!parsed.hasScheme() || ALLOWED_URI_SCHEMES.test(parsed.getScheme())) { var safe = naiveUriRewriter(parsed, effect, ltype, hints); return safe ? safe.toString() : null; } } } catch (e) { return null; } return null; } function log(logger, tagName, attribName, oldValue, newValue) { if (!attribName) { logger(tagName + " removed", { change: "removed", tagName: tagName }); } if (oldValue !== newValue) { var changed = "changed"; if (oldValue && !newValue) { changed = "removed"; } else if (!oldValue && newValue) { changed = "added"; } logger(tagName + "." + attribName + " " + changed, { change: changed, tagName: tagName, attribName: attribName, oldValue: oldValue, newValue: newValue }); } } function lookupAttribute(map, tagName, attribName) { var attribKey; attribKey = tagName + '::' + attribName; if (map.hasOwnProperty(attribKey)) { return map[attribKey]; } attribKey = '*::' + attribName; if (map.hasOwnProperty(attribKey)) { return map[attribKey]; } return void 0; } function getAttributeType(tagName, attribName) { return lookupAttribute(html4.ATTRIBS, tagName, attribName); } function getLoaderType(tagName, attribName) { return lookupAttribute(html4.LOADERTYPES, tagName, attribName); } function getUriEffect(tagName, attribName) { return lookupAttribute(html4.URIEFFECTS, tagName, attribName); } /** * Sanitizes attributes on an HTML tag. * @param {string} tagName An HTML tag name in lowercase. * @param {Array.<?string>} attribs An array of alternating names and values. * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to * apply to URI attributes; it can return a new string value, or null to * delete the attribute. If unspecified, URI attributes are deleted. * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply * to attributes containing HTML names, element IDs, and space-separated * lists of classes; it can return a new string value, or null to delete * the attribute. If unspecified, these attributes are kept unchanged. * @return {Array.<?string>} The sanitized attributes as a list of alternating * names and values, where a null value means to omit the attribute. */ function sanitizeAttribs(tagName, attribs, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) { // TODO(felix8a): it's obnoxious that domado duplicates much of this // TODO(felix8a): maybe consistently enforce constraints like target= for (var i = 0; i < attribs.length; i += 2) { var attribName = attribs[i]; var value = attribs[i + 1]; var oldValue = value; var atype = null, attribKey; if ((attribKey = tagName + '::' + attribName, html4.ATTRIBS.hasOwnProperty(attribKey)) || (attribKey = '*::' + attribName, html4.ATTRIBS.hasOwnProperty(attribKey))) { atype = html4.ATTRIBS[attribKey]; } if (atype !== null) { switch (atype) { case html4.atype['NONE']: break; case html4.atype['SCRIPT']: value = null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; case html4.atype['STYLE']: if ('undefined' === typeof parseCssDeclarations) { value = null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; } var sanitizedDeclarations = []; parseCssDeclarations( value, { 'declaration': function (property, tokens) { var normProp = property.toLowerCase(); sanitizeCssProperty( normProp, tokens, opt_naiveUriRewriter ? function (url) { return safeUri( url, html4.ueffects.SAME_DOCUMENT, html4.ltypes.SANDBOXED, { "TYPE": "CSS", "CSS_PROP": normProp }, opt_naiveUriRewriter); } : null); if (tokens.length) { sanitizedDeclarations.push( normProp + ': ' + tokens.join(' ')); } } }); value = sanitizedDeclarations.length > 0 ? sanitizedDeclarations.join(' ; ') : null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; case html4.atype['ID']: case html4.atype['IDREF']: case html4.atype['IDREFS']: case html4.atype['GLOBAL_NAME']: case html4.atype['LOCAL_NAME']: case html4.atype['CLASSES']: value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; case html4.atype['URI']: value = safeUri(value, getUriEffect(tagName, attribName), getLoaderType(tagName, attribName), { "TYPE": "MARKUP", "XML_ATTR": attribName, "XML_TAG": tagName }, opt_naiveUriRewriter); if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; case html4.atype['URI_FRAGMENT']: if (value && '#' === value.charAt(0)) { value = value.substring(1); // remove the leading '#' value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value; if (value !== null && value !== void 0) { value = '#' + value; // restore the leading '#' } } else { value = null; } if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; default: value = null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; } } else { value = null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } } attribs[i + 1] = value; } return attribs; } /** * Creates a tag policy that omits all tags marked UNSAFE in html4-defs.js * and applies the default attribute sanitizer with the supplied policy for * URI attributes and NMTOKEN attributes. * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to * apply to URI attributes. If not given, URI attributes are deleted. * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply * to attributes containing HTML names, element IDs, and space-separated * lists of classes. If not given, such attributes are left unchanged. * @return {function(string, Array.<?string>)} A tagPolicy suitable for * passing to html.sanitize. */ function makeTagPolicy( opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) { return function(tagName, attribs) { if (!(html4.ELEMENTS[tagName] & html4.eflags['UNSAFE'])) { return { 'attribs': sanitizeAttribs(tagName, attribs, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) }; } else { if (opt_logger) { log(opt_logger, tagName, undefined, undefined, undefined); } } }; } /** * Sanitizes HTML tags and attributes according to a given policy. * @param {string} inputHtml The HTML to sanitize. * @param {function(string, Array.<?string>)} tagPolicy A function that * decides which tags to accept and sanitizes their attributes (see * makeHtmlSanitizer above for details). * @return {string} The sanitized HTML. */ function sanitizeWithPolicy(inputHtml, tagPolicy) { var outputArray = []; makeHtmlSanitizer(tagPolicy)(inputHtml, outputArray); return outputArray.join(''); } /** * Strips unsafe tags and attributes from HTML. * @param {string} inputHtml The HTML to sanitize. * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to * apply to URI attributes. If not given, URI attributes are deleted. * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply * to attributes containing HTML names, element IDs, and space-separated * lists of classes. If not given, such attributes are left unchanged. */ function sanitize(inputHtml, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) { var tagPolicy = makeTagPolicy( opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger); return sanitizeWithPolicy(inputHtml, tagPolicy); } // Export both quoted and unquoted names for Closure linkage. var html = {}; html.escapeAttrib = html['escapeAttrib'] = escapeAttrib; html.makeHtmlSanitizer = html['makeHtmlSanitizer'] = makeHtmlSanitizer; html.makeSaxParser = html['makeSaxParser'] = makeSaxParser; html.makeTagPolicy = html['makeTagPolicy'] = makeTagPolicy; html.normalizeRCData = html['normalizeRCData'] = normalizeRCData; html.sanitize = html['sanitize'] = sanitize; html.sanitizeAttribs = html['sanitizeAttribs'] = sanitizeAttribs; html.sanitizeWithPolicy = html['sanitizeWithPolicy'] = sanitizeWithPolicy; html.unescapeEntities = html['unescapeEntities'] = unescapeEntities; return html; })(html4); var html_sanitize = html['sanitize']; return { html: html }; });