summaryrefslogtreecommitdiff
path: root/web-ui/app/js/lib/html-sanitizer.js
diff options
context:
space:
mode:
Diffstat (limited to 'web-ui/app/js/lib/html-sanitizer.js')
-rw-r--r--web-ui/app/js/lib/html-sanitizer.js1064
1 files changed, 1064 insertions, 0 deletions
diff --git a/web-ui/app/js/lib/html-sanitizer.js b/web-ui/app/js/lib/html-sanitizer.js
new file mode 100644
index 00000000..80fb0041
--- /dev/null
+++ b/web-ui/app/js/lib/html-sanitizer.js
@@ -0,0 +1,1064 @@
+// Copyright (C) 2006 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @fileoverview
+ * An HTML sanitizer that can satisfy a variety of security policies.
+ *
+ * <p>
+ * The HTML sanitizer is built around a SAX parser and HTML element and
+ * attributes schemas.
+ *
+ * If the cssparser is loaded, inline styles are sanitized using the
+ * css property and value schemas. Else they are remove during
+ * sanitization.
+ *
+ * If it exists, uses parseCssDeclarations, sanitizeCssProperty, cssSchema
+ *
+ * @author mikesamuel@gmail.com
+ * @author jasvir@gmail.com
+ * \@requires html4, URI
+ * \@overrides window
+ * \@provides html, html_sanitize
+ */
+
+// The Turkish i seems to be a non-issue, but abort in case it is.
+if ('I'.toLowerCase() !== 'i') { throw 'I/i problem'; }
+
+/**
+ * \@namespace
+ */
+define(['lib/html4-defs'], function (html4) {
+var html = (function(html4) {
+
+ // For closure compiler
+ var parseCssDeclarations, sanitizeCssProperty, cssSchema;
+ if ('undefined' !== typeof window) {
+ parseCssDeclarations = window['parseCssDeclarations'];
+ sanitizeCssProperty = window['sanitizeCssProperty'];
+ cssSchema = window['cssSchema'];
+ }
+
+ // The keys of this object must be 'quoted' or JSCompiler will mangle them!
+ // This is a partial list -- lookupEntity() uses the host browser's parser
+ // (when available) to implement full entity lookup.
+ // Note that entities are in general case-sensitive; the uppercase ones are
+ // explicitly defined by HTML5 (presumably as compatibility).
+ var ENTITIES = {
+ 'lt': '<',
+ 'LT': '<',
+ 'gt': '>',
+ 'GT': '>',
+ 'amp': '&',
+ 'AMP': '&',
+ 'quot': '"',
+ 'apos': '\'',
+ 'nbsp': '\240'
+ };
+
+ // Patterns for types of entity/character reference names.
+ var decimalEscapeRe = /^#(\d+)$/;
+ var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/;
+ // contains every entity per http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html
+ var safeEntityNameRe = /^[A-Za-z][A-za-z0-9]+$/;
+ // Used as a hook to invoke the browser's entity parsing. <textarea> is used
+ // because its content is parsed for entities but not tags.
+ // TODO(kpreid): This retrieval is a kludge and leads to silent loss of
+ // functionality if the document isn't available.
+ var entityLookupElement =
+ ('undefined' !== typeof window && window['document'])
+ ? window['document'].createElement('textarea') : null;
+ /**
+ * Decodes an HTML entity.
+ *
+ * {\@updoc
+ * $ lookupEntity('lt')
+ * # '<'
+ * $ lookupEntity('GT')
+ * # '>'
+ * $ lookupEntity('amp')
+ * # '&'
+ * $ lookupEntity('nbsp')
+ * # '\xA0'
+ * $ lookupEntity('apos')
+ * # "'"
+ * $ lookupEntity('quot')
+ * # '"'
+ * $ lookupEntity('#xa')
+ * # '\n'
+ * $ lookupEntity('#10')
+ * # '\n'
+ * $ lookupEntity('#x0a')
+ * # '\n'
+ * $ lookupEntity('#010')
+ * # '\n'
+ * $ lookupEntity('#x00A')
+ * # '\n'
+ * $ lookupEntity('Pi') // Known failure
+ * # '\u03A0'
+ * $ lookupEntity('pi') // Known failure
+ * # '\u03C0'
+ * }
+ *
+ * @param {string} name the content between the '&' and the ';'.
+ * @return {string} a single unicode code-point as a string.
+ */
+ function lookupEntity(name) {
+ // TODO: entity lookup as specified by HTML5 actually depends on the
+ // presence of the ";".
+ if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; }
+ var m = name.match(decimalEscapeRe);
+ if (m) {
+ return String.fromCharCode(parseInt(m[1], 10));
+ } else if (!!(m = name.match(hexEscapeRe))) {
+ return String.fromCharCode(parseInt(m[1], 16));
+ } else if (entityLookupElement && safeEntityNameRe.test(name)) {
+ entityLookupElement.innerHTML = '&' + name + ';';
+ var text = entityLookupElement.textContent;
+ ENTITIES[name] = text;
+ return text;
+ } else {
+ return '&' + name + ';';
+ }
+ }
+
+ function decodeOneEntity(_, name) {
+ return lookupEntity(name);
+ }
+
+ var nulRe = /\0/g;
+ function stripNULs(s) {
+ return s.replace(nulRe, '');
+ }
+
+ var ENTITY_RE_1 = /&(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/g;
+ var ENTITY_RE_2 = /^(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/;
+ /**
+ * The plain text of a chunk of HTML CDATA which possibly containing.
+ *
+ * {\@updoc
+ * $ unescapeEntities('')
+ * # ''
+ * $ unescapeEntities('hello World!')
+ * # 'hello World!'
+ * $ unescapeEntities('1 &lt; 2 &amp;&AMP; 4 &gt; 3&#10;')
+ * # '1 < 2 && 4 > 3\n'
+ * $ unescapeEntities('&lt;&lt <- unfinished entity&gt;')
+ * # '<&lt <- unfinished entity>'
+ * $ unescapeEntities('/foo?bar=baz&copy=true') // & often unescaped in URLS
+ * # '/foo?bar=baz&copy=true'
+ * $ unescapeEntities('pi=&pi;&#x3c0;, Pi=&Pi;\u03A0') // FIXME: known failure
+ * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0'
+ * }
+ *
+ * @param {string} s a chunk of HTML CDATA. It must not start or end inside
+ * an HTML entity.
+ */
+ function unescapeEntities(s) {
+ return s.replace(ENTITY_RE_1, decodeOneEntity);
+ }
+
+ var ampRe = /&/g;
+ var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi;
+ var ltRe = /[<]/g;
+ var gtRe = />/g;
+ var quotRe = /\"/g;
+
+ /**
+ * Escapes HTML special characters in attribute values.
+ *
+ * {\@updoc
+ * $ escapeAttrib('')
+ * # ''
+ * $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence.
+ * # '&#34;&lt;&lt;&amp;&#61;&#61;&amp;&gt;&gt;&#34;'
+ * $ escapeAttrib('Hello <World>!')
+ * # 'Hello &lt;World&gt;!'
+ * }
+ */
+ function escapeAttrib(s) {
+ return ('' + s).replace(ampRe, '&amp;').replace(ltRe, '&lt;')
+ .replace(gtRe, '&gt;').replace(quotRe, '&#34;');
+ }
+
+ /**
+ * Escape entities in RCDATA that can be escaped without changing the meaning.
+ * {\@updoc
+ * $ normalizeRCData('1 < 2 &&amp; 3 > 4 &amp;& 5 &lt; 7&8')
+ * # '1 &lt; 2 &amp;&amp; 3 &gt; 4 &amp;&amp; 5 &lt; 7&amp;8'
+ * }
+ */
+ function normalizeRCData(rcdata) {
+ return rcdata
+ .replace(looseAmpRe, '&amp;$1')
+ .replace(ltRe, '&lt;')
+ .replace(gtRe, '&gt;');
+ }
+
+ // TODO(felix8a): validate sanitizer regexs against the HTML5 grammar at
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html
+
+ // We initially split input so that potentially meaningful characters
+ // like '<' and '>' are separate tokens, using a fast dumb process that
+ // ignores quoting. Then we walk that token stream, and when we see a
+ // '<' that's the start of a tag, we use ATTR_RE to extract tag
+ // attributes from the next token. That token will never have a '>'
+ // character. However, it might have an unbalanced quote character, and
+ // when we see that, we combine additional tokens to balance the quote.
+
+ var ATTR_RE = new RegExp(
+ '^\\s*' +
+ '([-.:\\w]+)' + // 1 = Attribute name
+ '(?:' + (
+ '\\s*(=)\\s*' + // 2 = Is there a value?
+ '(' + ( // 3 = Attribute value
+ // TODO(felix8a): maybe use backref to match quotes
+ '(\")[^\"]*(\"|$)' + // 4, 5 = Double-quoted string
+ '|' +
+ '(\')[^\']*(\'|$)' + // 6, 7 = Single-quoted string
+ '|' +
+ // Positive lookahead to prevent interpretation of
+ // <foo a= b=c> as <foo a='b=c'>
+ // TODO(felix8a): might be able to drop this case
+ '(?=[a-z][-\\w]*\\s*=)' +
+ '|' +
+ // Unquoted value that isn't an attribute name
+ // (since we didn't match the positive lookahead above)
+ '[^\"\'\\s]*' ) +
+ ')' ) +
+ ')?',
+ 'i');
+
+ // false on IE<=8, true on most other browsers
+ var splitWillCapture = ('a,b'.split(/(,)/).length === 3);
+
+ // bitmask for tags with special parsing, like <script> and <textarea>
+ var EFLAGS_TEXT = html4.eflags['CDATA'] | html4.eflags['RCDATA'];
+
+ /**
+ * Given a SAX-like event handler, produce a function that feeds those
+ * events and a parameter to the event handler.
+ *
+ * The event handler has the form:{@code
+ * {
+ * // Name is an upper-case HTML tag name. Attribs is an array of
+ * // alternating upper-case attribute names, and attribute values. The
+ * // attribs array is reused by the parser. Param is the value passed to
+ * // the saxParser.
+ * startTag: function (name, attribs, param) { ... },
+ * endTag: function (name, param) { ... },
+ * pcdata: function (text, param) { ... },
+ * rcdata: function (text, param) { ... },
+ * cdata: function (text, param) { ... },
+ * startDoc: function (param) { ... },
+ * endDoc: function (param) { ... }
+ * }}
+ *
+ * @param {Object} handler a record containing event handlers.
+ * @return {function(string, Object)} A function that takes a chunk of HTML
+ * and a parameter. The parameter is passed on to the handler methods.
+ */
+ function makeSaxParser(handler) {
+ // Accept quoted or unquoted keys (Closure compat)
+ var hcopy = {
+ cdata: handler.cdata || handler['cdata'],
+ comment: handler.comment || handler['comment'],
+ endDoc: handler.endDoc || handler['endDoc'],
+ endTag: handler.endTag || handler['endTag'],
+ pcdata: handler.pcdata || handler['pcdata'],
+ rcdata: handler.rcdata || handler['rcdata'],
+ startDoc: handler.startDoc || handler['startDoc'],
+ startTag: handler.startTag || handler['startTag']
+ };
+ return function(htmlText, param) {
+ return parse(htmlText, hcopy, param);
+ };
+ }
+
+ // Parsing strategy is to split input into parts that might be lexically
+ // meaningful (every ">" becomes a separate part), and then recombine
+ // parts if we discover they're in a different context.
+
+ // TODO(felix8a): Significant performance regressions from -legacy,
+ // tested on
+ // Chrome 18.0
+ // Firefox 11.0
+ // IE 6, 7, 8, 9
+ // Opera 11.61
+ // Safari 5.1.3
+ // Many of these are unusual patterns that are linearly slower and still
+ // pretty fast (eg 1ms to 5ms), so not necessarily worth fixing.
+
+ // TODO(felix8a): "<script> && && && ... <\/script>" is slower on all
+ // browsers. The hotspot is htmlSplit.
+
+ // TODO(felix8a): "<p title='>>>>...'><\/p>" is slower on all browsers.
+ // This is partly htmlSplit, but the hotspot is parseTagAndAttrs.
+
+ // TODO(felix8a): "<a><\/a><a><\/a>..." is slower on IE9.
+ // "<a>1<\/a><a>1<\/a>..." is faster, "<a><\/a>2<a><\/a>2..." is faster.
+
+ // TODO(felix8a): "<p<p<p..." is slower on IE[6-8]
+
+ var continuationMarker = {};
+ function parse(htmlText, handler, param) {
+ var m, p, tagName;
+ var parts = htmlSplit(htmlText);
+ var state = {
+ noMoreGT: false,
+ noMoreEndComments: false
+ };
+ parseCPS(handler, parts, 0, state, param);
+ }
+
+ function continuationMaker(h, parts, initial, state, param) {
+ return function () {
+ parseCPS(h, parts, initial, state, param);
+ };
+ }
+
+ function parseCPS(h, parts, initial, state, param) {
+ try {
+ if (h.startDoc && initial == 0) { h.startDoc(param); }
+ var m, p, tagName;
+ for (var pos = initial, end = parts.length; pos < end;) {
+ var current = parts[pos++];
+ var next = parts[pos];
+ switch (current) {
+ case '&':
+ if (ENTITY_RE_2.test(next)) {
+ if (h.pcdata) {
+ h.pcdata('&' + next, param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ pos++;
+ } else {
+ if (h.pcdata) { h.pcdata("&amp;", param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ }
+ break;
+ case '<\/':
+ if ((m = /^([-\w:]+)[^\'\"]*/.exec(next))) {
+ if (m[0].length === next.length && parts[pos + 1] === '>') {
+ // fast case, no attribute parsing needed
+ pos += 2;
+ tagName = m[1].toLowerCase();
+ if (h.endTag) {
+ h.endTag(tagName, param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ } else {
+ // slow case, need to parse attributes
+ // TODO(felix8a): do we really care about misparsing this?
+ pos = parseEndTag(
+ parts, pos, h, param, continuationMarker, state);
+ }
+ } else {
+ if (h.pcdata) {
+ h.pcdata('&lt;/', param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ }
+ break;
+ case '<':
+ if (m = /^([-\w:]+)\s*\/?/.exec(next)) {
+ if (m[0].length === next.length && parts[pos + 1] === '>') {
+ // fast case, no attribute parsing needed
+ pos += 2;
+ tagName = m[1].toLowerCase();
+ if (h.startTag) {
+ h.startTag(tagName, [], param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ // tags like <script> and <textarea> have special parsing
+ var eflags = html4.ELEMENTS[tagName];
+ if (eflags & EFLAGS_TEXT) {
+ var tag = { name: tagName, next: pos, eflags: eflags };
+ pos = parseText(
+ parts, tag, h, param, continuationMarker, state);
+ }
+ } else {
+ // slow case, need to parse attributes
+ pos = parseStartTag(
+ parts, pos, h, param, continuationMarker, state);
+ }
+ } else {
+ if (h.pcdata) {
+ h.pcdata('&lt;', param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ }
+ break;
+ case '<\!--':
+ // The pathological case is n copies of '<\!--' without '-->', and
+ // repeated failure to find '-->' is quadratic. We avoid that by
+ // remembering when search for '-->' fails.
+ if (!state.noMoreEndComments) {
+ // A comment <\!--x--> is split into three tokens:
+ // '<\!--', 'x--', '>'
+ // We want to find the next '>' token that has a preceding '--'.
+ // pos is at the 'x--'.
+ for (p = pos + 1; p < end; p++) {
+ if (parts[p] === '>' && /--$/.test(parts[p - 1])) { break; }
+ }
+ if (p < end) {
+ if (h.comment) {
+ var comment = parts.slice(pos, p).join('');
+ h.comment(
+ comment.substr(0, comment.length - 2), param,
+ continuationMarker,
+ continuationMaker(h, parts, p + 1, state, param));
+ }
+ pos = p + 1;
+ } else {
+ state.noMoreEndComments = true;
+ }
+ }
+ if (state.noMoreEndComments) {
+ if (h.pcdata) {
+ h.pcdata('&lt;!--', param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ }
+ break;
+ case '<\!':
+ if (!/^\w/.test(next)) {
+ if (h.pcdata) {
+ h.pcdata('&lt;!', param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ } else {
+ // similar to noMoreEndComment logic
+ if (!state.noMoreGT) {
+ for (p = pos + 1; p < end; p++) {
+ if (parts[p] === '>') { break; }
+ }
+ if (p < end) {
+ pos = p + 1;
+ } else {
+ state.noMoreGT = true;
+ }
+ }
+ if (state.noMoreGT) {
+ if (h.pcdata) {
+ h.pcdata('&lt;!', param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ }
+ }
+ break;
+ case '<?':
+ // similar to noMoreEndComment logic
+ if (!state.noMoreGT) {
+ for (p = pos + 1; p < end; p++) {
+ if (parts[p] === '>') { break; }
+ }
+ if (p < end) {
+ pos = p + 1;
+ } else {
+ state.noMoreGT = true;
+ }
+ }
+ if (state.noMoreGT) {
+ if (h.pcdata) {
+ h.pcdata('&lt;?', param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ }
+ break;
+ case '>':
+ if (h.pcdata) {
+ h.pcdata("&gt;", param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ break;
+ case '':
+ break;
+ default:
+ if (h.pcdata) {
+ h.pcdata(current, param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ break;
+ }
+ }
+ if (h.endDoc) { h.endDoc(param); }
+ } catch (e) {
+ if (e !== continuationMarker) { throw e; }
+ }
+ }
+
+ // Split str into parts for the html parser.
+ function htmlSplit(str) {
+ // can't hoist this out of the function because of the re.exec loop.
+ var re = /(<\/|<\!--|<[!?]|[&<>])/g;
+ str += '';
+ if (splitWillCapture) {
+ return str.split(re);
+ } else {
+ var parts = [];
+ var lastPos = 0;
+ var m;
+ while ((m = re.exec(str)) !== null) {
+ parts.push(str.substring(lastPos, m.index));
+ parts.push(m[0]);
+ lastPos = m.index + m[0].length;
+ }
+ parts.push(str.substring(lastPos));
+ return parts;
+ }
+ }
+
+ function parseEndTag(parts, pos, h, param, continuationMarker, state) {
+ var tag = parseTagAndAttrs(parts, pos);
+ // drop unclosed tags
+ if (!tag) { return parts.length; }
+ if (h.endTag) {
+ h.endTag(tag.name, param, continuationMarker,
+ continuationMaker(h, parts, pos, state, param));
+ }
+ return tag.next;
+ }
+
+ function parseStartTag(parts, pos, h, param, continuationMarker, state) {
+ var tag = parseTagAndAttrs(parts, pos);
+ // drop unclosed tags
+ if (!tag) { return parts.length; }
+ if (h.startTag) {
+ h.startTag(tag.name, tag.attrs, param, continuationMarker,
+ continuationMaker(h, parts, tag.next, state, param));
+ }
+ // tags like <script> and <textarea> have special parsing
+ if (tag.eflags & EFLAGS_TEXT) {
+ return parseText(parts, tag, h, param, continuationMarker, state);
+ } else {
+ return tag.next;
+ }
+ }
+
+ var endTagRe = {};
+
+ // Tags like <script> and <textarea> are flagged as CDATA or RCDATA,
+ // which means everything is text until we see the correct closing tag.
+ function parseText(parts, tag, h, param, continuationMarker, state) {
+ var end = parts.length;
+ if (!endTagRe.hasOwnProperty(tag.name)) {
+ endTagRe[tag.name] = new RegExp('^' + tag.name + '(?:[\\s\\/]|$)', 'i');
+ }
+ var re = endTagRe[tag.name];
+ var first = tag.next;
+ var p = tag.next + 1;
+ for (; p < end; p++) {
+ if (parts[p - 1] === '<\/' && re.test(parts[p])) { break; }
+ }
+ if (p < end) { p -= 1; }
+ var buf = parts.slice(first, p).join('');
+ if (tag.eflags & html4.eflags['CDATA']) {
+ if (h.cdata) {
+ h.cdata(buf, param, continuationMarker,
+ continuationMaker(h, parts, p, state, param));
+ }
+ } else if (tag.eflags & html4.eflags['RCDATA']) {
+ if (h.rcdata) {
+ h.rcdata(normalizeRCData(buf), param, continuationMarker,
+ continuationMaker(h, parts, p, state, param));
+ }
+ } else {
+ throw new Error('bug');
+ }
+ return p;
+ }
+
+ // at this point, parts[pos-1] is either "<" or "<\/".
+ function parseTagAndAttrs(parts, pos) {
+ var m = /^([-\w:]+)/.exec(parts[pos]);
+ var tag = {};
+ tag.name = m[1].toLowerCase();
+ tag.eflags = html4.ELEMENTS[tag.name];
+ var buf = parts[pos].substr(m[0].length);
+ // Find the next '>'. We optimistically assume this '>' is not in a
+ // quoted context, and further down we fix things up if it turns out to
+ // be quoted.
+ var p = pos + 1;
+ var end = parts.length;
+ for (; p < end; p++) {
+ if (parts[p] === '>') { break; }
+ buf += parts[p];
+ }
+ if (end <= p) { return void 0; }
+ var attrs = [];
+ while (buf !== '') {
+ m = ATTR_RE.exec(buf);
+ if (!m) {
+ // No attribute found: skip garbage
+ buf = buf.replace(/^[\s\S][^a-z\s]*/, '');
+
+ } else if ((m[4] && !m[5]) || (m[6] && !m[7])) {
+ // Unterminated quote: slurp to the next unquoted '>'
+ var quote = m[4] || m[6];
+ var sawQuote = false;
+ var abuf = [buf, parts[p++]];
+ for (; p < end; p++) {
+ if (sawQuote) {
+ if (parts[p] === '>') { break; }
+ } else if (0 <= parts[p].indexOf(quote)) {
+ sawQuote = true;
+ }
+ abuf.push(parts[p]);
+ }
+ // Slurp failed: lose the garbage
+ if (end <= p) { break; }
+ // Otherwise retry attribute parsing
+ buf = abuf.join('');
+ continue;
+
+ } else {
+ // We have an attribute
+ var aName = m[1].toLowerCase();
+ var aValue = m[2] ? decodeValue(m[3]) : '';
+ attrs.push(aName, aValue);
+ buf = buf.substr(m[0].length);
+ }
+ }
+ tag.attrs = attrs;
+ tag.next = p + 1;
+ return tag;
+ }
+
+ function decodeValue(v) {
+ var q = v.charCodeAt(0);
+ if (q === 0x22 || q === 0x27) { // " or '
+ v = v.substr(1, v.length - 2);
+ }
+ return unescapeEntities(stripNULs(v));
+ }
+
+ /**
+ * Returns a function that strips unsafe tags and attributes from html.
+ * @param {function(string, Array.<string>): ?Array.<string>} tagPolicy
+ * A function that takes (tagName, attribs[]), where tagName is a key in
+ * html4.ELEMENTS and attribs is an array of alternating attribute names
+ * and values. It should return a record (as follows), or null to delete
+ * the element. It's okay for tagPolicy to modify the attribs array,
+ * but the same array is reused, so it should not be held between calls.
+ * Record keys:
+ * attribs: (required) Sanitized attributes array.
+ * tagName: Replacement tag name.
+ * @return {function(string, Array)} A function that sanitizes a string of
+ * HTML and appends result strings to the second argument, an array.
+ */
+ function makeHtmlSanitizer(tagPolicy) {
+ var stack;
+ var ignoring;
+ var emit = function (text, out) {
+ if (!ignoring) { out.push(text); }
+ };
+ return makeSaxParser({
+ 'startDoc': function(_) {
+ stack = [];
+ ignoring = false;
+ },
+ 'startTag': function(tagNameOrig, attribs, out) {
+ if (ignoring) { return; }
+ if (!html4.ELEMENTS.hasOwnProperty(tagNameOrig)) { return; }
+ var eflagsOrig = html4.ELEMENTS[tagNameOrig];
+ if (eflagsOrig & html4.eflags['FOLDABLE']) {
+ return;
+ }
+
+ var decision = tagPolicy(tagNameOrig, attribs);
+ if (!decision) {
+ ignoring = !(eflagsOrig & html4.eflags['EMPTY']);
+ return;
+ } else if (typeof decision !== 'object') {
+ throw new Error('tagPolicy did not return object (old API?)');
+ }
+ if ('attribs' in decision) {
+ attribs = decision['attribs'];
+ } else {
+ throw new Error('tagPolicy gave no attribs');
+ }
+ var eflagsRep;
+ var tagNameRep;
+ if ('tagName' in decision) {
+ tagNameRep = decision['tagName'];
+ eflagsRep = html4.ELEMENTS[tagNameRep];
+ } else {
+ tagNameRep = tagNameOrig;
+ eflagsRep = eflagsOrig;
+ }
+ // TODO(mikesamuel): relying on tagPolicy not to insert unsafe
+ // attribute names.
+
+ // If this is an optional-end-tag element and either this element or its
+ // previous like sibling was rewritten, then insert a close tag to
+ // preserve structure.
+ if (eflagsOrig & html4.eflags['OPTIONAL_ENDTAG']) {
+ var onStack = stack[stack.length - 1];
+ if (onStack && onStack.orig === tagNameOrig &&
+ (onStack.rep !== tagNameRep || tagNameOrig !== tagNameRep)) {
+ out.push('<\/', onStack.rep, '>');
+ }
+ }
+
+ if (!(eflagsOrig & html4.eflags['EMPTY'])) {
+ stack.push({orig: tagNameOrig, rep: tagNameRep});
+ }
+
+ out.push('<', tagNameRep);
+ for (var i = 0, n = attribs.length; i < n; i += 2) {
+ var attribName = attribs[i],
+ value = attribs[i + 1];
+ if (value !== null && value !== void 0) {
+ out.push(' ', attribName, '="', escapeAttrib(value), '"');
+ }
+ }
+ out.push('>');
+
+ if ((eflagsOrig & html4.eflags['EMPTY'])
+ && !(eflagsRep & html4.eflags['EMPTY'])) {
+ // replacement is non-empty, synthesize end tag
+ out.push('<\/', tagNameRep, '>');
+ }
+ },
+ 'endTag': function(tagName, out) {
+ if (ignoring) {
+ ignoring = false;
+ return;
+ }
+ if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
+ var eflags = html4.ELEMENTS[tagName];
+ if (!(eflags & (html4.eflags['EMPTY'] | html4.eflags['FOLDABLE']))) {
+ var index;
+ if (eflags & html4.eflags['OPTIONAL_ENDTAG']) {
+ for (index = stack.length; --index >= 0;) {
+ var stackElOrigTag = stack[index].orig;
+ if (stackElOrigTag === tagName) { break; }
+ if (!(html4.ELEMENTS[stackElOrigTag] &
+ html4.eflags['OPTIONAL_ENDTAG'])) {
+ // Don't pop non optional end tags looking for a match.
+ return;
+ }
+ }
+ } else {
+ for (index = stack.length; --index >= 0;) {
+ if (stack[index].orig === tagName) { break; }
+ }
+ }
+ if (index < 0) { return; } // Not opened.
+ for (var i = stack.length; --i > index;) {
+ var stackElRepTag = stack[i].rep;
+ if (!(html4.ELEMENTS[stackElRepTag] &
+ html4.eflags['OPTIONAL_ENDTAG'])) {
+ out.push('<\/', stackElRepTag, '>');
+ }
+ }
+ if (index < stack.length) {
+ tagName = stack[index].rep;
+ }
+ stack.length = index;
+ out.push('<\/', tagName, '>');
+ }
+ },
+ 'pcdata': emit,
+ 'rcdata': emit,
+ 'cdata': emit,
+ 'endDoc': function(out) {
+ for (; stack.length; stack.length--) {
+ out.push('<\/', stack[stack.length - 1].rep, '>');
+ }
+ }
+ });
+ }
+
+ var ALLOWED_URI_SCHEMES = /^(?:https?|mailto)$/i;
+
+ function safeUri(uri, effect, ltype, hints, naiveUriRewriter) {
+ if (!naiveUriRewriter) { return null; }
+ try {
+ var parsed = URI.parse('' + uri);
+ if (parsed) {
+ if (!parsed.hasScheme() ||
+ ALLOWED_URI_SCHEMES.test(parsed.getScheme())) {
+ var safe = naiveUriRewriter(parsed, effect, ltype, hints);
+ return safe ? safe.toString() : null;
+ }
+ }
+ } catch (e) {
+ return null;
+ }
+ return null;
+ }
+
+ function log(logger, tagName, attribName, oldValue, newValue) {
+ if (!attribName) {
+ logger(tagName + " removed", {
+ change: "removed",
+ tagName: tagName
+ });
+ }
+ if (oldValue !== newValue) {
+ var changed = "changed";
+ if (oldValue && !newValue) {
+ changed = "removed";
+ } else if (!oldValue && newValue) {
+ changed = "added";
+ }
+ logger(tagName + "." + attribName + " " + changed, {
+ change: changed,
+ tagName: tagName,
+ attribName: attribName,
+ oldValue: oldValue,
+ newValue: newValue
+ });
+ }
+ }
+
+ function lookupAttribute(map, tagName, attribName) {
+ var attribKey;
+ attribKey = tagName + '::' + attribName;
+ if (map.hasOwnProperty(attribKey)) {
+ return map[attribKey];
+ }
+ attribKey = '*::' + attribName;
+ if (map.hasOwnProperty(attribKey)) {
+ return map[attribKey];
+ }
+ return void 0;
+ }
+ function getAttributeType(tagName, attribName) {
+ return lookupAttribute(html4.ATTRIBS, tagName, attribName);
+ }
+ function getLoaderType(tagName, attribName) {
+ return lookupAttribute(html4.LOADERTYPES, tagName, attribName);
+ }
+ function getUriEffect(tagName, attribName) {
+ return lookupAttribute(html4.URIEFFECTS, tagName, attribName);
+ }
+
+ /**
+ * Sanitizes attributes on an HTML tag.
+ * @param {string} tagName An HTML tag name in lowercase.
+ * @param {Array.<?string>} attribs An array of alternating names and values.
+ * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
+ * apply to URI attributes; it can return a new string value, or null to
+ * delete the attribute. If unspecified, URI attributes are deleted.
+ * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
+ * to attributes containing HTML names, element IDs, and space-separated
+ * lists of classes; it can return a new string value, or null to delete
+ * the attribute. If unspecified, these attributes are kept unchanged.
+ * @return {Array.<?string>} The sanitized attributes as a list of alternating
+ * names and values, where a null value means to omit the attribute.
+ */
+ function sanitizeAttribs(tagName, attribs,
+ opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
+ // TODO(felix8a): it's obnoxious that domado duplicates much of this
+ // TODO(felix8a): maybe consistently enforce constraints like target=
+ for (var i = 0; i < attribs.length; i += 2) {
+ var attribName = attribs[i];
+ var value = attribs[i + 1];
+ var oldValue = value;
+ var atype = null, attribKey;
+ if ((attribKey = tagName + '::' + attribName,
+ html4.ATTRIBS.hasOwnProperty(attribKey)) ||
+ (attribKey = '*::' + attribName,
+ html4.ATTRIBS.hasOwnProperty(attribKey))) {
+ atype = html4.ATTRIBS[attribKey];
+ }
+ if (atype !== null) {
+ switch (atype) {
+ case html4.atype['NONE']: break;
+ case html4.atype['SCRIPT']:
+ value = null;
+ if (opt_logger) {
+ log(opt_logger, tagName, attribName, oldValue, value);
+ }
+ break;
+ case html4.atype['STYLE']:
+ if ('undefined' === typeof parseCssDeclarations) {
+ value = null;
+ if (opt_logger) {
+ log(opt_logger, tagName, attribName, oldValue, value);
+ }
+ break;
+ }
+ var sanitizedDeclarations = [];
+ parseCssDeclarations(
+ value,
+ {
+ 'declaration': function (property, tokens) {
+ var normProp = property.toLowerCase();
+ sanitizeCssProperty(
+ normProp, tokens,
+ opt_naiveUriRewriter
+ ? function (url) {
+ return safeUri(
+ url, html4.ueffects.SAME_DOCUMENT,
+ html4.ltypes.SANDBOXED,
+ {
+ "TYPE": "CSS",
+ "CSS_PROP": normProp
+ }, opt_naiveUriRewriter);
+ }
+ : null);
+ if (tokens.length) {
+ sanitizedDeclarations.push(
+ normProp + ': ' + tokens.join(' '));
+ }
+ }
+ });
+ value = sanitizedDeclarations.length > 0 ?
+ sanitizedDeclarations.join(' ; ') : null;
+ if (opt_logger) {
+ log(opt_logger, tagName, attribName, oldValue, value);
+ }
+ break;
+ case html4.atype['ID']:
+ case html4.atype['IDREF']:
+ case html4.atype['IDREFS']:
+ case html4.atype['GLOBAL_NAME']:
+ case html4.atype['LOCAL_NAME']:
+ case html4.atype['CLASSES']:
+ value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
+ if (opt_logger) {
+ log(opt_logger, tagName, attribName, oldValue, value);
+ }
+ break;
+ case html4.atype['URI']:
+ value = safeUri(value,
+ getUriEffect(tagName, attribName),
+ getLoaderType(tagName, attribName),
+ {
+ "TYPE": "MARKUP",
+ "XML_ATTR": attribName,
+ "XML_TAG": tagName
+ }, opt_naiveUriRewriter);
+ if (opt_logger) {
+ log(opt_logger, tagName, attribName, oldValue, value);
+ }
+ break;
+ case html4.atype['URI_FRAGMENT']:
+ if (value && '#' === value.charAt(0)) {
+ value = value.substring(1); // remove the leading '#'
+ value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
+ if (value !== null && value !== void 0) {
+ value = '#' + value; // restore the leading '#'
+ }
+ } else {
+ value = null;
+ }
+ if (opt_logger) {
+ log(opt_logger, tagName, attribName, oldValue, value);
+ }
+ break;
+ default:
+ value = null;
+ if (opt_logger) {
+ log(opt_logger, tagName, attribName, oldValue, value);
+ }
+ break;
+ }
+ } else {
+ value = null;
+ if (opt_logger) {
+ log(opt_logger, tagName, attribName, oldValue, value);
+ }
+ }
+ attribs[i + 1] = value;
+ }
+ return attribs;
+ }
+
+ /**
+ * Creates a tag policy that omits all tags marked UNSAFE in html4-defs.js
+ * and applies the default attribute sanitizer with the supplied policy for
+ * URI attributes and NMTOKEN attributes.
+ * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
+ * apply to URI attributes. If not given, URI attributes are deleted.
+ * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
+ * to attributes containing HTML names, element IDs, and space-separated
+ * lists of classes. If not given, such attributes are left unchanged.
+ * @return {function(string, Array.<?string>)} A tagPolicy suitable for
+ * passing to html.sanitize.
+ */
+ function makeTagPolicy(
+ opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
+ return function(tagName, attribs) {
+ if (!(html4.ELEMENTS[tagName] & html4.eflags['UNSAFE'])) {
+ return {
+ 'attribs': sanitizeAttribs(tagName, attribs,
+ opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger)
+ };
+ } else {
+ if (opt_logger) {
+ log(opt_logger, tagName, undefined, undefined, undefined);
+ }
+ }
+ };
+ }
+
+ /**
+ * Sanitizes HTML tags and attributes according to a given policy.
+ * @param {string} inputHtml The HTML to sanitize.
+ * @param {function(string, Array.<?string>)} tagPolicy A function that
+ * decides which tags to accept and sanitizes their attributes (see
+ * makeHtmlSanitizer above for details).
+ * @return {string} The sanitized HTML.
+ */
+ function sanitizeWithPolicy(inputHtml, tagPolicy) {
+ var outputArray = [];
+ makeHtmlSanitizer(tagPolicy)(inputHtml, outputArray);
+ return outputArray.join('');
+ }
+
+ /**
+ * Strips unsafe tags and attributes from HTML.
+ * @param {string} inputHtml The HTML to sanitize.
+ * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
+ * apply to URI attributes. If not given, URI attributes are deleted.
+ * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
+ * to attributes containing HTML names, element IDs, and space-separated
+ * lists of classes. If not given, such attributes are left unchanged.
+ */
+ function sanitize(inputHtml,
+ opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
+ var tagPolicy = makeTagPolicy(
+ opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger);
+ return sanitizeWithPolicy(inputHtml, tagPolicy);
+ }
+
+ // Export both quoted and unquoted names for Closure linkage.
+ var html = {};
+ html.escapeAttrib = html['escapeAttrib'] = escapeAttrib;
+ html.makeHtmlSanitizer = html['makeHtmlSanitizer'] = makeHtmlSanitizer;
+ html.makeSaxParser = html['makeSaxParser'] = makeSaxParser;
+ html.makeTagPolicy = html['makeTagPolicy'] = makeTagPolicy;
+ html.normalizeRCData = html['normalizeRCData'] = normalizeRCData;
+ html.sanitize = html['sanitize'] = sanitize;
+ html.sanitizeAttribs = html['sanitizeAttribs'] = sanitizeAttribs;
+ html.sanitizeWithPolicy = html['sanitizeWithPolicy'] = sanitizeWithPolicy;
+ html.unescapeEntities = html['unescapeEntities'] = unescapeEntities;
+ return html;
+})(html4);
+
+var html_sanitize = html['sanitize'];
+
+return {
+ html: html
+};
+});