Index: openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl,v diff -u -N -r1.67.2.20 -r1.67.2.21 --- openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 12 Jan 2017 13:10:20 -0000 1.67.2.20 +++ openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 12 Jan 2017 20:00:23 -0000 1.67.2.21 @@ -877,8 +877,8 @@ -unallowed_tags -unallowed_attributes -unallowed_protocols - -remove_js:boolean - -remove_outer_urls:boolean + -no_js:boolean + -no_outer_urls:boolean -validate:boolean } { @@ -906,11 +906,11 @@ @param unallowed_protocols list of protocols we don't allow in the markup. - @param remove_js this flag decides whether every script tag, + @param no_js this flag decides whether every script tag, inline event handlers and the javascript: pseudo-protocol should be stripped from the markup. - @param remove_outer_urls this flag tells the proc to remove + @param no_outer_urls this flag tells the proc to remove every reference to external addresses. Proc will try to distinguish between external URLs and fine fully specified internal ones. Acceptable URLs will be transformed in absolute @@ -990,7 +990,7 @@ # TODO: consider default unallowed stuff to come from a parameter - if {$remove_js_p} { + if {$no_js_p} { lappend unallowed_tags "script" lappend unallowed_attributes {*}{ onafterprint onbeforeprint onbeforeunload onerror @@ -1042,11 +1042,12 @@ set driver_prot [dict get $driver_info proto] set driver_host [dict get $driver_info hostname] set driver_port [dict get $driver_info port] - set ports(http) 80 - set ports(https) 443 - if {[info exists ports($driver_prot)] && $ports($driver_prot) ne $driver_port} { - append driver_host :$driver_port - } + set system_url [util::join_location \ + -proto $driver_prot \ + -hostname $driver_host \ + -port $driver_port] + # protocol-relative version of the system url + regsub ^$driver_prot:// $system_url {//} system_url_noprot set queue [$root childNodes] while {$queue ne {}} { @@ -1058,14 +1059,11 @@ # 1: check tag is allowed set node_name [string tolower [$node nodeName]] - if {[info exists unallowed_tag($node_name)] - || ($allowed_tags ne "*" && ![info exists allowed_tag($node_name)])} { - if {$validate_p} { - return 0 - } else { - $node delete - continue - } + if {[info exists unallowed_tag($node_name)] || + ($allowed_tags ne "*" && ![info exists allowed_tag($node_name)])} { + # invalid tag! + if {$validate_p} {return 0} else {$node delete} + continue } # tag itself is allowed, we can inspect its children @@ -1074,60 +1072,57 @@ # 2: check tag contains only allowed attributes foreach att [$node attributes] { set att [string tolower $att] - if {[info exists unallowed_attribute($att)] - || ($allowed_attributes ne "*" && ![info exists allowed_attribute($att)])} { - if {$validate_p} { - return 0 - } else { - $node removeAttribute $att - continue - } + if {[info exists unallowed_attribute($att)] || + ($allowed_attributes ne "*" && ![info exists allowed_attribute($att)])} { + # invalid attribute! + if {$validate_p} {return 0} else {$node removeAttribute $att} + continue } # 3: check for any attribute that could contain a url # whether this is acceptable switch $att { - "href" - "src" - "content" { - set att_val [string trim [$node getAttribute $att ""]] - if {$att_val eq ""} continue + "href" - "src" - "content" - "action" { + set url [string trim [$node getAttribute $att ""]] + if {$url eq ""} continue - # url in attribute has the protocol part - if {[regexp -nocase {^(\w+:)?(//)?(.*)$} $att_val match prot slashes url]} { - # remove the ':' - set prot [string range $prot 0 end-1] - # no external urls allowed: we still want - # to allow fully specified urls that refer - # to this server, so we try to transform - # the url in a local absolute reference, - # rather than stripping the attribute - # altogheter. Also, accept urls without - # protocol and slashes (e.g. the '#' - # idiom) and we leave the special - # javascript protocol to the next branch - # of the if. - if {$remove_outer_urls_p && ($prot ni {"" "javascript"} || $slashes eq "//")} { - # a fine full reference to our machine... - if {$prot eq $driver_prot && [regsub ^$driver_host $url {} url]} { + set prot "" + + # attribute is a URL as per RFC + if {[util::split_location $url prot hostname port]} { + if {$no_outer_urls_p} { + # no external urls allowed: we still want + # to allow fully specified urls that refer + # to this server, but we'll transform them + # in a local absolute reference. For all + # others, attribute will be removed + # altogether. + # - This is ok, points to our system... + if {[regsub ^($system_url|$system_url_noprot) $url {} url]} { $node setAttribute $att $url - # ...a nasty reference to who knows where. - } elseif {$validate_p} { - return 0 + # ...this is not, points elsewhere! } else { - $node removeAttribute $att + # invalid attribute! + if {$validate_p} {return 0} else {$node removeAttribute $att} + continue } - } elseif {$prot ne ""} { - # check if protocol is allowed - if {[info exists unallowed_protocol($prot)] - || ($allowed_protocols ne "*" && ![info exists allowed_protocol($prot)])} { - if {$validate_p} { - return 0 - } else { - $node removeAttribute $att - continue - } - } } + # this was likely a protocol-relative url + if {$prot eq ""} { + set prot $driver_prot + } } + + # regexp is for stuff like 'javascript:' pseudoprotocol, that is not really a url + if {$prot ne "" || [regexp {^(\w+):.*$} $url match prot]} { + # check if protocol is allowed + if {[info exists unallowed_protocol($prot)] || + ($allowed_protocols ne "*" && ![info exists allowed_protocol($prot)])} { + # invalid attribute! + if {$validate_p} {return 0} else {$node removeAttribute $att} + continue + } + } } } } Index: openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl,v diff -u -N --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl 12 Jan 2017 20:00:23 -0000 1.1.2.1 @@ -0,0 +1,115 @@ +ad_library { + + Tests that deal with the html - text procs + + @creation-date 2017-01-12 +} + + +aa_register_case -cats {api smoke} ad_sanitize_html { + + Test if it HTML sanitization works as expected + +} { + + # - Weird HTML, nonexistent and unclosed tags, '<' and '>' chars: + # result should be ok, with '<' and '>' converted to entities + lappend test_msgs "Invalid markup with single '<' and '>' chars ok?" + lappend test_cases {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} + lappend test_result_trivial {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} + lappend test_result_no_js {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} + lappend test_result_no_outer_urls {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} + + # - Weird HTML, nonexistent and unclosed tags, MULTIPLE '<' and '>' chars: + # some loss in translation, multiple '<' and '>' become single ones + lappend test_msgs "Invalid markup with multiple '<' and '>' chars ok?" + lappend test_cases { + sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad <<<<<<<<<< a <<< a << <<< << sasasadsasa < sadASDSA + } + lappend test_result_trivial { + sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA + } + lappend test_result_no_js { + sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA + } + lappend test_result_no_outer_urls { + sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA + } + + # - Half opened HTML into other markup: this markup will be completely rejected + lappend test_msgs "Invalid unparseable markup ok?" + lappend test_cases { + sadsa dfsdafs 3 sadASDSA + } + lappend test_result_trivial {} + lappend test_result_no_js {} + lappend test_result_no_outer_urls {} + + # - Plain text: this should stay as it is + lappend test_msgs "Plain text ok?" + set test_case { + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed + do eiusmod tempor incididunt ut labore et dolore magna + aliqua. Ut enim ad minim veniam, quis nostrud exercitation + ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis + aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint + occaecat cupidatat non proident, sunt in culpa qui officia + deserunt mollit anim id est laborum. + } + lappend test_cases $test_case + lappend test_result_trivial $test_case + lappend test_result_no_js $test_case + lappend test_result_no_outer_urls $test_case + + foreach msg $test_msgs test_case $test_cases result_trivial $test_result_trivial result_no_js $test_result_no_js result_no_outer_urls $test_result_no_outer_urls { + set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols *] + set result [string trim $result] ; set result_trivial [string trim $result_trivial] + aa_true $msg [expr {$result eq $result_trivial}] + set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_js] + set result [string trim $result] ; set result_no_js [string trim $result_no_js] + aa_true $msg [expr {$result eq $result_no_js}] + set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls] + set result [string trim $result] ; set result_no_outer_urls [string trim $result_no_outer_urls] + aa_true $msg [expr {$result eq $result_no_outer_urls}] + } + + array set r [util::http::get -url [util::configured_location]] + set test_case $r(page) + + set msg "In our index page is removing tags ok" + set unallowed_tags {div style script} + set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_tags $unallowed_tags] + set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_tags $unallowed_tags -validate] + aa_true "$msg with validate?" $valid_p + aa_false $msg? [regexp {<(div|style|script)\s*[^>]*>} $result] + + set msg "In our index page is removing attributes ok" + set unallowed_attributes {id style} + set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_attributes $unallowed_attributes] + set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_attributes $unallowed_attributes -validate] + aa_true "$msg with validate?" $valid_p + aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(id|style)=".*"[^>]*>} $result] + + set msg "In our index page is removing protocols ok?" + set unallowed_protocols {http javascript https} + set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_protocols $unallowed_protocols] + set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_protocols $unallowed_protocols -validate] + aa_true "$msg with validate?" $valid_p + aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(href|src|content|action)="(http|javascript):.*"[^>]*>} $result] + + set msg "In our index page is removing outer links ok?" + set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls] + set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls -validate] + aa_true "$msg with validate?" $valid_p + aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(href|src|content|action)="(http|https|//):.*"[^>]*>} $result] + +} + + + +# Local variables: +# mode: tcl +# tcl-indent-level: 4 +# indent-tabs-mode: nil +# End: