Index: openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl,v diff -u -r1.67.2.18 -r1.67.2.19 --- openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 10 Jan 2017 08:33:22 -0000 1.67.2.18 +++ openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 11 Jan 2017 20:14:04 -0000 1.67.2.19 @@ -869,9 +869,282 @@ return {} } + ad_proc ad_sanitize_html { + -html:required + -allowed_tags + -allowed_attributes + -allowed_protocols + -unallowed_tags + -unallowed_attributes + -unallowed_protocols + -remove_js:boolean + -remove_outer_urls:boolean + -validate:boolean + } { + Sanitizes HTML by specified criteria, basically removing + unallowed tags and attributes, javascript or outer references + into page URLs. When desired, this proc can act also as just a + validator in order to enforce some markup policies. + @param html the markup to be checked. + + @param allowed_tags list of tags we allow in the markup. + + @param allowed_attributes list of attributes we allow in the + markup. + + @param allowed_protocols list of attributes we allow into + links + + @param unallowed_tags list of tags we don't allow in the + markup. + + @param unallowed_attributes list of attributes we don't allow + in the markup. + + @param unallowed_protocols list of protocols we don't allow in + the markup. + + @param remove_js this flag decides whether every script tag, + inline event handlers and the javascript: pseudo-protocol + should be stripped from the markup. + + @param remove_outer_urls this flag tells the proc to remove + every reference to external addresses. Proc will try to + distinguish between external URLs and fine fully specified + internal ones. Acceptable URLs will be transformed in absolute + local references, others will be just stripped togheter with + the attribute. + + @param validate This flag will avoid the creation of the + stripped markup and just report whether the original one + respects all the specified requirements. + @return sanitized markup or a (0/1) truth value when the + -validate flag is specified + + @author Antonio Pisano + + } { + ## Allowed/Unallowed tags come from the user or default to + ## those specified in the parameters + + array set allowed_tag {} + if {![info exists allowed_tags]} { + # Use the antispam tags for this package instance and whatever is on the kernel. + set allowed_tags {} + lappend allowed_tags_list {*}[ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedTag antispam] + lappend allowed_tags_list {*}[ad_parameter_all_values_as_list AllowedTag antispam] + } + + array set allowed_attribute {} + if {![info exists allowed_attributes]} { + set allowed_attributes {} + lappend allowed_attributes {*}[ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedAttribute antispam] + lappend allowed_attributes {*}[ad_parameter_all_values_as_list AllowedAttribute antispam] + } + + array set allowed_protocol {} + if {![info exists allowed_protocols]} { + set allowed_protocols {} + lappend allowed_protocols {*}[ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedProtocol antispam] + lappend allowed_protocols {*}[ad_parameter_all_values_as_list AllowedProtocol antispam] + } + + if {"*" in $allowed_tags} { + set allowed_tags "*" + } + foreach tag $allowed_tags { + set allowed_tag([string tolower $tag]) 1 + } + + if {"*" in $allowed_attributes} { + set allowed_attributes "*" + } + foreach attribute $allowed_attributes { + set allowed_attribute([string tolower $attribute]) 1 + } + + if {"*" in $allowed_protocols} { + set allowed_protocols "*" + } + foreach protocol $allowed_protocols { + set allowed_protocol([string tolower $protocol]) 1 + } + + array set unallowed_tag {} + if {![info exists unallowed_tags]} { + set unallowed_tags {} + } + + array set unallowed_attribute {} + if {![info exists unallowed_attributes]} { + set unallowed_attributes {} + } + + array set unallowed_protocol {} + if {![info exists unallowed_protocols]} { + set unallowed_protocols {} + } + + # TODO: consider default unallowed stuff to come from a parameter + + if {$remove_js_p} { + lappend unallowed_tags "script" + lappend unallowed_attributes {*}{ + onafterprint onbeforeprint onbeforeunload onerror + onhashchange onload onmessage onoffline ononline + onpagehide onpageshow onpopstate onresize onstorage + onunload onblur onchange oncontextmenu onfocus oninput + oninvalid onreset onsearch onselect onsubmit onkeydown + onkeypress onkeyup onclick ondblclick onmousedown + onmousemove onmouseout onmouseover onmouseup + onmousewheel onwheel ondrag ondragend ondragenter + ondragleave ondragover ondragstart ondrop onscroll + oncopy oncut onpaste onabort oncanplay + oncanplaythrough oncuechange ondurationchange + onemptied onended onerror onloadeddata + onloadedmetadata onloadstart onpause onplay onplaying + onprogress onratechange onseeked onseeking onstalled + onsuspend ontimeupdate onvolumechange onwaiting onshow + ontoggle + } + lappend unallowed_protocols "javascript" + } + + foreach tag $unallowed_tags { + set unallowed_tag([string tolower $tag]) 1 + } + + foreach attribute $unallowed_attributes { + set unallowed_attribute([string tolower $attribute]) 1 + } + foreach protocol $unallowed_protocols { + set unallowed_protocol([string tolower $protocol]) 1 + } + + ## + + + # root of the document must be unique, this will enforce it by + # wrapping html in an auxiliary root element + set lmarker "" + set rmarker "" + set html "${lmarker}${html}${rmarker}" + if {[catch {dom parse -html $html doc} errmsg]} { + ad_log error "Failed at parsing HTML. Error from tDOM is: $errmsg" + return [expr {$validate ? 0 : ""}] + } + $doc documentElement root + + set driver_info [util_driver_info] + set driver_prot [dict get $driver_info proto] + set driver_host [dict get $driver_info hostname] + set driver_port [dict get $driver_info port] + set ports(http) 80 + set ports(https) 443 + if {[info exists ports($driver_prot)] && $ports($driver_prot) ne $driver_port} { + append driver_host :$driver_port + } + + set queue [$root childNodes] + while {$queue ne {}} { + set node [lindex $queue 0] + set queue [lrange $queue 1 end] + + # skip all non-element nodes + if {$node eq "" || [$node nodeType] ne "ELEMENT_NODE"} continue + + # 1: check tag is allowed + set node_name [string tolower [$node nodeName]] + if {[info exists unallowed_tag($node_name)] + || ($allowed_tags ne "*" && ![info exists allowed_tag($node_name)])} { + if {$validate_p} { + return 0 + } else { + $node delete + continue + } + } + + # tag itself is allowed, we can inspect its children + lappend queue {*}[$node childNodes] + + # 2: check tag contains only allowed attributes + foreach att [$node attributes] { + set att [string tolower $att] + if {[info exists unallowed_attribute($att)] + || ($allowed_attributes ne "*" && ![info exists allowed_attribute($att)])} { + if {$validate_p} { + return 0 + } else { + $node removeAttribute $att + continue + } + } + + # 3: check for any attribute that could contain a url + # whether this is acceptable + switch $att { + "href" - "src" - "content" { + set att_val [string trim [$node getAttribute $att ""]] + if {$att_val eq ""} continue + + # url in attribute has the protocol part + if {[regexp -nocase {^(\w+:)?(//)?(.*)$} $att_val match prot slashes url]} { + # remove the ':' + set prot [string range $prot 0 end-1] + # no external urls allowed: we still want + # to allow fully specified urls that refer + # to this server, so we try to transform + # the url in a local absolute reference, + # rather than stripping the attribute + # altogheter. Also, accept urls without + # protocol and slashes (e.g. the '#' + # idiom) and we leave the special + # javascript protocol to the next branch + # of the if. + if {$remove_outer_urls_p && ($prot ni {"" "javascript"} || $slashes eq "//")} { + # a fine full reference to our machine... + if {$prot eq $driver_prot && [regsub ^$driver_host $url {} url]} { + $node setAttribute $att $url + # ...a nasty reference to who knows where. + } elseif {$validate_p} { + return 0 + } else { + $node removeAttribute $att + } + } elseif {$prot ne ""} { + # check if protocol is allowed + if {[info exists unallowed_protocol($prot)] + || ($allowed_protocols ne "*" && ![info exists allowed_protocol($prot)])} { + if {$validate_p} { + return 0 + } else { + $node removeAttribute $att + continue + } + } + } + } + } + } + } + } + + if {$validate_p} { + return 1 + } else { + set html [$root asHTML] + # remove auxiliary root element from output + set html [string range $html [string length $lmarker] end-[string length $rmarker]] + set html [string trim $html] + return $html + } + } + + #################### # # HTML -> Text