Index: openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl,v diff -u -r1.67 -r1.68 --- openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 27 Oct 2014 16:40:08 -0000 1.67 +++ openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 7 Aug 2017 23:48:00 -0000 1.68 @@ -1,7 +1,7 @@ ad_library { Contains procs used to manipulate chunks of text and html, most notably converting between them. - + @author Lars Pind (lars@pinds.com) @creation-date 19 July 2000 @cvs-id $Id$ @@ -20,13 +20,13 @@ -no_quote:boolean -includes_html:boolean -encode:boolean - text + text } { - Converts plaintext to html. Also translates any recognized + Converts plaintext to html. Also translates any recognized email addresses or URLs into a hyperlink. - @param no_links will prevent it from highlighting - @param no_quote will prevent it from HTML-quoting output, so this can be run on + @param no_links will prevent it from highlighting + @param no_quote will prevent it from HTML-quoting output, so this can be run on semi-HTML input and preserve that formatting. This will also cause spaces/tabs to not be replaced with nbsp's, because this can too easily mess up HTML tags. @param includes_html Set this if the text parameter already contains some HTML which should be preserved. @@ -37,46 +37,60 @@ @creation-date 19 July 2000 } { if { $text eq "" } { - return {} + return "" } - + + set space_added 0 + set nr_links 0 if { !$no_links_p } { - # We start by putting a space in front so our URL/email highlighting will work - # for URLs/emails right in the beginning of the text. + # + # We start by putting a space in front so our URL/email + # highlighting will work for URLs/emails right in the + # beginning of the text. + # set text " $text" - - # if something is " http://" or " https://" - # we assume it is a link to an outside source. - - # (bd) The only purpose of thiese sTaRtUrL and - # eNdUrL markers is to get rid of trailing dots, - # commas and things like that. Note that there - # is a \x001 special char before and after each marker. - - regsub -nocase -all {([^a-zA-Z0-9]+)(http://[^\(\)"<>\s]+)} $text "\\1\x001sTaRtUrL\\2eNdUrL\x001" text - regsub -nocase -all {([^a-zA-Z0-9]+)(https://[^\(\)"<>\s]+)} $text "\\1\x001sTaRtUrL\\2eNdUrL\x001" text - regsub -nocase -all {([^a-zA-Z0-9]+)(ftp://[^\(\)"<>\s]+)} $text "\\1\x001sTaRtUrL\\2eNdUrL\x001" text + set space_added 1 - # Don't dress URLs that are already HREF=... or SRC=... chunks - if { $includes_html_p } { - regsub -nocase -all {(href\s*=\s*['"]?)\x001sTaRtUrL([^\x001]*)eNdUrL\x001} $text {\1\2} text - regsub -nocase -all {(src\s*=\s*['"]?)\x001sTaRtUrL([^\x001]*)eNdUrL\x001} $text {\1\2} text - } - + # if something is " http://" or " https://" or "ftp://" we + # assume it is a link to an outside source. + # + # (bd) The only purpose of the markers is to get rid of + # trailing dots, commas and things like that. Note the code + # uses utf-8 codes \u0002 (start of text) and \u0003 (end of + # text) special chars as marker. Previously, we had \x001 and + # \x002, which do not work reliably (regsub was missing some + # entries, probably due to a mess-up of the internal + # representation). + # + set nr_links [regsub -nocase -all \ + {([^a-zA-Z0-9]+)((http|https|ftp)://[^\(\)\"<>\s]+)} $text \ + "\\1\u0002\\2\u0003" text] + # email links have the form xxx@xxx.xxx + # # JCD: don't treat things =xxx@xxx.xxx as email since most - # common occurance seems to be in urls (although VPATH bounce - # emails like bounce-user=domain.com@sourcehost.com will then not - # work correctly). It's all quite ugly. - - regsub -nocase -all {([^a-zA-Z0-9=]+)(mailto:)?([^=\(\)\s:;,@<>]+@[^\(\)\s.:;,@<>]+[.][^\(\)\s:;,@<>]+)} $text \ - "\\1\x001sTaRtEmAiL\\3eNdEmAiL\x001" text - } + # common occurrence seems to be in urls (although VPATH bounce + # emails like bounce-user=domain.com@sourcehost.com will then + # not work correctly). Another tricky case is + # http://www.postgresql.org/message-id/20060329203545.M43728@narrowpathinc.com + # where we do not want turn the @ into a mailto. + incr nr_links [regsub -nocase -all \ + {([^a-zA-Z0-9=/.]+)(mailto:)?([^=\(\)\s:;,@<>/]+@[^\(\)\s.:;,@<>]+[.][^\(\)\s:;,@<>]+)} $text \ + "\\1\u0002mailto:\\3\u0003" text] + + # + # Remove marker from URLs that are already HREF=... or SRC=... chunks + # + if { $includes_html_p && $nr_links > 0} { + regsub -nocase -all {((href|src)\s*=\s*['\"]?)\u0002([^\u0003]*)\u0003} $text {\1\3} text + } + } + # At this point, before inserting some of our own <, >, and "'s # we quote the ones entered by the user: if { !$no_quote_p } { - set text [ad_quotehtml $text] + set text [ns_quotehtml $text] } if { $encode_p} { @@ -91,57 +105,115 @@ } set myHTML { - ª º À Á Â Ã Ä Å &Aelig; Ç - È É Ê Ë Ì Í Î Ï Ð Ñ - Ò Ó Ô Õ Ö Ø Ù Ú Û Ü + ª º À Á Â Ã Ä Å &Aelig; Ç + È É Ê Ë Ì Í Î Ï Ð Ñ + Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ - ç è é ê ë ì í î ï ð - ñ ò ó ô õ ö ø ù ú û + ç è é ê ë ì í î ï ð + ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ ¿ } - for { set i 0 } { $i < [ llength $myChars ] } { incr i } { - set text [ string map "[ lindex $myChars $i ] [ lindex $myHTML $i ]" $text ] + set map {} + foreach ch $myChars entity $myHTML { + lappend map $ch $entity } + set text [string map $map $text] } # Convert line breaks if { !$no_lines_p } { - if { $includes_html_p } { - set text [util_convert_line_breaks_to_html -includes_html -- $text] - } else { - set text [util_convert_line_breaks_to_html -- $text] - } + set text [util_convert_line_breaks_to_html -includes_html=$includes_html_p -- $text] + # the function strips all leading white space + set space_added 0 } if { !$no_quote_p } { # Convert every two spaces to an nbsp regsub -all { } $text "\\\  " text - + # Convert tabs to four nbsp's regsub -all {\t} $text {\ \ \ \ } text } - if { !$no_links_p } { - # Move the end of the link before any punctuation marks at the end of the URL - regsub -all {([]!?.:;,<>\(\)\}"'-]+)(eNdUrL\x001)} $text {\2\1} text - regsub -all {([]!?.:;,<>\(\)\}"'-]+)(eNdEmAiL\x001)} $text {\2\1} text + if { $nr_links > 0} { + # + # Move the end of the link before any punctuation marks at the + # end of the URL. + # + regsub -all {([\]!?.:;,<>\(\)\}\"'-]+)(\u0003)} $text {\2\1} text - # Dress the links and emails with A HREF - regsub -all {\x001sTaRtUrL([^\x001]*)eNdUrL\x001} $text {\1} text - regsub -all {\x001sTaRtEmAiL([^\x001]*)eNdEmAiL\x001} $text {\1} text - set text [string trimleft $text] + # + # Convert the marked links and emails into "..." + # + regsub -all {\u0002([^\u0003]+?)\u0003} $text {\1} text + + set changed_back [regsub -all {(\u0002|\u0003)} $text {} text] + if {$includes_html_p} { + # + # All markers should be gone now. + # + # In case we changed something back (means something is + # broken in our regexps above), provide a warning, we have + # to debug. + # + if {$changed_back > 0} { + ad_log warning "Replaced spurious magic marker in ad_text_to_html" + } + } } - # JCD: Remove all the eNd sTaRt stuff and warn if we do it since its bad - # to have these left (means something is broken in our regexps above) - if {[regsub -all {(\x001sTaRtUrL|eNdUrL\x001|\x001sTaRtEmAiL|eNdEmAiL\x001)} $text {} text]} { - ns_log warning "Replaced sTaRt/eNd magic tags in ad_text_to_html" + if {$space_added} { + set text [string range $text 1 end] } return $text } +ad_proc -public ad_html_qualify_links { + -path + html +} { + + Convert in the HTML text relative URLs into fully qualified URLs + including the host name. It performs the following operations: + + 1) prepend paths starting with a "/" by the protocol and host. +2) prepend paths not starting a "/" by the package_url, in case it was passed in. + +links, which are already fully qualified are not modified. + +} { + set host "[string trimright [ad_url] /]/" + + # + # Protect all full qualified URLs with special characters (one + # rule for single quotes, one for double quotes). + # + regsub -nocase -all \ + {(href|src)\s*=\s*'((http|https|ftp|mailto):[^'\"]+)'} $html \ + "\\1='\u0001\\2\u0002'" html +regsub -nocase -all \ + {(href|src)\s*=\s*[\"]((http|https|ftp|mailto):[^'\"]+)[\"]} $html \ + "\\1=\"\u0001\\2\u0002\"" html + +if {[info exists path]} { + set path "[string trim $path /]/" + regsub -all {(href|src)\s*=\s*['\"]([^/][^\u0001:'\"]+?)['\"]} $html \ + "\\1='${host}${path}\\2111'" html +} +regsub -all {(href|src)\s*=\s*['\"]/([^\u0001:'\"]+?)['\"]} $html \ + "\\1=\"${host}\\2222\"" html + +# +# Remove all protection characters again. +# +regsub -nocase -all {((href|src)\s*=\s*['\"]?)\u0001([^\u0002]*)\u0002} $html {\1\3} html + +return $html +} + + ad_proc -public util_convert_line_breaks_to_html { {-includes_html:boolean} text @@ -155,22 +227,17 @@ # Make sure all line breaks are single \n's regsub -all {\r\n} $text "\n" text regsub -all {\r} $text "\n" text - + # Remove whitespace before \n's regsub -all {[ \t]+\n} $text "\n" text - + # Wrap P's around paragraphs regsub -all {([^\n\s])\n\n+([^\n\s])} $text {\1

\2} text # remove line breaks right before and after HTML tags that will insert a paragraph break themselves if { $includes_html_p } { - set tags [join { ul ol li blockquote p div table tr td th } |] + set tags [join { ul ol li blockquote p div table tr td th } |] regsub -all -nocase "\\s*(\]*>)\\s*" $text {\1} text - - #foreach tag { ul ol li blockquote p div table tr td th } { - # regsub -all -nocase "\\n\\s*(\]*>)" $text {\1} text - # regsub -all -nocase "(\]*>)\\s*\\n" $text {\1} text - #} } # Convert _single_ CRLF's to
's to preserve line breaks @@ -188,7 +255,7 @@ Quotes ampersands, double-quotes, and angle brackets in $arg. Analogous to ns_quotehtml except that it quotes double-quotes (which - ns_quotehtml does not). + ns_quotehtml does not). @see ad_unquotehtml } { @@ -200,7 +267,7 @@ @see ad_quotehtml } { - return [string map {> > < < " \" & &} $arg] + return [string map {& & > > < < " \" " \" ' '} $arg] } @@ -210,23 +277,23 @@ # #################### + # # lars@pinds.com, 19 July 2000: # Should this proc change name to something in line with the rest # of the library? # - ad_proc -private util_close_html_tags { - html_fragment - {break_soft 0} + html_fragment + {break_soft 0} {break_hard 0} {ellipsis ""} {more ""} } { Given an HTML fragment, this procedure will close any tags that have been left open. The optional arguments let you specify that - the fragment is to be truncated to a certain number of displayable - characters. After break_soft, it truncates and closes open tags unless + the fragment is to be truncated to a certain number of displayable + characters. After break_soft, it truncates and closes open tags unless you're within non-breaking tags (e.g., Af). After break_hard displayable characters, the procedure simply truncates and closes any open HTML tags that might have resulted from the truncation. @@ -239,25 +306,23 @@

  • remove -- nuke this tag and its closing tag but leave contents.
  • close -- close this tag if left open. - - @param break_soft the number of characters you want the html fragment + + @param break_soft the number of characters you want the html fragment truncated to. Will allow certain tags (A, ADDRESS, NOBR) to close first. - @param break_hard the number of characters you want the html fragment + @param break_hard the number of characters you want the html fragment truncated to. Will truncate, regardless of what tag is currently in action. @param ellipsis This will get put at the end of the truncated string, if the string was truncated. - However, this counts towards the total string length, so that the returned string - including ellipsis is guaranteed to be shorter than the 'len' provided. + However, this counts towards the total string length, so that the returned string + including ellipsis is guaranteed to be shorter than the 'len' provided. @param more This will get put at the end of the truncated string, if the string was truncated. @author Jeff Davis (davis@xarg.net) - -} { - set frag $html_fragment - # +} { + # # The code in this function had an exponential behavior based on # the size. On the current OpenACS.org site (Jan 2009), the # function took on certain forums entries 6 to 9 hours @@ -285,26 +350,32 @@ # -gustaf neumann (Jan 2009) if {$break_soft == 0 && $break_hard == 0} { - set frag [string map [list &# "&#"] $html_fragment] - if {[catch {dom parse -html $frag doc} errorMsg]} { - # we got an error, so do normal processing - #ns_log notice "tdom can't parse the provided HTML, error=$errorMsg,\nchecking fragment without tdom" - } else { - $doc documentElement root - set html "" - # discared forms - foreach node [$root selectNodes //form] {$node delete} - # output wellformed html - set b [lindex [$root selectNodes {//body}] 0] - foreach n [$b childNodes] { - append html [$n asHTML] + # + # We have to protect against crashes, that might happen due to + # unsupported numeric entities in tdom. Therefore, we map + # numeric entities into something sufficiently opaque + # + set frag [string map [list &# "\0&#\0"] $html_fragment] + + if {[catch {dom parse -html $frag doc} errorMsg]} { + # we got an error, so do normal processing + ns_log notice "tdom can't parse the provided HTML, error=$errorMsg,\nchecking fragment without tdom" + } else { + $doc documentElement root + set html "" + # discard forms + foreach node [$root selectNodes //form] {$node delete} + # output wellformed html + set b [lindex [$root selectNodes {//body}] 0] + foreach n [$b childNodes] { + append html [$n asHTML] + } + return [string map [list "\0&#\0" &#] $html] } - return $html - } } - set frag $html_fragment - + set frag $html_fragment + # original code continues set syn(a) nobr @@ -313,7 +384,7 @@ # set syn(form) discard # - set syn(blink) remove + set syn(blink) remove # set syn(table) close set syn(font) close @@ -358,7 +429,7 @@ set syn(q) close set syn(span) close - set out {} + set out {} set out_len 0 # counts how deep we are nested in nonbreaking tags, tracks the nobr point @@ -383,9 +454,9 @@ # First try to fix up < not part of a tag. regsub -all {<([^/[:alpha:]!])} $frag {\<\1} frag - # no we do is chop off any trailing unclosed tag + # no we do is chop off any trailing unclosed tag # since when we substr blobs this sometimes happens - + # this should in theory cut any tags which have been cut open. while {[regexp {(.*)<[^>]*$} $frag match frag]} {} @@ -396,12 +467,12 @@ if {![regexp "(\[^<]*)(<(/?)(\[^ \r\n\t>]+)(\[^>]*)>)?(.*)" $frag match pretag fulltag close tag tagbody frag]} { # should never get here since above will match anything. ns_log Error "util_close_html_tag - NO MATCH: should never happen! frag=$frag" - append out $frag + append out $frag set frag {} } else { #ns_log Notice "pretag=$pretag\n fulltag=$fulltag\n close=$close\n tag=$tag\n tagbody=$tagbody frag length is [string length $frag]" if { ! $discard } { - # figure out if we can break with the pretag chunk + # figure out if we can break with the pretag chunk if { $break_soft } { if {! $nobr && [string length $pretag] + $out_len > $break_soft } { # first chop pretag to the right length @@ -414,80 +485,80 @@ } elseif { $nobr && [string length $pretag] + $out_len > $break_hard } { # we are in a nonbreaking tag and are past the hard break # so chop back to the point we got the nobr tag... - set tagptr $nobr_tagptr - if { $nobr_out_point > 0 } { + set tagptr $nobr_tagptr + if { $nobr_out_point > 0 } { set out [string range $out 0 $nobr_out_point-1] - } else { - # here maybe we should decide if we should keep the tag anyway + } else { + # here maybe we should decide if we should keep the tag anyway # if zero length result would be the result... set out {} } set broken_p 1 break - } + } } - + # tack on pretag append out $pretag incr out_len [string length $pretag] } - + # now deal with the tag if we got one... - if { $tag eq "" } { - # if the tag is empty we might have one of the bad matched that are not eating - # any of the string so check for them - if {[string length $match] == [string length $frag]} { + if { $tag eq "" } { + # if the tag is empty we might have one of the bad matched that are not eating + # any of the string so check for them + if {[string length $match] == [string length $frag]} { append out $frag set frag {} } } else { - set tag [string tolower $tag] + set tag [string tolower $tag] if { ![info exists syn($tag)]} { - # if we don't have an entry in our syntax table just tack it on + # if we don't have an entry in our syntax table just tack it on # and hope for the best. if { ! $discard } { append out $fulltag } } else { if { $close ne "/" } { - # new tag + # new tag # "remove" tags are just ignored here - # discard tags - if { $discard } { + # discard tags + if { $discard } { if { $syn($tag) eq "discard" } { - incr discard - incr tagptr + incr discard + incr tagptr set tagstack($tagptr) $tag } } else { switch $syn($tag) { - nobr { + nobr { if { ! $nobr } { set nobr_out_point [string length $out] set nobr_tagptr $tagptr set nobr_len $out_len } incr nobr - incr tagptr + incr tagptr set tagstack($tagptr) $tag append out $fulltag } - discard { - incr discard - incr tagptr + discard { + incr discard + incr tagptr set tagstack($tagptr) $tag } - close { - incr tagptr + close { + incr tagptr set tagstack($tagptr) $tag append out $fulltag } } } - } else { + } else { # we got a close tag - if { $discard } { - # if we are in discard mode only watch for + if { $discard } { + # if we are in discard mode only watch for # closes to discarded tags if { $syn($tag) eq "discard"} { if {$tagptr > -1} { @@ -509,7 +580,7 @@ incr tagptr -1 if { $syn($tag) eq "nobr"} { incr nobr -1 - } + } append out $fulltag } } @@ -520,33 +591,33 @@ } } } - - # on exit of the look either we parsed it all or we truncated. + + # on exit of the look either we parsed it all or we truncated. # we should now walk the stack and close any open tags. # Chop off extra whitespace at the end if { $broken_p } { set end_index [expr {[string length $out] -1}] while { $end_index >= 0 && [string is space [string index $out $end_index]] } { incr end_index -1 - } + } set out [string range $out 0 $end_index] } - for { set i $tagptr } { $i > -1 } { incr i -1 } { + for { set i $tagptr } { $i > -1 } { incr i -1 } { set tag $tagstack($i) # LARS: Only close tags which we aren't supposed to remove - if { $syn($tag) ne "discard" && $syn($tag) ne "remove" } { + if { $syn($tag) ni {discard remove}} { append out "" } } - + if { $broken_p } { append out $ellipsis append out $more } - + return $out } @@ -568,37 +639,37 @@ return [ad_parse_html_attributes_upvar html pos] } } - -ad_proc ad_parse_html_attributes_upvar { + +ad_proc ad_parse_html_attributes_upvar { -attribute_array html_varname pos_varname -} { +} { Parse attributes in an HTML fragment and return them as a list of lists.

    Each element of that list is either a single element, if the attribute had no value, or - a two-tuple, with the first element being the name of the attribute and the second being + a two-tuple, with the first element being the name of the attribute and the second being the value. The attribute names are all converted to lowercase.

    If you don't really care what happens when the same attribute is present twice, you can also use the attribute_array argument, and the attributes will be set there. For attributes without any value, we'll use the empty string.

    - Example: + Example:

    set html {<tag foo = bar baz greble="&quot;hello you sucker&quot;" foo='blah' Heres = '  something for   you to = "consider" '>}
    -set pos 5 ; # the 'f' in the first 'foo'
    +    set pos 5 ; # the 'f' in the first 'foo'
     
    -set attribute_list [ad_parse_html_attributes_upvar -attribute_array attribute_array html pos]
    + set attribute_list [ad_parse_html_attributes_upvar -attribute_array attribute_array html pos] attribute_list will contain the following: -
    {foo bar} baz {greble {"hello you sucker"}} {foo blah} {heres {  something for   you to = "consider" }}
    +
    {foo bar} baz {greble {"hello you sucker"}} {foo blah} {heres {  something for   you to = "consider" }}
    attribute_array will contain: -
    attribute_array(foo)='blah'
    -attribute_array(greble)='"hello you sucker"'
    -attribute_array(baz)=''
    -attribute_array(heres)='  something for   you to = "consider" '
    +
    attribute_array(foo)='blah'
    +    attribute_array(greble)='"hello you sucker"'
    +    attribute_array(baz)=''
    +    attribute_array(heres)='  something for   you to = "consider" '

    @@ -616,8 +687,8 @@ start. This should point to a character inside the tag, just after the tag name, and before the first attribute. Note, that we will modify this variable. When this proc is done, this variable will point to the tag-closing >. - Example: - if the tag is <img src="foo">, pos_varname should point to either the space between + Example: + if the tag is <img src="foo">, pos_varname should point to either the space between img and src, or the s in src. @param attribute_array This is an alternate way of returning the attributes, if you don't care @@ -626,12 +697,12 @@ @return A list of list holding the attribute names and values. Each element of that list is either a single element, if the attribute had no value, or - a two-tuple, with the first element being the name of the attribute and the second being + a two-tuple, with the first element being the name of the attribute and the second being the value. The attribute names are all converted to lowercase. @author Lars Pind (lars@pinds.com) @creation-date November 10, 2000 -} { +} { upvar $html_varname html upvar $pos_varname i if { [info exists attribute_array] } { @@ -652,641 +723,1083 @@ # This is an XML-style tag ending: <... /> break } - - # This regexp matches an attribute name and an equal sign, if present. - # Also eats whitespace before or after. - # The \A corresponds to ^, except it matches the position we're starting from, not the start of the string + + # This regexp matches an attribute name and an equal sign, if + # present. Also eats whitespace before or after. The \A + # corresponds to ^, except it matches the position we're + # starting from, not the start of the string. if { ![regexp -indices -start $i {\A\s*([^\s=>]+)\s*(=?)\s*} $html match attr_name_idx equal_sign_idx] } { - # Apparantly, there's no attribute name here. Let's eat all whitespace and lonely equal signs. + # + # Apparently, there's no attribute name here. + # Let's eat all whitespace and lonely equal signs. + # regexp -indices -start $i {\A[\s=]*} $html match set i [expr { [lindex $match 1] + 1 }] } { set attr_name [string tolower [string range $html [lindex $attr_name_idx 0] [lindex $attr_name_idx 1]]] - + # Move past the attribute name just found set i [expr { [lindex $match 1] + 1}] - + # If there is an equal sign, we're expecting the next token to be a value if { [lindex $equal_sign_idx 1] - [lindex $equal_sign_idx 0] < 0 } { # No equal sign, no value - lappend attributes [list $attr_name] + lappend attributes [list $attr_name] if { [info exists attribute_array] } { set attribute_array_var($attr_name) {} } } else { - + # is there a single or double quote sign as the first character? switch -- [string index $html $i] { {"} { set exp {\A"([^"]*)"\s*} } - {'} { set exp {\A'([^']*)'\s*} } - default { set exp {\A([^\s>]*)\s*} } - } - if { ![regexp -indices -start $i $exp $html match attr_value_idx] } { - # No end quote. - set attr_value [string range $html $i+1 end] - set i [string length $html] - } else { - set attr_value [string range $html [lindex $attr_value_idx 0] [lindex $attr_value_idx 1]] - set i [expr { [lindex $match 1] + 1}] - } + {'} { set exp {\A'([^']*)'\s*} } + default { set exp {\A([^\s>]*)\s*} } + } + if { ![regexp -indices -start $i $exp $html match attr_value_idx] } { + # No end quote. + set attr_value [string range $html $i+1 end] + set i [string length $html] + } else { + set attr_value [string range $html [lindex $attr_value_idx 0] [lindex $attr_value_idx 1]] + set i [expr { [lindex $match 1] + 1}] + } - set attr_value [util_expand_entities_ie_style $attr_value] - - lappend attributes [list $attr_name $attr_value] - if { [info exists attribute_array] } { - set attribute_array_var($attr_name) $attr_value + set attr_value [util_expand_entities_ie_style $attr_value] + + lappend attributes [list $attr_name $attr_value] + if { [info exists attribute_array] } { + set attribute_array_var($attr_name) $attr_value + } } } } + return $attributes } - return $attributes -} -ad_proc ad_html_security_check { html } { + ad_proc ad_html_security_check { html } { - Returns a human-readable explanation if the user has used any HTML - tag other than the ones marked allowed in antispam section of ad.ini. - Otherwise returns an empty string. - - @return a human-readable, plaintext explanation of what's wrong with the user's input. - - @author Lars Pind (lars@pinds.com) - @creation-date 20 July 2000 + Returns a human-readable explanation if the user has used any HTML + tag other than the ones marked allowed in antispam section of ad.ini. + Otherwise returns an empty string. -} { - if { [string first <% $html] > -1 } { - return "For security reasons, you're not allowed to have the less-than-percent combination in your input." - } - - array set allowed_attribute [list] - array set allowed_tag [list] - array set allowed_protocol [list] + @return a human-readable, plaintext explanation of what's wrong with the user's input. - # Use the antispam tags for this package instance and whatever is on the kernel. - set allowed_tags_list [concat \ - [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedTag antispam] \ - [ad_parameter_all_values_as_list AllowedTag antispam]] + @author Lars Pind (lars@pinds.com) + @creation-date 20 July 2000 - set allowed_attributes_list [concat \ - [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedAttribute antispam] \ - [ad_parameter_all_values_as_list AllowedAttribute antispam]] + } { + if { [string first <% $html] > -1 } { + return "For security reasons, you're not allowed to have the less-than-percent combination in your input." + } - set allowed_protocols_list [concat \ - [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedProtocol antispam] \ - [ad_parameter_all_values_as_list AllowedProtocol antispam]] + array set allowed_attribute [list] + array set allowed_tag [list] + array set allowed_protocol [list] - foreach tag $allowed_tags_list { - set allowed_tag([string tolower $tag]) 1 - } - foreach attribute $allowed_attributes_list { - set allowed_attribute([string tolower $attribute]) 1 - } - foreach tagname $allowed_tags_list { - set allowed_tag([string tolower $tagname]) 1 - } - foreach protocol $allowed_protocols_list { - set allowed_protocol([string tolower $protocol]) 1 - } - - # loop over all tags - for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { - # move past the tag-opening < - incr i + # Use the antispam tags for this package instance and whatever is on the kernel. + set allowed_tags_list [concat \ + [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedTag antispam] \ + [ad_parameter_all_values_as_list AllowedTag antispam]] - if { ![regexp -indices -start $i {\A/?([-_a-zA-Z0-9]+)\s*} $html match name_idx] } { - # The tag-opener isn't followed by USASCII letters (with or without optional inital slash) - # Not considered a tag. Shouldn't do any harm in browsers. - # (Tested with digits, with A syntax, with whitespace) - } else { - # The tag was valid ... now let's see if it's on the allowed list. - set tagname [string tolower [string range $html [lindex $name_idx 0] [lindex $name_idx 1]]] + set allowed_attributes_list [concat \ + [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedAttribute antispam] \ + [ad_parameter_all_values_as_list AllowedAttribute antispam]] - if { ![info exists allowed_tag($tagname)] && ![info exists allowed_tag(*)] } { - # Nope, this was a naughty tag. - return "For security reasons we only accept the submission of HTML - containing the following tags: [join $allowed_tags_list " "]. - You have a [string toupper $tagname] tag in there." + set allowed_protocols_list [concat \ + [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedProtocol antispam] \ + [ad_parameter_all_values_as_list AllowedProtocol antispam]] + + foreach attribute $allowed_attributes_list { + set allowed_attribute([string tolower $attribute]) 1 + } + foreach tagname $allowed_tags_list { + set allowed_tag([string tolower $tagname]) 1 + } + foreach protocol $allowed_protocols_list { + set allowed_protocol([string tolower $protocol]) 1 + } + + # loop over all tags + for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { + # move past the tag-opening < + incr i + + if { ![regexp -indices -start $i {\A/?([-_a-zA-Z0-9]+)\s*} $html match name_idx] } { + # The tag-opener isn't followed by USASCII letters (with or without optional initial slash) + # Not considered a tag. Shouldn't do any harm in browsers. + # (Tested with digits, with A syntax, with whitespace) } else { - # Legal tag. - - # Make i point to the first character inside the tag, after the tag name and any whitespace - set i [expr { [lindex $match 1] + 1}] - - set attr_list [ad_parse_html_attributes_upvar html i] + # The tag was valid ... now let's see if it's on the allowed list. + set tagname [string tolower [string range $html [lindex $name_idx 0] [lindex $name_idx 1]]] - set attr_count 0 - foreach attribute $attr_list { - incr attr_count + if { ![info exists allowed_tag($tagname)] && ![info exists allowed_tag(*)] } { + # Nope, this was a naughty tag. + return "For security reasons we only accept the submission of HTML + containing the following tags: [join $allowed_tags_list " "]. + You have a [string toupper $tagname] tag in there." + } else { + # Legal tag. - lassign $attribute attr_name attr_value - - if { ![info exists allowed_attribute($attr_name)] - && ![info exists allowed_attribute(*)] } { - return "The attribute '$attr_name' is not allowed for $tagname tags" - } - - if { [string tolower $attr_name] ne "style" } { - if { [regexp {^\s*([^\s:]+):\/\/} $attr_value match protocol] } { - if { ![info exists allowed_protocol([string tolower $protocol])] - && ![info exists allowed_protocol(*)] } { - return "Your URLs can only use these protocols: [join $allowed_protocols_list ", "]. + # Make i point to the first character inside the tag, after the tag name and any whitespace + set i [expr { [lindex $match 1] + 1}] + + set attr_list [ad_parse_html_attributes_upvar html i] + + foreach attribute $attr_list { + lassign $attribute attr_name attr_value + + if { ![info exists allowed_attribute($attr_name)] + && ![info exists allowed_attribute(*)] } { + return "The attribute '$attr_name' is not allowed for $tagname tags" + } + + if { [string tolower $attr_name] ne "style" } { + if { [regexp {^\s*([^\s:]+):\/\/} $attr_value match protocol] } { + if { ![info exists allowed_protocol([string tolower $protocol])] + && ![info exists allowed_protocol(*)] } { + return "Your URLs can only use these protocols: [join $allowed_protocols_list ", "]. You have a '$protocol' protocol in there." + } } } } } } } + return {} } - return {} -} + # This was created in order to pre-process some content to be fed + # to tDOM in ad_sanitize_html. In fact, even with its least picky + # behavior, tDOM cannot swallow whatever markup you give it. This + # proc might also be used in order to improve some OpenACS + # routines, like util_close_html_tags. As it has some limitations, + # this is left to future considerations. + ad_proc ad_dom_fix_html { + -html:required + {-marker "root"} + -dom:boolean + } { + + Similar in spirit to the famous Tidy command line utility, + this proc takes a piece of possibly invalid markup and returns + a 'fixed' version where unopened tags have been closed and + attribute specifications have been normalized by transforming them + in the form attribute-name="attribute value". All + attributes with an invalid (non-alphanumeric) name will be + stripped.
    +
    + Be aware that every comment and also the possibly present + DOCTYPE declaration will be stripped from the markup. Also, + most of tag's internal whitespace will be trimmed. This + behavior comes from the htmlparse library used in this + implementation. + @param html Markup to process + @param marker Root element use to enforce a single root of the + DOM tree. + + @param dom When this flag is set, instead of returning markup, + the proc will return the tDOM object built during the + operation. Useful when the result should be used by tDOM + anyway, so we can avoid superfluous parsing. -#################### -# -# HTML -> Text -# -#################### + @return markup or a tDOM document object if the -dom flag is + specified -ad_proc -public ad_html_to_text { - {-maxlen 70} - {-showtags:boolean} - {-no_format:boolean} - html -} { - Returns a best-guess plain text version of an HTML fragment. - Parses the HTML and does some simple formatting. The parser and - formatting - is pretty stupid, but it's better than nothing. - - @param maxlen the line length you want your output wrapped to. - @param showtags causes any unknown (and uninterpreted) tags to get shown in the output. - @param no_format causes hyperlink tags not to get listed at the end of the output. + @author Antonio Pisano + + } { + if {[catch {package require struct}]} { + error "Package struct non found on the system" + } + if {[catch {package require htmlparse}]} { + error "Package htmlparse non found on the system" + } + + set tree [::struct::tree] - @author Lars Pind (lars@pinds.com) - @author Aaron Swartz (aaron@swartzfam.com) - @creation-date 19 July 2000 -} { - set output(text) {} - set output(linelen) 0 - set output(maxlen) $maxlen - set output(pre) 0 - set output(p) 0 - set output(br) 0 - set output(space) 0 - set output(blockquote) 0 - - set length [string length $html] - set last_tag_end 0 + + catch {::htmlparse::tags destroy} - # For showing the URL of links. - set href_urls [list] - set href_stack [list] + ::struct::stack ::htmlparse::tags + ::htmlparse::tags push root + $tree set root type root - for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { - # append everything up to and not including the tag-opening < - ad_html_to_text_put_text output [string range $html $last_tag_end $i-1] + ::htmlparse::parse \ + -cmd [list ::htmlparse::2treeCallback $tree] \ + -incvar errs $html - # Check that: - # - we're not past the end of the string - # - and that the tag starts with either - # - alpha or - # - a slash, and then alpha - # Otherwise, it's probably just a lone < character - if { $i >= $length - 1 || - (![string is alpha [string index $html $i+1]] - && [string index $html $i+1] ne "!" - && ("/" ne [string index $html $i+1] || - ![string is alpha [string index $html $i+2]])) - } { - # Output the < and continue with next character - ad_html_to_text_put_text output "<" - set last_tag_end [incr i] - continue - } elseif {[string match "!--*" [string range $html $i+1 end]]} { - # handle HTML comments, I can't beleive noone noticed this before. - # this code maybe not be elegant but it works - - # find the closing comment tag. - set comment_idx [string first "-->" $html $i] - if {$comment_idx == -1} { - # no comment close, escape - set last_tag_end $i - set i $comment_idx - break - } - set i [expr {$comment_idx + 3}] - set last_tag_end $i - - continue - } - # we're inside a tag now. Find the end of it + $tree walk root -order post n { + ::htmlparse::Reorder $tree $n + } - # make i point to the char after the < - incr i - set tag_start $i + ::htmlparse::tags destroy - set count 0 - while 1 { - if {[incr count] > 3000 } { - # JCD: the programming bug is that an unmatched < in the input runs off forever looking for - # it's closing > and in some long text like program listings you can have lots of quotes - # before you find that > - error "There appears to be a programming bug in ad_html_to_text: We've entered an infinite loop." - } - # Find the positions of the first quote, apostrophe and greater-than sign. - set quote_idx [string first \" $html $i] - set apostrophe_idx [string first ' $html $i] - set gt_idx [string first > $html $i] - # If there is no greater-than sign, then the tag isn't closed. - if { $gt_idx == -1 } { - set i $length - break - } + set lmarker "<$marker>" + set rmarker "" + set doc [dom createDocument $marker] + set root [$doc documentElement] + + set queue {} + lappend queue [list $root [$tree children [$tree children root]]] + while {$queue ne {}} { + lassign [lindex $queue 0] domparent treechildren + set queue [lrange $queue 1 end] - # Find the first of the quote and the apostrophe - if { $apostrophe_idx == -1 } { - set string_delimiter_idx $quote_idx - } else { - if { $quote_idx == -1 } { - set string_delimiter_idx $apostrophe_idx + foreach child $treechildren { + set type [$tree get $child type] + set data [$tree get $child data] + if {$type eq "PCDATA"} { + set el [$doc createTextNode $data] } else { - if { $apostrophe_idx < $quote_idx } { - set string_delimiter_idx $apostrophe_idx - } else { - set string_delimiter_idx $quote_idx + set el [$doc createElement $type] + + # parse element attributes + while {$data ne ""} { + set data [string trim $data] + # attribute with a value, optionally surrounded by double or single quotes + if {[regexp "^(\[^= \]+)=(\"\[^\"\]*\"|'\[^'\].*'|\[^ \]*)" $data m attname attvalue]} { + if {[string match "\"*\"" $attvalue] || + [string match "'*'" $attvalue]} { + set attvalue [string range $attvalue 1 end-1] + } + # attribute with no value + } elseif {[regexp {^([^\s]+)} $data m attname]} { + set attvalue "" + } else { + error "Unrecoverable attribute spec in supplied markup" + } + + # skip bogus attribute names + if {[string is alnum -strict $attname]} { + $el setAttribute $attname $attvalue + } + + set data [string range $data [string length $m] end] } } + + $domparent appendChild $el + + set elchildren [$tree children $child] + if {$elchildren ne {}} { + lappend queue [list $el $elchildren] + } } - set string_delimiter [string index $html $string_delimiter_idx] + } - # If the greater than sign appears before any of the string delimters, we've found the tag end. - if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } { - # we found the tag end - set i $gt_idx - break - } + $tree destroy - # Otherwise, we'll have to skip past the ending string delimiter - set i [string first $string_delimiter $html [incr string_delimiter_idx]] - if { $i == -1 } { - # Missing string end delimiter - set i $length - break + if {$dom_p} { + return $doc + } else { + set html [$doc asHTML] + $doc delete + set html [string range $html [string length $lmarker] end-[string length $rmarker]] + } + + return [string trim $html] + } + + # Original purpose of this proc was to introduce a better way to + # enforce some HTML policies on the content submitted by the uses + # (e.g. forbid some tag/attribute like