Index: openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl,v diff -u -r1.56 -r1.57 --- openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 5 Aug 2008 14:56:43 -0000 1.56 +++ openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 5 Aug 2008 15:18:24 -0000 1.57 @@ -41,18 +41,18 @@ } if { !$no_links_p } { - # We start by putting a space in front so our URL/email highlighting will work - # for URLs/emails right in the beginning of the text. - set text " $text" - - # if something is " http://" or " https://" - # we assume it is a link to an outside source. - - # (bd) The only purpose of thiese sTaRtUrL and - # eNdUrL markers is to get rid of trailing dots, - # commas and things like that. Note that there - # is a \x001 special char before and after each marker. - + # We start by putting a space in front so our URL/email highlighting will work + # for URLs/emails right in the beginning of the text. + set text " $text" + + # if something is " http://" or " https://" + # we assume it is a link to an outside source. + + # (bd) The only purpose of thiese sTaRtUrL and + # eNdUrL markers is to get rid of trailing dots, + # commas and things like that. Note that there + # is a \x001 special char before and after each marker. + regsub -nocase -all {([^a-zA-Z0-9]+)(http://[^\(\)"<>\s]+)} $text "\\1\x001sTaRtUrL\\2eNdUrL\x001" text regsub -nocase -all {([^a-zA-Z0-9]+)(https://[^\(\)"<>\s]+)} $text "\\1\x001sTaRtUrL\\2eNdUrL\x001" text regsub -nocase -all {([^a-zA-Z0-9]+)(ftp://[^\(\)"<>\s]+)} $text "\\1\x001sTaRtUrL\\2eNdUrL\x001" text @@ -62,8 +62,8 @@ regsub -nocase -all {(href\s*=\s*['"]?)\x001sTaRtUrL([^\x001]*)eNdUrL\x001} $text {\1\2} text regsub -nocase -all {(src\s*=\s*['"]?)\x001sTaRtUrL([^\x001]*)eNdUrL\x001} $text {\1\2} text } - - # email links have the form xxx@xxx.xxx + + # email links have the form xxx@xxx.xxx # JCD: don't treat things =xxx@xxx.xxx as email since most # common occurance seems to be in urls (although VPATH bounce # emails like bounce-user=domain.com@sourcehost.com will then not @@ -80,29 +80,29 @@ } if { $encode_p} { - set myChars { - ª º À Á Â Ã Ä Å Æ Ç - È É Ê Ë Ì Í Î Ï Ð Ñ - Ò Ó Ô Õ Ö Ø Ù Ú Û Ü - Ý Þ ß à á â ã ä å æ - ç è é ê ë ì í î ï ð - ñ ò ó ô õ ö ø ù ú û - ü ý þ ÿ ¿ - } + set myChars { + ª º À Á Â Ã Ä Å Æ Ç + È É Ê Ë Ì Í Î Ï Ð Ñ + Ò Ó Ô Õ Ö Ø Ù Ú Û Ü + Ý Þ ß à á â ã ä å æ + ç è é ê ë ì í î ï ð + ñ ò ó ô õ ö ø ù ú û + ü ý þ ÿ ¿ + } - set myHTML { - ª º À Á Â Ã Ä Å &Aelig; Ç - È É Ê Ë Ì Í Î Ï Ð Ñ - Ò Ó Ô Õ Ö Ø Ù Ú Û Ü - Ý Þ ß à á â ã ä å æ - ç è é ê ë ì í î ï ð - ñ ò ó ô õ ö ø ù ú û - ü ý þ ÿ ¿ - } + set myHTML { + ª º À Á Â Ã Ä Å &Aelig; Ç + È É Ê Ë Ì Í Î Ï Ð Ñ + Ò Ó Ô Õ Ö Ø Ù Ú Û Ü + Ý Þ ß à á â ã ä å æ + ç è é ê ë ì í î ï ð + ñ ò ó ô õ ö ø ù ú û + ü ý þ ÿ ¿ + } - for { set i 0 } { $i < [ llength $myChars ] } { incr i } { - set text [ string map "[ lindex $myChars $i ] [ lindex $myHTML $i ]" $text ] - } + for { set i 0 } { $i < [ llength $myChars ] } { incr i } { + set text [ string map "[ lindex $myChars $i ] [ lindex $myHTML $i ]" $text ] + } } # Convert line breaks @@ -120,13 +120,13 @@ if { !$no_links_p } { # Move the end of the link before any punctuation marks at the end of the URL - regsub -all {([]!?.:;,<>\(\)\}"'-]+)(eNdUrL\x001)} $text {\2\1} text - regsub -all {([]!?.:;,<>\(\)\}"'-]+)(eNdEmAiL\x001)} $text {\2\1} text + regsub -all {([]!?.:;,<>\(\)\}"'-]+)(eNdUrL\x001)} $text {\2\1} text + regsub -all {([]!?.:;,<>\(\)\}"'-]+)(eNdEmAiL\x001)} $text {\2\1} text - # Dress the links and emails with A HREF - regsub -all {\x001sTaRtUrL([^\x001]*)eNdUrL\x001} $text {\1} text - regsub -all {\x001sTaRtEmAiL([^\x001]*)eNdEmAiL\x001} $text {\1} text - set text [string trimleft $text] + # Dress the links and emails with A HREF + regsub -all {\x001sTaRtUrL([^\x001]*)eNdUrL\x001} $text {\1} text + regsub -all {\x001sTaRtEmAiL([^\x001]*)eNdEmAiL\x001} $text {\1} text + set text [string trimleft $text] } # JCD: Remove all the eNd sTaRt stuff and warn if we do it since its bad @@ -514,10 +514,10 @@ @creation-date November 10, 2000 } { if { [info exists attribute_array] } { - upvar $attribute_array attribute_array_var - return [ad_parse_html_attributes_upvar -attribute_array attribute_array_var html pos] + upvar $attribute_array attribute_array_var + return [ad_parse_html_attributes_upvar -attribute_array attribute_array_var html pos] } else { - return [ad_parse_html_attributes_upvar html pos] + return [ad_parse_html_attributes_upvar html pos] } } @@ -587,7 +587,7 @@ upvar $html_varname html upvar $pos_varname i if { [info exists attribute_array] } { - upvar $attribute_array attribute_array_var + upvar $attribute_array attribute_array_var } # This is where we're going to return the result @@ -597,59 +597,59 @@ # We maintain counter is so that we don't accidentally enter an infinite loop set count 0 while { $i < [string length $html] && ![string equal [string index $html $i] {>}] } { - if { [incr count] > 1000 } { - error "There appears to be a programming bug in ad_parse_html_attributes_upvar: We've entered an infinite loop. We are here: \noffset $i: [string range $html $i [expr {$i + 60}]]" - } - if { [string equal [string range $html $i [expr { $i + 1 }]] "/>"] } { - # This is an XML-style tag ending: <... /> - break - } - - # This regexp matches an attribute name and an equal sign, if present. - # Also eats whitespace before or after. - # The \A corresponds to ^, except it matches the position we're starting from, not the start of the string - if { ![regexp -indices -start $i {\A\s*([^\s=>]+)\s*(=?)\s*} $html match attr_name_idx equal_sign_idx] } { - # Apparantly, there's no attribute name here. Let's eat all whitespace and lonely equal signs. - regexp -indices -start $i {\A[\s=]*} $html match - set i [expr { [lindex $match 1] + 1 }] - } { - set attr_name [string tolower [string range $html [lindex $attr_name_idx 0] [lindex $attr_name_idx 1]]] - - # Move past the attribute name just found - set i [expr { [lindex $match 1] + 1}] - - # If there is an equal sign, we're expecting the next token to be a value - if { [lindex $equal_sign_idx 1] - [lindex $equal_sign_idx 0] < 0 } { - # No equal sign, no value - lappend attributes $attr_name - if { [info exists attribute_array] } { - set attribute_array_var($attr_name) {} - } - } else { - - # is there a single or double quote sign as the first character? - switch -- [string index $html $i] { - {"} { set exp {\A"([^"]*)"\s*} } - {'} { set exp {\A'([^']*)'\s*} } - default { set exp {\A([^\s>]*)\s*} } - } - if { ![regexp -indices -start $i $exp $html match attr_value_idx] } { - # No end quote. - set attr_value [string range $html [expr {$i+1}] end] - set i [string length $html] - } else { - set attr_value [string range $html [lindex $attr_value_idx 0] [lindex $attr_value_idx 1]] - set i [expr { [lindex $match 1] + 1}] - } + if { [incr count] > 1000 } { + error "There appears to be a programming bug in ad_parse_html_attributes_upvar: We've entered an infinite loop. We are here: \noffset $i: [string range $html $i [expr {$i + 60}]]" + } + if { [string equal [string range $html $i [expr { $i + 1 }]] "/>"] } { + # This is an XML-style tag ending: <... /> + break + } + + # This regexp matches an attribute name and an equal sign, if present. + # Also eats whitespace before or after. + # The \A corresponds to ^, except it matches the position we're starting from, not the start of the string + if { ![regexp -indices -start $i {\A\s*([^\s=>]+)\s*(=?)\s*} $html match attr_name_idx equal_sign_idx] } { + # Apparantly, there's no attribute name here. Let's eat all whitespace and lonely equal signs. + regexp -indices -start $i {\A[\s=]*} $html match + set i [expr { [lindex $match 1] + 1 }] + } { + set attr_name [string tolower [string range $html [lindex $attr_name_idx 0] [lindex $attr_name_idx 1]]] + + # Move past the attribute name just found + set i [expr { [lindex $match 1] + 1}] + + # If there is an equal sign, we're expecting the next token to be a value + if { [lindex $equal_sign_idx 1] - [lindex $equal_sign_idx 0] < 0 } { + # No equal sign, no value + lappend attributes $attr_name + if { [info exists attribute_array] } { + set attribute_array_var($attr_name) {} + } + } else { + + # is there a single or double quote sign as the first character? + switch -- [string index $html $i] { + {"} { set exp {\A"([^"]*)"\s*} } + {'} { set exp {\A'([^']*)'\s*} } + default { set exp {\A([^\s>]*)\s*} } + } + if { ![regexp -indices -start $i $exp $html match attr_value_idx] } { + # No end quote. + set attr_value [string range $html [expr {$i+1}] end] + set i [string length $html] + } else { + set attr_value [string range $html [lindex $attr_value_idx 0] [lindex $attr_value_idx 1]] + set i [expr { [lindex $match 1] + 1}] + } - set attr_value [util_expand_entities_ie_style $attr_value] - - lappend attributes [list $attr_name $attr_value] - if { [info exists attribute_array] } { - set attribute_array_var($attr_name) $attr_value - } - } - } + set attr_value [util_expand_entities_ie_style $attr_value] + + lappend attributes [list $attr_name $attr_value] + if { [info exists attribute_array] } { + set attribute_array_var($attr_name) $attr_value + } + } + } } return $attributes } @@ -670,7 +670,7 @@ } { if { [string first <% $html] > -1 } { - return "For security reasons, you're not allowed to have the less-than-percent combination in your input." + return "For security reasons, you're not allowed to have the less-than-percent combination in your input." } array set allowed_attribute [list] @@ -679,77 +679,77 @@ # Use the antispam tags for this package instance and whatever is on the kernel. set allowed_tags_list [concat \ - [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedTag antispam] \ - [ad_parameter_all_values_as_list AllowedTag antispam]] + [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedTag antispam] \ + [ad_parameter_all_values_as_list AllowedTag antispam]] set allowed_attributes_list [concat \ - [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedAttribute antispam] \ - [ad_parameter_all_values_as_list AllowedAttribute antispam]] + [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedAttribute antispam] \ + [ad_parameter_all_values_as_list AllowedAttribute antispam]] set allowed_protocols_list [concat \ - [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedProtocol antispam] \ - [ad_parameter_all_values_as_list AllowedProtocol antispam]] + [ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedProtocol antispam] \ + [ad_parameter_all_values_as_list AllowedProtocol antispam]] foreach tag $allowed_tags_list { - set allowed_tag([string tolower $tag]) 1 + set allowed_tag([string tolower $tag]) 1 } foreach attribute $allowed_attributes_list { - set allowed_attribute([string tolower $attribute]) 1 + set allowed_attribute([string tolower $attribute]) 1 } foreach tagname $allowed_tags_list { - set allowed_tag([string tolower $tagname]) 1 + set allowed_tag([string tolower $tagname]) 1 } foreach protocol $allowed_protocols_list { - set allowed_protocol([string tolower $protocol]) 1 + set allowed_protocol([string tolower $protocol]) 1 } # loop over all tags for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { - # move past the tag-opening < - incr i + # move past the tag-opening < + incr i - if { ![regexp -indices -start $i {\A/?([-_a-zA-Z0-9]+)\s*} $html match name_idx] } { - # The tag-opener isn't followed by USASCII letters (with or without optional inital slash) - # Not considered a tag. Shouldn't do any harm in browsers. - # (Tested with digits, with A syntax, with whitespace) - } else { - # The tag was valid ... now let's see if it's on the allowed list. - set tagname [string tolower [string range $html [lindex $name_idx 0] [lindex $name_idx 1]]] + if { ![regexp -indices -start $i {\A/?([-_a-zA-Z0-9]+)\s*} $html match name_idx] } { + # The tag-opener isn't followed by USASCII letters (with or without optional inital slash) + # Not considered a tag. Shouldn't do any harm in browsers. + # (Tested with digits, with A syntax, with whitespace) + } else { + # The tag was valid ... now let's see if it's on the allowed list. + set tagname [string tolower [string range $html [lindex $name_idx 0] [lindex $name_idx 1]]] - if { ![info exists allowed_tag($tagname)] && ![info exists allowed_tag(*)] } { - # Nope, this was a naughty tag. - return "For security reasons we only accept the submission of HTML - containing the following tags: [join $allowed_tags_list " "]. - You have a [string toupper $tagname] tag in there." - } else { - # Legal tag. - - # Make i point to the first character inside the tag, after the tag name and any whitespace - set i [expr { [lindex $match 1] + 1}] + if { ![info exists allowed_tag($tagname)] && ![info exists allowed_tag(*)] } { + # Nope, this was a naughty tag. + return "For security reasons we only accept the submission of HTML + containing the following tags: [join $allowed_tags_list " "]. + You have a [string toupper $tagname] tag in there." + } else { + # Legal tag. + + # Make i point to the first character inside the tag, after the tag name and any whitespace + set i [expr { [lindex $match 1] + 1}] + + set attr_list [ad_parse_html_attributes_upvar html i] - set attr_list [ad_parse_html_attributes_upvar html i] - - set attr_count 0 - foreach attribute $attr_list { - incr attr_count - set attr_name [lindex $attribute 0] - set attr_value [lindex $attribute 1] - - if { ![info exists allowed_attribute($attr_name)] && ![info exists allowed_attribute(*)] } { - return "The attribute '$attr_name' is not allowed for $tagname tags" - } - + set attr_count 0 + foreach attribute $attr_list { + incr attr_count + set attr_name [lindex $attribute 0] + set attr_value [lindex $attribute 1] + + if { ![info exists allowed_attribute($attr_name)] && ![info exists allowed_attribute(*)] } { + return "The attribute '$attr_name' is not allowed for $tagname tags" + } + if { [string tolower $attr_name] ne "style" } { if { [regexp {^\s*([^\s:]+):} $attr_value match protocol] } { if { ![info exists allowed_protocol([string tolower $protocol])] && ![info exists allowed_protocol(*)] } { return "Your URLs can only use these protocols: [join $allowed_protocols_list ", "]. - You have a '$protocol' protocol in there." + You have a '$protocol' protocol in there." } } } - } - } - } + } + } + } } return {} } @@ -799,8 +799,8 @@ set href_stack [list] for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { - # append everything up to and not including the tag-opening < - ad_html_to_text_put_text output [string range $html $last_tag_end [expr {$i - 1}]] + # append everything up to and not including the tag-opening < + ad_html_to_text_put_text output [string range $html $last_tag_end [expr {$i - 1}]] # Check that: # - we're not past the end of the string @@ -818,105 +818,105 @@ continue } - # we're inside a tag now. Find the end of it + # we're inside a tag now. Find the end of it - # make i point to the char after the < - incr i - set tag_start $i - - set count 0 - while 1 { + # make i point to the char after the < + incr i + set tag_start $i + + set count 0 + while 1 { if {[incr count] > 1000 } { # JCD: the programming bug is that an unmatched < in the input runs off forever looking for # it's closing > and in some long text like program listings you can have lots of quotes # before you find that > error "There appears to be a programming bug in ad_html_to_text: We've entered an infinite loop." } - # Find the positions of the first quote, apostrophe and greater-than sign. - set quote_idx [string first \" $html $i] - set apostrophe_idx [string first ' $html $i] - set gt_idx [string first > $html $i] + # Find the positions of the first quote, apostrophe and greater-than sign. + set quote_idx [string first \" $html $i] + set apostrophe_idx [string first ' $html $i] + set gt_idx [string first > $html $i] - # If there is no greater-than sign, then the tag isn't closed. - if { $gt_idx == -1 } { - set i $length - break - } + # If there is no greater-than sign, then the tag isn't closed. + if { $gt_idx == -1 } { + set i $length + break + } - # Find the first of the quote and the apostrophe - if { $apostrophe_idx == -1 } { - set string_delimiter_idx $quote_idx - } else { - if { $quote_idx == -1 } { - set string_delimiter_idx $apostrophe_idx - } else { - if { $apostrophe_idx < $quote_idx } { - set string_delimiter_idx $apostrophe_idx - } else { - set string_delimiter_idx $quote_idx - } - } - } - set string_delimiter [string index $html $string_delimiter_idx] + # Find the first of the quote and the apostrophe + if { $apostrophe_idx == -1 } { + set string_delimiter_idx $quote_idx + } else { + if { $quote_idx == -1 } { + set string_delimiter_idx $apostrophe_idx + } else { + if { $apostrophe_idx < $quote_idx } { + set string_delimiter_idx $apostrophe_idx + } else { + set string_delimiter_idx $quote_idx + } + } + } + set string_delimiter [string index $html $string_delimiter_idx] - # If the greater than sign appears before any of the string delimters, we've found the tag end. - if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } { - # we found the tag end - set i $gt_idx - break - } + # If the greater than sign appears before any of the string delimters, we've found the tag end. + if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } { + # we found the tag end + set i $gt_idx + break + } - # Otherwise, we'll have to skip past the ending string delimiter - set i [string first $string_delimiter $html [incr string_delimiter_idx]] - if { $i == -1 } { - # Missing string end delimiter - set i $length - break - } - incr i - } - - set full_tag [string range $html $tag_start [expr { $i - 1 }]] + # Otherwise, we'll have to skip past the ending string delimiter + set i [string first $string_delimiter $html [incr string_delimiter_idx]] + if { $i == -1 } { + # Missing string end delimiter + set i $length + break + } + incr i + } + + set full_tag [string range $html $tag_start [expr { $i - 1 }]] + + if { ![regexp {^(/?)([^\s]+)[\s]*(\s.*)?$} $full_tag match slash tagname attributes] } { + # A malformed tag -- just delete it + } else { - if { ![regexp {^(/?)([^\s]+)[\s]*(\s.*)?$} $full_tag match slash tagname attributes] } { - # A malformed tag -- just delete it - } else { - - # Reset/create attribute array + # Reset/create attribute array array unset attribute_array - # Parse the attributes - ad_parse_html_attributes -attribute_array attribute_array $attributes + # Parse the attributes + ad_parse_html_attributes -attribute_array attribute_array $attributes - switch -- [string tolower $tagname] { - p - ul - ol - table { - set output(p) 1 - } - br { - ad_html_to_text_put_newline output - } - tr - td - th { - set output(br) 1 - } - h1 - h2 - h3 - h4 - h5 - h6 { - set output(p) 1 - if { $slash eq "" } { - ad_html_to_text_put_text output [string repeat "*" [string index $tagname 1]] - } - } - li { - set output(br) 1 - if { $slash eq "" } { - ad_html_to_text_put_text output "- " - } - } - strong - b { - ad_html_to_text_put_text output "*" - } - em - i - cite - u { - ad_html_to_text_put_text output "_" - } - a { + switch -- [string tolower $tagname] { + p - ul - ol - table { + set output(p) 1 + } + br { + ad_html_to_text_put_newline output + } + tr - td - th { + set output(br) 1 + } + h1 - h2 - h3 - h4 - h5 - h6 { + set output(p) 1 + if { $slash eq "" } { + ad_html_to_text_put_text output [string repeat "*" [string index $tagname 1]] + } + } + li { + set output(br) 1 + if { $slash eq "" } { + ad_html_to_text_put_text output "- " + } + } + strong - b { + ad_html_to_text_put_text output "*" + } + em - i - cite - u { + ad_html_to_text_put_text output "_" + } + a { if { !$no_format_p } { if { $slash eq ""} { if { [info exists attribute_array(href)] } { @@ -942,101 +942,101 @@ } } } - } - pre { - set output(p) 1 - if { $slash eq "" } { - incr output(pre) - } else { - incr output(pre) -1 - } - } - blockquote { - set output(p) 1 - if { $slash eq "" } { - incr output(blockquote) - incr output(maxlen) -4 - } else { - incr output(blockquote) -1 - incr output(maxlen) 4 - } - } - hr { - set output(p) 1 - ad_html_to_text_put_text output [string repeat "-" $output(maxlen)] - set output(p) 1 - } - q { - ad_html_to_text_put_text output \" - } - img { - if { $slash eq "" && !$no_format_p } { - set img_info {} - if { [info exists attribute_array(alt)] } { - lappend img_info "'$attribute_array(alt)'" - } - if { [info exists attribute_array(src)] } { - lappend img_info $attribute_array(src) - } - if { [llength $img_info] == 0 } { - ad_html_to_text_put_text output {[IMAGE]} - } else { - ad_html_to_text_put_text output "\[IMAGE: [join $img_info " "] \]" - } - } - } - default { - # Other tag - if { $showtags_p } { - ad_html_to_text_put_text output "<$slash$tagname$attributes>" - } - } - } - } - - # set end of last tag to the character following the > - set last_tag_end [incr i] + } + pre { + set output(p) 1 + if { $slash eq "" } { + incr output(pre) + } else { + incr output(pre) -1 + } + } + blockquote { + set output(p) 1 + if { $slash eq "" } { + incr output(blockquote) + incr output(maxlen) -4 + } else { + incr output(blockquote) -1 + incr output(maxlen) 4 + } + } + hr { + set output(p) 1 + ad_html_to_text_put_text output [string repeat "-" $output(maxlen)] + set output(p) 1 + } + q { + ad_html_to_text_put_text output \" + } + img { + if { $slash eq "" && !$no_format_p } { + set img_info {} + if { [info exists attribute_array(alt)] } { + lappend img_info "'$attribute_array(alt)'" + } + if { [info exists attribute_array(src)] } { + lappend img_info $attribute_array(src) + } + if { [llength $img_info] == 0 } { + ad_html_to_text_put_text output {[IMAGE]} + } else { + ad_html_to_text_put_text output "\[IMAGE: [join $img_info " "] \]" + } + } + } + default { + # Other tag + if { $showtags_p } { + ad_html_to_text_put_text output "<$slash$tagname$attributes>" + } + } + } + } + + # set end of last tag to the character following the > + set last_tag_end [incr i] } # append everything after the last tag ad_html_to_text_put_text output [string range $html $last_tag_end end] # Close any unclosed tags set output(pre) 0 while { $output(blockquote) > 0 } { - incr output(blockquote) -1 - incr output(maxlen) 4 + incr output(blockquote) -1 + incr output(maxlen) 4 } # write out URLs, if necessary: if { [llength $href_urls] > 0 } { - append output(text) "\n\n[join $href_urls "\n"]" + append output(text) "\n\n[join $href_urls "\n"]" } #--- # conversion like in ad_text_to_html # 2006/09/12 set myChars { - ª º À Á Â Ã Ä Å Æ Ç - È É Ê Ë Ì Í Î Ï Ð Ñ - Ò Ó Ô Õ Ö Ø Ù Ú Û Ü - Ý Þ ß à á â ã ä å æ - ç è é ê ë ì í î ï ð - ñ ò ó ô õ ö ø ù ú û - ü ý þ ÿ ¿ + ª º À Á Â Ã Ä Å Æ Ç + È É Ê Ë Ì Í Î Ï Ð Ñ + Ò Ó Ô Õ Ö Ø Ù Ú Û Ü + Ý Þ ß à á â ã ä å æ + ç è é ê ë ì í î ï ð + ñ ò ó ô õ ö ø ù ú û + ü ý þ ÿ ¿ } set myHTML { - ª º À Á Â Ã Ä Å &Aelig; Ç - È É Ê Ë Ì Í Î Ï Ð Ñ - Ò Ó Ô Õ Ö Ø Ù Ú Û Ü - Ý Þ ß à á â ã ä å æ - ç è é ê ë ì í î ï ð - ñ ò ó ô õ ö ø ù ú û - ü ý þ ÿ ¿ + ª º À Á Â Ã Ä Å &Aelig; Ç + È É Ê Ë Ì Í Î Ï Ð Ñ + Ò Ó Ô Õ Ö Ø Ù Ú Û Ü + Ý Þ ß à á â ã ä å æ + ç è é ê ë ì í î ï ð + ñ ò ó ô õ ö ø ù ú û + ü ý þ ÿ ¿ } for { set i 0 } { $i < [ llength $myHTML ] } { incr i } { - set output(text) [ string map "[ lindex $myHTML $i ] [ lindex $myChars $i ]" $output(text) ] + set output(text) [ string map "[ lindex $myHTML $i ] [ lindex $myChars $i ]" $output(text) ] } #--- @@ -1071,92 +1071,92 @@ # If we're not in a PRE if { $output(pre) <= 0 } { - # collapse all whitespace - regsub -all {\s+} $text { } text - - # if there's only spaces in the string, wait until later - if {$text eq " "} { - set output(space) 1 - return - } - - # if it's nothing, do nothing - if { $text eq "" } { - return - } - - # if the first character is a space, set the space bit - if {[string index $text 0] eq " "} { - set output(space) 1 - set text [string trimleft $text] - } + # collapse all whitespace + regsub -all {\s+} $text { } text + + # if there's only spaces in the string, wait until later + if {$text eq " "} { + set output(space) 1 + return + } + + # if it's nothing, do nothing + if { $text eq "" } { + return + } + + # if the first character is a space, set the space bit + if {[string index $text 0] eq " "} { + set output(space) 1 + set text [string trimleft $text] + } } else { - # we're in a PRE: clean line breaks and tabs - regsub -all {\r\n} $text "\n" text - regsub -all {\r} $text "\n" text - # tabs become four spaces - regsub -all {[\v\t]} $text { } text + # we're in a PRE: clean line breaks and tabs + regsub -all {\r\n} $text "\n" text + regsub -all {\r} $text "\n" text + # tabs become four spaces + regsub -all {[\v\t]} $text { } text } # output any pending paragraph breaks, line breaks or spaces. # as long as we're not at the beginning of the document if { $output(p) || $output(br) || $output(space) } { - if { $output(text) ne "" } { - if { $output(p) } { - ad_html_to_text_put_newline output - ad_html_to_text_put_newline output - } elseif { $output(br) } { - ad_html_to_text_put_newline output - } else { - # Don't add the space if we're at the beginning of a line, - # unless we're in a PRE - if { $output(pre) > 0 || $output(linelen) != 0 } { - append output(text) " " - incr output(linelen) - } - } - } - set output(p) 0 - set output(br) 0 - set output(space) 0 + if { $output(text) ne "" } { + if { $output(p) } { + ad_html_to_text_put_newline output + ad_html_to_text_put_newline output + } elseif { $output(br) } { + ad_html_to_text_put_newline output + } else { + # Don't add the space if we're at the beginning of a line, + # unless we're in a PRE + if { $output(pre) > 0 || $output(linelen) != 0 } { + append output(text) " " + incr output(linelen) + } + } + } + set output(p) 0 + set output(br) 0 + set output(space) 0 } # if the last character is a space, save it until the next time if { [regexp {^(.*) $} $text match text] } { - set output(space) 1 + set output(space) 1 } - + # If there's a blockquote in the beginning of the text, we wouldn't have caught it before if { $output(text) eq "" } { - append output(text) [string repeat { } $output(blockquote)] + append output(text) [string repeat { } $output(blockquote)] } # Now output the text. while { [regexp {^( +|\s|\S+)(.*)$} $text match word text] } { - # convert  's - # We do this now, so that they're displayed, but not treated, whitespace. - regsub -all { } $word { } word - - set wordlen [string length $word] - switch -glob -- $word { - " *" { - append output(text) "$word" - incr output(linelen) $wordlen - } - "\n" { - if { $output(text) ne "" } { - ad_html_to_text_put_newline output - } - } - default { - if { [expr {$output(linelen) + $wordlen}] > $output(maxlen) && $output(maxlen) != 0 } { - ad_html_to_text_put_newline output - } - append output(text) "$word" - incr output(linelen) $wordlen - } - } + # convert  's + # We do this now, so that they're displayed, but not treated, whitespace. + regsub -all { } $word { } word + + set wordlen [string length $word] + switch -glob -- $word { + " *" { + append output(text) "$word" + incr output(linelen) $wordlen + } + "\n" { + if { $output(text) ne "" } { + ad_html_to_text_put_newline output + } + } + default { + if { [expr {$output(linelen) + $wordlen}] > $output(maxlen) && $output(maxlen) != 0 } { + ad_html_to_text_put_newline output + } + append output(text) "$word" + incr output(linelen) $wordlen + } + } } } @@ -1215,51 +1215,49 @@ array set entities { lt < gt > quot \" ob \{ cb \} amp & } # Expand HTML entities on the value - for { set i [string first & $html] } \ - { $i != -1 } \ - { set i [string first & $html $i] } { - - set match_p 0 - switch -regexp -- [string index $html [expr {$i+1}]] { - # { - switch -regexp -- [string index $html [expr {$i+2}]] { - [xX] { - regexp -indices -start [expr {$i+3}] {[0-9a-fA-F]*} $html hex_idx - set hex [string range $html [lindex $hex_idx 0] [lindex $hex_idx 1]] - set html [string replace $html $i [lindex $hex_idx 1] \ - [subst -nocommands -novariables "\\x$hex"]] - set match_p 1 - } - [0-9] { - regexp -indices -start [expr {$i+2}] {[0-9]*} $html dec_idx - set dec [string range $html [lindex $dec_idx 0] [lindex $dec_idx 1]] + for { set i [string first & $html] } { $i != -1 } { set i [string first & $html $i] } { + + set match_p 0 + switch -regexp -- [string index $html [expr {$i+1}]] { + # { + switch -regexp -- [string index $html [expr {$i+2}]] { + [xX] { + regexp -indices -start [expr {$i+3}] {[0-9a-fA-F]*} $html hex_idx + set hex [string range $html [lindex $hex_idx 0] [lindex $hex_idx 1]] + set html [string replace $html $i [lindex $hex_idx 1] \ + [subst -nocommands -novariables "\\x$hex"]] + set match_p 1 + } + [0-9] { + regexp -indices -start [expr {$i+2}] {[0-9]*} $html dec_idx + set dec [string range $html [lindex $dec_idx 0] [lindex $dec_idx 1]] # $dec might contain leading 0s. Since format evaluates $dec as expr # leading 0s cause octal interpretation and therefore errors on e.g. & - set dec [string trimleft $dec 0] + set dec [string trimleft $dec 0] if {$dec eq ""} {set dec 0} - set html [string replace $html $i [lindex $dec_idx 1] \ - [format "%c" $dec]] - set match_p 1 - } - } - } - [a-zA-Z] { - if { [regexp -indices -start [expr {$i}] {\A&([^\s;]+)} $html match entity_idx] } { - set entity [string tolower [string range $html [lindex $entity_idx 0] [lindex $entity_idx 1]]] - if { [info exists entities($entity)] } { - set html [string replace $html $i [lindex $match 1] $entities($entity)] - } - set match_p 1 - } - } - } - incr i - if { $match_p } { - # remove trailing semicolon - if { [string equal [string index $html $i] {;}] } { - set html [string replace $html $i $i] - } - } + set html [string replace $html $i [lindex $dec_idx 1] \ + [format "%c" $dec]] + set match_p 1 + } + } + } + [a-zA-Z] { + if { [regexp -indices -start [expr {$i}] {\A&([^\s;]+)} $html match entity_idx] } { + set entity [string tolower [string range $html [lindex $entity_idx 0] [lindex $entity_idx 1]]] + if { [info exists entities($entity)] } { + set html [string replace $html $i [lindex $match 1] $entities($entity)] + } + set match_p 1 + } + } + } + incr i + if { $match_p } { + # remove trailing semicolon + if { [string equal [string index $html $i] {;}] } { + set html [string replace $html $i $i] + } + } } return $html } @@ -1279,44 +1277,44 @@ set result_rows [list] set start_of_line_index 0 while 1 { - set this_line [string range $input $start_of_line_index [expr {$start_of_line_index + $threshold - 1}]] - if { $this_line eq "" } { - return [join $result_rows "\n"] - } - set first_new_line_pos [string first "\n" $this_line] - if { $first_new_line_pos != -1 } { - # there is a newline - lappend result_rows [string range $input $start_of_line_index [expr {$start_of_line_index + $first_new_line_pos - 1}]] - set start_of_line_index [expr {$start_of_line_index + $first_new_line_pos + 1}] - continue - } - if { [expr {$start_of_line_index + $threshold + 1}] >= [string length $input] } { - # we're on the last line and it is < threshold so just return it - lappend result_rows $this_line - return [join $result_rows "\n"] - } - set last_space_pos [string last " " $this_line] - if { $last_space_pos == -1 } { - # no space found! Try the first space in the whole rest of the string - set next_space_pos [string first " " [string range $input $start_of_line_index end]] - set next_newline_pos [string first "\n" [string range $input $start_of_line_index end]] - if {$next_space_pos == -1} { - set last_space_pos $next_newline_pos - } elseif {$next_space_pos < $next_newline_pos} { - set last_space_pos $next_space_pos - } else { - set last_space_pos $next_newline_pos - } - if { $last_space_pos == -1 } { - # didn't find any more whitespace, append the whole thing as a line - lappend result_rows [string range $input $start_of_line_index end] - return [join $result_rows "\n"] - } - } - # OK, we have a last space pos of some sort - set real_index_of_space [expr {$start_of_line_index + $last_space_pos}] - lappend result_rows [string range $input $start_of_line_index [expr {$real_index_of_space - 1}]] - set start_of_line_index [expr {$start_of_line_index + $last_space_pos + 1}] + set this_line [string range $input $start_of_line_index [expr {$start_of_line_index + $threshold - 1}]] + if { $this_line eq "" } { + return [join $result_rows "\n"] + } + set first_new_line_pos [string first "\n" $this_line] + if { $first_new_line_pos != -1 } { + # there is a newline + lappend result_rows [string range $input $start_of_line_index [expr {$start_of_line_index + $first_new_line_pos - 1}]] + set start_of_line_index [expr {$start_of_line_index + $first_new_line_pos + 1}] + continue + } + if { [expr {$start_of_line_index + $threshold + 1}] >= [string length $input] } { + # we're on the last line and it is < threshold so just return it + lappend result_rows $this_line + return [join $result_rows "\n"] + } + set last_space_pos [string last " " $this_line] + if { $last_space_pos == -1 } { + # no space found! Try the first space in the whole rest of the string + set next_space_pos [string first " " [string range $input $start_of_line_index end]] + set next_newline_pos [string first "\n" [string range $input $start_of_line_index end]] + if {$next_space_pos == -1} { + set last_space_pos $next_newline_pos + } elseif {$next_space_pos < $next_newline_pos} { + set last_space_pos $next_space_pos + } else { + set last_space_pos $next_newline_pos + } + if { $last_space_pos == -1 } { + # didn't find any more whitespace, append the whole thing as a line + lappend result_rows [string range $input $start_of_line_index end] + return [join $result_rows "\n"] + } + } + # OK, we have a last space pos of some sort + set real_index_of_space [expr {$start_of_line_index + $last_space_pos}] + lappend result_rows [string range $input $start_of_line_index [expr {$real_index_of_space - 1}]] + set start_of_line_index [expr {$start_of_line_index + $last_space_pos + 1}] } } @@ -1424,57 +1422,57 @@ # Do the conversion switch $from { text/enhanced { - switch $to { + switch $to { text/html { set text [ad_enhanced_text_to_html $text] - } + } text/plain { - set text [ad_enhanced_text_to_plain_text -maxlen $maxlen -- $text] - } - } + set text [ad_enhanced_text_to_plain_text -maxlen $maxlen -- $text] + } + } } text/plain { - switch $to { + switch $to { text/html { - set text [ad_text_to_html -- $text] - } + set text [ad_text_to_html -- $text] + } text/plain { - set text [wrap_string $text $maxlen] - } - } + set text [wrap_string $text $maxlen] + } + } } text/fixed-width { - switch $to { + switch $to { text/html { - set text "
[ad_text_to_html -no_lines -- $text]
" - } + set text "
[ad_text_to_html -no_lines -- $text]
" + } text/plain { - set text [wrap_string $text $maxlen] - } - } - } + set text [wrap_string $text $maxlen] + } + } + } text/html { - switch $to { + switch $to { text/html { # Handled below - } + } text/plain { - set text [ad_html_to_text -maxlen $maxlen -- $text] - } - } - } - text/xml { - switch $to { + set text [ad_html_to_text -maxlen $maxlen -- $text] + } + } + } + text/xml { + switch $to { text/html { set text "
[ad_text_to_html -no_lines -- $text]
" - } + } text/plain { - set text [wrap_string $text $maxlen] - } - } - } + set text [wrap_string $text $maxlen] + } + } + } } - + # Handle closing of HTML tags, truncation switch $to { text/html { @@ -1484,7 +1482,7 @@ set text [string_truncate -ellipsis $ellipsis -more $more -len $truncate_len -- $text] } } - + return $text } @@ -1526,9 +1524,9 @@ @creation-date 19 July 2000 } { if {$html_p eq "t"} { - set from html + set from html } else { - set from text + set from text } return [ad_html_text_convert -from $from -to html -- $text] } @@ -1547,9 +1545,9 @@ @creation-date 19 July 2000 } { if {$html_p eq "t"} { - set from html + set from html } else { - set from text + set from text } return [ad_html_text_convert -from $from -to text -- $text] } @@ -1567,9 +1565,9 @@ @creation-date 19 July 2000 } { if { [regexp -nocase {

} $text] || [regexp -nocase {
} $text] || [regexp -nocase { 0 } { if { [string length $string] > $len } { set end_index [expr $len-[string length $ellipsis]-1] - + # Back up to the nearest whitespace if { ![string is space [string index $string [expr {$end_index + 1}]]] } { while { $end_index >= 0 && ![string is space [string index $string $end_index]] } { @@ -1686,10 +1684,10 @@ @see ad_text_to_html } { if { [regexp -nocase {

} $raw_string] || [regexp -nocase {
} $raw_string] } { - # user was already trying to do this as HTML - return $raw_string + # user was already trying to do this as HTML + return $raw_string } else { - return [ad_text_to_html -no_links -- $raw_string] + return [ad_text_to_html -no_links -- $raw_string] } } @@ -1703,9 +1701,9 @@ } { if { $html_p eq "t" } { - return $raw_string + return $raw_string } else { - return [util_convert_plaintext_to_html $raw_string] + return [util_convert_plaintext_to_html $raw_string] } } @@ -1735,4 +1733,3 @@ } { return [ad_quotehtml $arg] } -