Index: openacs-4/packages/feed-parser/tcl/feed-parser-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/feed-parser/tcl/feed-parser-procs.tcl,v diff -u -r1.1 -r1.2 --- openacs-4/packages/feed-parser/tcl/feed-parser-procs.tcl 19 Mar 2004 19:45:28 -0000 1.1 +++ openacs-4/packages/feed-parser/tcl/feed-parser-procs.tcl 30 May 2004 10:16:23 -0000 1.2 @@ -8,6 +8,181 @@ namespace eval feed_parser {} +ad_proc -public feed_parser::sort_result { + {-result:required} +} { + @author Simon Carstensen +} { + set sorted [list] + for {set i 0} {$i < [llength $result]} {incr i} { + lappend sorted [lindex $result end-$i] + } + return $sorted +} + +ad_proc -private feed_parser::items_fetch { + {-doc_node:required} +} { + Takes a tDOM document node which is supposed to be some + form of RSS. Returns a list with the item nodes. Returns + an empty list if none could be found. + + @author Guan Yang (guan@unicast.org) + @creation-date 2003-07-03 +} { + set items [$doc_node selectNodes {//*[local-name()='item' or local-name()='entry']}] + return $items +} + +ad_proc -private feed_parser::item_parse { + {-item_node:required} +} { + Takes a tDOM node which is supposed to represent an RSS item, + and returns a list with the RSS/RDF elements of that node. + + @author Simon Carstensen + @author Guan Yang (guan@unicast.org) +} { + set title "" + set link "" + set guid "" + set permalink_p false + set description "" + set content_encoded "" + + feed_parser::dom::set_child_text -node $item_node -child title + feed_parser::dom::set_child_text -node $item_node -child link + feed_parser::dom::set_child_text -node $item_node -child guid + feed_parser::dom::set_child_text -node $item_node -child description + + set maybe_atom_p 0 + + # Try to handle Atom link + if { [string equal $link ""] } { + set link_attr [$item_node selectNodes {*[local-name()='link']/@href}] + if { [llength $link_attr] == 1 } { + set link [lindex [lindex $link_attr 0] 1] + set maybe_atom_p 1 + } + } + + set encoded_nodes [$item_node selectNodes {*[local-name()='encoded' and namespace-uri()='http://purl.org/rss/1.0/modules/content/']}] + if { [llength $encoded_nodes] == 1 } { + set encoded_node [lindex $encoded_nodes 0] + set content_encoded [$encoded_node text] + } + + if { [llength [$item_node selectNodes "*\[local-name()='guid'\]"]] } { + # If guid exists, we assume that it's a permalink + set permalink_p true + } + + # Retrieve isPermaLink attribute + set isPermaLink_nodes [$item_node selectNodes "*\[local-name()='guid'\]/@isPermaLink"] + if { [llength isPermaLink_nodes] == 1} { + set isPermaLink_node [lindex $isPermaLink_nodes 0] + set isPermaLink [lindex $isPermaLink_node 1] + if { [string equal $isPermaLink false] } { + set permalink_p false + } + } + + if { [empty_string_p $link] } { + if { [exists_and_not_null guid] } { + set link $guid + set permalink_p true + } elseif { [empty_string_p $guid] && ![string equal $link $guid] } { + set permalink_p true + } + } + + # Try to handle Atom guid + if { [empty_string_p $guid] && $maybe_atom_p } { + feed_parser::dom::set_child_text -node $item_node -child id + if { [info exists id] } { + # We don't really know if it's an URL + set guid $id + if { [util_url_valid_p $id] } { + set permalink_p true + } else { + set permalink_p false + } + } + } + + # For Atom, description is summary, content is content_encoded + if { $maybe_atom_p } { + feed_parser::dom::set_child_text -node $item_node -child summary + if { [info exists summary] } { + set description $summary + } + + feed_parser::dom::set_child_text -node $item_node -child content + if { [info exists content] } { + set content_encoded $content + } + } + + #remove unsafe html + set description [feed_parser::remove_unsafe_html -html $description] + + return [list title $title link $link guid $guid permalink_p $permalink_p description $description content_encoded $content_encoded] +} + +ad_proc -private feed_parser::channel_parse { + {-channel_node:required} +} { + Takes a tDOM node which is supposed to represent an RSS + channel, and returns a list with the RSS/RDF elements + of that node. This proc should later be extended to + support Dublin Core elements and other funk. + + @author Guan Yang (guan@unicast.org) + @creation-date 2003-07-03 +} { + set properties [list title link description language copyright lastBuildDate docs generator managingEditor webMaster] + + foreach property $properties { + set $property "" + feed_parser::dom::set_child_text -node $channel_node -child $property + set channel($property) [set $property] + } + + set channel_name [$channel_node nodeName] + + # Do weird Atom-like stuff + if { [string equal $channel_name "feed"] } { + # link + if { [string equal $link ""] } { + # Link is in a href + set link_node [$channel_node selectNodes {*[local-name()='link' and @rel = 'alternate' and @type = 'text/html']/@href}] + if { [llength $link_node] == 1 } { + set link_node [lindex $link_node 0] + set channel(link) [lindex $link_node 1] + } + } + + # author + set author_node [$channel_node selectNodes {*[local-name()='author']}] + if { [llength $author_node] == 1 } { + set author_node [lindex $author_node 0] + feed_parser::dom::set_child_text -node $author_node -child name + feed_parser::dom::set_child_text -node $author_node -child email + if { [info exists email] && [info exists name] } { + set channel(managingEditor) "$email ($name)" + } + } + + # tagline + feed_parser::dom::set_child_text -node $channel_node -child tagline + if { [info exists tagline] } { + set channel(tagline) $tagline + } + } + + return [array get channel] +} + ad_proc -private feed_parser::remove_unsafe_html { -html:required } { @@ -108,20 +283,20 @@ if { [string equal $doc_name "feed"] } { # It's an Atom feed - set channel [news_aggregator::channel_parse \ + set channel [feed_parser::channel_parse \ -channel_node $doc_node] } else { # It looks RSS/RDF'fy - set channel [news_aggregator::channel_parse \ + set channel [feed_parser::channel_parse \ -channel_node [$doc_node getElementsByTagName channel]] } - set item_nodes [news_aggregator::items_fetch -doc_node $doc_node] - set item_nodes [news_aggregator::sort_result -result $item_nodes] + set item_nodes [feed_parser::items_fetch -doc_node $doc_node] + set item_nodes [feed_parser::sort_result -result $item_nodes] set items [list] foreach item_node $item_nodes { - lappend items [news_aggregator::item_parse -item_node $item_node] + lappend items [feed_parser::item_parse -item_node $item_node] } $doc delete Index: openacs-4/packages/news-aggregator/tcl/news-aggregator-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/news-aggregator/tcl/news-aggregator-procs.tcl,v diff -u -r1.7 -r1.8 --- openacs-4/packages/news-aggregator/tcl/news-aggregator-procs.tcl 20 Mar 2004 11:12:29 -0000 1.7 +++ openacs-4/packages/news-aggregator/tcl/news-aggregator-procs.tcl 30 May 2004 10:16:09 -0000 1.8 @@ -21,7 +21,6 @@ return [string equal $link $domain] } - ad_proc -public news_aggregator::last_scanned { {-diff:required} } { @@ -44,318 +43,3 @@ return $to_return } -ad_proc -private news_aggregator::dom_set_child_text { - {-node:required} - {-child:required} -} { - If node contains a child node named child, - the variable child is set to the text of that node - in the caller's stack frame. - - @author Guan Yang - @creation-date 2003-07-03 -} { - if { [$node hasChildNodes] } { - set child_nodes [$node selectNodes "*\[local-name()='$child'\]"] - if { [llength $child_nodes] == 1 } { - set child_node [lindex $child_nodes 0] - upvar $child var - set var [$child_node text] - } - } -} - -ad_proc -private news_aggregator::channel_parse { - {-channel_node:required} -} { - Takes a tDOM node which is supposed to represent an RSS - channel, and returns a list with the RSS/RDF elements - of that node. This proc should later be extended to - support Dublin Core elements and other funk. - - @author Guan Yang (guan@unicast.org) - @creation-date 2003-07-03 -} { - set properties [list title link description language copyright lastBuildDate docs generator managingEditor webMaster] - - foreach property $properties { - set $property "" - news_aggregator::dom_set_child_text -node $channel_node -child $property - set channel($property) [set $property] - } - - set channel_name [$channel_node nodeName] - - # Do weird Atom-like stuff - if { [string equal $channel_name "feed"] } { - # link - if { [string equal $link ""] } { - # Link is in a href - set link_node [$channel_node selectNodes {*[local-name()='link' and @rel = 'alternate' and @type = 'text/html']/@href}] - if { [llength $link_node] == 1 } { - set link_node [lindex $link_node 0] - set channel(link) [lindex $link_node 1] - } - } - - # author - set author_node [$channel_node selectNodes {*[local-name()='author']}] - if { [llength $author_node] == 1 } { - set author_node [lindex $author_node 0] - news_aggregator::dom_set_child_text -node $author_node -child name - news_aggregator::dom_set_child_text -node $author_node -child email - if { [info exists email] && [info exists name] } { - set channel(managingEditor) "$email ($name)" - } - } - - # tagline - news_aggregator::dom_set_child_text -node $channel_node -child tagline - if { [info exists tagline] } { - set channel(tagline) $tagline - } - } - - return [array get channel] -} - -ad_proc -private news_aggregator::items_fetch { - {-doc_node:required} -} { - Takes a tDOM document node which is supposed to be some - form of RSS. Returns a list with the item nodes. Returns - an empty list if none could be found. - - @author Guan Yang (guan@unicast.org) - @creation-date 2003-07-03 -} { - set items [$doc_node selectNodes {//*[local-name()='item' or local-name()='entry']}] - return $items -} - -ad_proc -private news_aggregator::item_parse { - {-item_node:required} -} { - Takes a tDOM node which is supposed to represent an RSS item, - and returns a list with the RSS/RDF elements of that node. - - @author Simon Carstensen - @author Guan Yang (guan@unicast.org) -} { - set title "" - set link "" - set guid "" - set permalink_p false - set description "" - set content_encoded "" - - news_aggregator::dom_set_child_text -node $item_node -child title - news_aggregator::dom_set_child_text -node $item_node -child link - news_aggregator::dom_set_child_text -node $item_node -child guid - news_aggregator::dom_set_child_text -node $item_node -child description - - set maybe_atom_p 0 - - # Try to handle Atom link - if { [string equal $link ""] } { - set link_attr [$item_node selectNodes {*[local-name()='link']/@href}] - if { [llength $link_attr] == 1 } { - set link [lindex [lindex $link_attr 0] 1] - set maybe_atom_p 1 - } - } - - set encoded_nodes [$item_node selectNodes {*[local-name()='encoded' and namespace-uri()='http://purl.org/rss/1.0/modules/content/']}] - if { [llength $encoded_nodes] == 1 } { - set encoded_node [lindex $encoded_nodes 0] - set content_encoded [$encoded_node text] - } - - if { [llength [$item_node selectNodes "*\[local-name()='guid'\]"]] } { - # If guid exists, we assume that it's a permalink - set permalink_p true - } - - # Retrieve isPermaLink attribute - set isPermaLink_nodes [$item_node selectNodes "*\[local-name()='guid'\]/@isPermaLink"] - if { [llength isPermaLink_nodes] == 1} { - set isPermaLink_node [lindex $isPermaLink_nodes 0] - set isPermaLink [lindex $isPermaLink_node 1] - if { [string equal $isPermaLink false] } { - set permalink_p false - } - } - - if { [empty_string_p $link] } { - if { [exists_and_not_null guid] } { - set link $guid - set permalink_p true - } elseif { [empty_string_p $guid] && ![string equal $link $guid] } { - set permalink_p true - } - } - - # Try to handle Atom guid - if { [empty_string_p $guid] && $maybe_atom_p } { - news_aggregator::dom_set_child_text -node $item_node -child id - if { [info exists id] } { - # We don't really know if it's an URL - set guid $id - if { [util_url_valid_p $id] } { - set permalink_p true - } else { - set permalink_p false - } - } - } - - # For Atom, description is summary, content is content_encoded - if { $maybe_atom_p } { - news_aggregator::dom_set_child_text -node $item_node -child summary - if { [info exists summary] } { - set description $summary - } - - news_aggregator::dom_set_child_text -node $item_node -child content - if { [info exists content] } { - set content_encoded $content - } - } - - #remove unsafe html - set description [news_aggregator::remove_unsafe_html -html $description] - - return [list title $title link $link guid $guid permalink_p $permalink_p description $description content_encoded $content_encoded] -} - -ad_proc -private news_aggregator::external_entity { - base_uri - system_identifier - public_identifier -} { - A callback for tDOM to resolve external entities. - - @author Guan Yang - @creation-date 2003-07-03 -} { - return [list string "" ""] -} - -ad_proc -public news_aggregator::sort_result { - {-result:required} -} { - @author Simon Carstensen -} { - set sorted [list] - for {set i 0} {$i < [llength $result]} {incr i} { - lappend sorted [lindex $result end-$i] - } - return $sorted -} - -ad_proc -public news_aggregator::parse { - {-xml:required} -} { - The workhorse of news-aggregator: A Very Ugly RSS Parser. - Now also supports Atom and weird formats in between. - - @author Simon Carstensen - @author Guan Yang (guan@unicast.org) -} { - if { [catch { - # Pre-process the doc and remove any processing instruction - regsub {^<\?xml [^\?]+\?>} $xml {} xml - set doc [dom parse $xml] - set doc_node [$doc documentElement] - set node_name [$doc_node nodeName] - - # feed is the doc-node name for Atom feeds - if { [lsearch {rdf RDF rdf:RDF rss feed} $node_name] == -1 } { - ns_log Debug "news_aggregator::parse: doc node name is not rdf, RDF, rdf:RDF or rss" - set rss_p 0 - } else { - set rss_p 1 - } - } errmsg] } { - ns_log Debug "news_aggregator::parse: error in initial itdom parse, errmsg = $errmsg" - set rss_p 0 - } - - if { !$rss_p } { - # not valid xml, let's try autodiscovery - ns_log Debug "news_aggregator::parse: not valid xml, we'll try autodiscovery" - - set doc [dom parse -html $xml] - set doc_node [$doc documentElement] - - set link_nodes [$doc_node selectNodes {/html/head/link[@rel='alternate' and @title='RSS' and @type='application/rss+xml']/@href}] - - $doc delete - - if { [llength $link_nodes] == 1} { - set link_node [lindex $link_nodes 0] - set feed_url [lindex $link_node 1] - array set f [ad_httpget -url $feed_url] - return [news_aggregator::parse -xml $f(page)] - } - return 0 - } - - if { [catch { - set doc_name [$doc_node nodeName] - - if { [string equal $doc_name "feed"] } { - # It's an Atom feed - set channel [news_aggregator::channel_parse \ - -channel_node $doc_node] - } else { - # It looks RSS/RDF'fy - set channel [news_aggregator::channel_parse \ - -channel_node [$doc_node getElementsByTagName channel]] - } - - set item_nodes [news_aggregator::items_fetch -doc_node $doc_node] - set item_nodes [news_aggregator::sort_result -result $item_nodes] - set items [list] - - foreach item_node $item_nodes { - lappend items [news_aggregator::item_parse -item_node $item_node] - } - - $doc delete - } err] } { - return 0 - } else { - return [list channel "$channel" items "$items"] - } -} - -ad_proc -public news_aggregator::remove_unsafe_html { - -html:required -} { - Make sure we are consuming RSS safely by removing unsafe tags. - - See http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely.html. - - @author Simon Carstensen - @creation-date 2003-07-06 -} { - - set unsafe_tags { - script - embed - object - frameset - frame - iframe - meta - link - style - } - - foreach tag $unsafe_tags { - regsub -all "(<$tag\[^>\]*>(\[^<\]*)?)+" $html {} html - } - - return $html -}