Index: openacs-4/packages/news-aggregator/tcl/news-aggregator-procs.tcl
===================================================================
RCS file: /usr/local/cvsroot/openacs-4/packages/news-aggregator/tcl/news-aggregator-procs.tcl,v
diff -u -r1.6 -r1.7
--- openacs-4/packages/news-aggregator/tcl/news-aggregator-procs.tcl	16 Nov 2003 09:21:20 -0000	1.6
+++ openacs-4/packages/news-aggregator/tcl/news-aggregator-procs.tcl	20 Mar 2004 11:12:29 -0000	1.7
@@ -1,74 +1,35 @@
 ad_library {
-     Procs used by the News Aggregator module.
+     Procs used by the News Aggregator application.
+
      @author Simon Carstensen (simon@bcuni.net)
-     @creation-date January 2003
+     @author Guan Yang (guan@unicast.org)
+     @creation-date 2003-06-28
 }
 
-ad_proc -public util_httpget_full {
-    url {headers ""} {timeout 30} {depth 0}
-} {
-    Just like ns_httpget, but first optional argument is an ns_set of
-    headers to send during the fetch.
-} {
-    if {[incr depth] > 10} {
-        return -code error "util_httpget:  Recursive redirection:  $url"
-    }
-    ns_log Debug "Getting {$url} {$headers} {$timeout} {$depth}"
-    set http [ns_httpopen GET $url $headers $timeout]
-    set rfd [lindex $http 0]
-    close [lindex $http 1]
-    set headers [lindex $http 2]
-    set response [ns_set name $headers]
-    set status [lindex $response 1]
-    set last_modified [ns_set iget $headers last-modified]
-    if {$status == 302} {
-	set location [ns_set iget $headers location]
-	if {$location != ""} {
-            ns_set free $headers
-            close $rfd
-	    return [util_httpget_full $location {} $timeout $depth]
-	}
-    } elseif { $status == 304 } {
-        # The requested variant has not been modified since the time specified
-        # A conditional get didn't return anything.
-        ns_set free $headers
-        close $rfd
-	return [list "" $status]
-    }
-    set length [ns_set iget $headers content-length]
-    if [string match "" $length] {set length -1}
-    set err [catch {
-        while 1 {
-            set buf [_ns_http_read $timeout $rfd $length]
-            append page $buf
-            if [string match "" $buf] break
-            if {$length > 0} {
-                incr length -[string length $buf]
-                if {$length <= 0} break
-            }
-        }
-    } errMsg]
-	ns_set free $headers
-	close $rfd
-	if $err {
-	    global errorInfo
-	    return -code error -errorinfo $errorInfo $errMsg
-	}
-	return [list $page $status $last_modified]
-    }
 
-ad_proc -public na_check_link {
-    domain
-    link
+namespace eval news_aggregator {}
+
+
+ad_proc -public news_aggregator::check_link {
+    {-domain:required}
+    {-link:required}
 } {
+    @author Simon Carstensen
+} {
     regexp {(https?://[^/]+)+} $domain domain
     regexp {(https?://[^/]+)+} $link link
     return [string equal $link $domain]
 }
 
-ad_proc -public na_last_scanned {
-    diff
+
+ad_proc -public news_aggregator::last_scanned {
+    {-diff:required}
 } {
+
+    Returns the number of hours and minutes since the feed was last updated.
+
+    @author Simon Carstensen    
+} {
     if {$diff < 120 && $diff > 60} {
 	set to_return "1 hour and "
     } elseif {$diff >= 60} {
@@ -83,197 +44,318 @@
     return $to_return
 }
 
-ad_proc -public na_get_nodes {
-    nodes
+ad_proc -private news_aggregator::dom_set_child_text {
+    {-node:required}
+    {-child:required}
 } {
-    foreach node_id $nodes {
-        switch -- [xml_node_get_name $node_id] {
-            title {
-		catch {set title [xml_node_get_content $node_id]}
-	    }
-            "link" {
-                catch {set link [xml_node_get_content $node_id]}
-	    }
-	    "guid" {
-                catch {set guid [xml_node_get_content $node_id]}
+    If node contains a child node named child,
+    the variable child is set to the text of that node
+    in the caller's stack frame.
+
+    @author Guan Yang
+    @creation-date 2003-07-03
+} {
+    if { [$node hasChildNodes] } {
+        set child_nodes [$node selectNodes "*\[local-name()='$child'\]"]
+        if { [llength $child_nodes] == 1 } {
+            set child_node [lindex $child_nodes 0]
+            upvar $child var
+            set var [$child_node text]
+        }
+    }
+}
+
+ad_proc -private news_aggregator::channel_parse {
+    {-channel_node:required}
+} {
+    Takes a tDOM node which is supposed to represent an RSS
+    channel, and returns a list with the RSS/RDF elements
+    of that node. This proc should later be extended to
+    support Dublin Core elements and other funk.
+
+    @author Guan Yang (guan@unicast.org)
+    @creation-date 2003-07-03
+} {
+    set properties [list title link description language copyright lastBuildDate docs generator managingEditor webMaster]
+
+    foreach property $properties {
+        set $property ""
+	    news_aggregator::dom_set_child_text -node $channel_node -child $property
+        set channel($property) [set $property]
+    }
+    
+    set channel_name [$channel_node nodeName]
+    
+    # Do weird Atom-like stuff
+    if { [string equal $channel_name "feed"] } {
+        # link
+        if { [string equal $link ""] } {
+            # Link is in a href
+            set link_node [$channel_node selectNodes {*[local-name()='link' and @rel = 'alternate' and @type = 'text/html']/@href}]
+            if { [llength $link_node] == 1 } {
+                set link_node [lindex $link_node 0]
+                set channel(link) [lindex $link_node 1]
             }
-            "description" {
-                catch {set description [xml_node_get_content $node_id]}
+        }
+        
+        # author
+        set author_node [$channel_node selectNodes {*[local-name()='author']}]
+        if { [llength $author_node] == 1 } {
+            set author_node [lindex $author_node 0]
+            news_aggregator::dom_set_child_text -node $author_node -child name
+            news_aggregator::dom_set_child_text -node $author_node -child email
+            if { [info exists email] && [info exists name] } {
+                set channel(managingEditor) "$email ($name)"
             }
-	}
+        }
+        
+        # tagline
+        news_aggregator::dom_set_child_text -node $channel_node -child tagline
+        if { [info exists tagline] } {
+            set channel(tagline) $tagline
+        }
     }
 
-    if { ![exists_and_not_null title] } {
-	set title ""
+    return [array get channel]
+}
+
+ad_proc -private news_aggregator::items_fetch {
+    {-doc_node:required}
+} {
+    Takes a tDOM document node which is supposed to be some
+    form of RSS. Returns a list with the item nodes. Returns
+    an empty list if none could be found.
+
+    @author Guan Yang (guan@unicast.org)
+    @creation-date 2003-07-03
+} {
+    set items [$doc_node selectNodes {//*[local-name()='item' or local-name()='entry']}]
+    return $items
+}
+
+ad_proc -private news_aggregator::item_parse {
+    {-item_node:required}
+} {
+    Takes a tDOM node which is supposed to represent an RSS item,
+    and returns a list with the RSS/RDF elements of that node.
+
+    @author Simon Carstensen
+    @author Guan Yang (guan@unicast.org)
+} {
+    set title ""
+    set link ""
+    set guid ""
+    set permalink_p false
+    set description ""
+    set content_encoded ""
+
+    news_aggregator::dom_set_child_text -node $item_node -child title
+    news_aggregator::dom_set_child_text -node $item_node -child link
+    news_aggregator::dom_set_child_text -node $item_node -child guid
+    news_aggregator::dom_set_child_text -node $item_node -child description
+    
+    set maybe_atom_p 0
+    
+    # Try to handle Atom link
+    if { [string equal $link ""] } {
+        set link_attr [$item_node selectNodes {*[local-name()='link']/@href}]
+        if { [llength $link_attr] == 1 } {
+            set link [lindex [lindex $link_attr 0] 1]
+            set maybe_atom_p 1
+        }
     }
-    if { ![exists_and_not_null link] } {
-	if { [exists_and_not_null guid] } {
-	    set link $guid
-	} else {
-	    set link ""
+
+    set encoded_nodes [$item_node selectNodes {*[local-name()='encoded' and namespace-uri()='http://purl.org/rss/1.0/modules/content/']}]
+    if { [llength $encoded_nodes] == 1 } {
+        set encoded_node [lindex $encoded_nodes 0]
+	    set content_encoded [$encoded_node text]
+    }
+
+    if { [llength [$item_node selectNodes "*\[local-name()='guid'\]"]] } {
+        # If guid exists, we assume that it's a permalink
+        set permalink_p true
+    }
+
+    # Retrieve isPermaLink attribute
+    set isPermaLink_nodes [$item_node selectNodes "*\[local-name()='guid'\]/@isPermaLink"]
+    if { [llength isPermaLink_nodes] == 1} {
+        set isPermaLink_node [lindex $isPermaLink_nodes 0]
+    	set isPermaLink [lindex $isPermaLink_node 1]
+	if { [string equal $isPermaLink false] } {
+	    set permalink_p false
 	}
     }
-    if { ![exists_and_not_null description] } {
-	set description ""
+
+    if { [empty_string_p $link] } {
+        if { [exists_and_not_null guid] } {
+            set link $guid
+            set permalink_p true
+        } elseif { [empty_string_p $guid] && ![string equal $link $guid] } {
+            set permalink_p true
+        }
     }
-	
-    return [list $title $link $description]
+    
+    # Try to handle Atom guid
+    if { [empty_string_p $guid] && $maybe_atom_p } {
+        news_aggregator::dom_set_child_text -node $item_node -child id
+        if { [info exists id] } {
+            # We don't really know if it's an URL
+            set guid $id
+            if { [util_url_valid_p $id] } {
+                set permalink_p true
+            } else {
+                set permalink_p false
+            }
+        }
+    }
+    
+    # For Atom, description is summary, content is content_encoded
+    if { $maybe_atom_p } {
+        news_aggregator::dom_set_child_text -node $item_node -child summary
+        if { [info exists summary] } {
+            set description $summary
+        }
+        
+        news_aggregator::dom_set_child_text -node $item_node -child content
+        if { [info exists content] } {
+            set content_encoded $content
+        }
+    }
+
+    #remove unsafe html
+    set description [news_aggregator::remove_unsafe_html -html $description]
+
+    return [list title $title link $link guid $guid permalink_p $permalink_p description $description content_encoded $content_encoded]
 }
 
-ad_proc -public na_sort_result {
-    result
+ad_proc -private news_aggregator::external_entity {
+    base_uri
+    system_identifier
+    public_identifier
 } {
+    A callback for tDOM to resolve external entities.
+
+    @author Guan Yang
+    @creation-date 2003-07-03
+} {
+    return [list string "" ""]
+}
+
+ad_proc -public news_aggregator::sort_result {
+    {-result:required}
+} {
+    @author Simon Carstensen
+} {
     set sorted [list]
     for {set i 0} {$i < [llength $result]} {incr i} {
 	lappend sorted [lindex $result end-$i]
     }
     return $sorted
 }
 
-ad_proc -public na_parse {
-    xml
+ad_proc -public news_aggregator::parse {
+    {-xml:required}
 } {
+    The workhorse of news-aggregator: A Very Ugly RSS Parser.
+    Now also supports Atom and weird formats in between. 
+
+    @author Simon Carstensen
+    @author Guan Yang (guan@unicast.org)
+} {
     if { [catch {
-	set doc_id [xml_parse -persist $xml]
-	set nodes [xml_node_get_children [xml_doc_get_first_node $doc_id]]
-	set channel [na_get_elements $nodes "channel"]
-	set result [list [na_get_nodes [xml_node_get_children $channel]]]
+    	# Pre-process the doc and remove any processing instruction
+	regsub {^<\?xml [^\?]+\?>} $xml {<?xml version="1.0"?>} xml
+	set doc [dom parse $xml]
+        set doc_node [$doc documentElement]
+        set node_name [$doc_node nodeName]
 
-	set items [na_get_elements $nodes "item"]
-	set items_sorted [na_sort_result $items]
-	foreach item $items_sorted {
-	    lappend result [na_get_nodes [xml_node_get_children $item]]
-	}
+	# feed is the doc-node name for Atom feeds
+        if { [lsearch {rdf RDF rdf:RDF rss feed} $node_name] == -1 } {
+	    ns_log Debug "news_aggregator::parse: doc node name is not rdf, RDF, rdf:RDF or rss"
+            set rss_p 0
+        } else {
+            set rss_p 1
+        }
+    } errmsg] } {
+	ns_log Debug "news_aggregator::parse: error in initial itdom parse, errmsg = $errmsg"
+        set rss_p 0
+    }
+
+    if { !$rss_p } {
+        # not valid xml, let's try autodiscovery
+	ns_log Debug "news_aggregator::parse: not  valid xml, we'll try autodiscovery"
+        
+        set doc [dom parse -html $xml]
+        set doc_node [$doc documentElement]
+
+        set link_nodes [$doc_node selectNodes {/html/head/link[@rel='alternate' and @title='RSS' and @type='application/rss+xml']/@href}]
+      
+	$doc delete
+
+        if { [llength $link_nodes] == 1} {
+            set link_node [lindex $link_nodes 0]
+            set feed_url [lindex $link_node 1]
+            array set f [ad_httpget -url $feed_url]
+            return [news_aggregator::parse -xml $f(page)]
+        }
+        return 0
+    }
+
+    if { [catch {
+	set doc_name [$doc_node nodeName]
 	
+	if { [string equal $doc_name "feed"] } {
+	    # It's an Atom feed
+	    set channel [news_aggregator::channel_parse \
+	                   -channel_node $doc_node]
+    } else {
+        # It looks RSS/RDF'fy
+	    set channel [news_aggregator::channel_parse \
+			-channel_node [$doc_node getElementsByTagName channel]]
+    }    
+        
+    set item_nodes [news_aggregator::items_fetch -doc_node $doc_node]
+	set item_nodes [news_aggregator::sort_result -result $item_nodes]
+	set items [list]
+  
+        foreach item_node $item_nodes {
+            lappend items [news_aggregator::item_parse -item_node $item_node]
+        }
 
+	$doc delete
     } err] } {
-	ns_log Notice "Error parsing RSS feed: $err"
         return 0
     } else {
-	return $result
+        return [list channel "$channel" items "$items"]
     }
 }
 
-ad_proc -public na_add_source {
-    feed_url
-    owner_id
-    package_id
-    source_id
+ad_proc -public news_aggregator::remove_unsafe_html { 
+    -html:required
 } {
-    Parse $feed_url for host_url, title, and description. Then add the source.
-} {
-    if { ![catch {set f [util_httpget_full $feed_url]}] && [string equal 200 [lindex $f 1]] } {
-	set result [na_parse [lindex $f 0]]
-	if { ![string equal 0 $result] } {
+    Make sure we are consuming RSS safely by removing unsafe tags.
 
-	    set channel [lindex $result 0]
-	    set title [string_truncate -len 245 -no_format -- [lindex $channel 0]]
-	    set link [string_truncate -len 245 -no_format -- [lindex $channel 1]]
-	    set description [string_truncate -len 245 -no_format -- [lindex $channel 2]]
-	    set source_id [db_nextval "acs_object_id_seq"]
-	    set creation_ip [ns_conn peeraddr]
-	    set last_modified [lindex $f 2]
-	
-	    # check whether the source already exists
-	    if { ![db_0or1row source { *SQL* }] } {
-		db_exec_plsql add_source { *SQL* }
-		set items [lrange $result 1 end]
-		foreach item $items {
-		    set title [string_truncate -len 245 -no_format -- [lindex $item 0]]
-		    set link [string_truncate -len 245 -no_format -- [lindex $item 1]]
-		    set description [lindex $item 2]
-		    db_exec_plsql add_item { *SQL* }
-		}
-	    }
-	}
-    }
-}
+    See http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely.html.
 
-ad_proc -public na_update_source {
-    owner_id
-    source_id
-    feed_url
-    last_modified
+    @author Simon Carstensen
+    @creation-date 2003-07-06
 } {
-    Parse source and then update the source if it has changed.
-} {
-    set header [ns_set create]
-    ns_set put $header "If-Modified-Since" $last_modified
-    if { [catch { 
-	set f [util_httpget_full $feed_url $header] }]
-    } {
-	return
-    }
-    # check the http status code
-    if { [exists_and_not_null f] && [string equal 200 [lindex $f 1]] } {
 
-	set result [na_parse [lindex $f 0]]
-	set host [lindex [lindex $result 0] 1]
-        set items [lrange $result 1 end]
-        foreach item $items {
-            set title [string_truncate -len 245 -no_format -- [lindex $item 0]]
-            set link [string_truncate -len 245 -no_format -- [lindex $item 1]]
-	    set description [lindex $item 2]
-
-	    # check whether link and description have been set as we
-	    # need these to check against already added items
-	    # also we check whether link is an external or internal URL
-	    # if not, it might occur in other items, and we can't check against it
-	    if { [exists_and_not_null link] && [na_check_link $link $host] } {
-		set identifier "link"
-	    } elseif { [exists_and_not_null description]  } {
-		set identifier "description"
-	    } else {
-		set identifier "none"
-	    }
-
-	    # check whether the item already exists
-	    # that it has at least a link or description
-	    # and that we're not handling a deleted item
-	    
-	    if { ![string equal "none" $identifier] } {
-		if {![db_0or1row item { *SQL* }]} {
-		    db_exec_plsql add_item { *SQL* }
-		    set updated_p 1
-		} elseif { [string equal f $deleted_p] && ((![string equal $title $item_title]) || (![string equal $description $item_description])) } {
-		    db_dml update_item { *SQL* }
-		    set updated_p 1
-		}
-	    }
-	}
-	if { [exists_and_not_null updated_p] } {
-	    # one or more items were added/updated
-	    # let's update the rss metadata as well
-	    set channel [lindex $result 0]
-	    set last_modified [lindex $f 2]
-	    set title [string_truncate -len 245 -no_format -- [lindex $channel 0]]
-	    set link [string_truncate -len 245 -no_format -- [lindex $channel 1]]
-	    set description [string_truncate -len 245 -no_format -- [lindex $channel 2]]
-	    
-	    db_dml update_source { *SQL* }
-	}
+    set unsafe_tags {
+        script
+        embed
+        object
+        frameset
+        frame
+        iframe
+        meta
+        link
+        style
     }
-}
 
-ad_proc -public na_update_sources { } {
-    Update sources by a one hour interval.
-} {
-    ns_log Debug "na_update_sources: updating news-aggregator sources"
-    
-    db_foreach sources { *SQL* } {
-	na_update_source $owner_id $source_id $feed_url $last_modified
+    foreach tag $unsafe_tags {
+        regsub -all "(<$tag\[^>\]*>(\[^<\]*</$tag>)?)+"  $html {} html
     }
 
+    return $html
 }
-
-# schedule hourly updates
-ad_schedule_proc 600 na_update_sources
-
-ad_proc -public na_cleanup_items {} {
-    Clean up the items that have been retrieved more than two months ago.
-} {
-    db_dml deleted_items { *SQL* }
-}
-
-# schedule daily cleanup of one-week-old deleted items
-#ad_schedule_proc 86400 na_cleanup_items