gyang
committed
on 30 May 04
news-aggregator: Moved all RSS parsing procs over to the feed-parser package.
openacs-4/.../tcl/feed-parser-procs.tcl (+180 -5)
1 1 ad_library {
2 2     The procs that make up our Feed Parser.
3 3    
4 4     @creation-date 2003-12-28
5 5     @author Guan Yang (guan@unicast.org)
6 6     @author Simon Carstensen (simon@bcuni.net)
7 7 }
8 8
9 9 namespace eval feed_parser {}
10 10
  11 ad_proc -public feed_parser::sort_result {
  12     {-result:required}
  13 } {
  14     @author Simon Carstensen
  15 } {
  16     set sorted [list]
  17     for {set i 0} {$i < [llength $result]} {incr i} {
  18         lappend sorted [lindex $result end-$i]
  19     }
  20     return $sorted
  21 }
  22
  23 ad_proc -private feed_parser::items_fetch {
  24     {-doc_node:required}
  25 } {
  26     Takes a tDOM document node which is supposed to be some
  27     form of RSS. Returns a list with the item nodes. Returns
  28     an empty list if none could be found.
  29
  30     @author Guan Yang (guan@unicast.org)
  31     @creation-date 2003-07-03
  32 } {
  33     set items [$doc_node selectNodes {//*[local-name()='item' or local-name()='entry']}]
  34     return $items
  35 }
  36
  37 ad_proc -private feed_parser::item_parse {
  38     {-item_node:required}
  39 } {
  40     Takes a tDOM node which is supposed to represent an RSS item,
  41     and returns a list with the RSS/RDF elements of that node.
  42
  43     @author Simon Carstensen
  44     @author Guan Yang (guan@unicast.org)
  45 } {
  46     set title ""
  47     set link ""
  48     set guid ""
  49     set permalink_p false
  50     set description ""
  51     set content_encoded ""
  52
  53     feed_parser::dom::set_child_text -node $item_node -child title
  54     feed_parser::dom::set_child_text -node $item_node -child link
  55     feed_parser::dom::set_child_text -node $item_node -child guid
  56     feed_parser::dom::set_child_text -node $item_node -child description
  57    
  58     set maybe_atom_p 0
  59    
  60     # Try to handle Atom link
  61     if { [string equal $link ""] } {
  62         set link_attr [$item_node selectNodes {*[local-name()='link']/@href}]
  63         if { [llength $link_attr] == 1 } {
  64             set link [lindex [lindex $link_attr 0] 1]
  65             set maybe_atom_p 1
  66         }
  67     }
  68
  69     set encoded_nodes [$item_node selectNodes {*[local-name()='encoded' and namespace-uri()='http://purl.org/rss/1.0/modules/content/']}]
  70     if { [llength $encoded_nodes] == 1 } {
  71         set encoded_node [lindex $encoded_nodes 0]
  72             set content_encoded [$encoded_node text]
  73     }
  74
  75     if { [llength [$item_node selectNodes "*\[local-name()='guid'\]"]] } {
  76         # If guid exists, we assume that it's a permalink
  77         set permalink_p true
  78     }
  79
  80     # Retrieve isPermaLink attribute
  81     set isPermaLink_nodes [$item_node selectNodes "*\[local-name()='guid'\]/@isPermaLink"]
  82     if { [llength isPermaLink_nodes] == 1} {
  83         set isPermaLink_node [lindex $isPermaLink_nodes 0]
  84         set isPermaLink [lindex $isPermaLink_node 1]
  85         if { [string equal $isPermaLink false] } {
  86             set permalink_p false
  87         }
  88     }
  89
  90     if { [empty_string_p $link] } {
  91         if { [exists_and_not_null guid] } {
  92             set link $guid
  93             set permalink_p true
  94         } elseif { [empty_string_p $guid] && ![string equal $link $guid] } {
  95             set permalink_p true
  96         }
  97     }
  98    
  99     # Try to handle Atom guid
  100     if { [empty_string_p $guid] && $maybe_atom_p } {
  101         feed_parser::dom::set_child_text -node $item_node -child id
  102         if { [info exists id] } {
  103             # We don't really know if it's an URL
  104             set guid $id
  105             if { [util_url_valid_p $id] } {
  106                 set permalink_p true
  107             } else {
  108                 set permalink_p false
  109             }
  110         }
  111     }
  112    
  113     # For Atom, description is summary, content is content_encoded
  114     if { $maybe_atom_p } {
  115         feed_parser::dom::set_child_text -node $item_node -child summary
  116         if { [info exists summary] } {
  117             set description $summary
  118         }
  119        
  120         feed_parser::dom::set_child_text -node $item_node -child content
  121         if { [info exists content] } {
  122             set content_encoded $content
  123         }
  124     }
  125
  126     #remove unsafe html
  127     set description [feed_parser::remove_unsafe_html -html $description]
  128
  129     return [list title $title link $link guid $guid permalink_p $permalink_p description $description content_encoded $content_encoded]
  130 }
  131
  132 ad_proc -private feed_parser::channel_parse {
  133     {-channel_node:required}
  134 } {
  135     Takes a tDOM node which is supposed to represent an RSS
  136     channel, and returns a list with the RSS/RDF elements
  137     of that node. This proc should later be extended to
  138     support Dublin Core elements and other funk.
  139
  140     @author Guan Yang (guan@unicast.org)
  141     @creation-date 2003-07-03
  142 } {
  143     set properties [list title link description language copyright lastBuildDate docs generator managingEditor webMaster]
  144
  145     foreach property $properties {
  146         set $property ""
  147             feed_parser::dom::set_child_text -node $channel_node -child $property
  148         set channel($property) [set $property]
  149     }
  150    
  151     set channel_name [$channel_node nodeName]
  152    
  153     # Do weird Atom-like stuff
  154     if { [string equal $channel_name "feed"] } {
  155         # link
  156         if { [string equal $link ""] } {
  157             # Link is in a href
  158             set link_node [$channel_node selectNodes {*[local-name()='link' and @rel = 'alternate' and @type = 'text/html']/@href}]
  159             if { [llength $link_node] == 1 } {
  160                 set link_node [lindex $link_node 0]
  161                 set channel(link) [lindex $link_node 1]
  162             }
  163         }
  164        
  165         # author
  166         set author_node [$channel_node selectNodes {*[local-name()='author']}]
  167         if { [llength $author_node] == 1 } {
  168             set author_node [lindex $author_node 0]
  169             feed_parser::dom::set_child_text -node $author_node -child name
  170             feed_parser::dom::set_child_text -node $author_node -child email
  171             if { [info exists email] && [info exists name] } {
  172                 set channel(managingEditor) "$email ($name)"
  173             }
  174         }
  175        
  176         # tagline
  177         feed_parser::dom::set_child_text -node $channel_node -child tagline
  178         if { [info exists tagline] } {
  179             set channel(tagline) $tagline
  180         }
  181     }
  182
  183     return [array get channel]
  184 }
  185
11 186 ad_proc -private feed_parser::remove_unsafe_html {
12 187     -html:required
13 188 } {
14 189     Make sure we are consuming RSS safely by removing unsafe tags.
15 190
16 191     See http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely.html.
17 192
18 193     @author Simon Carstensen
19 194     @creation-date 2003-07-06
20 195     @param html An HTML string that we need to clean up
21 196     @return The cleaned-up HTML string
22 197 } {
23 198     set unsafe_tags {
24 199         script
25 200         embed
26 201         object
27 202         frameset
28 203         frame
29 204         iframe
30 205         meta
 
91 266      
92 267         $doc delete
93 268    
94 269         if { [llength $link_nodes] == 1} {
95 270             set link_node [lindex $link_nodes 0]
96 271             set feed_url [lindex $link_node 1]
97 272             array set f [ad_httpget -url $feed_url]
98 273             return [feed_parser::parse_feed -xml $f(page)]
99 274         }
100 275        
101 276         set result(status) "error"
102 277         set result(error) "Not RSS and contained no autodiscovery element"
103 278         return [array get result]
104 279     }
105 280    
106 281     if { [catch {
107 282         set doc_name [$doc_node nodeName]
108 283        
109 284         if { [string equal $doc_name "feed"] } {
110 285             # It's an Atom feed
111               set channel [news_aggregator::channel_parse \
  286             set channel [feed_parser::channel_parse \
112 287                            -channel_node $doc_node]
113 288         } else {
114 289             # It looks RSS/RDF'fy
115               set channel [news_aggregator::channel_parse \
  290             set channel [feed_parser::channel_parse \
116 291                 -channel_node [$doc_node getElementsByTagName channel]]
117 292         }   
118 293            
119           set item_nodes [news_aggregator::items_fetch -doc_node $doc_node]
120           set item_nodes [news_aggregator::sort_result -result $item_nodes]
  294         set item_nodes [feed_parser::items_fetch -doc_node $doc_node]
  295         set item_nodes [feed_parser::sort_result -result $item_nodes]
121 296         set items [list]
122 297        
123 298         foreach item_node $item_nodes {
124               lappend items [news_aggregator::item_parse -item_node $item_node]
  299             lappend items [feed_parser::item_parse -item_node $item_node]
125 300         }
126 301        
127 302         $doc delete
128 303     } err] } {
129 304         set result(status) "error"
130 305         set result(error) "Parse error: $err"
131 306         return [array get result]
132 307     } else {
133 308         set result(status) "ok"
134 309         set result(error) ""
135 310         set result(channel) $channel
136 311         set result(items) $items
137 312         return [array get result]
138 313     }
139 314 }