| |
1 |
1 |
ad_library { |
| |
2 |
2 |
The procs that make up our Feed Parser. |
| |
3 |
3 |
|
| |
4 |
4 |
@creation-date 2003-12-28 |
| |
5 |
5 |
@author Guan Yang (guan@unicast.org) |
| |
6 |
6 |
@author Simon Carstensen (simon@bcuni.net) |
| |
7 |
7 |
} |
| |
8 |
8 |
|
| |
9 |
9 |
namespace eval feed_parser {} |
| |
10 |
10 |
|
| |
|
11 |
ad_proc -public feed_parser::sort_result { |
| |
|
12 |
{-result:required} |
| |
|
13 |
} { |
| |
|
14 |
@author Simon Carstensen |
| |
|
15 |
} { |
| |
|
16 |
set sorted [list] |
| |
|
17 |
for {set i 0} {$i < [llength $result]} {incr i} { |
| |
|
18 |
lappend sorted [lindex $result end-$i] |
| |
|
19 |
} |
| |
|
20 |
return $sorted |
| |
|
21 |
} |
| |
|
22 |
|
| |
|
23 |
ad_proc -private feed_parser::items_fetch { |
| |
|
24 |
{-doc_node:required} |
| |
|
25 |
} { |
| |
|
26 |
Takes a tDOM document node which is supposed to be some |
| |
|
27 |
form of RSS. Returns a list with the item nodes. Returns |
| |
|
28 |
an empty list if none could be found. |
| |
|
29 |
|
| |
|
30 |
@author Guan Yang (guan@unicast.org) |
| |
|
31 |
@creation-date 2003-07-03 |
| |
|
32 |
} { |
| |
|
33 |
set items [$doc_node selectNodes {//*[local-name()='item' or local-name()='entry']}] |
| |
|
34 |
return $items |
| |
|
35 |
} |
| |
|
36 |
|
| |
|
37 |
ad_proc -private feed_parser::item_parse { |
| |
|
38 |
{-item_node:required} |
| |
|
39 |
} { |
| |
|
40 |
Takes a tDOM node which is supposed to represent an RSS item, |
| |
|
41 |
and returns a list with the RSS/RDF elements of that node. |
| |
|
42 |
|
| |
|
43 |
@author Simon Carstensen |
| |
|
44 |
@author Guan Yang (guan@unicast.org) |
| |
|
45 |
} { |
| |
|
46 |
set title "" |
| |
|
47 |
set link "" |
| |
|
48 |
set guid "" |
| |
|
49 |
set permalink_p false |
| |
|
50 |
set description "" |
| |
|
51 |
set content_encoded "" |
| |
|
52 |
|
| |
|
53 |
feed_parser::dom::set_child_text -node $item_node -child title |
| |
|
54 |
feed_parser::dom::set_child_text -node $item_node -child link |
| |
|
55 |
feed_parser::dom::set_child_text -node $item_node -child guid |
| |
|
56 |
feed_parser::dom::set_child_text -node $item_node -child description |
| |
|
57 |
|
| |
|
58 |
set maybe_atom_p 0 |
| |
|
59 |
|
| |
|
60 |
# Try to handle Atom link |
| |
|
61 |
if { [string equal $link ""] } { |
| |
|
62 |
set link_attr [$item_node selectNodes {*[local-name()='link']/@href}] |
| |
|
63 |
if { [llength $link_attr] == 1 } { |
| |
|
64 |
set link [lindex [lindex $link_attr 0] 1] |
| |
|
65 |
set maybe_atom_p 1 |
| |
|
66 |
} |
| |
|
67 |
} |
| |
|
68 |
|
| |
|
69 |
set encoded_nodes [$item_node selectNodes {*[local-name()='encoded' and namespace-uri()='http://purl.org/rss/1.0/modules/content/']}] |
| |
|
70 |
if { [llength $encoded_nodes] == 1 } { |
| |
|
71 |
set encoded_node [lindex $encoded_nodes 0] |
| |
|
72 |
set content_encoded [$encoded_node text] |
| |
|
73 |
} |
| |
|
74 |
|
| |
|
75 |
if { [llength [$item_node selectNodes "*\[local-name()='guid'\]"]] } { |
| |
|
76 |
# If guid exists, we assume that it's a permalink |
| |
|
77 |
set permalink_p true |
| |
|
78 |
} |
| |
|
79 |
|
| |
|
80 |
# Retrieve isPermaLink attribute |
| |
|
81 |
set isPermaLink_nodes [$item_node selectNodes "*\[local-name()='guid'\]/@isPermaLink"] |
| |
|
82 |
if { [llength isPermaLink_nodes] == 1} { |
| |
|
83 |
set isPermaLink_node [lindex $isPermaLink_nodes 0] |
| |
|
84 |
set isPermaLink [lindex $isPermaLink_node 1] |
| |
|
85 |
if { [string equal $isPermaLink false] } { |
| |
|
86 |
set permalink_p false |
| |
|
87 |
} |
| |
|
88 |
} |
| |
|
89 |
|
| |
|
90 |
if { [empty_string_p $link] } { |
| |
|
91 |
if { [exists_and_not_null guid] } { |
| |
|
92 |
set link $guid |
| |
|
93 |
set permalink_p true |
| |
|
94 |
} elseif { [empty_string_p $guid] && ![string equal $link $guid] } { |
| |
|
95 |
set permalink_p true |
| |
|
96 |
} |
| |
|
97 |
} |
| |
|
98 |
|
| |
|
99 |
# Try to handle Atom guid |
| |
|
100 |
if { [empty_string_p $guid] && $maybe_atom_p } { |
| |
|
101 |
feed_parser::dom::set_child_text -node $item_node -child id |
| |
|
102 |
if { [info exists id] } { |
| |
|
103 |
# We don't really know if it's an URL |
| |
|
104 |
set guid $id |
| |
|
105 |
if { [util_url_valid_p $id] } { |
| |
|
106 |
set permalink_p true |
| |
|
107 |
} else { |
| |
|
108 |
set permalink_p false |
| |
|
109 |
} |
| |
|
110 |
} |
| |
|
111 |
} |
| |
|
112 |
|
| |
|
113 |
# For Atom, description is summary, content is content_encoded |
| |
|
114 |
if { $maybe_atom_p } { |
| |
|
115 |
feed_parser::dom::set_child_text -node $item_node -child summary |
| |
|
116 |
if { [info exists summary] } { |
| |
|
117 |
set description $summary |
| |
|
118 |
} |
| |
|
119 |
|
| |
|
120 |
feed_parser::dom::set_child_text -node $item_node -child content |
| |
|
121 |
if { [info exists content] } { |
| |
|
122 |
set content_encoded $content |
| |
|
123 |
} |
| |
|
124 |
} |
| |
|
125 |
|
| |
|
126 |
#remove unsafe html |
| |
|
127 |
set description [feed_parser::remove_unsafe_html -html $description] |
| |
|
128 |
|
| |
|
129 |
return [list title $title link $link guid $guid permalink_p $permalink_p description $description content_encoded $content_encoded] |
| |
|
130 |
} |
| |
|
131 |
|
| |
|
132 |
ad_proc -private feed_parser::channel_parse { |
| |
|
133 |
{-channel_node:required} |
| |
|
134 |
} { |
| |
|
135 |
Takes a tDOM node which is supposed to represent an RSS |
| |
|
136 |
channel, and returns a list with the RSS/RDF elements |
| |
|
137 |
of that node. This proc should later be extended to |
| |
|
138 |
support Dublin Core elements and other funk. |
| |
|
139 |
|
| |
|
140 |
@author Guan Yang (guan@unicast.org) |
| |
|
141 |
@creation-date 2003-07-03 |
| |
|
142 |
} { |
| |
|
143 |
set properties [list title link description language copyright lastBuildDate docs generator managingEditor webMaster] |
| |
|
144 |
|
| |
|
145 |
foreach property $properties { |
| |
|
146 |
set $property "" |
| |
|
147 |
feed_parser::dom::set_child_text -node $channel_node -child $property |
| |
|
148 |
set channel($property) [set $property] |
| |
|
149 |
} |
| |
|
150 |
|
| |
|
151 |
set channel_name [$channel_node nodeName] |
| |
|
152 |
|
| |
|
153 |
# Do weird Atom-like stuff |
| |
|
154 |
if { [string equal $channel_name "feed"] } { |
| |
|
155 |
# link |
| |
|
156 |
if { [string equal $link ""] } { |
| |
|
157 |
# Link is in a href |
| |
|
158 |
set link_node [$channel_node selectNodes {*[local-name()='link' and @rel = 'alternate' and @type = 'text/html']/@href}] |
| |
|
159 |
if { [llength $link_node] == 1 } { |
| |
|
160 |
set link_node [lindex $link_node 0] |
| |
|
161 |
set channel(link) [lindex $link_node 1] |
| |
|
162 |
} |
| |
|
163 |
} |
| |
|
164 |
|
| |
|
165 |
# author |
| |
|
166 |
set author_node [$channel_node selectNodes {*[local-name()='author']}] |
| |
|
167 |
if { [llength $author_node] == 1 } { |
| |
|
168 |
set author_node [lindex $author_node 0] |
| |
|
169 |
feed_parser::dom::set_child_text -node $author_node -child name |
| |
|
170 |
feed_parser::dom::set_child_text -node $author_node -child email |
| |
|
171 |
if { [info exists email] && [info exists name] } { |
| |
|
172 |
set channel(managingEditor) "$email ($name)" |
| |
|
173 |
} |
| |
|
174 |
} |
| |
|
175 |
|
| |
|
176 |
# tagline |
| |
|
177 |
feed_parser::dom::set_child_text -node $channel_node -child tagline |
| |
|
178 |
if { [info exists tagline] } { |
| |
|
179 |
set channel(tagline) $tagline |
| |
|
180 |
} |
| |
|
181 |
} |
| |
|
182 |
|
| |
|
183 |
return [array get channel] |
| |
|
184 |
} |
| |
|
185 |
|
| |
11 |
186 |
ad_proc -private feed_parser::remove_unsafe_html { |
| |
12 |
187 |
-html:required |
| |
13 |
188 |
} { |
| |
14 |
189 |
Make sure we are consuming RSS safely by removing unsafe tags. |
| |
15 |
190 |
|
| |
16 |
191 |
See http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely.html. |
| |
17 |
192 |
|
| |
18 |
193 |
@author Simon Carstensen |
| |
19 |
194 |
@creation-date 2003-07-06 |
| |
20 |
195 |
@param html An HTML string that we need to clean up |
| |
21 |
196 |
@return The cleaned-up HTML string |
| |
22 |
197 |
} { |
| |
23 |
198 |
set unsafe_tags { |
| |
24 |
199 |
script |
| |
25 |
200 |
embed |
| |
26 |
201 |
object |
| |
27 |
202 |
frameset |
| |
28 |
203 |
frame |
| |
29 |
204 |
iframe |
| |
30 |
205 |
meta |
|
| |
91 |
266 |
|
| |
92 |
267 |
$doc delete |
| |
93 |
268 |
|
| |
94 |
269 |
if { [llength $link_nodes] == 1} { |
| |
95 |
270 |
set link_node [lindex $link_nodes 0] |
| |
96 |
271 |
set feed_url [lindex $link_node 1] |
| |
97 |
272 |
array set f [ad_httpget -url $feed_url] |
| |
98 |
273 |
return [feed_parser::parse_feed -xml $f(page)] |
| |
99 |
274 |
} |
| |
100 |
275 |
|
| |
101 |
276 |
set result(status) "error" |
| |
102 |
277 |
set result(error) "Not RSS and contained no autodiscovery element" |
| |
103 |
278 |
return [array get result] |
| |
104 |
279 |
} |
| |
105 |
280 |
|
| |
106 |
281 |
if { [catch { |
| |
107 |
282 |
set doc_name [$doc_node nodeName] |
| |
108 |
283 |
|
| |
109 |
284 |
if { [string equal $doc_name "feed"] } { |
| |
110 |
285 |
# It's an Atom feed |
| |
111 |
|
set channel [news_aggregator::channel_parse \ |
| |
|
286 |
set channel [feed_parser::channel_parse \ |
| |
112 |
287 |
-channel_node $doc_node] |
| |
113 |
288 |
} else { |
| |
114 |
289 |
# It looks RSS/RDF'fy |
| |
115 |
|
set channel [news_aggregator::channel_parse \ |
| |
|
290 |
set channel [feed_parser::channel_parse \ |
| |
116 |
291 |
-channel_node [$doc_node getElementsByTagName channel]] |
| |
117 |
292 |
} |
| |
118 |
293 |
|
| |
119 |
|
set item_nodes [news_aggregator::items_fetch -doc_node $doc_node] |
| |
120 |
|
set item_nodes [news_aggregator::sort_result -result $item_nodes] |
| |
|
294 |
set item_nodes [feed_parser::items_fetch -doc_node $doc_node] |
| |
|
295 |
set item_nodes [feed_parser::sort_result -result $item_nodes] |
| |
121 |
296 |
set items [list] |
| |
122 |
297 |
|
| |
123 |
298 |
foreach item_node $item_nodes { |
| |
124 |
|
lappend items [news_aggregator::item_parse -item_node $item_node] |
| |
|
299 |
lappend items [feed_parser::item_parse -item_node $item_node] |
| |
125 |
300 |
} |
| |
126 |
301 |
|
| |
127 |
302 |
$doc delete |
| |
128 |
303 |
} err] } { |
| |
129 |
304 |
set result(status) "error" |
| |
130 |
305 |
set result(error) "Parse error: $err" |
| |
131 |
306 |
return [array get result] |
| |
132 |
307 |
} else { |
| |
133 |
308 |
set result(status) "ok" |
| |
134 |
309 |
set result(error) "" |
| |
135 |
310 |
set result(channel) $channel |
| |
136 |
311 |
set result(items) $items |
| |
137 |
312 |
return [array get result] |
| |
138 |
313 |
} |
| |
139 |
314 |
} |