Don't raise when an RSS source is down.
[zzz-spline-frontpage.git] / splinext / frontpage / sources.py
1 """Base class for a front page source, as well as a handful of specific
2 implementations.
3 """
4
5 from collections import namedtuple
6 import datetime
7 import re
8 import subprocess
9 from subprocess import PIPE
10 from urllib2 import URLError
11
12 import feedparser
13 import lxml.html
14
15 from pylons import cache
16
17 from spline.lib import helpers
18
19 def max_age_to_datetime(max_age):
20 """``max_age`` is specified in config as a number of seconds old. This
21 function takes that number and returns a corresponding datetime object.
22 """
23 if max_age == None:
24 return None
25
26 dt = datetime.datetime.now()
27 dt -= datetime.timedelta(seconds=int(max_age))
28
29 return dt
30
31
32 class Source(object):
33 """Represents a source to be polled for updates. Sources are populated
34 directly from the configuration file.
35
36 Properties:
37
38 ``title``
39 A name to identify this specific source.
40
41 ``icon``
42 Name of a Fugue icon to show next to the name.
43
44 ``link``
45 A URL where the full history of this source can be found.
46
47 ``limit``
48 The maximum number of items from this source to show at a time.
49 Optional.
50
51 ``max_age``
52 Items older than this age (in seconds) will be excluded. Optional.
53
54 Additionally, subclasses **must** define a ``template`` property -- a path
55 to a Mako template that knows how to render an update from this source.
56 The template will be passed one parameter: the update object, ``update``.
57 """
58
59 def __init__(self, config, title, icon, link, limit=None, max_age=None):
60 self.title = title
61 self.icon = icon
62 self.link = link
63 self.limit = int(limit)
64 self.max_age = max_age_to_datetime(max_age)
65
66 def do_cron(self, *args, **kwargs):
67 return
68
69 def poll(self, global_limit, global_max_age):
70 """Public wrapper that takes care of reconciling global and source item
71 limit and max age.
72
73 Subclasses should implement ``_poll``, below.
74 """
75 # Smallest limit wins
76 limit = min(self.limit, global_limit)
77
78 # Latest max age wins. Note that either could be None, but that's
79 # fine, because None is less than everything else
80 max_age = max(self.max_age, global_max_age)
81
82 return self._poll(limit, max_age)
83
84 def _poll(self, limit, max_age):
85 """Implementation of polling for updates. Must return an iterable.
86 Each element should be an object with ``source`` and ``time``
87 properties. A namedtuple works well.
88 """
89 raise NotImplementedError
90
91 class CachedSource(Source):
92 """Supports caching a source's updates in memcache.
93
94 On the surface, this functions just like any other ``Source``. Calling
95 ``poll`` still returns a list of updates. However, ``poll`` does not call
96 your ``_poll``; instead, your implementation is called by the spline cron,
97 and the results are cached. ``poll`` then returns the contents of the
98 cache.
99
100 ``_poll`` may return None, in which case the cache will be left unchanged.
101
102 You must define a ``_cache_key`` method that returns a key uniquely
103 identifying this object. Your key will be combined with the class name, so
104 it only needs to be unique for that source, not globally.
105
106 You may also override ``poll_frequency``, the number of minutes between
107 pollings. By default, this is a rather conservative 60.
108
109 Note that it may take up to a minute after server startup for updates
110 from a cached source to appear.
111 """
112
113 poll_frequency = 60
114
115 def cache_key(self):
116 return repr(type(self)) + ':' + self._cache_key()
117
118 def _cache_key(self):
119 raise NotImplementedError
120
121 def do_cron(self, tic, *args, **kwargs):
122 if tic % self.poll_frequency != 0:
123 # Too early!
124 return
125
126 updates = self._poll(self.limit, self.max_age)
127 if updates is not None:
128 cache.get_cache('spline-frontpage')[self.cache_key()] = updates
129
130 return
131
132 def poll(self, global_limit, global_max_age):
133 """Fetches cached updates."""
134 try:
135 return cache.get_cache('spline-frontpage')[self.cache_key()]
136 except KeyError:
137 # Haven't cached anything yet, apparently
138 return []
139
140
141 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
142 class FeedSource(CachedSource):
143 """Represents an RSS or Atom feed.
144
145 Extra properties:
146
147 ``feed_url``
148 URL for the feed.
149 """
150
151 template = '/front_page/rss.mako'
152
153 SUMMARY_LENGTH = 1000
154
155 poll_frequency = 15
156
157 def __init__(self, feed_url, **kwargs):
158 kwargs.setdefault('title', None)
159 super(FeedSource, self).__init__(**kwargs)
160
161 self.feed_url = feed_url
162
163 def _cache_key(self):
164 return self.feed_url
165
166 def _poll(self, limit, max_age):
167 feed = feedparser.parse(self.feed_url)
168
169 if feed.bozo and isinstance(feed.bozo_exception, URLError):
170 # Feed is DOWN. Bail here; otherwise, old entries might be lost
171 # just because, say, Bulbanews is down yet again
172 return None
173
174 if not self.title:
175 self.title = feed.feed.title
176
177 updates = []
178 for entry in feed.entries[:limit]:
179 # Grab a date -- Atom has published, RSS usually just has updated.
180 # Both come out as time tuples, which datetime.datetime() can read
181 try:
182 timestamp_tuple = entry.published_parsed
183 except AttributeError:
184 timestamp_tuple = entry.updated_parsed
185 timestamp = datetime.datetime(*timestamp_tuple[:6])
186
187 if max_age and timestamp < max_age:
188 # Entries should be oldest-first, so we can bail after the first
189 # expired entry
190 break
191
192 # Try to find something to show! Default to the summary, if there is
193 # one, or try to generate one otherwise
194 content = u''
195 if 'summary' in entry:
196 # If there be a summary, cheerfully trust that it's actually a
197 # summary
198 content = entry.summary
199 elif 'content' in entry and \
200 len(entry.content[0].value) <= self.SUMMARY_LENGTH:
201
202 # Full content is short; use as-is!
203 content = entry.content[0].value
204 elif 'content' in entry:
205 # Full content is way too much, especially for my giant blog posts.
206 # Cut this down to some arbitrary number of characters, then feed
207 # it to lxml.html to fix tag nesting
208 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
209 fragment = lxml.html.fromstring(broken_html)
210
211 # Insert an ellipsis at the end of the last node with text
212 last_text_node = None
213 last_tail_node = None
214 # Need to find the last node with a tail, OR the last node with
215 # text if it's later
216 for node in fragment.iter():
217 if node.tail:
218 last_tail_node = node
219 last_text_node = None
220 elif node.text:
221 last_text_node = node
222 last_tail_node = None
223
224 if last_text_node is not None:
225 last_text_node.text += '...'
226 if last_tail_node is not None:
227 last_tail_node.tail += '...'
228
229 # Serialize
230 content = lxml.html.tostring(fragment)
231
232 content = helpers.literal(content)
233
234 update = FrontPageRSS(
235 source = self,
236 time = timestamp,
237 content = content,
238 entry = entry,
239 )
240 updates.append(update)
241
242 return updates
243
244
245 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
246 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
247 ['hash', 'author', 'email', 'time', 'subject', 'repo'])
248
249 class GitSource(CachedSource):
250 """Represents a git repository.
251
252 The main repository is checked for annotated tags, and an update is
253 considered to be the list of commits between them. If any other
254 repositories are listed and have the same tags, their commits will be
255 included as well.
256
257 Extra properties:
258
259 ``repo_paths``
260 Space-separated list of repositories. These must be repository PATHS,
261 not arbitrary git URLs. Only the first one will be checked for the
262 list of tags.
263
264 ``repo_names``
265 A list of names for the repositories, in parallel with ``repo_paths``.
266 Used for constructing gitweb URLs and identifying the repositories.
267
268 ``gitweb``
269 Base URL to a gitweb installation, so commit ids can be linked to the
270 commit proper.
271
272 ``bug_tracker``
273 URL to a bug tracker; anything matching "#xxx" will be converted into a
274 link to this. Should contain a "{0}", which will be replaced by the
275 bug number.
276
277 ``tag_pattern``
278 Optional. A shell glob pattern used to filter the tags.
279 """
280
281 template = '/front_page/git.mako'
282
283 def __init__(self, repo_paths, repo_names, gitweb, bug_tracker=None,
284 tag_pattern=None, **kwargs):
285
286 kwargs.setdefault('title', None)
287 super(GitSource, self).__init__(**kwargs)
288
289 # Repo stuff can be space-delimited lists
290 self.repo_paths = repo_paths.split()
291 self.repo_names = repo_names.split()
292
293 self.gitweb = gitweb
294 self.bug_tracker = bug_tracker
295 self.tag_pattern = tag_pattern
296
297 def _cache_key(self):
298 return self.repo_paths[0]
299
300 def _poll(self, limit, max_age):
301 # Fetch the main repo's git tags
302 git_dir = '--git-dir=' + self.repo_paths[0]
303 args = [
304 'git',
305 git_dir,
306 'tag', '-l',
307 ]
308 if self.tag_pattern:
309 args.append(self.tag_pattern)
310
311 git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
312 tags = git_output.strip().split('\n')
313
314 # Tags come out in alphabetical order, which means earliest first. Reverse
315 # it to make the slicing easier
316 tags.reverse()
317 # Only history from tag to tag is actually interesting, so get the most
318 # recent $limit tags but skip the earliest
319 interesting_tags = tags[:-1][:limit]
320
321 updates = []
322 for tag, since_tag in zip(interesting_tags, tags[1:]):
323 # Get the date when this tag was actually created.
324 # 'raw' format gives unixtime followed by timezone offset
325 args = [
326 'git',
327 git_dir,
328 'for-each-ref',
329 '--format=%(taggerdate:raw)',
330 'refs/tags/' + tag,
331 ]
332 tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
333 tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
334 tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
335
336 if max_age and tagged_timestamp < max_age:
337 break
338
339 commits = []
340
341 for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
342 # Grab an easily-parsed history: fields delimited by nulls.
343 # Hash, author's name, commit timestamp, subject.
344 git_log_args = [
345 'git',
346 '--git-dir=' + repo_path,
347 'log',
348 '--pretty=%h%x00%an%x00%aE%x00%at%x00%s',
349 "{0}..{1}".format(since_tag, tag),
350 ]
351 proc = subprocess.Popen(git_log_args, stdout=PIPE)
352 for line in proc.stdout:
353 hash, author, email, time, subject \
354 = line.strip().decode('utf8').split('\x00')
355
356 # Convert bug numbers in subject to URLs
357 if self.bug_tracker:
358 subject = helpers.literal(
359 re.sub(u'#(\d+)', self._linkify_bug_number, subject)
360 )
361
362 commits.append(
363 FrontPageGitCommit(
364 hash = hash,
365 author = author,
366 email = email,
367 time = datetime.datetime.fromtimestamp(int(time)),
368 subject = subject,
369 repo = repo_name,
370 )
371 )
372
373 update = FrontPageGit(
374 source = self,
375 time = tagged_timestamp,
376 log = commits,
377 tag = tag,
378 )
379 updates.append(update)
380
381 return updates
382
383 def _linkify_bug_number(self, match):
384 """Regex replace function for changing bug numbers into links."""
385 n = match.group(1)
386 bug_url = self.bug_tracker.format(match.group(1))
387 return helpers.literal(
388 u"""<a href="{0}">{1}</a>""".format(bug_url, match.group(0)))