No need for ellipses on short RSS entries.
[zzz-spline-frontpage.git] / splinext / frontpage / sources.py
1 """Base class for a front page source, as well as a handful of specific
2 implementations.
3 """
4
5 from collections import namedtuple
6 import datetime
7 import re
8 import subprocess
9 from subprocess import PIPE
10 from urllib2 import URLError
11
12 import feedparser
13 import lxml.html
14
15 from pylons import cache
16
17 from spline.lib import helpers
18
19 def max_age_to_datetime(max_age):
20 """``max_age`` is specified in config as a number of seconds old. This
21 function takes that number and returns a corresponding datetime object.
22 """
23 if max_age == None:
24 return None
25
26 dt = datetime.datetime.now()
27 dt -= datetime.timedelta(seconds=int(max_age))
28
29 return dt
30
31
32 class Source(object):
33 """Represents a source to be polled for updates. Sources are populated
34 directly from the configuration file.
35
36 Properties:
37
38 ``title``
39 A name to identify this specific source.
40
41 ``icon``
42 Name of a Fugue icon to show next to the name.
43
44 ``link``
45 A URL where the full history of this source can be found.
46
47 ``limit``
48 The maximum number of items from this source to show at a time.
49 Optional.
50
51 ``max_age``
52 Items older than this age (in seconds) will be excluded. Optional.
53
54 Additionally, subclasses **must** define a ``template`` property -- a path
55 to a Mako template that knows how to render an update from this source.
56 The template will be passed one parameter: the update object, ``update``.
57 """
58
59 def __init__(self, config, title, icon, link, limit=None, max_age=None):
60 self.title = title
61 self.icon = icon
62 self.link = link
63 self.limit = int(limit)
64 self.max_age = max_age_to_datetime(max_age)
65
66 def do_cron(self, *args, **kwargs):
67 return
68
69 def poll(self, global_limit, global_max_age):
70 """Public wrapper that takes care of reconciling global and source item
71 limit and max age.
72
73 Subclasses should implement ``_poll``, below.
74 """
75 # Smallest limit wins
76 limit = min(self.limit, global_limit)
77
78 # Latest max age wins. Note that either could be None, but that's
79 # fine, because None is less than everything else
80 max_age = max(self.max_age, global_max_age)
81
82 return self._poll(limit, max_age)
83
84 def _poll(self, limit, max_age):
85 """Implementation of polling for updates. Must return an iterable.
86 Each element should be an object with ``source`` and ``time``
87 properties. A namedtuple works well.
88 """
89 raise NotImplementedError
90
91 class CachedSource(Source):
92 """Supports caching a source's updates in memcache.
93
94 On the surface, this functions just like any other ``Source``. Calling
95 ``poll`` still returns a list of updates. However, ``poll`` does not call
96 your ``_poll``; instead, your implementation is called by the spline cron,
97 and the results are cached. ``poll`` then returns the contents of the
98 cache.
99
100 You must define a ``_cache_key`` method that returns a key uniquely
101 identifying this object. Your key will be combined with the class name, so
102 it only needs to be unique for that source, not globally.
103
104 You may also override ``poll_frequency``, the number of minutes between
105 pollings. By default, this is a rather conservative 60.
106
107 Note that it may take up to a minute after server startup for updates
108 from a cached source to appear.
109 """
110
111 poll_frequency = 60
112
113 def cache_key(self):
114 return repr(type(self)) + ':' + self._cache_key()
115
116 def _cache_key(self):
117 raise NotImplementedError
118
119 def do_cron(self, tic, *args, **kwargs):
120 if tic % self.poll_frequency != 0:
121 # Too early!
122 return
123
124 updates = self._poll(self.limit, self.max_age)
125 cache.get_cache('spline-frontpage')[self.cache_key()] = updates
126
127 return
128
129 def poll(self, global_limit, global_max_age):
130 """Fetches cached updates."""
131 try:
132 return cache.get_cache('spline-frontpage')[self.cache_key()]
133 except KeyError:
134 # Haven't cached anything yet, apparently
135 return []
136
137
138 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
139 class FeedSource(CachedSource):
140 """Represents an RSS or Atom feed.
141
142 Extra properties:
143
144 ``feed_url``
145 URL for the feed.
146 """
147
148 template = '/front_page/rss.mako'
149
150 SUMMARY_LENGTH = 1000
151
152 poll_frequency = 15
153
154 def __init__(self, feed_url, **kwargs):
155 kwargs.setdefault('title', None)
156 super(FeedSource, self).__init__(**kwargs)
157
158 self.feed_url = feed_url
159
160 def _cache_key(self):
161 return self.feed_url
162
163 def _poll(self, limit, max_age):
164 feed = feedparser.parse(self.feed_url)
165
166 if feed.bozo and isinstance(feed.bozo_exception, URLError):
167 # Feed is DOWN. Bail here; otherwise, old entries might be lost
168 # just because, say, Bulbanews is down yet again
169 raise feed.bozo_exception
170
171 if not self.title:
172 self.title = feed.feed.title
173
174 updates = []
175 for entry in feed.entries[:limit]:
176 # Grab a date -- Atom has published, RSS usually just has updated.
177 # Both come out as time tuples, which datetime.datetime() can read
178 try:
179 timestamp_tuple = entry.published_parsed
180 except AttributeError:
181 timestamp_tuple = entry.updated_parsed
182 timestamp = datetime.datetime(*timestamp_tuple[:6])
183
184 if max_age and timestamp < max_age:
185 # Entries should be oldest-first, so we can bail after the first
186 # expired entry
187 break
188
189 # Try to find something to show! Default to the summary, if there is
190 # one, or try to generate one otherwise
191 content = u''
192 if 'summary' in entry:
193 # If there be a summary, cheerfully trust that it's actually a
194 # summary
195 content = entry.summary
196 elif 'content' in entry and \
197 len(entry.content[0].value) <= self.SUMMARY_LENGTH:
198
199 # Full content is short; use as-is!
200 content = entry.entry.content[0].value
201 elif 'content' in entry:
202 # Full content is way too much, especially for my giant blog posts.
203 # Cut this down to some arbitrary number of characters, then feed
204 # it to lxml.html to fix tag nesting
205 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
206 fragment = lxml.html.fromstring(broken_html)
207
208 # Insert an ellipsis at the end of the last node with text
209 last_text_node = None
210 last_tail_node = None
211 # Need to find the last node with a tail, OR the last node with
212 # text if it's later
213 for node in fragment.iter():
214 if node.tail:
215 last_tail_node = node
216 last_text_node = None
217 elif node.text:
218 last_text_node = node
219 last_tail_node = None
220
221 if last_text_node is not None:
222 last_text_node.text += '...'
223 if last_tail_node is not None:
224 last_tail_node.tail += '...'
225
226 # Serialize
227 content = lxml.html.tostring(fragment)
228
229 content = helpers.literal(content)
230
231 update = FrontPageRSS(
232 source = self,
233 time = timestamp,
234 content = content,
235 entry = entry,
236 )
237 updates.append(update)
238
239 return updates
240
241
242 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
243 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
244 ['hash', 'author', 'email', 'time', 'subject', 'repo'])
245
246 class GitSource(CachedSource):
247 """Represents a git repository.
248
249 The main repository is checked for annotated tags, and an update is
250 considered to be the list of commits between them. If any other
251 repositories are listed and have the same tags, their commits will be
252 included as well.
253
254 Extra properties:
255
256 ``repo_paths``
257 Space-separated list of repositories. These must be repository PATHS,
258 not arbitrary git URLs. Only the first one will be checked for the
259 list of tags.
260
261 ``repo_names``
262 A list of names for the repositories, in parallel with ``repo_paths``.
263 Used for constructing gitweb URLs and identifying the repositories.
264
265 ``gitweb``
266 Base URL to a gitweb installation, so commit ids can be linked to the
267 commit proper.
268
269 ``bug_tracker``
270 URL to a bug tracker; anything matching "#xxx" will be converted into a
271 link to this. Should contain a "{0}", which will be replaced by the
272 bug number.
273
274 ``tag_pattern``
275 Optional. A shell glob pattern used to filter the tags.
276 """
277
278 template = '/front_page/git.mako'
279
280 def __init__(self, repo_paths, repo_names, gitweb, bug_tracker=None,
281 tag_pattern=None, **kwargs):
282
283 kwargs.setdefault('title', None)
284 super(GitSource, self).__init__(**kwargs)
285
286 # Repo stuff can be space-delimited lists
287 self.repo_paths = repo_paths.split()
288 self.repo_names = repo_names.split()
289
290 self.gitweb = gitweb
291 self.bug_tracker = bug_tracker
292 self.tag_pattern = tag_pattern
293
294 def _cache_key(self):
295 return self.repo_paths[0]
296
297 def _poll(self, limit, max_age):
298 # Fetch the main repo's git tags
299 git_dir = '--git-dir=' + self.repo_paths[0]
300 args = [
301 'git',
302 git_dir,
303 'tag', '-l',
304 ]
305 if self.tag_pattern:
306 args.append(self.tag_pattern)
307
308 git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
309 tags = git_output.strip().split('\n')
310
311 # Tags come out in alphabetical order, which means earliest first. Reverse
312 # it to make the slicing easier
313 tags.reverse()
314 # Only history from tag to tag is actually interesting, so get the most
315 # recent $limit tags but skip the earliest
316 interesting_tags = tags[:-1][:limit]
317
318 updates = []
319 for tag, since_tag in zip(interesting_tags, tags[1:]):
320 # Get the date when this tag was actually created.
321 # 'raw' format gives unixtime followed by timezone offset
322 args = [
323 'git',
324 git_dir,
325 'for-each-ref',
326 '--format=%(taggerdate:raw)',
327 'refs/tags/' + tag,
328 ]
329 tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
330 tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
331 tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
332
333 if max_age and tagged_timestamp < max_age:
334 break
335
336 commits = []
337
338 for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
339 # Grab an easily-parsed history: fields delimited by nulls.
340 # Hash, author's name, commit timestamp, subject.
341 git_log_args = [
342 'git',
343 '--git-dir=' + repo_path,
344 'log',
345 '--pretty=%h%x00%an%x00%aE%x00%at%x00%s',
346 "{0}..{1}".format(since_tag, tag),
347 ]
348 proc = subprocess.Popen(git_log_args, stdout=PIPE)
349 for line in proc.stdout:
350 hash, author, email, time, subject \
351 = line.strip().decode('utf8').split('\x00')
352
353 # Convert bug numbers in subject to URLs
354 if self.bug_tracker:
355 subject = helpers.literal(
356 re.sub(u'#(\d+)', self._linkify_bug_number, subject)
357 )
358
359 commits.append(
360 FrontPageGitCommit(
361 hash = hash,
362 author = author,
363 email = email,
364 time = datetime.datetime.fromtimestamp(int(time)),
365 subject = subject,
366 repo = repo_name,
367 )
368 )
369
370 update = FrontPageGit(
371 source = self,
372 time = tagged_timestamp,
373 log = commits,
374 tag = tag,
375 )
376 updates.append(update)
377
378 return updates
379
380 def _linkify_bug_number(self, match):
381 """Regex replace function for changing bug numbers into links."""
382 n = match.group(1)
383 bug_url = self.bug_tracker.format(match.group(1))
384 return helpers.literal(
385 u"""<a href="{0}">{1}</a>""".format(bug_url, match.group(0)))