43d8db8338fb62cecdaa5caceaad9dc0d54a097a
[zzz-spline-frontpage.git] / splinext / frontpage / sources.py
1 """Base class for a front page source, as well as a handful of specific
2 implementations.
3 """
4
5 from collections import namedtuple
6 import datetime
7 import subprocess
8 from subprocess import PIPE
9 from urllib2 import URLError
10
11 import feedparser
12 import lxml.html
13
14 from pylons import cache
15
16 from spline.lib import helpers
17
18 def max_age_to_datetime(max_age):
19 """``max_age`` is specified in config as a number of seconds old. This
20 function takes that number and returns a corresponding datetime object.
21 """
22 if max_age == None:
23 return None
24
25 dt = datetime.datetime.now()
26 dt -= datetime.timedelta(seconds=int(max_age))
27
28 return dt
29
30
31 class Source(object):
32 """Represents a source to be polled for updates. Sources are populated
33 directly from the configuration file.
34
35 Properties:
36
37 ``title``
38 A name to identify this specific source.
39
40 ``icon``
41 Name of a Fugue icon to show next to the name.
42
43 ``link``
44 A URL where the full history of this source can be found.
45
46 ``limit``
47 The maximum number of items from this source to show at a time.
48 Optional.
49
50 ``max_age``
51 Items older than this age (in seconds) will be excluded. Optional.
52
53 Additionally, subclasses **must** define a ``template`` property -- a path
54 to a Mako template that knows how to render an update from this source.
55 The template will be passed one parameter: the update object, ``update``.
56 """
57
58 def __init__(self, title, icon, link, limit=None, max_age=None):
59 self.title = title
60 self.icon = icon
61 self.link = link
62 self.limit = int(limit)
63 self.max_age = max_age_to_datetime(max_age)
64
65 def do_cron(self, *args, **kwargs):
66 return
67
68 def poll(self, global_limit, global_max_age):
69 """Public wrapper that takes care of reconciling global and source item
70 limit and max age.
71
72 Subclasses should implement ``_poll``, below.
73 """
74 # Smallest limit wins
75 limit = min(self.limit, global_limit)
76
77 # Latest max age wins. Note that either could be None, but that's
78 # fine, because None is less than everything else
79 max_age = max(self.max_age, global_max_age)
80
81 return self._poll(limit, max_age)
82
83 def _poll(self, limit, max_age):
84 """Implementation of polling for updates. Must return an iterable.
85 Each element should be an object with ``source`` and ``time``
86 properties. A namedtuple works well.
87 """
88 raise NotImplementedError
89
90 class CachedSource(Source):
91 """Supports caching a source's updates in memcache.
92
93 On the surface, this functions just like any other ``Source``. Calling
94 ``poll`` still returns a list of updates. However, ``poll`` does not call
95 your ``_poll``; instead, your implementation is called by the spline cron,
96 and the results are cached. ``poll`` then returns the contents of the
97 cache.
98
99 You must define a ``_cache_key`` method that returns a key uniquely
100 identifying this object. Your key will be combined with the class name, so
101 it only needs to be unique for that source, not globally.
102
103 You may also override ``poll_frequency``, the number of minutes between
104 pollings. By default, this is a rather conservative 60.
105
106 Note that it may take up to a minute after server startup for updates
107 from a cached source to appear.
108 """
109
110 poll_frequency = 60
111
112 def cache_key(self):
113 return repr(type(self)) + ':' + self._cache_key()
114
115 def _cache_key(self):
116 raise NotImplementedError
117
118 def do_cron(self, tic, *args, **kwargs):
119 if tic % self.poll_frequency != 0:
120 # Too early!
121 return
122
123 try:
124 updates = self._poll(self.limit, self.max_age)
125 cache.get_cache('spline-frontpage')[self.cache_key()] = updates
126 except Exception:
127 # Hmm, polling broke. Be conservative and don't do anything; old
128 # data is probably still OK for now
129 pass
130
131 return
132
133 def poll(self, global_limit, global_max_age):
134 """Fetches cached updates."""
135 try:
136 return cache.get_cache('spline-frontpage')[self.cache_key()]
137 except KeyError:
138 # Haven't cached anything yet, apparently
139 return []
140
141
142 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
143 class FeedSource(CachedSource):
144 """Represents an RSS or Atom feed.
145
146 Extra properties:
147
148 ``feed_url``
149 URL for the feed.
150 """
151
152 template = '/front_page/rss.mako'
153
154 SUMMARY_LENGTH = 1000
155
156 poll_frequency = 15
157
158 def __init__(self, feed_url, **kwargs):
159 kwargs.setdefault('title', None)
160 super(FeedSource, self).__init__(**kwargs)
161
162 self.feed_url = feed_url
163
164 def _cache_key(self):
165 return self.feed_url
166
167 def _poll(self, limit, max_age):
168 feed = feedparser.parse(self.feed_url)
169
170 if feed.bozo and isinstance(feed.bozo_exception, URLError):
171 # Feed is DOWN. Bail here; otherwise, old entries might be lost
172 # just because, say, Bulbanews is down yet again
173 raise feed.bozo_exception
174
175 if not self.title:
176 self.title = feed.feed.title
177
178 updates = []
179 for entry in feed.entries[:limit]:
180 # Grab a date -- Atom has published, RSS usually just has updated.
181 # Both come out as time tuples, which datetime.datetime() can read
182 try:
183 timestamp_tuple = entry.published_parsed
184 except AttributeError:
185 timestamp_tuple = entry.updated_parsed
186 timestamp = datetime.datetime(*timestamp_tuple[:6])
187
188 if max_age and timestamp < max_age:
189 # Entries should be oldest-first, so we can bail after the first
190 # expired entry
191 break
192
193 # Try to find something to show! Default to the summary, if there is
194 # one, or try to generate one otherwise
195 content = u''
196 if 'summary' in entry:
197 # If there be a summary, cheerfully trust that it's actually a
198 # summary
199 content = entry.summary
200 elif 'content' in entry:
201 # Full content is way too much, especially for my giant blog posts.
202 # Cut this down to some arbitrary number of characters, then feed
203 # it to lxml.html to fix tag nesting
204 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
205 fragment = lxml.html.fromstring(broken_html)
206
207 # Insert an ellipsis at the end of the last node with text
208 last_text_node = None
209 last_tail_node = None
210 # Need to find the last node with a tail, OR the last node with
211 # text if it's later
212 for node in fragment.iter():
213 if node.tail:
214 last_tail_node = node
215 last_text_node = None
216 elif node.text:
217 last_text_node = node
218 last_tail_node = None
219
220 if last_text_node is not None:
221 last_text_node.text += '...'
222 if last_tail_node is not None:
223 last_tail_node.tail += '...'
224
225 # Serialize
226 content = lxml.html.tostring(fragment)
227
228 content = helpers.literal(content)
229
230 update = FrontPageRSS(
231 source = self,
232 time = timestamp,
233 content = content,
234 entry = entry,
235 )
236 updates.append(update)
237
238 return updates
239
240
241 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
242 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
243 ['hash', 'author', 'email', 'time', 'subject', 'repo'])
244
245 class GitSource(CachedSource):
246 """Represents a git repository.
247
248 The main repository is checked for annotated tags, and an update is
249 considered to be the list of commits between them. If any other
250 repositories are listed and have the same tags, their commits will be
251 included as well.
252
253 Extra properties:
254
255 ``repo_paths``
256 Space-separated list of repositories. These must be repository PATHS,
257 not arbitrary git URLs. Only the first one will be checked for the
258 list of tags.
259
260 ``repo_names``
261 A list of names for the repositories, in parallel with ``repo_paths``.
262 Used for constructing gitweb URLs and identifying the repositories.
263
264 ``gitweb``
265 Base URL to a gitweb installation, so commit ids can be linked to the
266 commit proper.
267
268 ``tag_pattern``
269 Optional. A shell glob pattern used to filter the tags.
270 """
271
272 template = '/front_page/git.mako'
273
274 def __init__(self, repo_paths, repo_names, gitweb, tag_pattern=None, **kwargs):
275 kwargs.setdefault('title', None)
276 super(GitSource, self).__init__(**kwargs)
277
278 # Repo stuff can be space-delimited lists
279 self.repo_paths = repo_paths.split()
280 self.repo_names = repo_names.split()
281
282 self.gitweb = gitweb
283 self.tag_pattern = tag_pattern
284
285 def _cache_key(self):
286 return self.repo_paths[0]
287
288 def _poll(self, limit, max_age):
289 # Fetch the main repo's git tags
290 git_dir = '--git-dir=' + self.repo_paths[0]
291 args = [
292 'git',
293 git_dir,
294 'tag', '-l',
295 ]
296 if self.tag_pattern:
297 args.append(self.tag_pattern)
298
299 git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
300 tags = git_output.strip().split('\n')
301
302 # Tags come out in alphabetical order, which means earliest first. Reverse
303 # it to make the slicing easier
304 tags.reverse()
305 # Only history from tag to tag is actually interesting, so get the most
306 # recent $limit tags but skip the earliest
307 interesting_tags = tags[:-1][:limit]
308
309 updates = []
310 for tag, since_tag in zip(interesting_tags, tags[1:]):
311 # Get the date when this tag was actually created.
312 # 'raw' format gives unixtime followed by timezone offset
313 args = [
314 'git',
315 git_dir,
316 'for-each-ref',
317 '--format=%(taggerdate:raw)',
318 'refs/tags/' + tag,
319 ]
320 tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
321 tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
322 tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
323
324 if max_age and tagged_timestamp < max_age:
325 break
326
327 commits = []
328
329 for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
330 # Grab an easily-parsed history: fields delimited by nulls.
331 # Hash, author's name, commit timestamp, subject.
332 git_log_args = [
333 'git',
334 '--git-dir=' + repo_path,
335 'log',
336 '--pretty=%h%x00%an%x00%aE%x00%at%x00%s',
337 "{0}..{1}".format(since_tag, tag),
338 ]
339 proc = subprocess.Popen(git_log_args, stdout=PIPE)
340 for line in proc.stdout:
341 hash, author, email, time, subject = line.strip().split('\x00')
342 commits.append(
343 FrontPageGitCommit(
344 hash = hash,
345 author = author,
346 email = email,
347 time = datetime.datetime.fromtimestamp(int(time)),
348 subject = subject,
349 repo = repo_name,
350 )
351 )
352
353 update = FrontPageGit(
354 source = self,
355 time = tagged_timestamp,
356 log = commits,
357 tag = tag,
358 )
359 updates.append(update)
360
361 return updates