dc97bf3f9ce9338b2f136cfc541ac40adaf1dade
[zzz-spline-frontpage.git] / splinext / frontpage / sources.py
1 """Base class for a front page source, as well as a handful of specific
2 implementations.
3 """
4
5 from collections import namedtuple
6 import datetime
7 import subprocess
8 from subprocess import PIPE
9
10 import feedparser
11 import lxml.html
12
13 from pylons import cache
14
15 from spline.lib import helpers
16
17 def max_age_to_datetime(max_age):
18 """``max_age`` is specified in config as a number of seconds old. This
19 function takes that number and returns a corresponding datetime object.
20 """
21 if max_age == None:
22 return None
23
24 seconds = int(max_age)
25
26
27
28 class Source(object):
29 """Represents a source to be polled for updates. Sources are populated
30 directly from the configuration file.
31
32 Properties:
33
34 ``title``
35 A name to identify this specific source.
36
37 ``icon``
38 Name of a Fugue icon to show next to the name.
39
40 ``link``
41 A URL where the full history of this source can be found.
42
43 ``limit``
44 The maximum number of items from this source to show at a time.
45 Optional.
46
47 ``max_age``
48 Items older than this age (in seconds) will be excluded. Optional.
49
50 Additionally, subclasses **must** define a ``template`` property -- a path
51 to a Mako template that knows how to render an update from this source.
52 The template will be passed one parameter: the update object, ``update``.
53 """
54
55 def __init__(self, title, icon, link, limit=None, max_age=None):
56 self.title = title
57 self.icon = icon
58 self.link = link
59 self.limit = int(limit)
60 self.max_age = max_age_to_datetime(max_age)
61
62 def do_cron(self, *args, **kwargs):
63 return
64
65 def poll(self, global_limit, global_max_age):
66 """Public wrapper that takes care of reconciling global and source item
67 limit and max age.
68
69 Subclasses should implement ``_poll``, below.
70 """
71 # Smallest limit wins
72 limit = min(self.limit, global_limit)
73
74 # Latest max age wins. Note that either could be None, but that's
75 # fine, because None is less than everything else
76 max_age = max(self.max_age, global_max_age)
77
78 return self._poll(limit, max_age)
79
80 def _poll(self, limit, max_age):
81 """Implementation of polling for updates. Must return an iterable.
82 Each element should be an object with ``source`` and ``time``
83 properties. A namedtuple works well.
84 """
85 raise NotImplementedError
86
87 class CachedSource(Source):
88 """Supports caching a source's updates in memcache.
89
90 On the surface, this functions just like any other ``Source``. Calling
91 ``poll`` still returns a list of updates. However, ``poll`` does not call
92 your ``_poll``; instead, your implementation is called by the spline cron,
93 and the results are cached. ``poll`` then returns the contents of the
94 cache.
95
96 You must define a ``_cache_key`` method that returns a key uniquely
97 identifying this object. Your key will be combined with the class name, so
98 it only needs to be unique for that source, not globally.
99
100 You may also override ``poll_frequency``, the number of minutes between
101 pollings. By default, this is a rather conservative 60.
102
103 Note that it may take up to a minute after server startup for updates
104 from a cached source to appear.
105 """
106
107 poll_frequency = 60
108
109 def cache_key(self):
110 return repr(type(self)) + ':' + self._cache_key()
111
112 def _cache_key(self):
113 raise NotImplementedError
114
115 def do_cron(self, tic, *args, **kwargs):
116 if tic % self.poll_frequency != 0:
117 # Too early!
118 return
119
120 updates = self._poll(self.limit, self.max_age)
121 cache.get_cache('spline-frontpage')[self.cache_key()] = updates
122
123 return
124
125 def poll(self, global_limit, global_max_age):
126 """Fetches cached updates."""
127 try:
128 return cache.get_cache('spline-frontpage')[self.cache_key()]
129 except KeyError:
130 # Haven't cached anything yet, apparently
131 return []
132
133
134 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
135 class FeedSource(CachedSource):
136 """Represents an RSS or Atom feed.
137
138 Extra properties:
139
140 ``feed_url``
141 URL for the feed.
142 """
143
144 template = '/front_page/rss.mako'
145
146 SUMMARY_LENGTH = 1000
147
148 poll_frequency = 15
149
150 def __init__(self, feed_url, **kwargs):
151 kwargs.setdefault('title', None)
152 super(FeedSource, self).__init__(**kwargs)
153
154 self.feed_url = feed_url
155
156 def _cache_key(self):
157 return self.feed_url
158
159 def _poll(self, limit, max_age):
160 feed = feedparser.parse(self.feed_url)
161
162 if not self.title:
163 self.title = feed.feed.title
164
165 updates = []
166 for entry in feed.entries[:limit]:
167 # Grab a date -- Atom has published, RSS usually just has updated.
168 # Both come out as time tuples, which datetime.datetime() can read
169 try:
170 timestamp_tuple = entry.published_parsed
171 except AttributeError:
172 timestamp_tuple = entry.updated_parsed
173 timestamp = datetime.datetime(*timestamp_tuple[:6])
174
175 if max_age and timestamp < max_age:
176 # Entries should be oldest-first, so we can bail after the first
177 # expired entry
178 break
179
180 # Try to find something to show! Default to the summary, if there is
181 # one, or try to generate one otherwise
182 content = u''
183 if 'summary' in entry:
184 # If there be a summary, cheerfully trust that it's actually a
185 # summary
186 content = entry.summary
187 elif 'content' in entry:
188 # Full content is way too much, especially for my giant blog posts.
189 # Cut this down to some arbitrary number of characters, then feed
190 # it to lxml.html to fix tag nesting
191 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
192 fragment = lxml.html.fromstring(broken_html)
193
194 # Insert an ellipsis at the end of the last node with text
195 last_text_node = None
196 last_tail_node = None
197 # Need to find the last node with a tail, OR the last node with
198 # text if it's later
199 for node in fragment.iter():
200 if node.tail:
201 last_tail_node = node
202 last_text_node = None
203 elif node.text:
204 last_text_node = node
205 last_tail_node = None
206
207 if last_text_node is not None:
208 last_text_node.text += '...'
209 if last_tail_node is not None:
210 last_tail_node.tail += '...'
211
212 # Serialize
213 content = lxml.html.tostring(fragment)
214
215 content = helpers.literal(content)
216
217 update = FrontPageRSS(
218 source = self,
219 time = timestamp,
220 content = content,
221 entry = entry,
222 )
223 updates.append(update)
224
225 return updates
226
227
228 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
229 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
230 ['hash', 'author', 'time', 'subject', 'repo'])
231
232 class GitSource(CachedSource):
233 """Represents a git repository.
234
235 The main repository is checked for annotated tags, and an update is
236 considered to be the list of commits between them. If any other
237 repositories are listed and have the same tags, their commits will be
238 included as well.
239
240 Extra properties:
241
242 ``repo_paths``
243 Space-separated list of repositories. These must be repository PATHS,
244 not arbitrary git URLs. Only the first one will be checked for the
245 list of tags.
246
247 ``repo_names``
248 A list of names for the repositories, in parallel with ``repo_paths``.
249 Used for constructing gitweb URLs and identifying the repositories.
250
251 ``gitweb``
252 Base URL to a gitweb installation, so commit ids can be linked to the
253 commit proper.
254
255 ``tag_pattern``
256 Optional. A shell glob pattern used to filter the tags.
257 """
258
259 template = '/front_page/git.mako'
260
261 def __init__(self, repo_paths, repo_names, gitweb, tag_pattern=None, **kwargs):
262 kwargs.setdefault('title', None)
263 super(GitSource, self).__init__(**kwargs)
264
265 # Repo stuff can be space-delimited lists
266 self.repo_paths = repo_paths.split()
267 self.repo_names = repo_names.split()
268
269 self.gitweb = gitweb
270 self.tag_pattern = tag_pattern
271
272 def _cache_key(self):
273 return self.repo_paths[0]
274
275 def _poll(self, limit, max_age):
276 # Fetch the main repo's git tags
277 git_dir = '--git-dir=' + self.repo_paths[0]
278 args = [
279 'git',
280 git_dir,
281 'tag', '-l',
282 ]
283 if self.tag_pattern:
284 args.append(self.tag_pattern)
285
286 git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
287 tags = git_output.strip().split('\n')
288
289 # Tags come out in alphabetical order, which means earliest first. Reverse
290 # it to make the slicing easier
291 tags.reverse()
292 # Only history from tag to tag is actually interesting, so get the most
293 # recent $limit tags but skip the earliest
294 interesting_tags = tags[:-1][:limit]
295
296 updates = []
297 for tag, since_tag in zip(interesting_tags, tags[1:]):
298 # Get the date when this tag was actually created.
299 # 'raw' format gives unixtime followed by timezone offset
300 args = [
301 'git',
302 git_dir,
303 'for-each-ref',
304 '--format=%(taggerdate:raw)',
305 'refs/tags/' + tag,
306 ]
307 tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
308 tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
309 tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
310
311 if max_age and tagged_timestamp < max_age:
312 break
313
314 commits = []
315
316 for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
317 # Grab an easily-parsed history: fields delimited by nulls.
318 # Hash, author's name, commit timestamp, subject.
319 git_log_args = [
320 'git',
321 '--git-dir=' + repo_path,
322 'log',
323 '--pretty=%h%x00%an%x00%at%x00%s',
324 "{0}..{1}".format(since_tag, tag),
325 ]
326 proc = subprocess.Popen(git_log_args, stdout=PIPE)
327 for line in proc.stdout:
328 hash, author, time, subject = line.strip().split('\x00')
329 commits.append(
330 FrontPageGitCommit(
331 hash = hash,
332 author = author,
333 time = datetime.datetime.fromtimestamp(int(time)),
334 subject = subject,
335 repo = repo_name,
336 )
337 )
338
339 update = FrontPageGit(
340 source = self,
341 time = tagged_timestamp,
342 log = commits,
343 tag = tag,
344 )
345 updates.append(update)
346
347 return updates