eaefc572ab5ba099d9bedc11666a637cf933a451
[zzz-spline-frontpage.git] / splinext / frontpage / sources.py
1 """Base class for a front page source, as well as a handful of specific
2 implementations.
3 """
4
5 from collections import namedtuple
6 import datetime
7 import subprocess
8 from subprocess import PIPE
9
10 import feedparser
11 import lxml.html
12
13 from pylons import cache
14
15 from spline.lib import helpers
16
17 def max_age_to_datetime(max_age):
18 """``max_age`` is specified in config as a number of seconds old. This
19 function takes that number and returns a corresponding datetime object.
20 """
21 if max_age == None:
22 return None
23
24 dt = datetime.datetime.now()
25 dt -= datetime.timedelta(seconds=int(max_age))
26
27 return dt
28
29
30 class Source(object):
31 """Represents a source to be polled for updates. Sources are populated
32 directly from the configuration file.
33
34 Properties:
35
36 ``title``
37 A name to identify this specific source.
38
39 ``icon``
40 Name of a Fugue icon to show next to the name.
41
42 ``link``
43 A URL where the full history of this source can be found.
44
45 ``limit``
46 The maximum number of items from this source to show at a time.
47 Optional.
48
49 ``max_age``
50 Items older than this age (in seconds) will be excluded. Optional.
51
52 Additionally, subclasses **must** define a ``template`` property -- a path
53 to a Mako template that knows how to render an update from this source.
54 The template will be passed one parameter: the update object, ``update``.
55 """
56
57 def __init__(self, title, icon, link, limit=None, max_age=None):
58 self.title = title
59 self.icon = icon
60 self.link = link
61 self.limit = int(limit)
62 self.max_age = max_age_to_datetime(max_age)
63
64 def do_cron(self, *args, **kwargs):
65 return
66
67 def poll(self, global_limit, global_max_age):
68 """Public wrapper that takes care of reconciling global and source item
69 limit and max age.
70
71 Subclasses should implement ``_poll``, below.
72 """
73 # Smallest limit wins
74 limit = min(self.limit, global_limit)
75
76 # Latest max age wins. Note that either could be None, but that's
77 # fine, because None is less than everything else
78 max_age = max(self.max_age, global_max_age)
79
80 return self._poll(limit, max_age)
81
82 def _poll(self, limit, max_age):
83 """Implementation of polling for updates. Must return an iterable.
84 Each element should be an object with ``source`` and ``time``
85 properties. A namedtuple works well.
86 """
87 raise NotImplementedError
88
89 class CachedSource(Source):
90 """Supports caching a source's updates in memcache.
91
92 On the surface, this functions just like any other ``Source``. Calling
93 ``poll`` still returns a list of updates. However, ``poll`` does not call
94 your ``_poll``; instead, your implementation is called by the spline cron,
95 and the results are cached. ``poll`` then returns the contents of the
96 cache.
97
98 You must define a ``_cache_key`` method that returns a key uniquely
99 identifying this object. Your key will be combined with the class name, so
100 it only needs to be unique for that source, not globally.
101
102 You may also override ``poll_frequency``, the number of minutes between
103 pollings. By default, this is a rather conservative 60.
104
105 Note that it may take up to a minute after server startup for updates
106 from a cached source to appear.
107 """
108
109 poll_frequency = 60
110
111 def cache_key(self):
112 return repr(type(self)) + ':' + self._cache_key()
113
114 def _cache_key(self):
115 raise NotImplementedError
116
117 def do_cron(self, tic, *args, **kwargs):
118 if tic % self.poll_frequency != 0:
119 # Too early!
120 return
121
122 updates = self._poll(self.limit, self.max_age)
123 cache.get_cache('spline-frontpage')[self.cache_key()] = updates
124
125 return
126
127 def poll(self, global_limit, global_max_age):
128 """Fetches cached updates."""
129 try:
130 return cache.get_cache('spline-frontpage')[self.cache_key()]
131 except KeyError:
132 # Haven't cached anything yet, apparently
133 return []
134
135
136 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
137 class FeedSource(CachedSource):
138 """Represents an RSS or Atom feed.
139
140 Extra properties:
141
142 ``feed_url``
143 URL for the feed.
144 """
145
146 template = '/front_page/rss.mako'
147
148 SUMMARY_LENGTH = 1000
149
150 poll_frequency = 15
151
152 def __init__(self, feed_url, **kwargs):
153 kwargs.setdefault('title', None)
154 super(FeedSource, self).__init__(**kwargs)
155
156 self.feed_url = feed_url
157
158 def _cache_key(self):
159 return self.feed_url
160
161 def _poll(self, limit, max_age):
162 feed = feedparser.parse(self.feed_url)
163
164 if not self.title:
165 self.title = feed.feed.title
166
167 updates = []
168 for entry in feed.entries[:limit]:
169 # Grab a date -- Atom has published, RSS usually just has updated.
170 # Both come out as time tuples, which datetime.datetime() can read
171 try:
172 timestamp_tuple = entry.published_parsed
173 except AttributeError:
174 timestamp_tuple = entry.updated_parsed
175 timestamp = datetime.datetime(*timestamp_tuple[:6])
176
177 if max_age and timestamp < max_age:
178 # Entries should be oldest-first, so we can bail after the first
179 # expired entry
180 break
181
182 # Try to find something to show! Default to the summary, if there is
183 # one, or try to generate one otherwise
184 content = u''
185 if 'summary' in entry:
186 # If there be a summary, cheerfully trust that it's actually a
187 # summary
188 content = entry.summary
189 elif 'content' in entry:
190 # Full content is way too much, especially for my giant blog posts.
191 # Cut this down to some arbitrary number of characters, then feed
192 # it to lxml.html to fix tag nesting
193 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
194 fragment = lxml.html.fromstring(broken_html)
195
196 # Insert an ellipsis at the end of the last node with text
197 last_text_node = None
198 last_tail_node = None
199 # Need to find the last node with a tail, OR the last node with
200 # text if it's later
201 for node in fragment.iter():
202 if node.tail:
203 last_tail_node = node
204 last_text_node = None
205 elif node.text:
206 last_text_node = node
207 last_tail_node = None
208
209 if last_text_node is not None:
210 last_text_node.text += '...'
211 if last_tail_node is not None:
212 last_tail_node.tail += '...'
213
214 # Serialize
215 content = lxml.html.tostring(fragment)
216
217 content = helpers.literal(content)
218
219 update = FrontPageRSS(
220 source = self,
221 time = timestamp,
222 content = content,
223 entry = entry,
224 )
225 updates.append(update)
226
227 return updates
228
229
230 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
231 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
232 ['hash', 'author', 'time', 'subject', 'repo'])
233
234 class GitSource(CachedSource):
235 """Represents a git repository.
236
237 The main repository is checked for annotated tags, and an update is
238 considered to be the list of commits between them. If any other
239 repositories are listed and have the same tags, their commits will be
240 included as well.
241
242 Extra properties:
243
244 ``repo_paths``
245 Space-separated list of repositories. These must be repository PATHS,
246 not arbitrary git URLs. Only the first one will be checked for the
247 list of tags.
248
249 ``repo_names``
250 A list of names for the repositories, in parallel with ``repo_paths``.
251 Used for constructing gitweb URLs and identifying the repositories.
252
253 ``gitweb``
254 Base URL to a gitweb installation, so commit ids can be linked to the
255 commit proper.
256
257 ``tag_pattern``
258 Optional. A shell glob pattern used to filter the tags.
259 """
260
261 template = '/front_page/git.mako'
262
263 def __init__(self, repo_paths, repo_names, gitweb, tag_pattern=None, **kwargs):
264 kwargs.setdefault('title', None)
265 super(GitSource, self).__init__(**kwargs)
266
267 # Repo stuff can be space-delimited lists
268 self.repo_paths = repo_paths.split()
269 self.repo_names = repo_names.split()
270
271 self.gitweb = gitweb
272 self.tag_pattern = tag_pattern
273
274 def _cache_key(self):
275 return self.repo_paths[0]
276
277 def _poll(self, limit, max_age):
278 # Fetch the main repo's git tags
279 git_dir = '--git-dir=' + self.repo_paths[0]
280 args = [
281 'git',
282 git_dir,
283 'tag', '-l',
284 ]
285 if self.tag_pattern:
286 args.append(self.tag_pattern)
287
288 git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
289 tags = git_output.strip().split('\n')
290
291 # Tags come out in alphabetical order, which means earliest first. Reverse
292 # it to make the slicing easier
293 tags.reverse()
294 # Only history from tag to tag is actually interesting, so get the most
295 # recent $limit tags but skip the earliest
296 interesting_tags = tags[:-1][:limit]
297
298 updates = []
299 for tag, since_tag in zip(interesting_tags, tags[1:]):
300 # Get the date when this tag was actually created.
301 # 'raw' format gives unixtime followed by timezone offset
302 args = [
303 'git',
304 git_dir,
305 'for-each-ref',
306 '--format=%(taggerdate:raw)',
307 'refs/tags/' + tag,
308 ]
309 tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
310 tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
311 tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
312
313 if max_age and tagged_timestamp < max_age:
314 break
315
316 commits = []
317
318 for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
319 # Grab an easily-parsed history: fields delimited by nulls.
320 # Hash, author's name, commit timestamp, subject.
321 git_log_args = [
322 'git',
323 '--git-dir=' + repo_path,
324 'log',
325 '--pretty=%h%x00%an%x00%at%x00%s',
326 "{0}..{1}".format(since_tag, tag),
327 ]
328 proc = subprocess.Popen(git_log_args, stdout=PIPE)
329 for line in proc.stdout:
330 hash, author, time, subject = line.strip().split('\x00')
331 commits.append(
332 FrontPageGitCommit(
333 hash = hash,
334 author = author,
335 time = datetime.datetime.fromtimestamp(int(time)),
336 subject = subject,
337 repo = repo_name,
338 )
339 )
340
341 update = FrontPageGit(
342 source = self,
343 time = tagged_timestamp,
344 log = commits,
345 tag = tag,
346 )
347 updates.append(update)
348
349 return updates