splinext/frontpage/sources.py

   1 """Base class for a front page source, as well as a handful of specific
   2 implementations.
   3 """
   4
   5 from collections import namedtuple
   6 import datetime
   7 import re
   8 import subprocess
   9 from subprocess import PIPE
  10 from urllib2 import URLError
  11
  12 import feedparser
  13 import lxml.html
  14
  15 from pylons import cache
  16
  17 from spline.lib import helpers
  18
  19 def max_age_to_datetime(max_age):
  20     """``max_age`` is specified in config as a number of seconds old.  This
  21     function takes that number and returns a corresponding datetime object.
  22     """
  23     if max_age == None:
  24         return None
  25
  26     dt = datetime.datetime.now()
  27     dt -= datetime.timedelta(seconds=int(max_age))
  28
  29     return dt
  30
  31
  32 class Source(object):
  33     """Represents a source to be polled for updates.  Sources are populated
  34     directly from the configuration file.
  35
  36     Properties:
  37
  38     ``title``
  39         A name to identify this specific source.
  40
  41     ``icon``
  42         Name of a Fugue icon to show next to the name.
  43
  44     ``link``
  45         A URL where the full history of this source can be found.
  46
  47     ``limit``
  48         The maximum number of items from this source to show at a time.
  49         Optional.
  50
  51     ``max_age``
  52         Items older than this age (in seconds) will be excluded.  Optional.
  53
  54     Additionally, subclasses **must** define a ``template`` property -- a path
  55     to a Mako template that knows how to render an update from this source.
  56     The template will be passed one parameter: the update object, ``update``.
  57     """
  58
  59     def __init__(self, config, title, icon, link, limit=None, max_age=None):
  60         self.title = title
  61         self.icon = icon
  62         self.link = link
  63         self.limit = int(limit)
  64         self.max_age = max_age_to_datetime(max_age)
  65
  66     def do_cron(self, *args, **kwargs):
  67         return
  68
  69     def poll(self, global_limit, global_max_age):
  70         """Public wrapper that takes care of reconciling global and source item
  71         limit and max age.
  72
  73         Subclasses should implement ``_poll``, below.
  74         """
  75         # Smallest limit wins
  76         limit = min(self.limit, global_limit)
  77
  78         # Latest max age wins.  Note that either could be None, but that's
  79         # fine, because None is less than everything else
  80         max_age = max(self.max_age, global_max_age)
  81
  82         return self._poll(limit, max_age)
  83
  84     def _poll(self, limit, max_age):
  85         """Implementation of polling for updates.  Must return an iterable.
  86         Each element should be an object with ``source`` and ``time``
  87         properties.  A namedtuple works well.
  88         """
  89         raise NotImplementedError
  90
  91 class CachedSource(Source):
  92     """Supports caching a source's updates in memcache.
  93
  94     On the surface, this functions just like any other ``Source``.  Calling
  95     ``poll`` still returns a list of updates.  However, ``poll`` does not call
  96     your ``_poll``; instead, your implementation is called by the spline cron,
  97     and the results are cached.  ``poll`` then returns the contents of the
  98     cache.
  99
 100     You must define a ``_cache_key`` method that returns a key uniquely
 101     identifying this object.  Your key will be combined with the class name, so
 102     it only needs to be unique for that source, not globally.
 103
 104     You may also override ``poll_frequency``, the number of minutes between
 105     pollings.  By default, this is a rather conservative 60.
 106
 107     Note that it may take up to a minute after server startup for updates
 108     from a cached source to appear.
 109     """
 110
 111     poll_frequency = 60
 112
 113     def cache_key(self):
 114         return repr(type(self)) + ':' + self._cache_key()
 115
 116     def _cache_key(self):
 117         raise NotImplementedError
 118
 119     def do_cron(self, tic, *args, **kwargs):
 120         if tic % self.poll_frequency != 0:
 121             # Too early!
 122             return
 123
 124         updates = self._poll(self.limit, self.max_age)
 125         cache.get_cache('spline-frontpage')[self.cache_key()] = updates
 126
 127         return
 128
 129     def poll(self, global_limit, global_max_age):
 130         """Fetches cached updates."""
 131         try:
 132             return cache.get_cache('spline-frontpage')[self.cache_key()]
 133         except KeyError:
 134             # Haven't cached anything yet, apparently
 135             return []
 136
 137
 138 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
 139 class FeedSource(CachedSource):
 140     """Represents an RSS or Atom feed.
 141
 142     Extra properties:
 143
 144     ``feed_url``
 145         URL for the feed.
 146     """
 147
 148     template = '/front_page/rss.mako'
 149
 150     SUMMARY_LENGTH = 1000
 151
 152     poll_frequency = 15
 153
 154     def __init__(self, feed_url, **kwargs):
 155         kwargs.setdefault('title', None)
 156         super(FeedSource, self).__init__(**kwargs)
 157
 158         self.feed_url = feed_url
 159
 160     def _cache_key(self):
 161         return self.feed_url
 162
 163     def _poll(self, limit, max_age):
 164         feed = feedparser.parse(self.feed_url)
 165
 166         if feed.bozo and isinstance(feed.bozo_exception, URLError):
 167             # Feed is DOWN.  Bail here; otherwise, old entries might be lost
 168             # just because, say, Bulbanews is down yet again
 169             raise feed.bozo_exception
 170
 171         if not self.title:
 172             self.title = feed.feed.title
 173
 174         updates = []
 175         for entry in feed.entries[:limit]:
 176             # Grab a date -- Atom has published, RSS usually just has updated.
 177             # Both come out as time tuples, which datetime.datetime() can read
 178             try:
 179                 timestamp_tuple = entry.published_parsed
 180             except AttributeError:
 181                 timestamp_tuple = entry.updated_parsed
 182             timestamp = datetime.datetime(*timestamp_tuple[:6])
 183
 184             if max_age and timestamp < max_age:
 185                 # Entries should be oldest-first, so we can bail after the first
 186                 # expired entry
 187                 break
 188
 189             # Try to find something to show!  Default to the summary, if there is
 190             # one, or try to generate one otherwise
 191             content = u''
 192             if 'summary' in entry:
 193                 # If there be a summary, cheerfully trust that it's actually a
 194                 # summary
 195                 content = entry.summary
 196             elif 'content' in entry:
 197                 # Full content is way too much, especially for my giant blog posts.
 198                 # Cut this down to some arbitrary number of characters, then feed
 199                 # it to lxml.html to fix tag nesting
 200                 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
 201                 fragment = lxml.html.fromstring(broken_html)
 202
 203                 # Insert an ellipsis at the end of the last node with text
 204                 last_text_node = None
 205                 last_tail_node = None
 206                 # Need to find the last node with a tail, OR the last node with
 207                 # text if it's later
 208                 for node in fragment.iter():
 209                     if node.tail:
 210                         last_tail_node = node
 211                         last_text_node = None
 212                     elif node.text:
 213                         last_text_node = node
 214                         last_tail_node = None
 215
 216                 if last_text_node is not None:
 217                     last_text_node.text += '...'
 218                 if last_tail_node is not None:
 219                     last_tail_node.tail += '...'
 220
 221                 # Serialize
 222                 content = lxml.html.tostring(fragment)
 223
 224             content = helpers.literal(content)
 225
 226             update = FrontPageRSS(
 227                 source = self,
 228                 time = timestamp,
 229                 content = content,
 230                 entry = entry,
 231             )
 232             updates.append(update)
 233
 234         return updates
 235
 236
 237 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
 238 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
 239     ['hash', 'author', 'email', 'time', 'subject', 'repo'])
 240
 241 class GitSource(CachedSource):
 242     """Represents a git repository.
 243
 244     The main repository is checked for annotated tags, and an update is
 245     considered to be the list of commits between them.  If any other
 246     repositories are listed and have the same tags, their commits will be
 247     included as well.
 248
 249     Extra properties:
 250
 251     ``repo_paths``
 252         Space-separated list of repositories.  These must be repository PATHS,
 253         not arbitrary git URLs.  Only the first one will be checked for the
 254         list of tags.
 255
 256     ``repo_names``
 257         A list of names for the repositories, in parallel with ``repo_paths``.
 258         Used for constructing gitweb URLs and identifying the repositories.
 259
 260     ``gitweb``
 261         Base URL to a gitweb installation, so commit ids can be linked to the
 262         commit proper.
 263
 264     ``bug_tracker``
 265         URL to a bug tracker; anything matching "#xxx" will be converted into a
 266         link to this.  Should contain a "{0}", which will be replaced by the
 267         bug number.
 268
 269     ``tag_pattern``
 270         Optional.  A shell glob pattern used to filter the tags.
 271     """
 272
 273     template = '/front_page/git.mako'
 274
 275     def __init__(self, repo_paths, repo_names, gitweb, bug_tracker=None,
 276         tag_pattern=None, **kwargs):
 277
 278         kwargs.setdefault('title', None)
 279         super(GitSource, self).__init__(**kwargs)
 280
 281         # Repo stuff can be space-delimited lists
 282         self.repo_paths = repo_paths.split()
 283         self.repo_names = repo_names.split()
 284
 285         self.gitweb = gitweb
 286         self.bug_tracker = bug_tracker
 287         self.tag_pattern = tag_pattern
 288
 289     def _cache_key(self):
 290         return self.repo_paths[0]
 291
 292     def _poll(self, limit, max_age):
 293         # Fetch the main repo's git tags
 294         git_dir = '--git-dir=' + self.repo_paths[0]
 295         args = [
 296             'git',
 297             git_dir,
 298             'tag', '-l',
 299         ]
 300         if self.tag_pattern:
 301             args.append(self.tag_pattern)
 302
 303         git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 304         tags = git_output.strip().split('\n')
 305
 306         # Tags come out in alphabetical order, which means earliest first.  Reverse
 307         # it to make the slicing easier
 308         tags.reverse()
 309         # Only history from tag to tag is actually interesting, so get the most
 310         # recent $limit tags but skip the earliest
 311         interesting_tags = tags[:-1][:limit]
 312
 313         updates = []
 314         for tag, since_tag in zip(interesting_tags, tags[1:]):
 315             # Get the date when this tag was actually created.
 316             # 'raw' format gives unixtime followed by timezone offset
 317             args = [
 318                 'git',
 319                 git_dir,
 320                 'for-each-ref',
 321                 '--format=%(taggerdate:raw)',
 322                 'refs/tags/' + tag,
 323             ]
 324             tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 325             tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
 326             tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
 327
 328             if max_age and tagged_timestamp < max_age:
 329                 break
 330
 331             commits = []
 332
 333             for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
 334                 # Grab an easily-parsed history: fields delimited by nulls.
 335                 # Hash, author's name, commit timestamp, subject.
 336                 git_log_args = [
 337                     'git',
 338                     '--git-dir=' + repo_path,
 339                     'log',
 340                     '--pretty=%h%x00%an%x00%aE%x00%at%x00%s',
 341                     "{0}..{1}".format(since_tag, tag),
 342                 ]
 343                 proc = subprocess.Popen(git_log_args, stdout=PIPE)
 344                 for line in proc.stdout:
 345                     hash, author, email, time, subject \
 346                         = line.strip().decode('utf8').split('\x00')
 347
 348                     # Convert bug numbers in subject to URLs
 349                     if self.bug_tracker:
 350                         subject = helpers.literal(
 351                             re.sub(u'#(\d+)', self._linkify_bug_number, subject)
 352                         )
 353
 354                     commits.append(
 355                         FrontPageGitCommit(
 356                             hash = hash,
 357                             author = author,
 358                             email = email,
 359                             time = datetime.datetime.fromtimestamp(int(time)),
 360                             subject = subject,
 361                             repo = repo_name,
 362                         )
 363                     )
 364
 365             update = FrontPageGit(
 366                 source = self,
 367                 time = tagged_timestamp,
 368                 log = commits,
 369                 tag = tag,
 370             )
 371             updates.append(update)
 372
 373         return updates
 374
 375     def _linkify_bug_number(self, match):
 376         """Regex replace function for changing bug numbers into links."""
 377         n = match.group(1)
 378         bug_url = self.bug_tracker.format(match.group(1))
 379         return helpers.literal(
 380             u"""<a href="{0}">{1}</a>""".format(bug_url, match.group(0)))