splinext/frontpage/sources.py

   1 """Base class for a front page source, as well as a handful of specific
   2 implementations.
   3 """
   4
   5 from collections import namedtuple
   6 import datetime
   7 import re
   8 import subprocess
   9 from subprocess import PIPE
  10 from urllib2 import URLError
  11
  12 import feedparser
  13 import lxml.html
  14
  15 from pylons import cache
  16
  17 from spline.lib import helpers
  18
  19 def max_age_to_datetime(max_age):
  20     """``max_age`` is specified in config as a number of seconds old.  This
  21     function takes that number and returns a corresponding datetime object.
  22     """
  23     if max_age == None:
  24         return None
  25
  26     dt = datetime.datetime.now()
  27     dt -= datetime.timedelta(seconds=int(max_age))
  28
  29     return dt
  30
  31
  32 class Source(object):
  33     """Represents a source to be polled for updates.  Sources are populated
  34     directly from the configuration file.
  35
  36     Properties:
  37
  38     ``title``
  39         A name to identify this specific source.
  40
  41     ``icon``
  42         Name of a Fugue icon to show next to the name.
  43
  44     ``link``
  45         A URL where the full history of this source can be found.
  46
  47     ``limit``
  48         The maximum number of items from this source to show at a time.
  49         Optional.
  50
  51     ``max_age``
  52         Items older than this age (in seconds) will be excluded.  Optional.
  53
  54     Additionally, subclasses **must** define a ``template`` property -- a path
  55     to a Mako template that knows how to render an update from this source.
  56     The template will be passed one parameter: the update object, ``update``.
  57     """
  58
  59     def __init__(self, config, title, icon, link, limit=None, max_age=None):
  60         self.title = title
  61         self.icon = icon
  62         self.link = link
  63         self.limit = int(limit)
  64         self.max_age = max_age_to_datetime(max_age)
  65
  66     def do_cron(self, *args, **kwargs):
  67         return
  68
  69     def poll(self, global_limit, global_max_age):
  70         """Public wrapper that takes care of reconciling global and source item
  71         limit and max age.
  72
  73         Subclasses should implement ``_poll``, below.
  74         """
  75         # Smallest limit wins
  76         limit = min(self.limit, global_limit)
  77
  78         # Latest max age wins.  Note that either could be None, but that's
  79         # fine, because None is less than everything else
  80         max_age = max(self.max_age, global_max_age)
  81
  82         return self._poll(limit, max_age)
  83
  84     def _poll(self, limit, max_age):
  85         """Implementation of polling for updates.  Must return an iterable.
  86         Each element should be an object with ``source`` and ``time``
  87         properties.  A namedtuple works well.
  88         """
  89         raise NotImplementedError
  90
  91 class CachedSource(Source):
  92     """Supports caching a source's updates in memcache.
  93
  94     On the surface, this functions just like any other ``Source``.  Calling
  95     ``poll`` still returns a list of updates.  However, ``poll`` does not call
  96     your ``_poll``; instead, your implementation is called by the spline cron,
  97     and the results are cached.  ``poll`` then returns the contents of the
  98     cache.
  99
 100     ``_poll`` may return None, in which case the cache will be left unchanged.
 101
 102     You must define a ``_cache_key`` method that returns a key uniquely
 103     identifying this object.  Your key will be combined with the class name, so
 104     it only needs to be unique for that source, not globally.
 105
 106     You may also override ``poll_frequency``, the number of minutes between
 107     pollings.  By default, this is a rather conservative 60.
 108
 109     Note that it may take up to a minute after server startup for updates
 110     from a cached source to appear.
 111     """
 112
 113     poll_frequency = 60
 114
 115     def cache_key(self):
 116         return repr(type(self)) + ':' + self._cache_key()
 117
 118     def _cache_key(self):
 119         raise NotImplementedError
 120
 121     def do_cron(self, tic, *args, **kwargs):
 122         if tic % self.poll_frequency != 0:
 123             # Too early!
 124             return
 125
 126         updates = self._poll(self.limit, self.max_age)
 127         if updates is not None:
 128             cache.get_cache('spline-frontpage')[self.cache_key()] = updates
 129
 130         return
 131
 132     def poll(self, global_limit, global_max_age):
 133         """Fetches cached updates."""
 134         try:
 135             return cache.get_cache('spline-frontpage')[self.cache_key()]
 136         except KeyError:
 137             # Haven't cached anything yet, apparently
 138             return []
 139
 140
 141 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
 142 class FeedSource(CachedSource):
 143     """Represents an RSS or Atom feed.
 144
 145     Extra properties:
 146
 147     ``feed_url``
 148         URL for the feed.
 149     """
 150
 151     template = '/front_page/rss.mako'
 152
 153     SUMMARY_LENGTH = 1000
 154
 155     poll_frequency = 15
 156
 157     def __init__(self, feed_url, **kwargs):
 158         kwargs.setdefault('title', None)
 159         super(FeedSource, self).__init__(**kwargs)
 160
 161         self.feed_url = feed_url
 162
 163     def _cache_key(self):
 164         return self.feed_url
 165
 166     def _poll(self, limit, max_age):
 167         feed = feedparser.parse(self.feed_url)
 168
 169         if feed.bozo and isinstance(feed.bozo_exception, URLError):
 170             # Feed is DOWN.  Bail here; otherwise, old entries might be lost
 171             # just because, say, Bulbanews is down yet again
 172             return None
 173
 174         if not self.title:
 175             self.title = feed.feed.title
 176
 177         updates = []
 178         for entry in feed.entries[:limit]:
 179             # Grab a date -- Atom has published, RSS usually just has updated.
 180             # Both come out as time tuples, which datetime.datetime() can read
 181             try:
 182                 timestamp_tuple = entry.published_parsed
 183             except AttributeError:
 184                 timestamp_tuple = entry.updated_parsed
 185             timestamp = datetime.datetime(*timestamp_tuple[:6])
 186
 187             if max_age and timestamp < max_age:
 188                 # Entries should be oldest-first, so we can bail after the first
 189                 # expired entry
 190                 break
 191
 192             # Try to find something to show!  Default to the summary, if there is
 193             # one, or try to generate one otherwise
 194             content = u''
 195             if 'summary' in entry:
 196                 # If there be a summary, cheerfully trust that it's actually a
 197                 # summary
 198                 content = entry.summary
 199             elif 'content' in entry and \
 200                 len(entry.content[0].value) <= self.SUMMARY_LENGTH:
 201
 202                 # Full content is short; use as-is!
 203                 content = entry.content[0].value
 204             elif 'content' in entry:
 205                 # Full content is way too much, especially for my giant blog posts.
 206                 # Cut this down to some arbitrary number of characters, then feed
 207                 # it to lxml.html to fix tag nesting
 208                 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
 209                 fragment = lxml.html.fromstring(broken_html)
 210
 211                 # Insert an ellipsis at the end of the last node with text
 212                 last_text_node = None
 213                 last_tail_node = None
 214                 # Need to find the last node with a tail, OR the last node with
 215                 # text if it's later
 216                 for node in fragment.iter():
 217                     if node.tail:
 218                         last_tail_node = node
 219                         last_text_node = None
 220                     elif node.text:
 221                         last_text_node = node
 222                         last_tail_node = None
 223
 224                 if last_text_node is not None:
 225                     last_text_node.text += '...'
 226                 if last_tail_node is not None:
 227                     last_tail_node.tail += '...'
 228
 229                 # Serialize
 230                 content = lxml.html.tostring(fragment)
 231
 232             content = helpers.literal(content)
 233
 234             update = FrontPageRSS(
 235                 source = self,
 236                 time = timestamp,
 237                 content = content,
 238                 entry = entry,
 239             )
 240             updates.append(update)
 241
 242         return updates
 243
 244
 245 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
 246 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
 247     ['hash', 'author', 'email', 'time', 'subject', 'repo'])
 248
 249 class GitSource(CachedSource):
 250     """Represents a git repository.
 251
 252     The main repository is checked for annotated tags, and an update is
 253     considered to be the list of commits between them.  If any other
 254     repositories are listed and have the same tags, their commits will be
 255     included as well.
 256
 257     Extra properties:
 258
 259     ``repo_paths``
 260         Space-separated list of repositories.  These must be repository PATHS,
 261         not arbitrary git URLs.  Only the first one will be checked for the
 262         list of tags.
 263
 264     ``repo_names``
 265         A list of names for the repositories, in parallel with ``repo_paths``.
 266         Used for constructing gitweb URLs and identifying the repositories.
 267
 268     ``gitweb``
 269         Base URL to a gitweb installation, so commit ids can be linked to the
 270         commit proper.
 271
 272     ``bug_tracker``
 273         URL to a bug tracker; anything matching "#xxx" will be converted into a
 274         link to this.  Should contain a "{0}", which will be replaced by the
 275         bug number.
 276
 277     ``tag_pattern``
 278         Optional.  A shell glob pattern used to filter the tags.
 279     """
 280
 281     template = '/front_page/git.mako'
 282
 283     def __init__(self, repo_paths, repo_names, gitweb, bug_tracker=None,
 284         tag_pattern=None, **kwargs):
 285
 286         kwargs.setdefault('title', None)
 287         super(GitSource, self).__init__(**kwargs)
 288
 289         # Repo stuff can be space-delimited lists
 290         self.repo_paths = repo_paths.split()
 291         self.repo_names = repo_names.split()
 292
 293         self.gitweb = gitweb
 294         self.bug_tracker = bug_tracker
 295         self.tag_pattern = tag_pattern
 296
 297     def _cache_key(self):
 298         return self.repo_paths[0]
 299
 300     def _poll(self, limit, max_age):
 301         # Fetch the main repo's git tags
 302         git_dir = '--git-dir=' + self.repo_paths[0]
 303         args = [
 304             'git',
 305             git_dir,
 306             'tag', '-l',
 307         ]
 308         if self.tag_pattern:
 309             args.append(self.tag_pattern)
 310
 311         git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 312         tags = git_output.strip().split('\n')
 313
 314         # Tags come out in alphabetical order, which means earliest first.  Reverse
 315         # it to make the slicing easier
 316         tags.reverse()
 317         # Only history from tag to tag is actually interesting, so get the most
 318         # recent $limit tags but skip the earliest
 319         interesting_tags = tags[:-1][:limit]
 320
 321         updates = []
 322         for tag, since_tag in zip(interesting_tags, tags[1:]):
 323             # Get the date when this tag was actually created.
 324             # 'raw' format gives unixtime followed by timezone offset
 325             args = [
 326                 'git',
 327                 git_dir,
 328                 'for-each-ref',
 329                 '--format=%(taggerdate:raw)',
 330                 'refs/tags/' + tag,
 331             ]
 332             tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 333             tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
 334             tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
 335
 336             if max_age and tagged_timestamp < max_age:
 337                 break
 338
 339             commits = []
 340
 341             for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
 342                 # Grab an easily-parsed history: fields delimited by nulls.
 343                 # Hash, author's name, commit timestamp, subject.
 344                 git_log_args = [
 345                     'git',
 346                     '--git-dir=' + repo_path,
 347                     'log',
 348                     '--pretty=%h%x00%an%x00%aE%x00%at%x00%s',
 349                     "{0}..{1}".format(since_tag, tag),
 350                 ]
 351                 proc = subprocess.Popen(git_log_args, stdout=PIPE)
 352                 for line in proc.stdout:
 353                     hash, author, email, time, subject \
 354                         = line.strip().decode('utf8').split('\x00')
 355
 356                     # Convert bug numbers in subject to URLs
 357                     if self.bug_tracker:
 358                         subject = helpers.literal(
 359                             re.sub(u'#(\d+)', self._linkify_bug_number, subject)
 360                         )
 361
 362                     commits.append(
 363                         FrontPageGitCommit(
 364                             hash = hash,
 365                             author = author,
 366                             email = email,
 367                             time = datetime.datetime.fromtimestamp(int(time)),
 368                             subject = subject,
 369                             repo = repo_name,
 370                         )
 371                     )
 372
 373             update = FrontPageGit(
 374                 source = self,
 375                 time = tagged_timestamp,
 376                 log = commits,
 377                 tag = tag,
 378             )
 379             updates.append(update)
 380
 381         return updates
 382
 383     def _linkify_bug_number(self, match):
 384         """Regex replace function for changing bug numbers into links."""
 385         n = match.group(1)
 386         bug_url = self.bug_tracker.format(match.group(1))
 387         return helpers.literal(
 388             u"""<a href="{0}">{1}</a>""".format(bug_url, match.group(0)))