splinext/frontpage/sources.py

   1 """Base class for a front page source, as well as a handful of specific
   2 implementations.
   3 """
   4
   5 from collections import namedtuple
   6 import datetime
   7 import re
   8 import subprocess
   9 from subprocess import PIPE
  10 from urllib2 import URLError
  11
  12 import feedparser
  13 import lxml.html
  14
  15 from pylons import cache
  16
  17 from spline.lib import helpers
  18
  19 def max_age_to_datetime(max_age):
  20     """``max_age`` is specified in config as a number of seconds old.  This
  21     function takes that number and returns a corresponding datetime object.
  22     """
  23     if max_age == None:
  24         return None
  25
  26     dt = datetime.datetime.now()
  27     dt -= datetime.timedelta(seconds=int(max_age))
  28
  29     return dt
  30
  31
  32 class Source(object):
  33     """Represents a source to be polled for updates.  Sources are populated
  34     directly from the configuration file.
  35
  36     Properties:
  37
  38     ``title``
  39         A name to identify this specific source.
  40
  41     ``icon``
  42         Name of a Fugue icon to show next to the name.
  43
  44     ``link``
  45         A URL where the full history of this source can be found.
  46
  47     ``limit``
  48         The maximum number of items from this source to show at a time.
  49         Optional.
  50
  51     ``max_age``
  52         Items older than this age (in seconds) will be excluded.  Optional.
  53
  54     Additionally, subclasses **must** define a ``template`` property -- a path
  55     to a Mako template that knows how to render an update from this source.
  56     The template will be passed one parameter: the update object, ``update``.
  57     """
  58
  59     def __init__(self, config, title, icon, link, limit=None, max_age=None):
  60         self.title = title
  61         self.icon = icon
  62         self.link = link
  63         self.limit = int(limit)
  64         self.max_age = max_age_to_datetime(max_age)
  65
  66     def do_cron(self, *args, **kwargs):
  67         return
  68
  69     def poll(self, global_limit, global_max_age):
  70         """Public wrapper that takes care of reconciling global and source item
  71         limit and max age.
  72
  73         Subclasses should implement ``_poll``, below.
  74         """
  75         # Smallest limit wins
  76         limit = min(self.limit, global_limit)
  77
  78         # Latest max age wins.  Note that either could be None, but that's
  79         # fine, because None is less than everything else
  80         max_age = max(self.max_age, global_max_age)
  81
  82         return self._poll(limit, max_age)
  83
  84     def _poll(self, limit, max_age):
  85         """Implementation of polling for updates.  Must return an iterable.
  86         Each element should be an object with ``source`` and ``time``
  87         properties.  A namedtuple works well.
  88         """
  89         raise NotImplementedError
  90
  91 class CachedSource(Source):
  92     """Supports caching a source's updates in memcache.
  93
  94     On the surface, this functions just like any other ``Source``.  Calling
  95     ``poll`` still returns a list of updates.  However, ``poll`` does not call
  96     your ``_poll``; instead, your implementation is called by the spline cron,
  97     and the results are cached.  ``poll`` then returns the contents of the
  98     cache.
  99
 100     You must define a ``_cache_key`` method that returns a key uniquely
 101     identifying this object.  Your key will be combined with the class name, so
 102     it only needs to be unique for that source, not globally.
 103
 104     You may also override ``poll_frequency``, the number of minutes between
 105     pollings.  By default, this is a rather conservative 60.
 106
 107     Note that it may take up to a minute after server startup for updates
 108     from a cached source to appear.
 109     """
 110
 111     poll_frequency = 60
 112
 113     def cache_key(self):
 114         return repr(type(self)) + ':' + self._cache_key()
 115
 116     def _cache_key(self):
 117         raise NotImplementedError
 118
 119     def do_cron(self, tic, *args, **kwargs):
 120         if tic % self.poll_frequency != 0:
 121             # Too early!
 122             return
 123
 124         updates = self._poll(self.limit, self.max_age)
 125         cache.get_cache('spline-frontpage')[self.cache_key()] = updates
 126
 127         return
 128
 129     def poll(self, global_limit, global_max_age):
 130         """Fetches cached updates."""
 131         try:
 132             return cache.get_cache('spline-frontpage')[self.cache_key()]
 133         except KeyError:
 134             # Haven't cached anything yet, apparently
 135             return []
 136
 137
 138 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
 139 class FeedSource(CachedSource):
 140     """Represents an RSS or Atom feed.
 141
 142     Extra properties:
 143
 144     ``feed_url``
 145         URL for the feed.
 146     """
 147
 148     template = '/front_page/rss.mako'
 149
 150     SUMMARY_LENGTH = 1000
 151
 152     poll_frequency = 15
 153
 154     def __init__(self, feed_url, **kwargs):
 155         kwargs.setdefault('title', None)
 156         super(FeedSource, self).__init__(**kwargs)
 157
 158         self.feed_url = feed_url
 159
 160     def _cache_key(self):
 161         return self.feed_url
 162
 163     def _poll(self, limit, max_age):
 164         feed = feedparser.parse(self.feed_url)
 165
 166         if feed.bozo and isinstance(feed.bozo_exception, URLError):
 167             # Feed is DOWN.  Bail here; otherwise, old entries might be lost
 168             # just because, say, Bulbanews is down yet again
 169             raise feed.bozo_exception
 170
 171         if not self.title:
 172             self.title = feed.feed.title
 173
 174         updates = []
 175         for entry in feed.entries[:limit]:
 176             # Grab a date -- Atom has published, RSS usually just has updated.
 177             # Both come out as time tuples, which datetime.datetime() can read
 178             try:
 179                 timestamp_tuple = entry.published_parsed
 180             except AttributeError:
 181                 timestamp_tuple = entry.updated_parsed
 182             timestamp = datetime.datetime(*timestamp_tuple[:6])
 183
 184             if max_age and timestamp < max_age:
 185                 # Entries should be oldest-first, so we can bail after the first
 186                 # expired entry
 187                 break
 188
 189             # Try to find something to show!  Default to the summary, if there is
 190             # one, or try to generate one otherwise
 191             content = u''
 192             if 'summary' in entry:
 193                 # If there be a summary, cheerfully trust that it's actually a
 194                 # summary
 195                 content = entry.summary
 196             elif 'content' in entry and \
 197                 len(entry.content[0].value) <= self.SUMMARY_LENGTH:
 198
 199                 # Full content is short; use as-is!
 200                 content = entry.content[0].value
 201             elif 'content' in entry:
 202                 # Full content is way too much, especially for my giant blog posts.
 203                 # Cut this down to some arbitrary number of characters, then feed
 204                 # it to lxml.html to fix tag nesting
 205                 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
 206                 fragment = lxml.html.fromstring(broken_html)
 207
 208                 # Insert an ellipsis at the end of the last node with text
 209                 last_text_node = None
 210                 last_tail_node = None
 211                 # Need to find the last node with a tail, OR the last node with
 212                 # text if it's later
 213                 for node in fragment.iter():
 214                     if node.tail:
 215                         last_tail_node = node
 216                         last_text_node = None
 217                     elif node.text:
 218                         last_text_node = node
 219                         last_tail_node = None
 220
 221                 if last_text_node is not None:
 222                     last_text_node.text += '...'
 223                 if last_tail_node is not None:
 224                     last_tail_node.tail += '...'
 225
 226                 # Serialize
 227                 content = lxml.html.tostring(fragment)
 228
 229             content = helpers.literal(content)
 230
 231             update = FrontPageRSS(
 232                 source = self,
 233                 time = timestamp,
 234                 content = content,
 235                 entry = entry,
 236             )
 237             updates.append(update)
 238
 239         return updates
 240
 241
 242 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
 243 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
 244     ['hash', 'author', 'email', 'time', 'subject', 'repo'])
 245
 246 class GitSource(CachedSource):
 247     """Represents a git repository.
 248
 249     The main repository is checked for annotated tags, and an update is
 250     considered to be the list of commits between them.  If any other
 251     repositories are listed and have the same tags, their commits will be
 252     included as well.
 253
 254     Extra properties:
 255
 256     ``repo_paths``
 257         Space-separated list of repositories.  These must be repository PATHS,
 258         not arbitrary git URLs.  Only the first one will be checked for the
 259         list of tags.
 260
 261     ``repo_names``
 262         A list of names for the repositories, in parallel with ``repo_paths``.
 263         Used for constructing gitweb URLs and identifying the repositories.
 264
 265     ``gitweb``
 266         Base URL to a gitweb installation, so commit ids can be linked to the
 267         commit proper.
 268
 269     ``bug_tracker``
 270         URL to a bug tracker; anything matching "#xxx" will be converted into a
 271         link to this.  Should contain a "{0}", which will be replaced by the
 272         bug number.
 273
 274     ``tag_pattern``
 275         Optional.  A shell glob pattern used to filter the tags.
 276     """
 277
 278     template = '/front_page/git.mako'
 279
 280     def __init__(self, repo_paths, repo_names, gitweb, bug_tracker=None,
 281         tag_pattern=None, **kwargs):
 282
 283         kwargs.setdefault('title', None)
 284         super(GitSource, self).__init__(**kwargs)
 285
 286         # Repo stuff can be space-delimited lists
 287         self.repo_paths = repo_paths.split()
 288         self.repo_names = repo_names.split()
 289
 290         self.gitweb = gitweb
 291         self.bug_tracker = bug_tracker
 292         self.tag_pattern = tag_pattern
 293
 294     def _cache_key(self):
 295         return self.repo_paths[0]
 296
 297     def _poll(self, limit, max_age):
 298         # Fetch the main repo's git tags
 299         git_dir = '--git-dir=' + self.repo_paths[0]
 300         args = [
 301             'git',
 302             git_dir,
 303             'tag', '-l',
 304         ]
 305         if self.tag_pattern:
 306             args.append(self.tag_pattern)
 307
 308         git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 309         tags = git_output.strip().split('\n')
 310
 311         # Tags come out in alphabetical order, which means earliest first.  Reverse
 312         # it to make the slicing easier
 313         tags.reverse()
 314         # Only history from tag to tag is actually interesting, so get the most
 315         # recent $limit tags but skip the earliest
 316         interesting_tags = tags[:-1][:limit]
 317
 318         updates = []
 319         for tag, since_tag in zip(interesting_tags, tags[1:]):
 320             # Get the date when this tag was actually created.
 321             # 'raw' format gives unixtime followed by timezone offset
 322             args = [
 323                 'git',
 324                 git_dir,
 325                 'for-each-ref',
 326                 '--format=%(taggerdate:raw)',
 327                 'refs/tags/' + tag,
 328             ]
 329             tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 330             tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
 331             tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
 332
 333             if max_age and tagged_timestamp < max_age:
 334                 break
 335
 336             commits = []
 337
 338             for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
 339                 # Grab an easily-parsed history: fields delimited by nulls.
 340                 # Hash, author's name, commit timestamp, subject.
 341                 git_log_args = [
 342                     'git',
 343                     '--git-dir=' + repo_path,
 344                     'log',
 345                     '--pretty=%h%x00%an%x00%aE%x00%at%x00%s',
 346                     "{0}..{1}".format(since_tag, tag),
 347                 ]
 348                 proc = subprocess.Popen(git_log_args, stdout=PIPE)
 349                 for line in proc.stdout:
 350                     hash, author, email, time, subject \
 351                         = line.strip().decode('utf8').split('\x00')
 352
 353                     # Convert bug numbers in subject to URLs
 354                     if self.bug_tracker:
 355                         subject = helpers.literal(
 356                             re.sub(u'#(\d+)', self._linkify_bug_number, subject)
 357                         )
 358
 359                     commits.append(
 360                         FrontPageGitCommit(
 361                             hash = hash,
 362                             author = author,
 363                             email = email,
 364                             time = datetime.datetime.fromtimestamp(int(time)),
 365                             subject = subject,
 366                             repo = repo_name,
 367                         )
 368                     )
 369
 370             update = FrontPageGit(
 371                 source = self,
 372                 time = tagged_timestamp,
 373                 log = commits,
 374                 tag = tag,
 375             )
 376             updates.append(update)
 377
 378         return updates
 379
 380     def _linkify_bug_number(self, match):
 381         """Regex replace function for changing bug numbers into links."""
 382         n = match.group(1)
 383         bug_url = self.bug_tracker.format(match.group(1))
 384         return helpers.literal(
 385             u"""<a href="{0}">{1}</a>""".format(bug_url, match.group(0)))