splinext/frontpage/sources.py

   1 """Base class for a front page source, as well as a handful of specific
   2 implementations.
   3 """
   4
   5 from collections import namedtuple
   6 import datetime
   7 import subprocess
   8 from subprocess import PIPE
   9
  10 import feedparser
  11 import lxml.html
  12
  13 from pylons import cache
  14
  15 from spline.lib import helpers
  16
  17 def max_age_to_datetime(max_age):
  18     """``max_age`` is specified in config as a number of seconds old.  This
  19     function takes that number and returns a corresponding datetime object.
  20     """
  21     if max_age == None:
  22         return None
  23
  24     seconds = int(max_age)
  25
  26
  27
  28 class Source(object):
  29     """Represents a source to be polled for updates.  Sources are populated
  30     directly from the configuration file.
  31
  32     Properties:
  33
  34     ``title``
  35         A name to identify this specific source.
  36
  37     ``icon``
  38         Name of a Fugue icon to show next to the name.
  39
  40     ``link``
  41         A URL where the full history of this source can be found.
  42
  43     ``limit``
  44         The maximum number of items from this source to show at a time.
  45         Optional.
  46
  47     ``max_age``
  48         Items older than this age (in seconds) will be excluded.  Optional.
  49
  50     Additionally, subclasses **must** define a ``template`` property -- a path
  51     to a Mako template that knows how to render an update from this source.
  52     The template will be passed one parameter: the update object, ``update``.
  53     """
  54
  55     def __init__(self, title, icon, link, limit=None, max_age=None):
  56         self.title = title
  57         self.icon = icon
  58         self.link = link
  59         self.limit = int(limit)
  60         self.max_age = max_age_to_datetime(max_age)
  61
  62     def do_cron(self, *args, **kwargs):
  63         return
  64
  65     def poll(self, global_limit, global_max_age):
  66         """Public wrapper that takes care of reconciling global and source item
  67         limit and max age.
  68
  69         Subclasses should implement ``_poll``, below.
  70         """
  71         # Smallest limit wins
  72         limit = min(self.limit, global_limit)
  73
  74         # Latest max age wins.  Note that either could be None, but that's
  75         # fine, because None is less than everything else
  76         max_age = max(self.max_age, global_max_age)
  77
  78         return self._poll(limit, max_age)
  79
  80     def _poll(self, limit, max_age):
  81         """Implementation of polling for updates.  Must return an iterable.
  82         Each element should be an object with ``source`` and ``time``
  83         properties.  A namedtuple works well.
  84         """
  85         raise NotImplementedError
  86
  87 class CachedSource(Source):
  88     """Supports caching a source's updates in memcache.
  89
  90     On the surface, this functions just like any other ``Source``.  Calling
  91     ``poll`` still returns a list of updates.  However, ``poll`` does not call
  92     your ``_poll``; instead, your implementation is called by the spline cron,
  93     and the results are cached.  ``poll`` then returns the contents of the
  94     cache.
  95
  96     You must define a ``_cache_key`` method that returns a key uniquely
  97     identifying this object.  Your key will be combined with the class name, so
  98     it only needs to be unique for that source, not globally.
  99
 100     You may also override ``poll_frequency``, the number of minutes between
 101     pollings.  By default, this is a rather conservative 60.
 102
 103     Note that it may take up to a minute after server startup for updates
 104     from a cached source to appear.
 105     """
 106
 107     poll_frequency = 60
 108
 109     def cache_key(self):
 110         return repr(type(self)) + ':' + self._cache_key()
 111
 112     def _cache_key(self):
 113         raise NotImplementedError
 114
 115     def do_cron(self, tic, *args, **kwargs):
 116         if tic % self.poll_frequency != 0:
 117             # Too early!
 118             return
 119
 120         updates = self._poll(self.limit, self.max_age)
 121         cache.get_cache('spline-frontpage')[self.cache_key()] = updates
 122
 123         return
 124
 125     def poll(self, global_limit, global_max_age):
 126         """Fetches cached updates."""
 127         try:
 128             return cache.get_cache('spline-frontpage')[self.cache_key()]
 129         except KeyError:
 130             # Haven't cached anything yet, apparently
 131             return []
 132
 133
 134 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
 135 class FeedSource(CachedSource):
 136     """Represents an RSS or Atom feed.
 137
 138     Extra properties:
 139
 140     ``feed_url``
 141         URL for the feed.
 142     """
 143
 144     template = '/front_page/rss.mako'
 145
 146     SUMMARY_LENGTH = 1000
 147
 148     poll_frequency = 15
 149
 150     def __init__(self, feed_url, **kwargs):
 151         kwargs.setdefault('title', None)
 152         super(FeedSource, self).__init__(**kwargs)
 153
 154         self.feed_url = feed_url
 155
 156     def _cache_key(self):
 157         return self.feed_url
 158
 159     def _poll(self, limit, max_age):
 160         feed = feedparser.parse(self.feed_url)
 161
 162         if not self.title:
 163             self.title = feed.feed.title
 164
 165         updates = []
 166         for entry in feed.entries[:limit]:
 167             # Grab a date -- Atom has published, RSS usually just has updated.
 168             # Both come out as time tuples, which datetime.datetime() can read
 169             try:
 170                 timestamp_tuple = entry.published_parsed
 171             except AttributeError:
 172                 timestamp_tuple = entry.updated_parsed
 173             timestamp = datetime.datetime(*timestamp_tuple[:6])
 174
 175             if max_age and timestamp < max_age:
 176                 # Entries should be oldest-first, so we can bail after the first
 177                 # expired entry
 178                 break
 179
 180             # Try to find something to show!  Default to the summary, if there is
 181             # one, or try to generate one otherwise
 182             content = u''
 183             if 'summary' in entry:
 184                 # If there be a summary, cheerfully trust that it's actually a
 185                 # summary
 186                 content = entry.summary
 187             elif 'content' in entry:
 188                 # Full content is way too much, especially for my giant blog posts.
 189                 # Cut this down to some arbitrary number of characters, then feed
 190                 # it to lxml.html to fix tag nesting
 191                 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
 192                 fragment = lxml.html.fromstring(broken_html)
 193
 194                 # Insert an ellipsis at the end of the last node with text
 195                 last_text_node = None
 196                 last_tail_node = None
 197                 # Need to find the last node with a tail, OR the last node with
 198                 # text if it's later
 199                 for node in fragment.iter():
 200                     if node.tail:
 201                         last_tail_node = node
 202                         last_text_node = None
 203                     elif node.text:
 204                         last_text_node = node
 205                         last_tail_node = None
 206
 207                 if last_text_node is not None:
 208                     last_text_node.text += '...'
 209                 if last_tail_node is not None:
 210                     last_tail_node.tail += '...'
 211
 212                 # Serialize
 213                 content = lxml.html.tostring(fragment)
 214
 215             content = helpers.literal(content)
 216
 217             update = FrontPageRSS(
 218                 source = self,
 219                 time = timestamp,
 220                 content = content,
 221                 entry = entry,
 222             )
 223             updates.append(update)
 224
 225         return updates
 226
 227
 228 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
 229 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
 230     ['hash', 'author', 'time', 'subject', 'repo'])
 231
 232 class GitSource(CachedSource):
 233     """Represents a git repository.
 234
 235     The main repository is checked for annotated tags, and an update is
 236     considered to be the list of commits between them.  If any other
 237     repositories are listed and have the same tags, their commits will be
 238     included as well.
 239
 240     Extra properties:
 241
 242     ``repo_paths``
 243         Space-separated list of repositories.  These must be repository PATHS,
 244         not arbitrary git URLs.  Only the first one will be checked for the
 245         list of tags.
 246
 247     ``repo_names``
 248         A list of names for the repositories, in parallel with ``repo_paths``.
 249         Used for constructing gitweb URLs and identifying the repositories.
 250
 251     ``gitweb``
 252         Base URL to a gitweb installation, so commit ids can be linked to the
 253         commit proper.
 254
 255     ``tag_pattern``
 256         Optional.  A shell glob pattern used to filter the tags.
 257     """
 258
 259     template = '/front_page/git.mako'
 260
 261     def __init__(self, repo_paths, repo_names, gitweb, tag_pattern=None, **kwargs):
 262         kwargs.setdefault('title', None)
 263         super(GitSource, self).__init__(**kwargs)
 264
 265         # Repo stuff can be space-delimited lists
 266         self.repo_paths = repo_paths.split()
 267         self.repo_names = repo_names.split()
 268
 269         self.gitweb = gitweb
 270         self.tag_pattern = tag_pattern
 271
 272     def _cache_key(self):
 273         return self.repo_paths[0]
 274
 275     def _poll(self, limit, max_age):
 276         # Fetch the main repo's git tags
 277         git_dir = '--git-dir=' + self.repo_paths[0]
 278         args = [
 279             'git',
 280             git_dir,
 281             'tag', '-l',
 282         ]
 283         if self.tag_pattern:
 284             args.append(self.tag_pattern)
 285
 286         git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 287         tags = git_output.strip().split('\n')
 288
 289         # Tags come out in alphabetical order, which means earliest first.  Reverse
 290         # it to make the slicing easier
 291         tags.reverse()
 292         # Only history from tag to tag is actually interesting, so get the most
 293         # recent $limit tags but skip the earliest
 294         interesting_tags = tags[:-1][:limit]
 295
 296         updates = []
 297         for tag, since_tag in zip(interesting_tags, tags[1:]):
 298             # Get the date when this tag was actually created.
 299             # 'raw' format gives unixtime followed by timezone offset
 300             args = [
 301                 'git',
 302                 git_dir,
 303                 'for-each-ref',
 304                 '--format=%(taggerdate:raw)',
 305                 'refs/tags/' + tag,
 306             ]
 307             tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 308             tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
 309             tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
 310
 311             if max_age and tagged_timestamp < max_age:
 312                 break
 313
 314             commits = []
 315
 316             for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
 317                 # Grab an easily-parsed history: fields delimited by nulls.
 318                 # Hash, author's name, commit timestamp, subject.
 319                 git_log_args = [
 320                     'git',
 321                     '--git-dir=' + repo_path,
 322                     'log',
 323                     '--pretty=%h%x00%an%x00%at%x00%s',
 324                     "{0}..{1}".format(since_tag, tag),
 325                 ]
 326                 proc = subprocess.Popen(git_log_args, stdout=PIPE)
 327                 for line in proc.stdout:
 328                     hash, author, time, subject = line.strip().split('\x00')
 329                     commits.append(
 330                         FrontPageGitCommit(
 331                             hash = hash,
 332                             author = author,
 333                             time = datetime.datetime.fromtimestamp(int(time)),
 334                             subject = subject,
 335                             repo = repo_name,
 336                         )
 337                     )
 338
 339             update = FrontPageGit(
 340                 source = self,
 341                 time = tagged_timestamp,
 342                 log = commits,
 343                 tag = tag,
 344             )
 345             updates.append(update)
 346
 347         return updates