splinext/frontpage/sources.py

   1 """Base class for a front page source, as well as a handful of specific
   2 implementations.
   3 """
   4
   5 from collections import namedtuple
   6 import datetime
   7 import subprocess
   8 from subprocess import PIPE
   9
  10 import feedparser
  11 import lxml.html
  12
  13 from spline.lib import helpers
  14
  15
  16 class Source(object):
  17     """Represents a source to be polled for updates.  Sources are populated
  18     directly from the configuration file.
  19
  20     Properties:
  21
  22     ``title``
  23         A name to identify this specific source.
  24
  25     ``icon``
  26         Name of a Fugue icon to show next to the name.
  27
  28     ``link``
  29         A URL where the full history of this source can be found.
  30
  31     ``limit``
  32         The maximum number of items from this source to show at a time.
  33         Optional.
  34
  35     ``max_age``
  36         Items older than this age (in seconds) will be excluded.  Optional.
  37
  38     Additionally, subclasses **must** define a ``template`` property -- a path
  39     to a Mako template that knows how to render an update from this source.
  40     The template will be passed one parameter: the update object, ``update``.
  41     """
  42
  43     def __init__(self, title, icon, link, limit=None, max_age=None):
  44         self.title = title
  45         self.icon = icon
  46         self.link = link
  47         self.limit = limit
  48         self.max_age = max_age
  49
  50     def poll(self):
  51         """Poll for updates.  Must return an iterable.  Each element should be
  52         an Update object.
  53         """
  54         raise NotImplementedError
  55
  56
  57 FrontPageRSS = namedtuple('FrontPageRSS', ['source', 'time', 'entry', 'content'])
  58 class FeedSource(Source):
  59     """Represents an RSS or Atom feed.
  60
  61     Extra properties:
  62
  63     ``feed_url``
  64         URL for the feed.
  65     """
  66
  67     template = '/front_page/rss.mako'
  68
  69     SUMMARY_LENGTH = 1000
  70
  71     def __init__(self, feed_url, **kwargs):
  72         kwargs.setdefault('title', None)
  73         super(FeedSource, self).__init__(**kwargs)
  74
  75         self.feed_url = feed_url
  76
  77     def poll(self, limit, max_age):
  78         feed = feedparser.parse(self.feed_url)
  79
  80         if not self.title:
  81             self.title = feed.feed.title
  82
  83         updates = []
  84         for entry in feed.entries[:limit]:
  85             # Grab a date -- Atom has published, RSS usually just has updated.
  86             # Both come out as time tuples, which datetime.datetime() can read
  87             try:
  88                 timestamp_tuple = entry.published_parsed
  89             except AttributeError:
  90                 timestamp_tuple = entry.updated_parsed
  91             timestamp = datetime.datetime(*timestamp_tuple[:6])
  92
  93             if max_age and timestamp < max_age:
  94                 # Entries should be oldest-first, so we can bail after the first
  95                 # expired entry
  96                 break
  97
  98             # Try to find something to show!  Default to the summary, if there is
  99             # one, or try to generate one otherwise
 100             content = u''
 101             if 'summary' in entry:
 102                 # If there be a summary, cheerfully trust that it's actually a
 103                 # summary
 104                 content = entry.summary
 105             elif 'content' in entry:
 106                 # Full content is way too much, especially for my giant blog posts.
 107                 # Cut this down to some arbitrary number of characters, then feed
 108                 # it to lxml.html to fix tag nesting
 109                 broken_html = entry.content[0].value[:self.SUMMARY_LENGTH]
 110                 fragment = lxml.html.fromstring(broken_html)
 111
 112                 # Insert an ellipsis at the end of the last node with text
 113                 last_text_node = None
 114                 last_tail_node = None
 115                 # Need to find the last node with a tail, OR the last node with
 116                 # text if it's later
 117                 for node in fragment.iter():
 118                     if node.tail:
 119                         last_tail_node = node
 120                         last_text_node = None
 121                     elif node.text:
 122                         last_text_node = node
 123                         last_tail_node = None
 124
 125                 if last_text_node is not None:
 126                     last_text_node.text += '...'
 127                 if last_tail_node is not None:
 128                     last_tail_node.tail += '...'
 129
 130                 # Serialize
 131                 content = lxml.html.tostring(fragment)
 132
 133             content = helpers.literal(content)
 134
 135             update = FrontPageRSS(
 136                 source = self,
 137                 time = timestamp,
 138                 content = content,
 139                 entry = entry,
 140             )
 141             updates.append(update)
 142
 143         return updates
 144
 145
 146 FrontPageGit = namedtuple('FrontPageGit', ['source', 'time', 'log', 'tag'])
 147 FrontPageGitCommit = namedtuple('FrontPageGitCommit',
 148     ['hash', 'author', 'time', 'subject', 'repo'])
 149
 150 class GitSource(Source):
 151     """Represents a git repository.
 152
 153     The main repository is checked for annotated tags, and an update is
 154     considered to be the list of commits between them.  If any other
 155     repositories are listed and have the same tags, their commits will be
 156     included as well.
 157
 158     Extra properties:
 159
 160     ``repo_paths``
 161         Space-separated list of repositories.  These must be repository PATHS,
 162         not arbitrary git URLs.  Only the first one will be checked for the
 163         list of tags.
 164
 165     ``repo_names``
 166         A list of names for the repositories, in parallel with ``repo_paths``.
 167         Used for constructing gitweb URLs and identifying the repositories.
 168
 169     ``gitweb``
 170         Base URL to a gitweb installation, so commit ids can be linked to the
 171         commit proper.
 172
 173     ``tag_pattern``
 174         Optional.  A shell glob pattern used to filter the tags.
 175     """
 176
 177     template = '/front_page/git.mako'
 178
 179     def __init__(self, repo_paths, repo_names, gitweb, tag_pattern=None, **kwargs):
 180         kwargs.setdefault('title', None)
 181         super(GitSource, self).__init__(**kwargs)
 182
 183         # Repo stuff can be space-delimited lists
 184         self.repo_paths = repo_paths.split()
 185         self.repo_names = repo_names.split()
 186
 187         self.gitweb = gitweb
 188         self.tag_pattern = tag_pattern
 189
 190     def poll(self, limit, max_age):
 191
 192         # Fetch the main repo's git tags
 193         git_dir = '--git-dir=' + self.repo_paths[0]
 194         args = [
 195             'git',
 196             git_dir,
 197             'tag', '-l',
 198         ]
 199         if self.tag_pattern:
 200             args.append(self.tag_pattern)
 201
 202         git_output, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 203         tags = git_output.strip().split('\n')
 204
 205         # Tags come out in alphabetical order, which means earliest first.  Reverse
 206         # it to make the slicing easier
 207         tags.reverse()
 208         # Only history from tag to tag is actually interesting, so get the most
 209         # recent $limit tags but skip the earliest
 210         interesting_tags = tags[:-1][:limit]
 211
 212         updates = []
 213         for tag, since_tag in zip(interesting_tags, tags[1:]):
 214             # Get the date when this tag was actually created.
 215             # 'raw' format gives unixtime followed by timezone offset
 216             args = [
 217                 'git',
 218                 git_dir,
 219                 'for-each-ref',
 220                 '--format=%(taggerdate:raw)',
 221                 'refs/tags/' + tag,
 222             ]
 223             tag_timestamp, _ = subprocess.Popen(args, stdout=PIPE).communicate()
 224             tag_unixtime, tag_timezone = tag_timestamp.split(None, 1)
 225             tagged_timestamp = datetime.datetime.fromtimestamp(int(tag_unixtime))
 226
 227             if max_age and tagged_timestamp < max_age:
 228                 break
 229
 230             commits = []
 231
 232             for repo_path, repo_name in zip(self.repo_paths, self.repo_names):
 233                 # Grab an easily-parsed history: fields delimited by nulls.
 234                 # Hash, author's name, commit timestamp, subject.
 235                 git_log_args = [
 236                     'git',
 237                     '--git-dir=' + repo_path,
 238                     'log',
 239                     '--pretty=%h%x00%an%x00%at%x00%s',
 240                     "{0}..{1}".format(since_tag, tag),
 241                 ]
 242                 proc = subprocess.Popen(git_log_args, stdout=PIPE)
 243                 for line in proc.stdout:
 244                     hash, author, time, subject = line.strip().split('\x00')
 245                     commits.append(
 246                         FrontPageGitCommit(
 247                             hash = hash,
 248                             author = author,
 249                             time = datetime.datetime.fromtimestamp(int(time)),
 250                             subject = subject,
 251                             repo = repo_name,
 252                         )
 253                     )
 254
 255             update = FrontPageGit(
 256                 source = self,
 257                 time = tagged_timestamp,
 258                 log = commits,
 259                 tag = tag,
 260             )
 261             updates.append(update)
 262
 263         return updates