From d7972c06f3fa84f7e1cb8ef5c5fd420e835466d2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 Mar 2009 10:41:18 -0700 Subject: [PATCH] Create RSS feed for calibre release. Also new recipe for The Register by vgrama --- setup.py | 4 +- src/calibre/utils/rss_gen.py | 443 ++++++++++++++++++ src/calibre/web/feeds/recipes/__init__.py | 2 +- .../feeds/recipes/recipe_security_watch.py | 5 + .../web/feeds/recipes/recipe_the_register.py | 19 + upload.py | 67 ++- 6 files changed, 537 insertions(+), 3 deletions(-) create mode 100644 src/calibre/utils/rss_gen.py create mode 100644 src/calibre/web/feeds/recipes/recipe_the_register.py diff --git a/setup.py b/setup.py index ad56a0c9e4..b0ff04a983 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,8 @@ if __name__ == '__main__': resources, clean, gui, translations, update, \ tag_release, upload_demo, build_linux, build_windows, \ build_osx, upload_installers, upload_user_manual, \ - upload_to_pypi, stage3, stage2, stage1, upload + upload_to_pypi, stage3, stage2, stage1, upload, \ + upload_rss entry_points['console_scripts'].append( 'calibre_postinstall = calibre.linux:post_install') @@ -170,6 +171,7 @@ if __name__ == '__main__': 'upload_installers': upload_installers, 'upload_user_manual': upload_user_manual, 'upload_to_pypi': upload_to_pypi, + 'upload_rss' : upload_rss, 'stage3' : stage3, 'stage2' : stage2, 'stage1' : stage1, diff --git a/src/calibre/utils/rss_gen.py b/src/calibre/utils/rss_gen.py new file mode 100644 index 0000000000..fc1f1cf245 --- /dev/null +++ b/src/calibre/utils/rss_gen.py @@ -0,0 +1,443 @@ +"""PyRSS2Gen - A Python library for generating RSS 2.0 feeds.""" + +__name__ = "PyRSS2Gen" +__version__ = (1, 0, 0) +__author__ = "Andrew Dalke " + +_generator_name = __name__ + "-" + ".".join(map(str, __version__)) + +import datetime + +# Could make this the base class; will need to add 'publish' +class WriteXmlMixin: + def write_xml(self, outfile, encoding = "iso-8859-1"): + from xml.sax import saxutils + handler = saxutils.XMLGenerator(outfile, encoding) + handler.startDocument() + self.publish(handler) + handler.endDocument() + + def to_xml(self, encoding = "iso-8859-1"): + try: + import cStringIO as StringIO + except ImportError: + import StringIO + f = StringIO.StringIO() + self.write_xml(f, encoding) + return f.getvalue() + + +def _element(handler, name, obj, d = {}): + if isinstance(obj, basestring) or obj is None: + # special-case handling to make the API easier + # to use for the common case. + handler.startElement(name, d) + if obj is not None: + handler.characters(obj) + handler.endElement(name) + else: + # It better know how to emit the correct XML. + obj.publish(handler) + +def _opt_element(handler, name, obj): + if obj is None: + return + _element(handler, name, obj) + + +def _format_date(dt): + """convert a datetime into an RFC 822 formatted date + + Input date must be in GMT. + """ + # Looks like: + # Sat, 07 Sep 2002 00:00:01 GMT + # Can't use strftime because that's locale dependent + # + # Isn't there a standard way to do this for Python? The + # rfc822 and email.Utils modules assume a timestamp. The + # following is based on the rfc822 module. + return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( + ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()], + dt.day, + ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1], + dt.year, dt.hour, dt.minute, dt.second) + + +## +# A couple simple wrapper objects for the fields which +# take a simple value other than a string. +class IntElement: + """implements the 'publish' API for integers + + Takes the tag name and the integer value to publish. + + (Could be used for anything which uses str() to be published + to text for XML.) + """ + element_attrs = {} + def __init__(self, name, val): + self.name = name + self.val = val + def publish(self, handler): + handler.startElement(self.name, self.element_attrs) + handler.characters(str(self.val)) + handler.endElement(self.name) + +class DateElement: + """implements the 'publish' API for a datetime.datetime + + Takes the tag name and the datetime to publish. + + Converts the datetime to RFC 2822 timestamp (4-digit year). + """ + def __init__(self, name, dt): + self.name = name + self.dt = dt + def publish(self, handler): + _element(handler, self.name, _format_date(self.dt)) +#### + +class Category: + """Publish a category element""" + def __init__(self, category, domain = None): + self.category = category + self.domain = domain + def publish(self, handler): + d = {} + if self.domain is not None: + d["domain"] = self.domain + _element(handler, "category", self.category, d) + +class Cloud: + """Publish a cloud""" + def __init__(self, domain, port, path, + registerProcedure, protocol): + self.domain = domain + self.port = port + self.path = path + self.registerProcedure = registerProcedure + self.protocol = protocol + def publish(self, handler): + _element(handler, "cloud", None, { + "domain": self.domain, + "port": str(self.port), + "path": self.path, + "registerProcedure": self.registerProcedure, + "protocol": self.protocol}) + +class Image: + """Publish a channel Image""" + element_attrs = {} + def __init__(self, url, title, link, + width = None, height = None, description = None): + self.url = url + self.title = title + self.link = link + self.width = width + self.height = height + self.description = description + + def publish(self, handler): + handler.startElement("image", self.element_attrs) + + _element(handler, "url", self.url) + _element(handler, "title", self.title) + _element(handler, "link", self.link) + + width = self.width + if isinstance(width, int): + width = IntElement("width", width) + _opt_element(handler, "width", width) + + height = self.height + if isinstance(height, int): + height = IntElement("height", height) + _opt_element(handler, "height", height) + + _opt_element(handler, "description", self.description) + + handler.endElement("image") + +class Guid: + """Publish a guid + + Defaults to being a permalink, which is the assumption if it's + omitted. Hence strings are always permalinks. + """ + def __init__(self, guid, isPermaLink = 1): + self.guid = guid + self.isPermaLink = isPermaLink + def publish(self, handler): + d = {} + if self.isPermaLink: + d["isPermaLink"] = "true" + else: + d["isPermaLink"] = "false" + _element(handler, "guid", self.guid, d) + +class TextInput: + """Publish a textInput + + Apparently this is rarely used. + """ + element_attrs = {} + def __init__(self, title, description, name, link): + self.title = title + self.description = description + self.name = name + self.link = link + + def publish(self, handler): + handler.startElement("textInput", self.element_attrs) + _element(handler, "title", self.title) + _element(handler, "description", self.description) + _element(handler, "name", self.name) + _element(handler, "link", self.link) + handler.endElement("textInput") + + +class Enclosure: + """Publish an enclosure""" + def __init__(self, url, length, type): + self.url = url + self.length = length + self.type = type + def publish(self, handler): + _element(handler, "enclosure", None, + {"url": self.url, + "length": str(self.length), + "type": self.type, + }) + +class Source: + """Publish the item's original source, used by aggregators""" + def __init__(self, name, url): + self.name = name + self.url = url + def publish(self, handler): + _element(handler, "source", self.name, {"url": self.url}) + +class SkipHours: + """Publish the skipHours + + This takes a list of hours, as integers. + """ + element_attrs = {} + def __init__(self, hours): + self.hours = hours + def publish(self, handler): + if self.hours: + handler.startElement("skipHours", self.element_attrs) + for hour in self.hours: + _element(handler, "hour", str(hour)) + handler.endElement("skipHours") + +class SkipDays: + """Publish the skipDays + + This takes a list of days as strings. + """ + element_attrs = {} + def __init__(self, days): + self.days = days + def publish(self, handler): + if self.days: + handler.startElement("skipDays", self.element_attrs) + for day in self.days: + _element(handler, "day", day) + handler.endElement("skipDays") + +class RSS2(WriteXmlMixin): + """The main RSS class. + + Stores the channel attributes, with the "category" elements under + ".categories" and the RSS items under ".items". + """ + + rss_attrs = {"version": "2.0"} + element_attrs = {} + def __init__(self, + title, + link, + description, + + language = None, + copyright = None, + managingEditor = None, + webMaster = None, + pubDate = None, # a datetime, *in* *GMT* + lastBuildDate = None, # a datetime + + categories = None, # list of strings or Category + generator = _generator_name, + docs = "http://blogs.law.harvard.edu/tech/rss", + cloud = None, # a Cloud + ttl = None, # integer number of minutes + + image = None, # an Image + rating = None, # a string; I don't know how it's used + textInput = None, # a TextInput + skipHours = None, # a SkipHours with a list of integers + skipDays = None, # a SkipDays with a list of strings + + items = None, # list of RSSItems + ): + self.title = title + self.link = link + self.description = description + self.language = language + self.copyright = copyright + self.managingEditor = managingEditor + + self.webMaster = webMaster + self.pubDate = pubDate + self.lastBuildDate = lastBuildDate + + if categories is None: + categories = [] + self.categories = categories + self.generator = generator + self.docs = docs + self.cloud = cloud + self.ttl = ttl + self.image = image + self.rating = rating + self.textInput = textInput + self.skipHours = skipHours + self.skipDays = skipDays + + if items is None: + items = [] + self.items = items + + def publish(self, handler): + handler.startElement("rss", self.rss_attrs) + handler.startElement("channel", self.element_attrs) + _element(handler, "title", self.title) + _element(handler, "link", self.link) + _element(handler, "description", self.description) + + self.publish_extensions(handler) + + _opt_element(handler, "language", self.language) + _opt_element(handler, "copyright", self.copyright) + _opt_element(handler, "managingEditor", self.managingEditor) + _opt_element(handler, "webMaster", self.webMaster) + + pubDate = self.pubDate + if isinstance(pubDate, datetime.datetime): + pubDate = DateElement("pubDate", pubDate) + _opt_element(handler, "pubDate", pubDate) + + lastBuildDate = self.lastBuildDate + if isinstance(lastBuildDate, datetime.datetime): + lastBuildDate = DateElement("lastBuildDate", lastBuildDate) + _opt_element(handler, "lastBuildDate", lastBuildDate) + + for category in self.categories: + if isinstance(category, basestring): + category = Category(category) + category.publish(handler) + + _opt_element(handler, "generator", self.generator) + _opt_element(handler, "docs", self.docs) + + if self.cloud is not None: + self.cloud.publish(handler) + + ttl = self.ttl + if isinstance(self.ttl, int): + ttl = IntElement("ttl", ttl) + _opt_element(handler, "tt", ttl) + + if self.image is not None: + self.image.publish(handler) + + _opt_element(handler, "rating", self.rating) + if self.textInput is not None: + self.textInput.publish(handler) + if self.skipHours is not None: + self.skipHours.publish(handler) + if self.skipDays is not None: + self.skipDays.publish(handler) + + for item in self.items: + item.publish(handler) + + handler.endElement("channel") + handler.endElement("rss") + + def publish_extensions(self, handler): + # Derived classes can hook into this to insert + # output after the three required fields. + pass + + + +class RSSItem(WriteXmlMixin): + """Publish an RSS Item""" + element_attrs = {} + def __init__(self, + title = None, # string + link = None, # url as string + description = None, # string + author = None, # email address as string + categories = None, # list of string or Category + comments = None, # url as string + enclosure = None, # an Enclosure + guid = None, # a unique string + pubDate = None, # a datetime + source = None, # a Source + ): + + if title is None and description is None: + raise TypeError( + "must define at least one of 'title' or 'description'") + self.title = title + self.link = link + self.description = description + self.author = author + if categories is None: + categories = [] + self.categories = categories + self.comments = comments + self.enclosure = enclosure + self.guid = guid + self.pubDate = pubDate + self.source = source + # It sure does get tedious typing these names three times... + + def publish(self, handler): + handler.startElement("item", self.element_attrs) + _opt_element(handler, "title", self.title) + _opt_element(handler, "link", self.link) + self.publish_extensions(handler) + _opt_element(handler, "description", self.description) + _opt_element(handler, "author", self.author) + + for category in self.categories: + if isinstance(category, basestring): + category = Category(category) + category.publish(handler) + + _opt_element(handler, "comments", self.comments) + if self.enclosure is not None: + self.enclosure.publish(handler) + _opt_element(handler, "guid", self.guid) + + pubDate = self.pubDate + if isinstance(pubDate, datetime.datetime): + pubDate = DateElement("pubDate", pubDate) + _opt_element(handler, "pubDate", pubDate) + + if self.source is not None: + self.source.publish(handler) + + handler.endElement("item") + + def publish_extensions(self, handler): + # Derived classes can hook into this to insert + # output after the title and link elements + pass diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 94a32a6393..b3c3c4605b 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal ' Builtin recipes. ''' recipe_modules = ['recipe_' + r for r in ( - 'newsweek', 'atlantic', 'economist', 'portfolio', + 'newsweek', 'atlantic', 'economist', 'portfolio', 'the_register', 'nytimes', 'usatoday', 'outlook_india', 'bbc', 'greader', 'wsj', 'wired', 'globe_and_mail', 'smh', 'espn', 'business_week', 'miami_herald', 'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht', 'lanacion', diff --git a/src/calibre/web/feeds/recipes/recipe_security_watch.py b/src/calibre/web/feeds/recipes/recipe_security_watch.py index 7b111da208..b449b25f37 100644 --- a/src/calibre/web/feeds/recipes/recipe_security_watch.py +++ b/src/calibre/web/feeds/recipes/recipe_security_watch.py @@ -13,6 +13,7 @@ class SecurityWatch(BasicNewsRecipe): filter_regexps = [r'ad\.doubleclick'] filter_regexps = [r'advert'] language = _('English') + extra_css = 'div {text-align:left}' remove_tags = [dict(id='topBannerContainer'), dict(id='topBannerSmall'), @@ -30,3 +31,7 @@ class SecurityWatch(BasicNewsRecipe): feeds = [(u'securitywatch', u'http://feeds.ziffdavisenterprise.com/RSS/security_watch/')] + def postprocess_html(self, soup, first_fetch): + for t in soup.findAll(['table', 'tr', 'td']): + t.name = 'div' + return soup diff --git a/src/calibre/web/feeds/recipes/recipe_the_register.py b/src/calibre/web/feeds/recipes/recipe_the_register.py new file mode 100644 index 0000000000..8b0fa48cff --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_the_register.py @@ -0,0 +1,19 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe + +class TheRegister(BasicNewsRecipe): + title = u'The Register' + __author__ = 'vgrama' + language = _('English') + oldest_article = 1 + max_articles_per_feed = 100 + use_embedded_content = False + + feeds = [(u'Register', u'http://www.theregister.co.uk/headlines.atom')] + + remove_tags = [ + dict(name='div', attrs={'id':'leader'}), + dict(name='div', attrs={'id':'footer'}), + dict(name='div', attrs={'id':'masthead'})] + + def print_version(self, url): + return '%s/print.html' % url diff --git a/upload.py b/upload.py index cab426b191..9e7d2a0774 100644 --- a/upload.py +++ b/upload.py @@ -4,7 +4,8 @@ __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' import shutil, os, glob, re, cStringIO, sys, tempfile, time, textwrap, socket, \ - struct + struct, subprocess +from datetime import datetime from setuptools.command.build_py import build_py as _build_py, convert_path from distutils.core import Command from subprocess import check_call, call, Popen @@ -664,6 +665,7 @@ class stage3(OptionlessCommand): ('upload_installers', None), ('upload_user_manual', None), ('upload_to_pypi', None), + ('upload_rss', None), ] @classmethod @@ -706,3 +708,66 @@ class upload(OptionlessCommand): ('stage2', None), ('stage3', None) ] + +class upload_rss(OptionlessCommand): + + from bzrlib import log as blog + + class ChangelogFormatter(blog.LogFormatter): + supports_tags = True + supports_merge_revisions = False + + def __init__(self, num_of_versions=20): + from calibre.utils.rss_gen import RSS2 + self.num_of_versions = num_of_versions + self.rss = RSS2( + title = 'calibre releases', + link = 'http://calibre.kovidgoyal.net/wiki/Changelog', + description = 'Latest release of calibre', + lastBuildDate = datetime.utcnow() + ) + self.current_entry = None + + def log_revision(self, r): + from calibre.utils.rss_gen import RSSItem, Guid + if len(self.rss.items) > self.num_of_versions-1: + return + msg = r.rev.message + match = re.match(r'version\s+(\d+\.\d+.\d+)', msg) + + if match: + if self.current_entry is not None: + mkup = '
    %s
' + self.current_entry.description = mkup%(''.join( + self.current_entry.description)) + + self.rss.items.append(self.current_entry) + timestamp = r.rev.timezone + r.rev.timestamp + self.current_entry = RSSItem( + title = 'calibre %s released'%match.group(1), + link = 'http://calibre.kovidgoyal.net/download', + guid = Guid(match.group(), False), + pubDate = datetime(*time.gmtime(timestamp)[:6]), + description = [] + ) + elif self.current_entry is not None: + if re.search(r'[a-zA-Z]', msg) and len(msg.strip()) > 5: + if 'translation' not in msg and not msg.startswith('IGN'): + msg = msg.replace('<', '<').replace('>', '>') + msg = re.sub('#(\d+)', r'#\1', + msg) + + self.current_entry.description.append( + '
  • %s
  • '%msg.strip()) + + + def run(self): + from bzrlib import log, branch + bzr_path = os.path.expanduser('~/work/calibre') + b = branch.Branch.open(bzr_path) + lf = upload_rss.ChangelogFormatter() + log.show_log(b, lf) + lf.rss.write_xml(open('/tmp/releases.xml', 'wb')) + subprocess.check_call('scp /tmp/releases.xml divok:/var/www/calibre.kovidgoyal.net/htdocs/downloads'.split()) + +