diff --git a/recipes/boston.com.recipe b/recipes/boston.com.recipe index 9ba4e28a6b..6515f9a4d1 100644 --- a/recipes/boston.com.recipe +++ b/recipes/boston.com.recipe @@ -1,17 +1,17 @@ -import string import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag from datetime import date, timedelta -from calibre.utils.magick.draw import save_cover_data_to -from calibre.ptempfile import PersistentTemporaryFile + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class BostonGlobeSubscription(BasicNewsRecipe): - # logger = logging.getLogger("mechanize") - # logger.addHandler(logging.StreamHandler(sys.stdout)) - # logger.setLevel(logging.DEBUG) title = "Boston Globe Subscription" __author__ = 'Rob Freundlich' description = 'Boston Globe with full articles for subscribers' @@ -21,23 +21,17 @@ class BostonGlobeSubscription(BasicNewsRecipe): timefmt = ' [%a, %d %b, %Y]' needs_subscription = 'optional' keep_only_tags = [ - dict(attrs={'class': ['section-head', 'comic', 'article']}) + classes('main-hed lead-figure byline article-text comic'), ] remove_tags = [ - dict(attrs={"class": [ - "skip-nav article-more", "aside promo", "article-bar bar-yellow", "tools", "sticky-tools", "article-footer", "bg-footer" - ]}), - dict(attrs={"id": ["masthead", "video", "section-nav", - 'newsletter-form', "meter-limit-met-popup"]}) + classes('inline-newsletter ad skip-nav'), + dict(name=['meta', 'link']) ] + remove_attributes = ['style'] no_stylesheets = True # simultaneous_downloads = 1 - valid_filename_chars = "-_.%s%s" % (string.ascii_letters, string.digits) cover_url = "http://ecx.images-amazon.com/images/I/419qC6zeKSL._SL500_AA300_.jpg" - preprocess_regexps = [ - (re.compile(r'\ 0: # url = imageURL[0]["href"] # print "COMIC %s: %s" % (title, url) - articles.append({"title": title, "url": self.make_url( + articles.append({"title": title, "url": self.absolutize_url( url), "author": author, "date": self.todaysDate, "description": ""}) feeds.append(("Comics", articles)) @@ -268,37 +213,18 @@ class BostonGlobeSubscription(BasicNewsRecipe): def preprocess_html(self, soup): images = soup.findAll("img") - for image in images: - if (image["src"] == ""): - if (image["data-fullsrc"]): - image["src"] = image["data-fullsrc"] - elif (image["src"].startswith("//")): - image["src"] = "http://" + image["src"] - + for img in images: + fs = img.get('data-fullsrc') + if fs: + img['src'] = fs + src = img.get('src') + if src: + img['src'] = self.absolutize_url(src) return soup def postprocess_html(self, soup, first): comicsBody = soup.find( "body", {"class": re.compile(".*section-comics.*")}) - if (comicsBody): + if comicsBody: return self.postprocess_comics(soup, first) - - article = soup.find("div", "article") - if (article): - # Yay! We're getting the subscriber view. Easy to handle - articleHeader = article.find("div", "header") - articleByline = article.find("div", "byline") - articleBody = article.find("div", "article-body") - figureLead = article.find("div", "figure lead-figure full") - - body = Tag(soup, "body") - body.insert(0, articleHeader) - body.insert(1, articleByline) - body.insert(2, articleBody) - - if (figureLead): - body.insert(2, figureLead) - - soup.body.replaceWith(body) - return soup