diff --git a/recipes/boston.com.recipe b/recipes/boston.com.recipe index e8eb7d1a8b..c2ee4bf2e1 100644 --- a/recipes/boston.com.recipe +++ b/recipes/boston.com.recipe @@ -1,53 +1,418 @@ -__license__ = 'GPL v3' -__copyright__ = '2009-2010, Darko Miletic ' -''' -www.boston.com -''' - +import string, re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag +from datetime import date -class BusinessStandard(BasicNewsRecipe): - title = 'The Boston Globe' - __author__ = 'Darko Miletic' - description = 'News from Boston' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - delay = 1 - use_embedded_content = False - auto_cleanup = True - encoding = 'utf-8' - publisher = 'Boston' - category = 'news, boston, usa, world' - language = 'en' - publication_type = 'newspaper' - masthead_url = 'http://cache.boston.com/images/globe/grslider/the_boston_globe.gif' - extra_css = ' body{font-family: Georgia, serif} div#articleBodyTop{display:block} ' +class BostonGlobeSubscription(BasicNewsRecipe): - conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } + # logger = logging.getLogger("mechanize") + # logger.addHandler(logging.StreamHandler(sys.stdout)) + # logger.setLevel(logging.DEBUG) + title = "Boston Globe Subscription" + __author__ = 'Rob Freundlich' + description = 'Boston Globe with full articles for subscribers' + INDEX = date.today().strftime('http://www.bostonglobe.com/todayspaper/%Y/%m/%d') + todaysDate = date.today().strftime("%d/%m/%Y") + timefmt = ' [%a, %d %b, %Y]' + needs_subscription = True + remove_tags = [dict(attrs={"class":["skip-nav article-more", + "aside promo", + "article-bar bar-yellow", + "tools", + "sticky-tools", + "article-footer", + "bg-footer"]}), + dict(attrs={"id":["masthead", + "video", + "section-nav", + "meter-limit-met-popup"]})] + no_stylesheets = True + # simultaneous_downloads = 1 + valid_filename_chars = "-_.%s%s" % (string.ascii_letters, string.digits) + cover_url = "http://ecx.images-amazon.com/images/I/419qC6zeKSL._SL500_AA300_.jpg" + preprocess_regexps = [ + (re.compile(r'\ 0: + # url = imageURL[0]["href"] + # print "COMIC %s: %s" % (title, url) + articles.append({"title":title, "url":self.make_url(url), "author":author, "date":self.todaysDate, "description":""}) + + feeds.append(("Comics", articles)) + + get_top_stories() + + get_section("Nation", "tp-section-thenation") + get_section("World", "tp-section-theworld") + get_section("Metro", "tp-section-metro") + getOpinionSection() + get_section("Arts & Movies", "tp-section-g:arts&movies") + get_section("Family", "tp-section-g:family") + get_section("Style", "tp-section-g:style") + get_section("Globe West", "tp-section-globewest") + get_section("Food", "tp-section-g:food") + get_section("Living", "tp-section-living") + get_section("Health", "tp-section-g:health") + get_section("Ideas", "tp-section-ideas") + get_section("Boston Globe Magazine", "tp-section-magazine") + + get_comics() + + #get_section("Business", "tp-section-business") + #get_section("Obituaries", "tp-section-obituaries") + #get_section("Sports", "tp-section-sports") + #get_section("Globe North", "tp-section-globenorth") + #get_section("Globe South", "tp-section-globesouth") + #get_section("Money & Careers", "tp-section-money&careers") + #get_section("Books", "tp-section-books") + #get_section("Travel", "tp-section-travel") + #get_section("Real Estate", "tp-section-realestate") + + for feed in feeds: + feedTitle = feed[0] + articles = feed[1] + self.log(feedTitle) + for article in articles: + self.log(" ", article["title"], ".....", article["url"]) + self.log(" ", article["description"]) + + return feeds + + def postprocess_comics(self, soup, first): + main = soup.find("div", id="main") + sectionHead = main.find("div", "section-head") + title = sectionHead.h2 + byline = sectionHead.h3 + imgLink = main.find("a", "comic") + img = imgLink.img + + print "title: %s" % title + print "byline: %s" % byline + print "imgLink: %s" % imgLink + print "img: %s" % img + + body = Tag(soup, "body") + body.insert(0, title) + body.insert(1, byline) + body.insert(2, img) + + soup.body.replaceWith(body) + + return soup + + def preprocess_html(self, soup): + images = soup.findAll("img") + for image in images: + if (image["src"] == ""): + if (image["data-fullsrc"]): + image["src"] = image["data-fullsrc"] + elif (image["src"].startswith("//")): + image["src"] = "http://" + image["src"] + + return soup + + def postprocess_html(self, soup, first): + comicsBody = soup.find("body", {"class":re.compile(".*section-comics.*")}) + if (comicsBody): + return self.postprocess_comics(soup, first) + + article = soup.find("div", "article") + if (article): + # Yay! We're getting the subscriber view. Easy to handle + articleHeader = article.find("div", "header") + articleByline = article.find("div", "byline") + articleBody = article.find("div", "article-body") + figureLead = article.find("div", "figure lead-figure full") + + body = Tag(soup, "body") + body.insert(0, articleHeader) + body.insert(1, articleByline) + body.insert(2, articleBody) + + if (figureLead): + body.insert(2, figureLead) + + soup.body.replaceWith(body) + + return soup