import string, re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag from datetime import date from calibre.utils.magick.draw import save_cover_data_to from calibre.ptempfile import PersistentTemporaryFile class BostonGlobeSubscription(BasicNewsRecipe): # logger = logging.getLogger("mechanize") # logger.addHandler(logging.StreamHandler(sys.stdout)) # logger.setLevel(logging.DEBUG) title = "Boston Globe Subscription" __author__ = 'Rob Freundlich' description = 'Boston Globe with full articles for subscribers' language = 'en' INDEX = date.today().strftime('http://www.bostonglobe.com/todayspaper/%Y/%m/%d') todaysDate = date.today().strftime("%d/%m/%Y") timefmt = ' [%a, %d %b, %Y]' needs_subscription = 'optional' remove_tags = [dict(attrs={"class":["skip-nav article-more", "aside promo", "article-bar bar-yellow", "tools", "sticky-tools", "article-footer", "bg-footer"]}), dict(attrs={"id":["masthead", "video", "section-nav", "meter-limit-met-popup"]})] no_stylesheets = True # simultaneous_downloads = 1 valid_filename_chars = "-_.%s%s" % (string.ascii_letters, string.digits) cover_url = "http://ecx.images-amazon.com/images/I/419qC6zeKSL._SL500_AA300_.jpg" preprocess_regexps = [ (re.compile(r'\ 0: # url = imageURL[0]["href"] # print "COMIC %s: %s" % (title, url) articles.append({"title":title, "url":self.make_url(url), "author":author, "date":self.todaysDate, "description":""}) feeds.append(("Comics", articles)) get_top_stories() get_section("Nation", "tp-section-thenation") get_section("World", "tp-section-theworld") get_section("Metro", "tp-section-metro") getOpinionSection() get_section("Arts & Movies", "tp-section-g:arts&movies") get_section("Family", "tp-section-g:family") get_section("Style", "tp-section-g:style") get_section("Globe West", "tp-section-globewest") get_section("Food", "tp-section-g:food") get_section("Living", "tp-section-living") get_section("Health", "tp-section-g:health") get_section("Ideas", "tp-section-ideas") get_section("Boston Globe Magazine", "tp-section-magazine") get_comics() # get_section("Business", "tp-section-business") # get_section("Obituaries", "tp-section-obituaries") # get_section("Sports", "tp-section-sports") # get_section("Globe North", "tp-section-globenorth") # get_section("Globe South", "tp-section-globesouth") # get_section("Money & Careers", "tp-section-money&careers") # get_section("Books", "tp-section-books") # get_section("Travel", "tp-section-travel") # get_section("Real Estate", "tp-section-realestate") for feed in feeds: feedTitle = feed[0] articles = feed[1] self.log(feedTitle) for article in articles: self.log(" ", article["title"], ".....", article["url"]) self.log(" ", article["description"]) return feeds def postprocess_comics(self, soup, first): main = soup.find("div", id="main") sectionHead = main.find("div", "section-head") title = sectionHead.h2 byline = sectionHead.h3 imgLink = main.find("a", "comic") img = imgLink.img print "title: %s" % title print "byline: %s" % byline print "imgLink: %s" % imgLink print "img: %s" % img body = Tag(soup, "body") body.insert(0, title) body.insert(1, byline) body.insert(2, img) soup.body.replaceWith(body) return soup def preprocess_html(self, soup): images = soup.findAll("img") for image in images: if (image["src"] == ""): if (image["data-fullsrc"]): image["src"] = image["data-fullsrc"] elif (image["src"].startswith("//")): image["src"] = "http://" + image["src"] return soup def postprocess_html(self, soup, first): comicsBody = soup.find("body", {"class":re.compile(".*section-comics.*")}) if (comicsBody): return self.postprocess_comics(soup, first) article = soup.find("div", "article") if (article): # Yay! We're getting the subscriber view. Easy to handle articleHeader = article.find("div", "header") articleByline = article.find("div", "byline") articleBody = article.find("div", "article-body") figureLead = article.find("div", "figure lead-figure full") body = Tag(soup, "body") body.insert(0, articleHeader) body.insert(1, articleByline) body.insert(2, articleBody) if (figureLead): body.insert(2, figureLead) soup.body.replaceWith(body) return soup