import string import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag from datetime import date, timedelta from calibre.utils.magick.draw import save_cover_data_to from calibre.ptempfile import PersistentTemporaryFile class BostonGlobeSubscription(BasicNewsRecipe): # logger = logging.getLogger("mechanize") # logger.addHandler(logging.StreamHandler(sys.stdout)) # logger.setLevel(logging.DEBUG) title = "Boston Globe Subscription" __author__ = 'Rob Freundlich' description = 'Boston Globe with full articles for subscribers' language = 'en' INDEX = 'http://www.bostonglobe.com/todayspaper/%Y/%m/%d' todaysDate = date.today().strftime("%d/%m/%Y") timefmt = ' [%a, %d %b, %Y]' needs_subscription = 'optional' keep_only_tags = [ dict(attrs={'class': ['section-head', 'comic', 'article']}) ] remove_tags = [ dict(attrs={"class": [ "skip-nav article-more", "aside promo", "article-bar bar-yellow", "tools", "sticky-tools", "article-footer", "bg-footer" ]}), dict(attrs={"id": ["masthead", "video", "section-nav", 'newsletter-form', "meter-limit-met-popup"]}) ] no_stylesheets = True # simultaneous_downloads = 1 valid_filename_chars = "-_.%s%s" % (string.ascii_letters, string.digits) cover_url = "http://ecx.images-amazon.com/images/I/419qC6zeKSL._SL500_AA300_.jpg" preprocess_regexps = [ (re.compile(r'\ 0: # url = imageURL[0]["href"] # print "COMIC %s: %s" % (title, url) articles.append({"title": title, "url": self.make_url( url), "author": author, "date": self.todaysDate, "description": ""}) feeds.append(("Comics", articles)) get_top_stories() for div in soup.findAll('div', {'class': 'tod-paper-section'}): get_section(div) get_comics() return feeds def postprocess_comics(self, soup, first): main = soup.find("div", id="main") sectionHead = main.find("div", "section-head") title = sectionHead.h2 byline = sectionHead.h3 imgLink = main.find("a", "comic") img = imgLink.img body = Tag(soup, "body") body.insert(0, title) body.insert(1, byline) body.insert(2, img) soup.body.replaceWith(body) return soup def preprocess_html(self, soup): images = soup.findAll("img") for image in images: if (image["src"] == ""): if (image["data-fullsrc"]): image["src"] = image["data-fullsrc"] elif (image["src"].startswith("//")): image["src"] = "http://" + image["src"] return soup def postprocess_html(self, soup, first): comicsBody = soup.find( "body", {"class": re.compile(".*section-comics.*")}) if (comicsBody): return self.postprocess_comics(soup, first) article = soup.find("div", "article") if (article): # Yay! We're getting the subscriber view. Easy to handle articleHeader = article.find("div", "header") articleByline = article.find("div", "byline") articleBody = article.find("div", "article-body") figureLead = article.find("div", "figure lead-figure full") body = Tag(soup, "body") body.insert(0, articleHeader) body.insert(1, articleByline) body.insert(2, articleBody) if (figureLead): body.insert(2, figureLead) soup.body.replaceWith(body) return soup