diff --git a/resources/recipes/wapo_cartoons.recipe b/resources/recipes/wapo_cartoons.recipe index 78440aa140..09810dbc71 100644 --- a/resources/recipes/wapo_cartoons.recipe +++ b/resources/recipes/wapo_cartoons.recipe @@ -1,145 +1,145 @@ -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from datetime import date, timedelta - -class WaPoCartoonsRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' - __author__ = 'kwetal' - language = 'en' - version = 2 - - title = u'Washington Post Cartoons' - publisher = u'Washington Post' - category = u'News, Cartoons' - description = u'Cartoons from the Washington Post' - - oldest_article = 7 - max_articles_per_feed = 100 - use_embedded_content = False - no_stylesheets = True - - feeds = [] - feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/')) - feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/')) - feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb')) - feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/')) - feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk')) - feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/')) - feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/')) - feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/')) - feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/')) - - extra_css = ''' - body {font-family: verdana, arial, helvetica, geneva, sans-serif;} - h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;} - #name {margin-bottom: 0.2em} - #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;} - ''' - - def parse_index(self): - index = [] - oldestDate = date.today() - timedelta(days = self.oldest_article) - oldest = oldestDate.strftime('%Y%m%d') - for feed in self.feeds: - cartoons = [] - soup = self.index_to_soup(feed[1]) - - cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''} - cartoons.append(cartoon) - - select = soup.find('select', attrs = {'name': ['url', 'dest']}) - if select: - cartoonCandidates = [] - if select['name'] == 'url': - cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest) - else: - cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest) - - for cartoon in cartoonCandidates: - cartoons.append(cartoon) - - index.append([feed[0], cartoons]) - - return index - - def preprocess_html(self, soup): - freshSoup = self.getFreshSoup(soup) - - div = soup.find('div', attrs = {'id': 'name'}) - if div: - freshSoup.body.append(div) - comic = soup.find('div', attrs = {'id': 'comic_full'}) - - img = comic.find('img') - if '&' in img['src']: - img['src'], sep, bad = img['src'].rpartition('&') - - freshSoup.body.append(comic) - freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'})) - else: - span = soup.find('span', attrs = {'class': 'title'}) - if span: - del span['class'] - span['id'] = 'name' - span.name = 'div' - freshSoup.body.append(span) - - img = soup.find('img', attrs = {'class': 'pic_big'}) - if img: - td = img.parent - if td.has_key('style'): - del td['style'] - td.name = 'div' - td['id'] = 'comic_full' - freshSoup.body.append(td) - - td = soup.find('td', attrs = {'class': 'copy'}) - if td: - for a in td.find('a'): - a.extract() - del td['class'] - td['id'] = 'copyright' - td.name = 'div' - freshSoup.body.append(td) - - return freshSoup - - def getFreshSoup(self, oldSoup): - freshSoup = BeautifulSoup('') - if oldSoup.head.title: - freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) - return freshSoup - - def cartoonCandidatesWaPo(self, select, oldest): - opts = select.findAll('option') - for i in range(1, len(opts)): - url = opts[i]['value'].rstrip('/') - dateparts = url.split('/')[-3:] - datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2]) - if datenum >= oldest: - yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''} - else: - return - - def cartoonCandidatesCreatorsCom(self, select, oldest): - monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', - 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', - 'November': '11', 'December': '12'} - - opts = select.findAll('option') - for i in range(1, len(opts)): - if opts[i].has_key('selected'): - continue - - dateString = self.tag_to_string(opts[i]) - rest, sep, year = dateString.rpartition(', ') - parts = rest.split(' ') - day = parts[2].rjust(2, '0') - month = monthNames[parts[1]] - datenum = str(year) + month + str(day) - if datenum >= oldest: - yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''} - else: - return - - +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from datetime import date, timedelta + +class WaPoCartoonsRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en' + version = 2 + + title = u'Washington Post Cartoons' + publisher = u'Washington Post' + category = u'News, Cartoons' + description = u'Cartoons from the Washington Post' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + no_stylesheets = True + + feeds = [] + feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/')) + feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/')) + feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb')) + feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/')) + feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk')) + feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/')) + feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/')) + feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/')) + feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/')) + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif;} + h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;} + #name {margin-bottom: 0.2em} + #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;} + ''' + + def parse_index(self): + index = [] + oldestDate = date.today() - timedelta(days = self.oldest_article) + oldest = oldestDate.strftime('%Y%m%d') + for feed in self.feeds: + cartoons = [] + soup = self.index_to_soup(feed[1]) + + cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''} + cartoons.append(cartoon) + + select = soup.find('select', attrs = {'name': ['url', 'dest']}) + if select: + cartoonCandidates = [] + if select['name'] == 'url': + cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest) + else: + cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest) + + for cartoon in cartoonCandidates: + cartoons.append(cartoon) + + index.append([feed[0], cartoons]) + + return index + + def preprocess_html(self, soup): + freshSoup = self.getFreshSoup(soup) + + div = soup.find('div', attrs = {'id': 'name'}) + if div: + freshSoup.body.append(div) + comic = soup.find('div', attrs = {'id': 'comic_full'}) + + img = comic.find('img') + if '&' in img['src']: + img['src'], sep, bad = img['src'].rpartition('&') + + freshSoup.body.append(comic) + freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'})) + else: + span = soup.find('span', attrs = {'class': 'title'}) + if span: + del span['class'] + span['id'] = 'name' + span.name = 'div' + freshSoup.body.append(span) + + img = soup.find('img', attrs = {'class': 'pic_big'}) + if img: + td = img.parent + if td.has_key('style'): + del td['style'] + td.name = 'div' + td['id'] = 'comic_full' + freshSoup.body.append(td) + + td = soup.find('td', attrs = {'class': 'copy'}) + if td: + for a in td.find('a'): + a.extract() + del td['class'] + td['id'] = 'copyright' + td.name = 'div' + freshSoup.body.append(td) + + return freshSoup + + def getFreshSoup(self, oldSoup): + freshSoup = BeautifulSoup('') + if oldSoup.head.title: + freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) + return freshSoup + + def cartoonCandidatesWaPo(self, select, oldest): + opts = select.findAll('option') + for i in range(1, len(opts)): + url = opts[i]['value'].rstrip('/') + dateparts = url.split('/')[-3:] + datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2]) + if datenum >= oldest: + yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''} + else: + return + + def cartoonCandidatesCreatorsCom(self, select, oldest): + monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', + 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', + 'November': '11', 'December': '12'} + + opts = select.findAll('option') + for i in range(1, len(opts)): + if opts[i].has_key('selected'): + continue + + dateString = self.tag_to_string(opts[i]) + rest, sep, year = dateString.rpartition(', ') + parts = rest.split(' ') + day = parts[2].rjust(2, '0') + month = monthNames[parts[1]] + datenum = str(year) + month + str(day) + if datenum >= oldest: + yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''} + else: + return + + diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py index 064a1d1bdd..9ed8bb6255 100644 --- a/src/calibre/ebooks/epub/output.py +++ b/src/calibre/ebooks/epub/output.py @@ -264,6 +264,11 @@ class EPUBOutput(OutputFormatPlugin): if body: body = body[0] + # Add id attribute to tags that have name + for x in XPath('//h:a[@name]')(body): + if not x.get('id', False): + x.set('id', x.get('name')) + # Replace
that are children of as ADE doesn't handle them if hasattr(body, 'xpath'): for br in XPath('./h:br')(body):