Fix #4474 (Note of page in epub/picture in mobi)

This commit is contained in:
Kovid Goyal 2010-01-10 15:40:13 -07:00
parent dbbed21599
commit 20fe1609e4
2 changed files with 150 additions and 145 deletions

View File

@ -1,145 +1,145 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from datetime import date, timedelta from datetime import date, timedelta
class WaPoCartoonsRecipe(BasicNewsRecipe): class WaPoCartoonsRecipe(BasicNewsRecipe):
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'kwetal' __author__ = 'kwetal'
language = 'en' language = 'en'
version = 2 version = 2
title = u'Washington Post Cartoons' title = u'Washington Post Cartoons'
publisher = u'Washington Post' publisher = u'Washington Post'
category = u'News, Cartoons' category = u'News, Cartoons'
description = u'Cartoons from the Washington Post' description = u'Cartoons from the Washington Post'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
feeds = [] feeds = []
feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/')) feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/'))
feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/')) feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/'))
feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb')) feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb'))
feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/')) feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/'))
feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk')) feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk'))
feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/')) feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/'))
feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/')) feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/'))
feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/')) feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/'))
feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/')) feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/'))
extra_css = ''' extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif;} body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;} h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;}
#name {margin-bottom: 0.2em} #name {margin-bottom: 0.2em}
#copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;} #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;}
''' '''
def parse_index(self): def parse_index(self):
index = [] index = []
oldestDate = date.today() - timedelta(days = self.oldest_article) oldestDate = date.today() - timedelta(days = self.oldest_article)
oldest = oldestDate.strftime('%Y%m%d') oldest = oldestDate.strftime('%Y%m%d')
for feed in self.feeds: for feed in self.feeds:
cartoons = [] cartoons = []
soup = self.index_to_soup(feed[1]) soup = self.index_to_soup(feed[1])
cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''} cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''}
cartoons.append(cartoon) cartoons.append(cartoon)
select = soup.find('select', attrs = {'name': ['url', 'dest']}) select = soup.find('select', attrs = {'name': ['url', 'dest']})
if select: if select:
cartoonCandidates = [] cartoonCandidates = []
if select['name'] == 'url': if select['name'] == 'url':
cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest) cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest)
else: else:
cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest) cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest)
for cartoon in cartoonCandidates: for cartoon in cartoonCandidates:
cartoons.append(cartoon) cartoons.append(cartoon)
index.append([feed[0], cartoons]) index.append([feed[0], cartoons])
return index return index
def preprocess_html(self, soup): def preprocess_html(self, soup):
freshSoup = self.getFreshSoup(soup) freshSoup = self.getFreshSoup(soup)
div = soup.find('div', attrs = {'id': 'name'}) div = soup.find('div', attrs = {'id': 'name'})
if div: if div:
freshSoup.body.append(div) freshSoup.body.append(div)
comic = soup.find('div', attrs = {'id': 'comic_full'}) comic = soup.find('div', attrs = {'id': 'comic_full'})
img = comic.find('img') img = comic.find('img')
if '&' in img['src']: if '&' in img['src']:
img['src'], sep, bad = img['src'].rpartition('&') img['src'], sep, bad = img['src'].rpartition('&')
freshSoup.body.append(comic) freshSoup.body.append(comic)
freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'})) freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'}))
else: else:
span = soup.find('span', attrs = {'class': 'title'}) span = soup.find('span', attrs = {'class': 'title'})
if span: if span:
del span['class'] del span['class']
span['id'] = 'name' span['id'] = 'name'
span.name = 'div' span.name = 'div'
freshSoup.body.append(span) freshSoup.body.append(span)
img = soup.find('img', attrs = {'class': 'pic_big'}) img = soup.find('img', attrs = {'class': 'pic_big'})
if img: if img:
td = img.parent td = img.parent
if td.has_key('style'): if td.has_key('style'):
del td['style'] del td['style']
td.name = 'div' td.name = 'div'
td['id'] = 'comic_full' td['id'] = 'comic_full'
freshSoup.body.append(td) freshSoup.body.append(td)
td = soup.find('td', attrs = {'class': 'copy'}) td = soup.find('td', attrs = {'class': 'copy'})
if td: if td:
for a in td.find('a'): for a in td.find('a'):
a.extract() a.extract()
del td['class'] del td['class']
td['id'] = 'copyright' td['id'] = 'copyright'
td.name = 'div' td.name = 'div'
freshSoup.body.append(td) freshSoup.body.append(td)
return freshSoup return freshSoup
def getFreshSoup(self, oldSoup): def getFreshSoup(self, oldSoup):
freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>') freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
if oldSoup.head.title: if oldSoup.head.title:
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
return freshSoup return freshSoup
def cartoonCandidatesWaPo(self, select, oldest): def cartoonCandidatesWaPo(self, select, oldest):
opts = select.findAll('option') opts = select.findAll('option')
for i in range(1, len(opts)): for i in range(1, len(opts)):
url = opts[i]['value'].rstrip('/') url = opts[i]['value'].rstrip('/')
dateparts = url.split('/')[-3:] dateparts = url.split('/')[-3:]
datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2]) datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2])
if datenum >= oldest: if datenum >= oldest:
yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''} yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''}
else: else:
return return
def cartoonCandidatesCreatorsCom(self, select, oldest): def cartoonCandidatesCreatorsCom(self, select, oldest):
monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
'November': '11', 'December': '12'} 'November': '11', 'December': '12'}
opts = select.findAll('option') opts = select.findAll('option')
for i in range(1, len(opts)): for i in range(1, len(opts)):
if opts[i].has_key('selected'): if opts[i].has_key('selected'):
continue continue
dateString = self.tag_to_string(opts[i]) dateString = self.tag_to_string(opts[i])
rest, sep, year = dateString.rpartition(', ') rest, sep, year = dateString.rpartition(', ')
parts = rest.split(' ') parts = rest.split(' ')
day = parts[2].rjust(2, '0') day = parts[2].rjust(2, '0')
month = monthNames[parts[1]] month = monthNames[parts[1]]
datenum = str(year) + month + str(day) datenum = str(year) + month + str(day)
if datenum >= oldest: if datenum >= oldest:
yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''} yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''}
else: else:
return return

View File

@ -264,6 +264,11 @@ class EPUBOutput(OutputFormatPlugin):
if body: if body:
body = body[0] body = body[0]
# Add id attribute to <a> tags that have name
for x in XPath('//h:a[@name]')(body):
if not x.get('id', False):
x.set('id', x.get('name'))
# Replace <br> that are children of <body> as ADE doesn't handle them # Replace <br> that are children of <body> as ADE doesn't handle them
if hasattr(body, 'xpath'): if hasattr(body, 'xpath'):
for br in XPath('./h:br')(body): for br in XPath('./h:br')(body):