mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Fix #4474 (Note of page in epub/picture in mobi)
This commit is contained in:
parent
dbbed21599
commit
20fe1609e4
@ -1,145 +1,145 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
|
|
||||||
class WaPoCartoonsRecipe(BasicNewsRecipe):
|
class WaPoCartoonsRecipe(BasicNewsRecipe):
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'kwetal'
|
__author__ = 'kwetal'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
version = 2
|
version = 2
|
||||||
|
|
||||||
title = u'Washington Post Cartoons'
|
title = u'Washington Post Cartoons'
|
||||||
publisher = u'Washington Post'
|
publisher = u'Washington Post'
|
||||||
category = u'News, Cartoons'
|
category = u'News, Cartoons'
|
||||||
description = u'Cartoons from the Washington Post'
|
description = u'Cartoons from the Washington Post'
|
||||||
|
|
||||||
oldest_article = 7
|
oldest_article = 7
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/'))
|
feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/'))
|
||||||
feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/'))
|
feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/'))
|
||||||
feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb'))
|
feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb'))
|
||||||
feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/'))
|
feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/'))
|
||||||
feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk'))
|
feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk'))
|
||||||
feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/'))
|
feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/'))
|
||||||
feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/'))
|
feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/'))
|
||||||
feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/'))
|
feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/'))
|
||||||
feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/'))
|
feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/'))
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
|
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
|
||||||
h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;}
|
h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;}
|
||||||
#name {margin-bottom: 0.2em}
|
#name {margin-bottom: 0.2em}
|
||||||
#copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;}
|
#copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
index = []
|
index = []
|
||||||
oldestDate = date.today() - timedelta(days = self.oldest_article)
|
oldestDate = date.today() - timedelta(days = self.oldest_article)
|
||||||
oldest = oldestDate.strftime('%Y%m%d')
|
oldest = oldestDate.strftime('%Y%m%d')
|
||||||
for feed in self.feeds:
|
for feed in self.feeds:
|
||||||
cartoons = []
|
cartoons = []
|
||||||
soup = self.index_to_soup(feed[1])
|
soup = self.index_to_soup(feed[1])
|
||||||
|
|
||||||
cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''}
|
cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''}
|
||||||
cartoons.append(cartoon)
|
cartoons.append(cartoon)
|
||||||
|
|
||||||
select = soup.find('select', attrs = {'name': ['url', 'dest']})
|
select = soup.find('select', attrs = {'name': ['url', 'dest']})
|
||||||
if select:
|
if select:
|
||||||
cartoonCandidates = []
|
cartoonCandidates = []
|
||||||
if select['name'] == 'url':
|
if select['name'] == 'url':
|
||||||
cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest)
|
cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest)
|
||||||
else:
|
else:
|
||||||
cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest)
|
cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest)
|
||||||
|
|
||||||
for cartoon in cartoonCandidates:
|
for cartoon in cartoonCandidates:
|
||||||
cartoons.append(cartoon)
|
cartoons.append(cartoon)
|
||||||
|
|
||||||
index.append([feed[0], cartoons])
|
index.append([feed[0], cartoons])
|
||||||
|
|
||||||
return index
|
return index
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
freshSoup = self.getFreshSoup(soup)
|
freshSoup = self.getFreshSoup(soup)
|
||||||
|
|
||||||
div = soup.find('div', attrs = {'id': 'name'})
|
div = soup.find('div', attrs = {'id': 'name'})
|
||||||
if div:
|
if div:
|
||||||
freshSoup.body.append(div)
|
freshSoup.body.append(div)
|
||||||
comic = soup.find('div', attrs = {'id': 'comic_full'})
|
comic = soup.find('div', attrs = {'id': 'comic_full'})
|
||||||
|
|
||||||
img = comic.find('img')
|
img = comic.find('img')
|
||||||
if '&' in img['src']:
|
if '&' in img['src']:
|
||||||
img['src'], sep, bad = img['src'].rpartition('&')
|
img['src'], sep, bad = img['src'].rpartition('&')
|
||||||
|
|
||||||
freshSoup.body.append(comic)
|
freshSoup.body.append(comic)
|
||||||
freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'}))
|
freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'}))
|
||||||
else:
|
else:
|
||||||
span = soup.find('span', attrs = {'class': 'title'})
|
span = soup.find('span', attrs = {'class': 'title'})
|
||||||
if span:
|
if span:
|
||||||
del span['class']
|
del span['class']
|
||||||
span['id'] = 'name'
|
span['id'] = 'name'
|
||||||
span.name = 'div'
|
span.name = 'div'
|
||||||
freshSoup.body.append(span)
|
freshSoup.body.append(span)
|
||||||
|
|
||||||
img = soup.find('img', attrs = {'class': 'pic_big'})
|
img = soup.find('img', attrs = {'class': 'pic_big'})
|
||||||
if img:
|
if img:
|
||||||
td = img.parent
|
td = img.parent
|
||||||
if td.has_key('style'):
|
if td.has_key('style'):
|
||||||
del td['style']
|
del td['style']
|
||||||
td.name = 'div'
|
td.name = 'div'
|
||||||
td['id'] = 'comic_full'
|
td['id'] = 'comic_full'
|
||||||
freshSoup.body.append(td)
|
freshSoup.body.append(td)
|
||||||
|
|
||||||
td = soup.find('td', attrs = {'class': 'copy'})
|
td = soup.find('td', attrs = {'class': 'copy'})
|
||||||
if td:
|
if td:
|
||||||
for a in td.find('a'):
|
for a in td.find('a'):
|
||||||
a.extract()
|
a.extract()
|
||||||
del td['class']
|
del td['class']
|
||||||
td['id'] = 'copyright'
|
td['id'] = 'copyright'
|
||||||
td.name = 'div'
|
td.name = 'div'
|
||||||
freshSoup.body.append(td)
|
freshSoup.body.append(td)
|
||||||
|
|
||||||
return freshSoup
|
return freshSoup
|
||||||
|
|
||||||
def getFreshSoup(self, oldSoup):
|
def getFreshSoup(self, oldSoup):
|
||||||
freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
|
freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
|
||||||
if oldSoup.head.title:
|
if oldSoup.head.title:
|
||||||
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
|
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
|
||||||
return freshSoup
|
return freshSoup
|
||||||
|
|
||||||
def cartoonCandidatesWaPo(self, select, oldest):
|
def cartoonCandidatesWaPo(self, select, oldest):
|
||||||
opts = select.findAll('option')
|
opts = select.findAll('option')
|
||||||
for i in range(1, len(opts)):
|
for i in range(1, len(opts)):
|
||||||
url = opts[i]['value'].rstrip('/')
|
url = opts[i]['value'].rstrip('/')
|
||||||
dateparts = url.split('/')[-3:]
|
dateparts = url.split('/')[-3:]
|
||||||
datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2])
|
datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2])
|
||||||
if datenum >= oldest:
|
if datenum >= oldest:
|
||||||
yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''}
|
yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''}
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
|
|
||||||
def cartoonCandidatesCreatorsCom(self, select, oldest):
|
def cartoonCandidatesCreatorsCom(self, select, oldest):
|
||||||
monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
|
monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
|
||||||
'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
|
'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
|
||||||
'November': '11', 'December': '12'}
|
'November': '11', 'December': '12'}
|
||||||
|
|
||||||
opts = select.findAll('option')
|
opts = select.findAll('option')
|
||||||
for i in range(1, len(opts)):
|
for i in range(1, len(opts)):
|
||||||
if opts[i].has_key('selected'):
|
if opts[i].has_key('selected'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
dateString = self.tag_to_string(opts[i])
|
dateString = self.tag_to_string(opts[i])
|
||||||
rest, sep, year = dateString.rpartition(', ')
|
rest, sep, year = dateString.rpartition(', ')
|
||||||
parts = rest.split(' ')
|
parts = rest.split(' ')
|
||||||
day = parts[2].rjust(2, '0')
|
day = parts[2].rjust(2, '0')
|
||||||
month = monthNames[parts[1]]
|
month = monthNames[parts[1]]
|
||||||
datenum = str(year) + month + str(day)
|
datenum = str(year) + month + str(day)
|
||||||
if datenum >= oldest:
|
if datenum >= oldest:
|
||||||
yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''}
|
yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''}
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -264,6 +264,11 @@ class EPUBOutput(OutputFormatPlugin):
|
|||||||
if body:
|
if body:
|
||||||
body = body[0]
|
body = body[0]
|
||||||
|
|
||||||
|
# Add id attribute to <a> tags that have name
|
||||||
|
for x in XPath('//h:a[@name]')(body):
|
||||||
|
if not x.get('id', False):
|
||||||
|
x.set('id', x.get('name'))
|
||||||
|
|
||||||
# Replace <br> that are children of <body> as ADE doesn't handle them
|
# Replace <br> that are children of <body> as ADE doesn't handle them
|
||||||
if hasattr(body, 'xpath'):
|
if hasattr(body, 'xpath'):
|
||||||
for br in XPath('./h:br')(body):
|
for br in XPath('./h:br')(body):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user