This commit is contained in:
Kovid Goyal 2009-11-22 08:14:34 -07:00
parent 2fe75e21af
commit 7ba005f3e0
4 changed files with 95 additions and 93 deletions

View File

@ -1,87 +1,87 @@
#!/usr/bin/python #!/usr/bin/python
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
class FokkeEnSukkeRecipe(BasicNewsRecipe) : class FokkeEnSukkeRecipe(BasicNewsRecipe) :
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'kwetal' __author__ = 'kwetal'
language = 'nl' language = 'nl'
description = u'Popular Dutch daily cartoon Fokke en Sukke' description = u'Popular Dutch daily cartoon Fokke en Sukke'
title = u'Fokke en Sukke' title = u'Fokke en Sukke'
no_stylesheets = True no_stylesheets = True
# For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue # For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
# with that: it treats it as content and displays it as is. Setting this property to empty solves this for me. # with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
template_css = '' template_css = ''
INDEX = u'http://foksuk.nl' INDEX = u'http://foksuk.nl'
# This cover is not as nice as it could be, needs some work # This cover is not as nice as it could be, needs some work
#cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif' #cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})] keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
def parse_index(self) : def parse_index(self) :
# A list with daynames as they _can_ appear in the index # A list with daynames as they _can_ appear in the index
dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag'] dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag']
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
# Find the links for the various cartoons for this week and loop through them # Find the links for the various cartoons for this week and loop through them
index = soup.find('div', attrs={'class' : 'selectcartoon'}) index = soup.find('div', attrs={'class' : 'selectcartoon'})
links = index.findAll('a') links = index.findAll('a')
maxIndex = len(links) - 1 maxIndex = len(links) - 1
articles = [] articles = []
for i in range(len(links)) : for i in range(len(links)) :
# The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice. # The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
if i == 0 : if i == 0 :
continue continue
# There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname. # There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two. # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
if links[i].renderContents() in dayNames : if links[i].renderContents() in dayNames :
# If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content # If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') : if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
# Got you! Add it to the list # Got you! Add it to the list
article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''} article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''}
articles.append(article) articles.append(article)
# If there is a '1', there should be a '2' as well, but better save than sorry # If there is a '1', there should be a '2' as well, but better save than sorry
if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') : if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') :
# Got you! Add it to the list # Got you! Add it to the list
article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''} article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''}
articles.append(article) articles.append(article)
else : else :
# There is only one cartoon for this day. Add it to the list. # There is only one cartoon for this day. Add it to the list.
article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''} article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''}
articles.append(article) articles.append(article)
# Might as well use the weeknumber as title # Might as well use the weeknumber as title
week = index.find('span', attrs={'class' : 'week'}).renderContents() week = index.find('span', attrs={'class' : 'week'}).renderContents()
return [[week, articles]] return [[week, articles]]
def preprocess_html(self, soup) : def preprocess_html(self, soup) :
# This method is called for every page, be it cartoon or TOC. We need to process each in their own way # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
cartoon = soup.find('div', attrs={'class' : 'cartoon'}) cartoon = soup.find('div', attrs={'class' : 'cartoon'})
if cartoon : if cartoon :
# It is a cartoon. Extract the title. # It is a cartoon. Extract the title.
title = '' title = ''
img = soup.find('img', attrs = {'alt' : True}) img = soup.find('img', attrs = {'alt' : True})
if img : if img :
title = img['alt'] title = img['alt']
# Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that. # Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')]) tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
tag.insert(0, title) tag.insert(0, title)
cartoon.insert(0, tag) cartoon.insert(0, tag)
# I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier, # I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
# and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook. # and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
select = cartoon.find('div', attrs={'class' : 'selectcartoon'}) select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
if select : if select :
select.extract() select.extract()
return cartoon return cartoon
else : else :
# It is a TOC. Just return the whole lot. # It is a TOC. Just return the whole lot.
return soup return soup

View File

@ -309,11 +309,13 @@ OptionRecommendation(name='remove_paragraph_spacing',
'paragraphs of 1.5em. Spacing removal will not work ' 'paragraphs of 1.5em. Spacing removal will not work '
'if the source file does not use paragraphs (<p> or <div> tags).') 'if the source file does not use paragraphs (<p> or <div> tags).')
), ),
OptionRecommendation(name='remove_paragraph_spacing_indent_size', OptionRecommendation(name='remove_paragraph_spacing_indent_size',
recommended_value=1.5, level=OptionRecommendation.LOW, recommended_value=1.5, level=OptionRecommendation.LOW,
help=_('Width of the indent used with Remove spacing between paragraphs option') help=_('When calibre removes inter paragraph spacing, it automatically '
), 'sets a paragraph indent, to ensure that paragraphs can be easily '
'distinguished. This option controls the width of that indent.')
),
OptionRecommendation(name='prefer_metadata_cover', OptionRecommendation(name='prefer_metadata_cover',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,

View File

@ -256,7 +256,7 @@ class PML_HTMLizer(object):
if code in self.SPAN_STATES: if code in self.SPAN_STATES:
del spans[spans.index(code)] del spans[spans.index(code)]
for c in divs+spans: for c in divs+spans:
if state[c][0]: if self.state[c][0]:
if c in self.STATES_VALUE_REQ: if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
else: else:
@ -265,7 +265,7 @@ class PML_HTMLizer(object):
if code in self.STATES_VALUE_REQ: if code in self.STATES_VALUE_REQ:
val = self.code_value(stream) val = self.code_value(stream)
text = self.STATES_TAGS[code][0] % val text = self.STATES_TAGS[code][0] % val
state[code][1] = val self.state[code][1] = val
else: else:
text = self.STATES_TAGS[code][0] text = self.STATES_TAGS[code][0]

View File

@ -163,7 +163,7 @@ Paragraph spacing
Normally, paragraphs in XHTML are rendered with a blank line between them and no leading text Normally, paragraphs in XHTML are rendered with a blank line between them and no leading text
indent. |app| has a couple of options to control this. :guilabel:`Remove spacing between paragraphs` indent. |app| has a couple of options to control this. :guilabel:`Remove spacing between paragraphs`
forcefully ensure that all paragraphs have no inter paragraph spacing. It also sets the text forcefully ensure that all paragraphs have no inter paragraph spacing. It also sets the text
indent to 1.5em (can be changed) to mark that start of every paragraph. indent to 1.5em (can be changed) to mark the start of every paragraph.
:guilabel:`Insert blank line` does the :guilabel:`Insert blank line` does the
opposite, guaranteeing that there is exactly one blank line between each pair of paragraphs. opposite, guaranteeing that there is exactly one blank line between each pair of paragraphs.
Both these options are very comprehensive, removing spacing, or inserting it for *all* paragraphs Both these options are very comprehensive, removing spacing, or inserting it for *all* paragraphs