mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
IGN:...
This commit is contained in:
parent
2fe75e21af
commit
7ba005f3e0
@ -1,87 +1,87 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
|
|
||||||
|
|
||||||
class FokkeEnSukkeRecipe(BasicNewsRecipe) :
|
class FokkeEnSukkeRecipe(BasicNewsRecipe) :
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__author__ = 'kwetal'
|
__author__ = 'kwetal'
|
||||||
language = 'nl'
|
language = 'nl'
|
||||||
description = u'Popular Dutch daily cartoon Fokke en Sukke'
|
description = u'Popular Dutch daily cartoon Fokke en Sukke'
|
||||||
|
|
||||||
title = u'Fokke en Sukke'
|
title = u'Fokke en Sukke'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
# For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
|
# For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
|
||||||
# with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
|
# with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
|
||||||
template_css = ''
|
template_css = ''
|
||||||
INDEX = u'http://foksuk.nl'
|
INDEX = u'http://foksuk.nl'
|
||||||
|
|
||||||
# This cover is not as nice as it could be, needs some work
|
# This cover is not as nice as it could be, needs some work
|
||||||
#cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
|
#cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
|
||||||
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
|
keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
|
||||||
|
|
||||||
def parse_index(self) :
|
def parse_index(self) :
|
||||||
# A list with daynames as they _can_ appear in the index
|
# A list with daynames as they _can_ appear in the index
|
||||||
dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag']
|
dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag']
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
|
||||||
# Find the links for the various cartoons for this week and loop through them
|
# Find the links for the various cartoons for this week and loop through them
|
||||||
index = soup.find('div', attrs={'class' : 'selectcartoon'})
|
index = soup.find('div', attrs={'class' : 'selectcartoon'})
|
||||||
links = index.findAll('a')
|
links = index.findAll('a')
|
||||||
maxIndex = len(links) - 1
|
maxIndex = len(links) - 1
|
||||||
articles = []
|
articles = []
|
||||||
for i in range(len(links)) :
|
for i in range(len(links)) :
|
||||||
# The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
|
# The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
|
||||||
if i == 0 :
|
if i == 0 :
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
|
# There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
|
||||||
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
|
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
|
||||||
if links[i].renderContents() in dayNames :
|
if links[i].renderContents() in dayNames :
|
||||||
# If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
|
# If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
|
||||||
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
|
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
|
||||||
# Got you! Add it to the list
|
# Got you! Add it to the list
|
||||||
article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''}
|
article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''}
|
||||||
articles.append(article)
|
articles.append(article)
|
||||||
# If there is a '1', there should be a '2' as well, but better save than sorry
|
# If there is a '1', there should be a '2' as well, but better save than sorry
|
||||||
if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') :
|
if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') :
|
||||||
# Got you! Add it to the list
|
# Got you! Add it to the list
|
||||||
article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''}
|
article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''}
|
||||||
articles.append(article)
|
articles.append(article)
|
||||||
else :
|
else :
|
||||||
# There is only one cartoon for this day. Add it to the list.
|
# There is only one cartoon for this day. Add it to the list.
|
||||||
article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''}
|
article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''}
|
||||||
articles.append(article)
|
articles.append(article)
|
||||||
# Might as well use the weeknumber as title
|
# Might as well use the weeknumber as title
|
||||||
week = index.find('span', attrs={'class' : 'week'}).renderContents()
|
week = index.find('span', attrs={'class' : 'week'}).renderContents()
|
||||||
|
|
||||||
return [[week, articles]]
|
return [[week, articles]]
|
||||||
|
|
||||||
def preprocess_html(self, soup) :
|
def preprocess_html(self, soup) :
|
||||||
# This method is called for every page, be it cartoon or TOC. We need to process each in their own way
|
# This method is called for every page, be it cartoon or TOC. We need to process each in their own way
|
||||||
cartoon = soup.find('div', attrs={'class' : 'cartoon'})
|
cartoon = soup.find('div', attrs={'class' : 'cartoon'})
|
||||||
if cartoon :
|
if cartoon :
|
||||||
# It is a cartoon. Extract the title.
|
# It is a cartoon. Extract the title.
|
||||||
title = ''
|
title = ''
|
||||||
img = soup.find('img', attrs = {'alt' : True})
|
img = soup.find('img', attrs = {'alt' : True})
|
||||||
if img :
|
if img :
|
||||||
title = img['alt']
|
title = img['alt']
|
||||||
|
|
||||||
# Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
|
# Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
|
||||||
tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
|
tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
|
||||||
tag.insert(0, title)
|
tag.insert(0, title)
|
||||||
cartoon.insert(0, tag)
|
cartoon.insert(0, tag)
|
||||||
|
|
||||||
# I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
|
# I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
|
||||||
# and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
|
# and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
|
||||||
select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
|
select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
|
||||||
if select :
|
if select :
|
||||||
select.extract()
|
select.extract()
|
||||||
|
|
||||||
return cartoon
|
return cartoon
|
||||||
else :
|
else :
|
||||||
# It is a TOC. Just return the whole lot.
|
# It is a TOC. Just return the whole lot.
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
@ -309,11 +309,13 @@ OptionRecommendation(name='remove_paragraph_spacing',
|
|||||||
'paragraphs of 1.5em. Spacing removal will not work '
|
'paragraphs of 1.5em. Spacing removal will not work '
|
||||||
'if the source file does not use paragraphs (<p> or <div> tags).')
|
'if the source file does not use paragraphs (<p> or <div> tags).')
|
||||||
),
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='remove_paragraph_spacing_indent_size',
|
OptionRecommendation(name='remove_paragraph_spacing_indent_size',
|
||||||
recommended_value=1.5, level=OptionRecommendation.LOW,
|
recommended_value=1.5, level=OptionRecommendation.LOW,
|
||||||
help=_('Width of the indent used with Remove spacing between paragraphs option')
|
help=_('When calibre removes inter paragraph spacing, it automatically '
|
||||||
),
|
'sets a paragraph indent, to ensure that paragraphs can be easily '
|
||||||
|
'distinguished. This option controls the width of that indent.')
|
||||||
|
),
|
||||||
|
|
||||||
OptionRecommendation(name='prefer_metadata_cover',
|
OptionRecommendation(name='prefer_metadata_cover',
|
||||||
recommended_value=False, level=OptionRecommendation.LOW,
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
@ -256,7 +256,7 @@ class PML_HTMLizer(object):
|
|||||||
if code in self.SPAN_STATES:
|
if code in self.SPAN_STATES:
|
||||||
del spans[spans.index(code)]
|
del spans[spans.index(code)]
|
||||||
for c in divs+spans:
|
for c in divs+spans:
|
||||||
if state[c][0]:
|
if self.state[c][0]:
|
||||||
if c in self.STATES_VALUE_REQ:
|
if c in self.STATES_VALUE_REQ:
|
||||||
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
|
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
|
||||||
else:
|
else:
|
||||||
@ -265,7 +265,7 @@ class PML_HTMLizer(object):
|
|||||||
if code in self.STATES_VALUE_REQ:
|
if code in self.STATES_VALUE_REQ:
|
||||||
val = self.code_value(stream)
|
val = self.code_value(stream)
|
||||||
text = self.STATES_TAGS[code][0] % val
|
text = self.STATES_TAGS[code][0] % val
|
||||||
state[code][1] = val
|
self.state[code][1] = val
|
||||||
else:
|
else:
|
||||||
text = self.STATES_TAGS[code][0]
|
text = self.STATES_TAGS[code][0]
|
||||||
|
|
||||||
|
@ -163,7 +163,7 @@ Paragraph spacing
|
|||||||
Normally, paragraphs in XHTML are rendered with a blank line between them and no leading text
|
Normally, paragraphs in XHTML are rendered with a blank line between them and no leading text
|
||||||
indent. |app| has a couple of options to control this. :guilabel:`Remove spacing between paragraphs`
|
indent. |app| has a couple of options to control this. :guilabel:`Remove spacing between paragraphs`
|
||||||
forcefully ensure that all paragraphs have no inter paragraph spacing. It also sets the text
|
forcefully ensure that all paragraphs have no inter paragraph spacing. It also sets the text
|
||||||
indent to 1.5em (can be changed) to mark that start of every paragraph.
|
indent to 1.5em (can be changed) to mark the start of every paragraph.
|
||||||
:guilabel:`Insert blank line` does the
|
:guilabel:`Insert blank line` does the
|
||||||
opposite, guaranteeing that there is exactly one blank line between each pair of paragraphs.
|
opposite, guaranteeing that there is exactly one blank line between each pair of paragraphs.
|
||||||
Both these options are very comprehensive, removing spacing, or inserting it for *all* paragraphs
|
Both these options are very comprehensive, removing spacing, or inserting it for *all* paragraphs
|
||||||
|
Loading…
x
Reference in New Issue
Block a user