Sync to trunk.

This commit is contained in:
John Schember 2009-11-22 11:31:06 -05:00
commit d396739429
8 changed files with 193 additions and 107 deletions

View File

@ -1,87 +1,87 @@
#!/usr/bin/python
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class FokkeEnSukkeRecipe(BasicNewsRecipe) :
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'nl'
description = u'Popular Dutch daily cartoon Fokke en Sukke'
title = u'Fokke en Sukke'
no_stylesheets = True
# For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
# with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
template_css = ''
INDEX = u'http://foksuk.nl'
# This cover is not as nice as it could be, needs some work
#cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
def parse_index(self) :
# A list with daynames as they _can_ appear in the index
dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag']
soup = self.index_to_soup(self.INDEX)
# Find the links for the various cartoons for this week and loop through them
index = soup.find('div', attrs={'class' : 'selectcartoon'})
links = index.findAll('a')
maxIndex = len(links) - 1
articles = []
for i in range(len(links)) :
# The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
if i == 0 :
continue
# There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
if links[i].renderContents() in dayNames :
# If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
# Got you! Add it to the list
article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''}
articles.append(article)
# If there is a '1', there should be a '2' as well, but better save than sorry
if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') :
# Got you! Add it to the list
article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''}
articles.append(article)
else :
# There is only one cartoon for this day. Add it to the list.
article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''}
articles.append(article)
# Might as well use the weeknumber as title
week = index.find('span', attrs={'class' : 'week'}).renderContents()
return [[week, articles]]
def preprocess_html(self, soup) :
# This method is called for every page, be it cartoon or TOC. We need to process each in their own way
cartoon = soup.find('div', attrs={'class' : 'cartoon'})
if cartoon :
# It is a cartoon. Extract the title.
title = ''
img = soup.find('img', attrs = {'alt' : True})
if img :
title = img['alt']
# Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
tag.insert(0, title)
cartoon.insert(0, tag)
# I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
# and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
if select :
select.extract()
return cartoon
else :
# It is a TOC. Just return the whole lot.
return soup
#!/usr/bin/python
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class FokkeEnSukkeRecipe(BasicNewsRecipe) :
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'nl'
description = u'Popular Dutch daily cartoon Fokke en Sukke'
title = u'Fokke en Sukke'
no_stylesheets = True
# For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
# with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
template_css = ''
INDEX = u'http://foksuk.nl'
# This cover is not as nice as it could be, needs some work
#cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
def parse_index(self) :
# A list with daynames as they _can_ appear in the index
dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag']
soup = self.index_to_soup(self.INDEX)
# Find the links for the various cartoons for this week and loop through them
index = soup.find('div', attrs={'class' : 'selectcartoon'})
links = index.findAll('a')
maxIndex = len(links) - 1
articles = []
for i in range(len(links)) :
# The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
if i == 0 :
continue
# There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
if links[i].renderContents() in dayNames :
# If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
# Got you! Add it to the list
article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''}
articles.append(article)
# If there is a '1', there should be a '2' as well, but better save than sorry
if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') :
# Got you! Add it to the list
article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''}
articles.append(article)
else :
# There is only one cartoon for this day. Add it to the list.
article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''}
articles.append(article)
# Might as well use the weeknumber as title
week = index.find('span', attrs={'class' : 'week'}).renderContents()
return [[week, articles]]
def preprocess_html(self, soup) :
# This method is called for every page, be it cartoon or TOC. We need to process each in their own way
cartoon = soup.find('div', attrs={'class' : 'cartoon'})
if cartoon :
# It is a cartoon. Extract the title.
title = ''
img = soup.find('img', attrs = {'alt' : True})
if img :
title = img['alt']
# Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
tag.insert(0, title)
cartoon.insert(0, tag)
# I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
# and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
if select :
select.extract()
return cartoon
else :
# It is a TOC. Just return the whole lot.
return soup

View File

@ -125,7 +125,7 @@ def add_pipeline_options(parser, plumber):
'extra_css',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'dont_justify',
'insert_blank_line', 'remove_paragraph_spacing',
'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
'asciiize', 'remove_header', 'header_regex',
'remove_footer', 'footer_regex',
]

View File

@ -310,6 +310,13 @@ OptionRecommendation(name='remove_paragraph_spacing',
'if the source file does not use paragraphs (<p> or <div> tags).')
),
OptionRecommendation(name='remove_paragraph_spacing_indent_size',
recommended_value=1.5, level=OptionRecommendation.LOW,
help=_('When calibre removes inter paragraph spacing, it automatically '
'sets a paragraph indent, to ensure that paragraphs can be easily '
'distinguished. This option controls the width of that indent.')
),
OptionRecommendation(name='prefer_metadata_cover',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Use the cover detected from the source file in preference '

View File

@ -278,7 +278,7 @@ class CSSFlattener(object):
if self.context.insert_blank_line:
cssdict['margin-top'] = cssdict['margin-bottom'] = '0.5em'
if self.context.remove_paragraph_spacing:
cssdict['text-indent'] = '1.5em'
cssdict['text-indent'] = "%1.1fem" % self.context.remove_paragraph_spacing_indent_size
if cssdict:
items = cssdict.items()
items.sort()

View File

@ -256,7 +256,7 @@ class PML_HTMLizer(object):
if code in self.SPAN_STATES:
del spans[spans.index(code)]
for c in divs+spans:
if state[c][0]:
if self.state[c][0]:
if c in self.STATES_VALUE_REQ:
text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1]
else:
@ -265,7 +265,7 @@ class PML_HTMLizer(object):
if code in self.STATES_VALUE_REQ:
val = self.code_value(stream)
text = self.STATES_TAGS[code][0] % val
state[code][1] = val
self.state[code][1] = val
else:
text = self.STATES_TAGS[code][0]

View File

@ -23,7 +23,7 @@ class LookAndFeelWidget(Widget, Ui_Form):
'font_size_mapping', 'line_height',
'linearize_tables',
'disable_font_rescaling', 'insert_blank_line',
'remove_paragraph_spacing', 'input_encoding',
'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding',
'asciiize']
)
self.db, self.book_id = db, book_id
@ -32,6 +32,8 @@ class LookAndFeelWidget(Widget, Ui_Form):
self.opt_disable_font_rescaling.toggle()
self.connect(self.button_font_key, SIGNAL('clicked()'),
self.font_key_wizard)
self.opt_remove_paragraph_spacing.toggle()
self.opt_remove_paragraph_spacing.toggle()
def font_key_wizard(self):
from calibre.gui2.convert.font_key import FontKeyChooser

View File

@ -14,7 +14,7 @@
<string>Form</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="0" column="0" colspan="2">
<item row="0" column="0">
<widget class="QCheckBox" name="opt_disable_font_rescaling">
<property name="text">
<string>&amp;Disable font size rescaling</string>
@ -31,7 +31,7 @@
</property>
</widget>
</item>
<item row="1" column="3">
<item row="1" column="2">
<widget class="QDoubleSpinBox" name="opt_base_font_size">
<property name="suffix">
<string> pt</string>
@ -63,7 +63,7 @@
</property>
</widget>
</item>
<item row="2" column="2" colspan="2">
<item row="2" column="1" colspan="2">
<layout class="QHBoxLayout" name="horizontalLayout">
<item>
<widget class="QLineEdit" name="opt_font_size_mapping">
@ -107,7 +107,7 @@
</property>
</widget>
</item>
<item row="3" column="3">
<item row="3" column="2">
<widget class="QDoubleSpinBox" name="opt_line_height">
<property name="suffix">
<string> pt</string>
@ -127,12 +127,59 @@
</property>
</widget>
</item>
<item row="4" column="1" colspan="2">
<widget class="QLineEdit" name="opt_input_encoding"/>
</item>
<item row="5" column="0" colspan="3">
<widget class="QCheckBox" name="opt_remove_paragraph_spacing">
<property name="text">
<string>Remove &amp;spacing between paragraphs</string>
</property>
</widget>
<layout class="QHBoxLayout" name="horizontalLayout_3">
<item>
<widget class="QCheckBox" name="opt_remove_paragraph_spacing">
<property name="text">
<string>Remove &amp;spacing between paragraphs</string>
</property>
</widget>
</item>
<item>
<spacer name="horizontalSpacer">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>40</width>
<height>20</height>
</size>
</property>
</spacer>
</item>
<item>
<layout class="QHBoxLayout" name="horizontalLayout_2">
<item>
<widget class="QLabel" name="label_4">
<property name="text">
<string>Indent size:</string>
</property>
<property name="alignment">
<set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
</property>
</widget>
</item>
<item>
<widget class="QDoubleSpinBox" name="opt_remove_paragraph_spacing_indent_size">
<property name="toolTip">
<string>&lt;p&gt;When calibre removes inter paragraph spacing, it automatically sets a paragraph indent, to ensure that paragraphs can be easily distinguished. This option controls the width of that indent.</string>
</property>
<property name="suffix">
<string> em</string>
</property>
<property name="decimals">
<number>1</number>
</property>
</widget>
</item>
</layout>
</item>
</layout>
</item>
<item row="6" column="0">
<widget class="QCheckBox" name="opt_insert_blank_line">
@ -155,14 +202,14 @@
</property>
</widget>
</item>
<item row="9" column="0" colspan="3">
<item row="9" column="0">
<widget class="QCheckBox" name="opt_asciiize">
<property name="text">
<string>&amp;Transliterate unicode characters to ASCII.</string>
</property>
</widget>
</item>
<item row="10" column="0" colspan="4">
<item row="10" column="0" colspan="3">
<widget class="QGroupBox" name="groupBox">
<property name="title">
<string>Extra &amp;CSS</string>
@ -174,9 +221,6 @@
</layout>
</widget>
</item>
<item row="4" column="2" colspan="2">
<widget class="QLineEdit" name="opt_input_encoding"/>
</item>
</layout>
</widget>
<resources>
@ -216,5 +260,37 @@
</hint>
</hints>
</connection>
<connection>
<sender>opt_remove_paragraph_spacing</sender>
<signal>toggled(bool)</signal>
<receiver>label_4</receiver>
<slot>setEnabled(bool)</slot>
<hints>
<hint type="sourcelabel">
<x>20</x>
<y>20</y>
</hint>
<hint type="destinationlabel">
<x>20</x>
<y>20</y>
</hint>
</hints>
</connection>
<connection>
<sender>opt_remove_paragraph_spacing</sender>
<signal>toggled(bool)</signal>
<receiver>opt_remove_paragraph_spacing_indent_size</receiver>
<slot>setEnabled(bool)</slot>
<hints>
<hint type="sourcelabel">
<x>20</x>
<y>20</y>
</hint>
<hint type="destinationlabel">
<x>20</x>
<y>20</y>
</hint>
</hints>
</connection>
</connections>
</ui>

View File

@ -163,7 +163,8 @@ Paragraph spacing
Normally, paragraphs in XHTML are rendered with a blank line between them and no leading text
indent. |app| has a couple of options to control this. :guilabel:`Remove spacing between paragraphs`
forcefully ensure that all paragraphs have no inter paragraph spacing. It also sets the text
indent to 1.5em to mark that start of every paragraph. :guilabel:`Insert blank line` does the
indent to 1.5em (can be changed) to mark the start of every paragraph.
:guilabel:`Insert blank line` does the
opposite, guaranteeing that there is exactly one blank line between each pair of paragraphs.
Both these options are very comprehensive, removing spacing, or inserting it for *all* paragraphs
(technically <p> and <div> tags). This is so that you can just set the option and be sure that