From 379c3b3465b74b8b44aed196ca4ffe050dd6d74c Mon Sep 17 00:00:00 2001 From: jason Date: Sun, 22 Nov 2009 13:00:45 +0000 Subject: [PATCH 1/3] Add item to UI to config para indent --- src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 5 +++ src/calibre/ebooks/oeb/transforms/flatcss.py | 2 +- src/calibre/gui2/convert/look_and_feel.py | 4 +- src/calibre/gui2/convert/look_and_feel.ui | 41 +++++++++++++++++--- 5 files changed, 46 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index 75c545f8b5..178561fcb5 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -125,7 +125,7 @@ def add_pipeline_options(parser, plumber): 'extra_css', 'margin_top', 'margin_left', 'margin_right', 'margin_bottom', 'dont_justify', - 'insert_blank_line', 'remove_paragraph_spacing', + 'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size', 'asciiize', 'remove_header', 'header_regex', 'remove_footer', 'footer_regex', ] diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 2a3dfedd65..262f64a9cc 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -309,6 +309,11 @@ OptionRecommendation(name='remove_paragraph_spacing', 'paragraphs of 1.5em. Spacing removal will not work ' 'if the source file does not use paragraphs (

or

tags).') ), + +OptionRecommendation(name='remove_paragraph_spacing_indent_size', + recommended_value=1.5, level=OptionRecommendation.LOW, + help=_('Width of the indent used with Remove spacing between paragraphs option') + ), OptionRecommendation(name='prefer_metadata_cover', recommended_value=False, level=OptionRecommendation.LOW, diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 464acbe0e0..ffb5364750 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -278,7 +278,7 @@ class CSSFlattener(object): if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = '0.5em' if self.context.remove_paragraph_spacing: - cssdict['text-indent'] = '1.5em' + cssdict['text-indent'] = "%1.1f em" % self.context.remove_paragraph_spacing_indent_size if cssdict: items = cssdict.items() items.sort() diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py index a10a410b67..4d43f64910 100644 --- a/src/calibre/gui2/convert/look_and_feel.py +++ b/src/calibre/gui2/convert/look_and_feel.py @@ -23,7 +23,7 @@ class LookAndFeelWidget(Widget, Ui_Form): 'font_size_mapping', 'line_height', 'linearize_tables', 'disable_font_rescaling', 'insert_blank_line', - 'remove_paragraph_spacing', 'input_encoding', + 'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding', 'asciiize'] ) self.db, self.book_id = db, book_id @@ -32,6 +32,8 @@ class LookAndFeelWidget(Widget, Ui_Form): self.opt_disable_font_rescaling.toggle() self.connect(self.button_font_key, SIGNAL('clicked()'), self.font_key_wizard) + self.opt_remove_paragraph_spacing.toggle() + self.opt_remove_paragraph_spacing.toggle() def font_key_wizard(self): from calibre.gui2.convert.font_key import FontKeyChooser diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui index d451cd9af0..0161dfcea6 100644 --- a/src/calibre/gui2/convert/look_and_feel.ui +++ b/src/calibre/gui2/convert/look_and_feel.ui @@ -127,20 +127,39 @@ - + Remove &spacing between paragraphs - - - + + - Insert &blank line + Indent size: + + + Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter + + + + em + + + 1 + + + + + + + Insert &blank line + + + @@ -216,5 +235,17 @@ + + opt_remove_paragraph_spacing + toggled(bool) + label_4 + setEnabled(bool) + + + opt_remove_paragraph_spacing + toggled(bool) + opt_remove_paragraph_spacing_indent_size + setEnabled(bool) + From bf93536979998441cb936fd065790c16ff1dd4cd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Nov 2009 07:50:11 -0700 Subject: [PATCH 2/3] PDB/PML Input: All new state machine parser, should result in better conversions. --- src/calibre/ebooks/pdb/ereader/reader.py | 4 +- src/calibre/ebooks/pml/pmlconverter.py | 441 +++++++++++++++++++---- 2 files changed, 375 insertions(+), 70 deletions(-) diff --git a/src/calibre/ebooks/pdb/ereader/reader.py b/src/calibre/ebooks/pdb/ereader/reader.py index 77ca8d6933..ad1df98793 100644 --- a/src/calibre/ebooks/pdb/ereader/reader.py +++ b/src/calibre/ebooks/pdb/ereader/reader.py @@ -31,5 +31,5 @@ class Reader(FormatReader): def dump_pml(self): return self.reader.dump_pml() - def dump_images(self): - return self.reader.dump_images() + def dump_images(self, out_dir): + return self.reader.dump_images(out_dir) diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 1505e5fc4b..05cf488617 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -9,85 +9,390 @@ __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' import re +import StringIO -from calibre import my_unichr +from calibre import my_unichr, prepare_string_for_xml from calibre.ebooks.pdb.ereader import image_name -PML_HTML_RULES = [ - # Any literal <, &, and > chars be escaped to avoid HTML issues (though - # and tags are handled specially later). - (re.compile(r'&'), lambda match: '&'), - (re.compile(r'<'), lambda match: '<'), - (re.compile(r'>'), lambda match: '>'), +class PML_HTMLizer(object): - # NOP-process all \x escapes, turning \\ into \ This keeps the regex - # parsing simple while making sure that we don't try to honor \\x as \x - # (and also makes sure we DO honor \\\x as \ followed by \x). - (re.compile(r'\\(.)'), lambda match: '\' if match.group(1) == '\\' else '\\' + match.group(1)), + STATES = [ + 'i', + 'u', + 'd', + 'b', + 'sp', + 'sb', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'a', + 'c', + 'r', + 't', + 's', + 'l', + 'T', + 'Fn', + 'Sd', + 'FS' + ] - (re.compile(r'\\p'), lambda match: '

'), - (re.compile(r'\\x(?P.*?)\\x', re.DOTALL), lambda match: '

%s

' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\X(?P[0-4])(?P.*?)\\X[0-4]', re.DOTALL), lambda match: '%s' % (int(match.group('val')) + 1, match.group('text'), int(match.group('val')) + 1) if match.group('text') else ''), - (re.compile(r'\\C\d=".+?"'), lambda match: ''), # This should be made to create a TOC entry - (re.compile(r'\\c(?P.*?)\\c', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\r(?P.*?)\\r', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\i(?P.*?)\\i', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\u(?P.*?)\\u', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\o(?P.*?)\\o', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\v(?P.*?)\\v', re.DOTALL), lambda match: '' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\t(?P.*?)\\t', re.DOTALL), lambda match: '
%s
' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\T="(?P\d+)%*"(?P.*?)$', re.MULTILINE), lambda match: r'
%s
' % (match.group('val'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\w="(?P\d+)%"'), lambda match: '
' % match.group('val')), - (re.compile(r'\\n'), lambda match: ''), - (re.compile(r'\\s(?P.*?)\\s', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\b(?P.*?)\\b', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), # \b is deprecated; \B should be used instead. - (re.compile(r'\\l(?P.*?)\\l', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\B(?P.*?)\\B', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\Sp(?P.*?)\\Sp', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\Sb(?P.*?)\\Sb', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), - (re.compile(r'\\k(?P.*?)\\k', re.DOTALL), lambda match: '%s' % match.group('text').upper() if match.group('text') else ''), - (re.compile(r'\\a(?P\d{3})'), lambda match: '&#%s;' % match.group('num')), - (re.compile(r'\\U(?P[0-9a-f]{4})'), lambda match: '%s' % my_unichr(int(match.group('num'), 16))), - (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), - (re.compile(r'\\q="(?P#.+?)"(?P.*?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')), - (re.compile(r'\\-'), lambda match: ''), - (re.compile(r'\\Fn="(?P.+?)"(?P.*?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'\\Sd="(?P.+?)"(?P.*?)\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text')) if match.group('text') else ''), - # Just italicize index items as that is how the eReader software renders them. - (re.compile(r'\\I(?P.*?)\\I', re.DOTALL), lambda match: '%s' % match.group('text') if match.group('text') else ''), + STATES_VALUE_REQ = [ + 'a', + 'T', + 'FS' + ] - # Sidebar and Footnotes - (re.compile(r'<sidebar\s+id="(?P.+?)">\s*(?P.*?)\s*</sidebar>', re.DOTALL), lambda match: '
%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''), - (re.compile(r'<footnote\s+id="(?P.+?)">\s*(?P.*?)\s*</footnote>', re.DOTALL), lambda match: '
%s
' % (match.group('target'), match.group('text')) if match.group('text') else ''), + STATES_TAGS = { + 'h1': ('

', '

'), + 'h2': ('

', '

'), + 'h3': ('

', '

'), + 'h4': ('

', '

'), + 'h5': ('
', '
'), + 'h6': ('
', '
'), + 'sp': ('', ''), + 'sb': ('', ''), + 'a': ('', ''), + 'c': ('
', '
'), + 'r': ('
', '
'), + 't': ('
', '
'), + 'T': ('
', '
'), + 'i': ('', ''), + 'u': ('', ''), + 'd': ('', ''), + 'b': ('', ''), + 's': ('', ''), + 'l': ('', ''), + 'FS': ('
', '
'), + } - # eReader files are one paragraph per line. - # This forces the lines to wrap properly. - (re.compile('^(?P.+)$', re.MULTILINE), lambda match: '

%s

' % match.group('text')), - # Remove empty

's. - (re.compile('

[ ]*

'), lambda match: ''), - # Ensure empty lines carry over. - (re.compile('(\r\n|\n|\r){3}'), lambda match: '
'), + CODE_STATES = { + 'q': 'a', + 'x': 'h1', + 'X0': 'h2', + 'X1': 'h3', + 'X2': 'h4', + 'X3': 'h5', + 'X4': 'h6', + 'Sp': 'sp', + 'Sb': 'sb', + 'c': 'c', + 'r': 'r', + 't': 't', + 'T': 'T', + 'i': 'i', + 'I': 'i', + 'u': 'u', + 'o': 'd', + 'b': 'b', + 'B': 'b', + 's': 's', + 'l': 'l', + 'Fn': 'a', + 'Sd': 'a', + 'FN': 'FS', + 'SB': 'FS', + } - # Try to fix some of the misordering of character-attribute tags. - (re.compile(r'(?P(<(i|u|b|del|sup|sub)( [^>]+)?>)+)(?P()+)'), lambda match: match.group('close') + match.group('ch')), - (re.compile(r'(?P(<(i|u|b|del|sup|sub|span)( [^>]+)?>)+)(?P(<(div|h\d)( [^>]+)?>)+)'), lambda match: match.group('blk') + match.group('ch')), + DIV_STATES = [ + 'c', + 'r', + 't', + 'T', + 'FS', + ] - # Remove unmatched plm codes. - (re.compile(r'\\X[0-4]'), lambda match: ''), - (re.compile(r'\\T="\d+%*"'), lambda match: ''), - (re.compile(r'\\Sp'), lambda match: ''), - (re.compile(r'\\Sb'), lambda match: ''), - # Remove invalid single item pml codes. - (re.compile(r'\\.'), lambda match: ''), -] + SPAN_STATES = [ + 's', + 'l', + 'i', + 'u', + 'd', + 'b', + ] -def pml_to_html(pml): - html = pml - for rule in PML_HTML_RULES: - html = rule[0].sub(rule[1], html) + def __init__(self, close_all): + self.close_all = close_all + self.state = {} - return html + def prepare_pml(self, pml): + # Remove comments + pml = re.sub(r'(?mus)\\v(?P.*?)\\v', '', pml) + # Footnotes and Sidebars + pml = re.sub(r'(?mus).+?)">\s*(?P.*?)\s*
', lambda match: '\\FN="fns-%s"%s\\FN' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + pml = re.sub(r'(?mus).+?)">\s*(?P.*?)\s*', lambda match: '\\SB="fns-%s"%s\\SB' % (match.group('target'), match.group('text')) if match.group('text') else '', pml) + + pml = prepare_string_for_xml(pml) + + pml = re.sub(r'\\a(?P\d{3})', lambda match: '&#%s;' % match.group('num'), pml) + pml = re.sub(r'\\U(?P[0-9a-f]{4})', lambda match: '%s' % my_unichr(int(match.group('num'), 16)), pml) + + return pml + + def prepare_line(self, line): + line = re.sub(r'[ ]{2,}', ' ', line) + line = re.sub(r'^[ ]*(?=.)', '', line) + line = re.sub(r'(?<=.)[ ]*$', '', line) + line = re.sub(r'^[ ]*$', '', line) + + return line + + def start_line(self): + start = u'' + + for key, val in self.state.items(): + if val[0]: + if key not in self.STATES_VALUE_REQ: + start += self.STATES_TAGS[key][0] + else: + start += self.STATES_TAGS[key][0] % val[1] + + return u'

%s' % start + + def end_line(self): + end = u'' + + for key, val in self.state.items(): + if val[0]: + if key == 'T': + self.state['T'][0] = False + end += self.STATES_TAGS[key][1] + + return u'%s

' % end + + def process_code_simple(self, code): + if code not in self.CODE_STATES.keys(): + return u'' + + text = u'' + + if self.state[self.CODE_STATES[code]][0]: + text = self.STATES_TAGS[self.CODE_STATES[code]][1] + else: + text = self.STATES_TAGS[self.CODE_STATES[code]][0] + + self.state[self.CODE_STATES[code]][0] = not self.state[self.CODE_STATES[code]][0] + + return text + + def process_code_link(self, stream, pre=''): + text = u'' + + href = self.code_value(stream) + if pre: + href = '#%s-%s' % (pre, href) + + if self.state['a'][0]: + text = self.STATES_TAGS['a'][1] + else: + text = self.STATES_TAGS['a'][0] % href + self.state['a'][1] = href + + self.state['a'][0] = not self.state['a'][0] + + return text + + def process_code_div_span(self, code, stream): + if self.close_all: + return self.process_code_div_span_call(code, stream) + else: + return self.process_code_div_span_ind(code, stream) + + def process_code_div_span_ind(self, code, stream): + text = u'' + ds = [] + + code = self.CODE_STATES[code] + + if code in self.DIV_STATES: + ds = self.DIV_STATES[:] + elif code in self.SPAN_STATES: + ds = self.SPAN_STATES[:] + + if self.state[code][0]: + # Close all. + for c in ds: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + # Reopen the based on state. + del ds[ds.index(code)] + for c in ds: + if self.state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text = self.STATES_TAGS[code][0] % val + self.state[code][1] = val + else: + text = self.STATES_TAGS[code][0] + + self.state[code][0] = not self.state[code][0] + + return text + + def process_code_div_span_call(self, code, stream): + text = u'' + divs = self.DIV_STATES[:] + spans = self.SPAN_STATES[:] + + code = self.CODE_STATES[code] + + if self.state[code][0]: + # Close all divs then spans. + for c in spans+divs: + if self.state[c][0]: + text += self.STATES_TAGS[c][1] + # Reopen the based on state. Open divs then spans + if code in self.DIV_STATES: + del divs[divs.index(code)] + if code in self.SPAN_STATES: + del spans[spans.index(code)] + for c in divs+spans: + if state[c][0]: + if c in self.STATES_VALUE_REQ: + text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] + else: + text += self.STATES_TAGS[c][0] + else: + if code in self.STATES_VALUE_REQ: + val = self.code_value(stream) + text = self.STATES_TAGS[code][0] % val + state[code][1] = val + else: + text = self.STATES_TAGS[code][0] + + self.state[code][0] = not self.state[code][0] + + return text + + def code_value(self, stream): + value = u'' + open = False + + c = stream.read(1) + while c != '': + if open and c != '"': + value += c + if c == '"': + if not open: + open = True + else: + break + c = stream.read(1) + + return value.strip() + + def parse_pml(self, pml): + pml = self.prepare_pml(pml) + output = [] + + self.state = {} + for s in self.STATES: + self.state[s] = [False, '']; + + for line in pml.splitlines(): + if not line: + continue + parsed = [] + empty = True + + # Must use StringIO, cStringIO does not support unicode + line = StringIO.StringIO(self.prepare_line(line)) + parsed.append(self.start_line()) + + c = line.read(1) + while c != '': + text = u'' + + if c == '\\': + c = line.read(1) + + if c == 'x': + text = self.process_code_simple(c) + elif c in 'XS': + l = line.read(1) + if '%s%s' % (c, l) == 'Sd': + text = self.process_code_link(line, 'fns') + elif '%s%s' % (c, l) == 'SB': + text = self.process_code_div_span('SB', line) + else: + text = self.process_code_simple('%s%s' % (c, l)) + elif c == 'q': + text = self.process_code_link(line) + elif c in 'crtTiIuobB': + text = self.process_code_div_span(c, line) + elif c in 'sl': + close = u'' + if c == 's' and self.state['l']: + close = self.process_code_div_span('l', line) + if c == 'l' and self.state['s']: + close = self.process_code_div_span('s', line) + text = self.process_code_div_span(c, line) + text = close+text + elif c == 'm': + empty = False + src = self.code_value(line) + text = '' % image_name(src).strip('\x00') + elif c == 'Q': + empty = False + id = self.code_value(line) + text = '' % id + elif c == 'p': + empty = False + text = '

' + elif c == 'C': + # This should be made to create a TOC entry + line.read(1) + self.code_value(line) + elif c == 'n': + pass + elif c == 'F': + l = line.read(1) + if '%s%s' % (c, l) == 'Fn': + text = self.process_code_link(line, 'fns') + elif '%s%s' % (c, l) == 'FN': + text = self.process_code_div_span('FN', line) + elif c == 'w': + empty = False + text = '
' % self.code_value(line) + elif c == '-': + empty = False + text = '­' + elif c == '\\': + empty = False + text = '\\' + else: + if c != ' ': + empty = False + text = c + parsed.append(text) + c = line.read(1) + + if not empty: + text = self.end_line() + parsed.append(text) + output.append(u''.join(parsed)) + line.close() + + return u'\n'.join(output) + + +def pml_to_html(pml, close_all=False): + ''' + close_all will close div all div and span tags when one is closed and then + re-open the appropriate ones. + ''' + + hizer = PML_HTMLizer(close_all) + return hizer.parse_pml(pml) def footnote_sidebar_to_html(id, pml): if id.startswith('\x01'): From 7ba005f3e0516aa02ff2d4d86c08bd02430eddd6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Nov 2009 08:14:34 -0700 Subject: [PATCH 3/3] IGN:... --- resources/recipes/fokkeensukke.recipe | 174 +++++++++++------------ src/calibre/ebooks/conversion/plumber.py | 8 +- src/calibre/ebooks/pml/pmlconverter.py | 4 +- src/calibre/manual/conversion.rst | 2 +- 4 files changed, 95 insertions(+), 93 deletions(-) diff --git a/resources/recipes/fokkeensukke.recipe b/resources/recipes/fokkeensukke.recipe index 5627631770..3ddbe1cfe5 100644 --- a/resources/recipes/fokkeensukke.recipe +++ b/resources/recipes/fokkeensukke.recipe @@ -1,87 +1,87 @@ -#!/usr/bin/python -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag - - -class FokkeEnSukkeRecipe(BasicNewsRecipe) : - __license__ = 'GPL v3' - __author__ = 'kwetal' - language = 'nl' - description = u'Popular Dutch daily cartoon Fokke en Sukke' - - title = u'Fokke en Sukke' - no_stylesheets = True - # For reasons unknown to me the extra css is, on the cartoon pages, inserted in the and not in the . My reader (Sony PRS-600) has a serious issue - # with that: it treats it as content and displays it as is. Setting this property to empty solves this for me. - template_css = '' - INDEX = u'http://foksuk.nl' - - # This cover is not as nice as it could be, needs some work - #cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif' - - keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})] - - def parse_index(self) : - # A list with daynames as they _can_ appear in the index - dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag'] - soup = self.index_to_soup(self.INDEX) - - # Find the links for the various cartoons for this week and loop through them - index = soup.find('div', attrs={'class' : 'selectcartoon'}) - links = index.findAll('a') - maxIndex = len(links) - 1 - articles = [] - for i in range(len(links)) : - # The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice. - if i == 0 : - continue - - # There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname. - # If there are two, there are three links in sequence: dayname 1 2. In that case we're interested in the last two. - if links[i].renderContents() in dayNames : - # If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content - if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') : - # Got you! Add it to the list - article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''} - articles.append(article) - # If there is a '1', there should be a '2' as well, but better save than sorry - if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') : - # Got you! Add it to the list - article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''} - articles.append(article) - else : - # There is only one cartoon for this day. Add it to the list. - article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''} - articles.append(article) - # Might as well use the weeknumber as title - week = index.find('span', attrs={'class' : 'week'}).renderContents() - - return [[week, articles]] - - def preprocess_html(self, soup) : - # This method is called for every page, be it cartoon or TOC. We need to process each in their own way - cartoon = soup.find('div', attrs={'class' : 'cartoon'}) - if cartoon : - # It is a cartoon. Extract the title. - title = '' - img = soup.find('img', attrs = {'alt' : True}) - if img : - title = img['alt'] - - # Using the 'extra_css' displays it in the and not in the . See comment at the top of this class. Setting the style this way solves that. - tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')]) - tag.insert(0, title) - cartoon.insert(0, tag) - - # I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier, - # and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook. - select = cartoon.find('div', attrs={'class' : 'selectcartoon'}) - if select : - select.extract() - - return cartoon - else : - # It is a TOC. Just return the whole lot. - return soup - - +#!/usr/bin/python +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag + + +class FokkeEnSukkeRecipe(BasicNewsRecipe) : + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'nl' + description = u'Popular Dutch daily cartoon Fokke en Sukke' + + title = u'Fokke en Sukke' + no_stylesheets = True + # For reasons unknown to me the extra css is, on the cartoon pages, inserted in the and not in the . My reader (Sony PRS-600) has a serious issue + # with that: it treats it as content and displays it as is. Setting this property to empty solves this for me. + template_css = '' + INDEX = u'http://foksuk.nl' + + # This cover is not as nice as it could be, needs some work + #cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif' + + keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})] + + def parse_index(self) : + # A list with daynames as they _can_ appear in the index + dayNames = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag & zondag'] + soup = self.index_to_soup(self.INDEX) + + # Find the links for the various cartoons for this week and loop through them + index = soup.find('div', attrs={'class' : 'selectcartoon'}) + links = index.findAll('a') + maxIndex = len(links) - 1 + articles = [] + for i in range(len(links)) : + # The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice. + if i == 0 : + continue + + # There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname. + # If there are two, there are three links in sequence: dayname 1 2. In that case we're interested in the last two. + if links[i].renderContents() in dayNames : + # If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content + if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') : + # Got you! Add it to the list + article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''} + articles.append(article) + # If there is a '1', there should be a '2' as well, but better save than sorry + if (i + 2 <= maxIndex) and (links[i + 2].renderContents() == '2') : + # Got you! Add it to the list + article = {'title' : links[i].renderContents() + ' 2', 'date' : u'', 'url' : self.INDEX + links[i + 2]['href'], 'description' : ''} + articles.append(article) + else : + # There is only one cartoon for this day. Add it to the list. + article = {'title' : links[i].renderContents(), 'date' : u'', 'url' : self.INDEX + links[i]['href'], 'description' : ''} + articles.append(article) + # Might as well use the weeknumber as title + week = index.find('span', attrs={'class' : 'week'}).renderContents() + + return [[week, articles]] + + def preprocess_html(self, soup) : + # This method is called for every page, be it cartoon or TOC. We need to process each in their own way + cartoon = soup.find('div', attrs={'class' : 'cartoon'}) + if cartoon : + # It is a cartoon. Extract the title. + title = '' + img = soup.find('img', attrs = {'alt' : True}) + if img : + title = img['alt'] + + # Using the 'extra_css' displays it in the and not in the . See comment at the top of this class. Setting the style this way solves that. + tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')]) + tag.insert(0, title) + cartoon.insert(0, tag) + + # I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier, + # and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook. + select = cartoon.find('div', attrs={'class' : 'selectcartoon'}) + if select : + select.extract() + + return cartoon + else : + # It is a TOC. Just return the whole lot. + return soup + + diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 262f64a9cc..30cc42480c 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -309,11 +309,13 @@ OptionRecommendation(name='remove_paragraph_spacing', 'paragraphs of 1.5em. Spacing removal will not work ' 'if the source file does not use paragraphs (

or

tags).') ), - + OptionRecommendation(name='remove_paragraph_spacing_indent_size', recommended_value=1.5, level=OptionRecommendation.LOW, - help=_('Width of the indent used with Remove spacing between paragraphs option') - ), + help=_('When calibre removes inter paragraph spacing, it automatically ' + 'sets a paragraph indent, to ensure that paragraphs can be easily ' + 'distinguished. This option controls the width of that indent.') + ), OptionRecommendation(name='prefer_metadata_cover', recommended_value=False, level=OptionRecommendation.LOW, diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 05cf488617..cb8ae15298 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -256,7 +256,7 @@ class PML_HTMLizer(object): if code in self.SPAN_STATES: del spans[spans.index(code)] for c in divs+spans: - if state[c][0]: + if self.state[c][0]: if c in self.STATES_VALUE_REQ: text += self.STATES_TAGS[self.CODE_STATES[c]][0] % self.state[c][1] else: @@ -265,7 +265,7 @@ class PML_HTMLizer(object): if code in self.STATES_VALUE_REQ: val = self.code_value(stream) text = self.STATES_TAGS[code][0] % val - state[code][1] = val + self.state[code][1] = val else: text = self.STATES_TAGS[code][0] diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index 64d8b7b62b..a841b9cf04 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -163,7 +163,7 @@ Paragraph spacing Normally, paragraphs in XHTML are rendered with a blank line between them and no leading text indent. |app| has a couple of options to control this. :guilabel:`Remove spacing between paragraphs` forcefully ensure that all paragraphs have no inter paragraph spacing. It also sets the text -indent to 1.5em (can be changed) to mark that start of every paragraph. +indent to 1.5em (can be changed) to mark the start of every paragraph. :guilabel:`Insert blank line` does the opposite, guaranteeing that there is exactly one blank line between each pair of paragraphs. Both these options are very comprehensive, removing spacing, or inserting it for *all* paragraphs