diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index d8cee34e2d..765ff8eed5 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -293,12 +293,13 @@ class HTMLConverter(object): if self.pseudo_css.has_key(tagname): pprop.update(self.pseudo_css[tagname]) if tag.has_key("class"): - cls = tag["class"].lower() - for classname in ["."+cls, tagname+"."+cls]: - if self.css.has_key(classname): - prop.update(self.css[classname]) - if self.pseudo_css.has_key(classname): - pprop.update(self.pseudo_css[classname]) + cls = tag["class"].lower() + for cls in cls.split(): + for classname in ["."+cls, tagname+"."+cls]: + if self.css.has_key(classname): + prop.update(self.css[classname]) + if self.pseudo_css.has_key(classname): + pprop.update(self.pseudo_css[classname]) if tag.has_key("style"): prop.update(self.parse_style_properties(tag["style"])) return prop, pprop @@ -625,7 +626,6 @@ class HTMLConverter(object): unneeded.append(prop) for prop in unneeded: fp.pop(prop) - elem = Span(text=src, **fp) if (fp or force_span_use) else src self.current_para.append(elem) @@ -651,24 +651,25 @@ class HTMLConverter(object): def end_current_para(self): ''' - End current paragraph with a paragraph break after it. If the current - paragraph has no non whitespace text in it do nothing. + End current paragraph with a paragraph break after it. + ''' + if self.current_para.contents: + self.current_block.append(self.current_para) + self.current_block.append(CR()) + self.current_para = Paragraph() + + def end_current_block(self): + ''' + End current TextBlock. Create new TextBlock with the same styles. ''' - if not self.current_para.has_text(): - return if self.current_para.contents: self.current_block.append(self.current_para) self.current_para = Paragraph() - if self.current_block.contents and \ - not isinstance(self.current_block.contents[-1], CR): - self.current_block.append(CR()) - - def end_current_block(self): - self.current_para.append_to(self.current_block) - self.current_block.append_to(self.current_page) - self.current_para = Paragraph() - self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle, + if self.current_block.contents or self.current_block.must_append: + self.current_page.append(self.current_block) + self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle, blockStyle=self.current_block.blockStyle) + def process_image(self, path, tag_css, width=None, height=None, dropcaps=False): original_path = path @@ -1033,7 +1034,7 @@ class HTMLConverter(object): return fp - def process_block(self, tag, tag_css, tkey): + def process_block(self, tag, tag_css): ''' Ensure padding and text-indent properties are respected ''' text_properties = self.text_properties(tag_css) block_properties = self.block_properties(tag_css) @@ -1057,7 +1058,8 @@ class HTMLConverter(object): self.block_styles.append(bs) self.current_block = self.book.create_text_block(blockStyle=bs, textStyle=ts) - self.targets[tkey] = self.current_block + return True + return False def parse_tag(self, tag, parent_css): try: @@ -1298,20 +1300,13 @@ class HTMLConverter(object): self.current_para.append(elem(text)) elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - tkey = None - if self.anchor_ids and tag.has_key('id'): - target = self.book.create_text_block(textStyle=self.current_block.textStyle, - blockStyle=self.current_block.blockStyle) + new_block = self.process_block(tag, tag_css) + if self.anchor_ids and tag.has_key('id'): tkey = self.target_prefix+tag['id'] - self.targets[tkey] = target - - if len(self.current_block.contents) > 2: + if not new_block: self.end_current_block() - self.current_page.append(target) - self.unused_target_blocks.append(target) - else: - self.targets[tkey] = self.current_block - self.current_block.must_append = True + self.current_block.must_append = True + self.targets[tkey] = self.current_block src = self.get_text(tag, limit=1000) if not self.disable_chapter_detection and tagname.startswith('h'): if self.chapter_regex.search(src): @@ -1320,18 +1315,16 @@ class HTMLConverter(object): self.page_break_found = True if not tag.contents: self.current_block.append(CR()) - self.current_block.must_append = True return - self.process_block(tag, tag_css, tkey) - if self.current_para.contents: - self.current_block.append(self.current_para) + + if self.current_para.has_text(): + self.current_para.append_to(self.current_block) if self.current_block.contents: self.current_block.append(CR()) self.previous_text = '\n' self.current_para = Paragraph() - self.process_children(tag, tag_css, tag_pseudo_css) - if self.current_para.contents: + if self.current_para.contents : self.current_block.append(self.current_para) self.current_para = Paragraph() if tagname.startswith('h') or self.blank_after_para: @@ -1346,25 +1339,24 @@ class HTMLConverter(object): self.line_break() self.previous_text = '\n' elif tagname in ['hr', 'tr']: # tr needed for nested tables - self.end_current_para() - self.line_break() self.end_current_block() if tagname == 'hr': self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth'])) self.previous_text = '\n' self.process_children(tag, tag_css, tag_pseudo_css) elif tagname == 'td': # Needed for nested tables - self.current_para.append(' ') - self.previous_text = ' ' + if not self.in_table: + self.current_para.append(' ') + self.previous_text = ' ' self.process_children(tag, tag_css, tag_pseudo_css) elif tagname == 'table' and not self.ignore_tables and not self.in_table: tag_css = self.tag_css(tag)[0] # Table should not inherit CSS try: self.process_table(tag, tag_css) except Exception, err: - self.logger.warning('An error occurred while processing a table: %s', str(err)) + self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err)) self.logger.debug('', exc_info=True) - self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300]) + self.logger.debug('Bad table:\n%s', str(tag)[:300]) self.in_table = False self.process_children(tag, tag_css, tag_pseudo_css) finally: @@ -1468,7 +1460,7 @@ def process_file(path, options, logger=None): fheader = re.sub(r'%%a','%a',fheader) fheader = re.sub(r'%%t','%t',fheader) header.append(fheader + " ") - book, fonts = Book(options, header=header, **args) + book, fonts = Book(options, logger, header=header, **args) le = re.compile(options.link_exclude) if options.link_exclude else \ re.compile('$') pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \ diff --git a/src/libprs500/ebooks/lrf/html/demo/demo.html b/src/libprs500/ebooks/lrf/html/demo/demo.html index d0142ce120..3085c6a6fe 100644 --- a/src/libprs500/ebooks/lrf/html/demo/demo.html +++ b/src/libprs500/ebooks/lrf/html/demo/demo.html @@ -14,7 +14,7 @@ This file contains a demonstration of the capabilities of html2lrf, the HTML to LRF converter from libprs500. To obtain libprs500 visit
https://libprs500.kovidgoyal.net


-

Table of Contents

+

Table of Contents

-

Lists

+

Lists

Nested lists

    @@ -54,8 +54,6 @@

    Tables

    -
    - @@ -67,29 +65,27 @@

    html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells.

    -

    Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.

    -

    On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.

    Sample Complex Table of Contents

    A matrix

    Column 1Column 2Column 3
    - - - - - - - - - - - - + + + + + + + + + + + +
     PAGE
    Prefacev
    List of Works of Referencevii
    List of Illustrationsxi
    ChapterI.History of the Foundation3
    II.Exterior of the Church25
    III.Interior of the Church33
    IV.St. Bartholomew-the-Less and the Hospital63
    AppendixI.The Priory Seals73
    II.The Priors and Rectors77
    III.Inventory of Vestments, etc.79
    IV.The Organ80
    Index83
    Prefacev
    List of Works of Referencevii
    List of Illustrationsxi
    ChapterI.History of the Foundation3
    II.Exterior of the Church25
    III.Interior of the Church33
    IV.St. Bartholomew-the-Less and the Hospital63
    AppendixI.The Priory Seals73
    II.The Priors and Rectors77
    III.Inventory of Vestments, etc.79
    IV.The Organ80
    Index83

    @@ -120,8 +116,7 @@


    A very indented paragraph

    An unindented paragraph

    -

    A default indented paragraph


    -
    +

    A default indented paragraph


    Table of Contents diff --git a/src/libprs500/ebooks/lrf/html/table.py b/src/libprs500/ebooks/lrf/html/table.py index c63b1dfeba..f21dc6f3b6 100644 --- a/src/libprs500/ebooks/lrf/html/table.py +++ b/src/libprs500/ebooks/lrf/html/table.py @@ -140,7 +140,7 @@ class Cell(object): ts = tb.textStyle.attrs default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize'])) parindent = self.pts_to_pixels(ts['parindent']) - + mwidth = 0 for token, attrs in tokens(tb): font = default_font if isinstance(token, int): # Handle para and line breaks @@ -155,8 +155,10 @@ class Cell(object): continue word = token.split() word = word[0] if word else "" - width, height = font.getsize(word) - return parindent + width + 2 + width = font.getsize(word)[0] + if width > mwidth: + mwidth = width + return parindent + mwidth + 2 def text_block_size(self, tb, maxwidth=sys.maxint, debug=False): ts = tb.textStyle.attrs @@ -338,7 +340,7 @@ class Table(object): adjustable_columns.append(i) itercount = 0 - min_widths = [self.minimum_width(i) for i in xrange(cols)] + min_widths = [self.minimum_width(i)+10 for i in xrange(cols)] while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100: for i in adjustable_columns: widths[i] = ceil((95./100.)*widths[i]) if \ diff --git a/src/libprs500/ebooks/lrf/pylrs/pylrs.py b/src/libprs500/ebooks/lrf/pylrs/pylrs.py index a3e05acd0e..9f7302e8c1 100644 --- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py +++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py @@ -249,7 +249,7 @@ class LrsContainer(object): self.parent = None self.contents = [] self.validChildren = validChildren - self.must_append = False + self.must_append = False #: If True even an empty container is appended by append_to def has_text(self): ''' Return True iff this container has non whitespace text ''' @@ -261,7 +261,7 @@ class LrsContainer(object): if child.has_text(): return True for item in self.contents: - if isinstance(item, (Plot, ImageBlock, Canvas)): + if isinstance(item, (Plot, ImageBlock, Canvas, CR)): return True return False @@ -270,7 +270,7 @@ class LrsContainer(object): Append self to C{parent} iff self has non whitespace textual content @type parent: LrsContainer ''' - if self.has_text() or self.must_append: + if self.contents or self.must_append: parent.append(self)