From 3a5663f9dfa4466670f710f82b376eceb2c7b29d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 20 Jun 2015 15:27:24 +0530 Subject: [PATCH] Edit Book: Check Book: Add a check for text that is placed directly inside the body tag, without any surrounding tags --- src/calibre/ebooks/oeb/polish/check/main.py | 3 +- .../ebooks/oeb/polish/check/parsing.py | 50 ++++++++++++++++++- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/oeb/polish/check/main.py b/src/calibre/ebooks/oeb/polish/check/main.py index bc135621e2..bdb54a99e5 100644 --- a/src/calibre/ebooks/oeb/polish/check/main.py +++ b/src/calibre/ebooks/oeb/polish/check/main.py @@ -14,7 +14,7 @@ from calibre.ebooks.oeb.polish.cover import is_raster_image from calibre.ebooks.oeb.polish.check.base import run_checkers, WARN from calibre.ebooks.oeb.polish.check.parsing import ( check_filenames, check_xml_parsing, check_css_parsing, fix_style_tag, - check_html_size, check_ids, EmptyFile, check_encoding_declarations) + check_html_size, check_ids, check_markup, EmptyFile, check_encoding_declarations) from calibre.ebooks.oeb.polish.check.images import check_raster_images from calibre.ebooks.oeb.polish.check.links import check_links, check_mimetypes, check_link_destinations from calibre.ebooks.oeb.polish.check.fonts import check_fonts @@ -76,6 +76,7 @@ def run_checks(container): errors += check_fonts(container) errors += check_filenames(container) errors += check_ids(container) + errors += check_markup(container) errors += check_opf(container) return errors diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py index 9d4d61b630..076741a597 100644 --- a/src/calibre/ebooks/oeb/polish/check/parsing.py +++ b/src/calibre/ebooks/oeb/polish/check/parsing.py @@ -17,7 +17,7 @@ from calibre.ebooks.html_entities import html5_entities from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type from calibre.ebooks.oeb.polish.check.base import BaseError, WARN, ERROR, INFO -from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS, urlquote, URL_SAFE +from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_NS, urlquote, URL_SAFE, XHTML HTML_ENTITTIES = frozenset(html5_entities) XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'} @@ -328,6 +328,38 @@ class DuplicateId(BaseError): container.dirty(self.name) return True +class BareTextInBody(BaseError): + + INDIVIDUAL_FIX = _('Wrap the bare text in a p tag') + HELP = _('You cannot have bare text inside the body tag. The text must be placed inside some other tag, such as p or div') + has_multiple_locations = True + + def __init__(self, name, lines): + BaseError.__init__(self, _('Bare text in body tag'), name) + self.all_locations = [(name, l, None) for l in sorted(lines)] + + def __call__(self, container): + root = container.parsed(self.name) + for body in root.xpath('//*[local-name() = "body"]'): + children = tuple(body.iterchildren('*')) + if body.text and body.text.strip(): + p = body.makeelement(XHTML('p')) + p.text, body.text = body.text.strip(), '\n ' + p.tail = '\n' + if children: + p.tail += ' ' + body.insert(0, p) + for child in children: + if child.tail and child.tail.strip(): + p = body.makeelement(XHTML('p')) + p.text, child.tail = child.tail.strip(), '\n ' + p.tail = '\n' + body.insert(body.index(child) + 1, p) + if child is not children[-1]: + p.tail += ' ' + container.dirty(self.name) + return True + class ErrorHandler(object): ' Replacement logger to get useful error/warning info out of cssutils during parsing ' @@ -402,3 +434,19 @@ def check_ids(container): seen_ids[eid] = elem.sourceline errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems()) return errors + +def check_markup(container): + errors = [] + for name, mt in container.mime_map.iteritems(): + if mt in OEB_DOCS: + lines = [] + root = container.parsed(name) + for body in root.xpath('//*[local-name()="body"]'): + if body.text and body.text.strip(): + lines.append(body.sourceline) + for child in body.iterchildren('*'): + if child.tail and child.tail.strip(): + lines.append(child.sourceline) + if lines: + errors.append(BareTextInBody(name, lines)) + return errors