remove br from top of page in chm conversion

This commit is contained in:
James Ralston 2010-02-21 10:06:40 -08:00
parent c91f022385
commit ac8ccceef8

View File

@ -11,7 +11,7 @@ from mimetypes import guess_type as guess_mimetype
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from pprint import PrettyPrinter from pprint import PrettyPrinter
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup, NavigableString
from lxml import html, etree from lxml import html, etree
from pychm.chm import CHMFile from pychm.chm import CHMFile
from pychm.chmlib import ( from pychm.chmlib import (
@ -35,6 +35,17 @@ def match_string(s1, s2_already_lowered):
return True return True
return False return False
def check_all_prev_empty(tag):
if tag is None:
return True
if tag.__class__ == NavigableString and not check_empty(tag):
return False
return check_all_prev_empty(tag.previousSibling)
def check_empty(s, rex = re.compile(r'\S')):
return rex.search(s) is None
def option_parser(): def option_parser():
parser = OptionParser(usage=_('%prog [options] mybook.chm')) parser = OptionParser(usage=_('%prog [options] mybook.chm'))
parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
@ -160,6 +171,12 @@ class CHMReader(CHMFile):
t[-1].extract() t[-1].extract()
# for some very odd reason each page's content appears to be in a table # for some very odd reason each page's content appears to be in a table
# too. and this table has sub-tables for random asides... grr. # too. and this table has sub-tables for random asides... grr.
# remove br at top of page if present after nav bars removed
br = html('br')
if br:
if check_all_prev_empty(br[0].previousSibling):
br[0].extract()
# some images seem to be broken in some chm's :/ # some images seem to be broken in some chm's :/
for img in html('img'): for img in html('img'):