Fix #4973 (Improvement for chm conversion - remove br tag at top of page)

This commit is contained in:
Kovid Goyal 2010-02-21 12:52:41 -07:00
parent 5fd1703e30
commit 6b04e57275

View File

@ -4,11 +4,11 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.' ' and Alex Bramley <a.bramley at gmail.com>.'
import os, shutil, uuid import os, shutil, uuid, re
from tempfile import mkdtemp from tempfile import mkdtemp
from mimetypes import guess_type as guess_mimetype from mimetypes import guess_type as guess_mimetype
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup, NavigableString
from lxml import html from lxml import html
from pychm.chm import CHMFile from pychm.chm import CHMFile
from pychm.chmlib import ( from pychm.chmlib import (
@ -29,6 +29,17 @@ def match_string(s1, s2_already_lowered):
return True return True
return False return False
def check_all_prev_empty(tag):
if tag is None:
return True
if tag.__class__ == NavigableString and not check_empty(tag):
return False
return check_all_prev_empty(tag.previousSibling)
def check_empty(s, rex = re.compile(r'\S')):
return rex.search(s) is None
def option_parser(): def option_parser():
parser = OptionParser(usage=_('%prog [options] mybook.chm')) parser = OptionParser(usage=_('%prog [options] mybook.chm'))
parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output') parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
@ -155,6 +166,12 @@ class CHMReader(CHMFile):
# for some very odd reason each page's content appears to be in a table # for some very odd reason each page's content appears to be in a table
# too. and this table has sub-tables for random asides... grr. # too. and this table has sub-tables for random asides... grr.
# remove br at top of page if present after nav bars removed
br = soup('br')
if br:
if check_all_prev_empty(br[0].previousSibling):
br[0].extract()
# some images seem to be broken in some chm's :/ # some images seem to be broken in some chm's :/
for img in soup('img'): for img in soup('img'):
try: try: