mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #4973 (Improvement for chm conversion - remove br tag at top of page)
This commit is contained in:
parent
5fd1703e30
commit
6b04e57275
@ -4,11 +4,11 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
||||||
' and Alex Bramley <a.bramley at gmail.com>.'
|
' and Alex Bramley <a.bramley at gmail.com>.'
|
||||||
|
|
||||||
import os, shutil, uuid
|
import os, shutil, uuid, re
|
||||||
from tempfile import mkdtemp
|
from tempfile import mkdtemp
|
||||||
from mimetypes import guess_type as guess_mimetype
|
from mimetypes import guess_type as guess_mimetype
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup, NavigableString
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from pychm.chm import CHMFile
|
from pychm.chm import CHMFile
|
||||||
from pychm.chmlib import (
|
from pychm.chmlib import (
|
||||||
@ -29,6 +29,17 @@ def match_string(s1, s2_already_lowered):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def check_all_prev_empty(tag):
|
||||||
|
if tag is None:
|
||||||
|
return True
|
||||||
|
if tag.__class__ == NavigableString and not check_empty(tag):
|
||||||
|
return False
|
||||||
|
return check_all_prev_empty(tag.previousSibling)
|
||||||
|
|
||||||
|
def check_empty(s, rex = re.compile(r'\S')):
|
||||||
|
return rex.search(s) is None
|
||||||
|
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
parser = OptionParser(usage=_('%prog [options] mybook.chm'))
|
parser = OptionParser(usage=_('%prog [options] mybook.chm'))
|
||||||
parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
|
parser.add_option('--output-dir', '-d', default='.', help=_('Output directory. Defaults to current directory'), dest='output')
|
||||||
@ -155,6 +166,12 @@ class CHMReader(CHMFile):
|
|||||||
# for some very odd reason each page's content appears to be in a table
|
# for some very odd reason each page's content appears to be in a table
|
||||||
# too. and this table has sub-tables for random asides... grr.
|
# too. and this table has sub-tables for random asides... grr.
|
||||||
|
|
||||||
|
# remove br at top of page if present after nav bars removed
|
||||||
|
br = soup('br')
|
||||||
|
if br:
|
||||||
|
if check_all_prev_empty(br[0].previousSibling):
|
||||||
|
br[0].extract()
|
||||||
|
|
||||||
# some images seem to be broken in some chm's :/
|
# some images seem to be broken in some chm's :/
|
||||||
for img in soup('img'):
|
for img in soup('img'):
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user