Fix #2193 (PRS-700 epub flow_size crash)

This commit is contained in:
Kovid Goyal 2009-04-02 11:07:49 -07:00
parent 9eceea4762
commit 62688a6822

View File

@ -8,24 +8,24 @@ Conversion of HTML/OPF files follows several stages:
* All links in the HTML files or in the OPF manifest are * All links in the HTML files or in the OPF manifest are
followed to build up a list of HTML files to be converted. followed to build up a list of HTML files to be converted.
This stage is implemented by This stage is implemented by
:function:`calibre.ebooks.html.traverse` and :function:`calibre.ebooks.html.traverse` and
:class:`calibre.ebooks.html.HTMLFile`. :class:`calibre.ebooks.html.HTMLFile`.
* The HTML is pre-processed to make it more semantic. * The HTML is pre-processed to make it more semantic.
All links in the HTML files to other resources like images, All links in the HTML files to other resources like images,
stylesheets, etc. are relativized. The resources are copied stylesheets, etc. are relativized. The resources are copied
into the `resources` sub directory. This is accomplished by into the `resources` sub directory. This is accomplished by
:class:`calibre.ebooks.html.PreProcessor` and :class:`calibre.ebooks.html.PreProcessor` and
:class:`calibre.ebooks.html.Parser`. :class:`calibre.ebooks.html.Parser`.
* The HTML is processed. Various operations are performed. * The HTML is processed. Various operations are performed.
All style declarations are extracted and consolidated into All style declarations are extracted and consolidated into
a single style sheet. Chapters are auto-detected and marked. a single style sheet. Chapters are auto-detected and marked.
Various font related manipulations are performed. See Various font related manipulations are performed. See
:class:`HTMLProcessor`. :class:`HTMLProcessor`.
* The processed HTML is saved and the * The processed HTML is saved and the
:module:`calibre.ebooks.epub.split` module is used to split up :module:`calibre.ebooks.epub.split` module is used to split up
large HTML files into smaller chunks. large HTML files into smaller chunks.
@ -64,7 +64,7 @@ def remove_bad_link(element, attribute, link, pos):
def check_links(opf_path, pretty_print): def check_links(opf_path, pretty_print):
''' '''
Find and remove all invalid links in the HTML files Find and remove all invalid links in the HTML files
''' '''
logger = logging.getLogger('html2epub') logger = logging.getLogger('html2epub')
logger.info('\tChecking files for bad links...') logger.info('\tChecking files for bad links...')
@ -78,7 +78,7 @@ def check_links(opf_path, pretty_print):
if isinstance(f, str): if isinstance(f, str):
f = f.decode('utf-8') f = f.decode('utf-8')
html_files.append(os.path.abspath(content(f))) html_files.append(os.path.abspath(content(f)))
for path in html_files: for path in html_files:
if not os.access(path, os.R_OK): if not os.access(path, os.R_OK):
continue continue
@ -113,27 +113,27 @@ def find_html_index(files):
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
class HTMLProcessor(Processor, Rationalizer): class HTMLProcessor(Processor, Rationalizer):
def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets): def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets):
Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles,
name='html2epub') name='html2epub')
if opts.verbose > 2: if opts.verbose > 2:
self.debug_tree('parsed') self.debug_tree('parsed')
self.detect_chapters() self.detect_chapters()
self.extract_css(stylesheets) self.extract_css(stylesheets)
if self.opts.base_font_size2 > 0: if self.opts.base_font_size2 > 0:
self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet], self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet],
self.root, self.opts) self.root, self.opts)
if opts.verbose > 2: if opts.verbose > 2:
self.debug_tree('nocss') self.debug_tree('nocss')
if hasattr(self.body, 'xpath'): if hasattr(self.body, 'xpath'):
for script in list(self.body.xpath('descendant::script')): for script in list(self.body.xpath('descendant::script')):
script.getparent().remove(script) script.getparent().remove(script)
self.fix_markup() self.fix_markup()
def convert_image(self, img): def convert_image(self, img):
rpath = img.get('src', '') rpath = img.get('src', '')
path = os.path.join(os.path.dirname(self.save_path()), *rpath.split('/')) path = os.path.join(os.path.dirname(self.save_path()), *rpath.split('/'))
@ -150,10 +150,10 @@ class HTMLProcessor(Processor, Rationalizer):
if val == rpath: if val == rpath:
self.resource_map[key] = rpath+'_calibre_converted.jpg' self.resource_map[key] = rpath+'_calibre_converted.jpg'
img.set('src', rpath+'_calibre_converted.jpg') img.set('src', rpath+'_calibre_converted.jpg')
def fix_markup(self): def fix_markup(self):
''' '''
Perform various markup transforms to get the output to render correctly Perform various markup transforms to get the output to render correctly
in the quirky ADE. in the quirky ADE.
''' '''
# Replace <br> that are children of <body> as ADE doesn't handle them # Replace <br> that are children of <body> as ADE doesn't handle them
@ -179,8 +179,8 @@ class HTMLProcessor(Processor, Rationalizer):
if not br.tail: if not br.tail:
br.tail = '' br.tail = ''
br.tail += sibling.tail br.tail += sibling.tail
if self.opts.profile.remove_object_tags: if self.opts.profile.remove_object_tags:
for tag in self.root.xpath('//embed'): for tag in self.root.xpath('//embed'):
tag.getparent().remove(tag) tag.getparent().remove(tag)
@ -188,42 +188,46 @@ class HTMLProcessor(Processor, Rationalizer):
if tag.get('type', '').lower().strip() in ('image/svg+xml',): if tag.get('type', '').lower().strip() in ('image/svg+xml',):
continue continue
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in self.root.xpath('//title|//style'): for tag in self.root.xpath('//title|//style'):
if not tag.text: if not tag.text:
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in self.root.xpath('//script'): for tag in self.root.xpath('//script'):
if not tag.text and not tag.get('src', False): if not tag.text and not tag.get('src', False):
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in self.root.xpath('//form'): for tag in self.root.xpath('//form'):
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in self.root.xpath('//center'): for tag in self.root.xpath('//center'):
tag.tag = 'div' tag.tag = 'div'
tag.set('style', 'text-align:center') tag.set('style', 'text-align:center')
if self.opts.linearize_tables: if self.opts.linearize_tables:
for tag in self.root.xpath('//table | //tr | //th | //td'): for tag in self.root.xpath('//table | //tr | //th | //td'):
tag.tag = 'div' tag.tag = 'div'
# ADE can't handle &amp; in an img url
for tag in self.root.xpath('//img[@src]'):
tag.set('src', tag.get('src', '').replace('&', ''))
def save(self): def save(self):
for meta in list(self.root.xpath('//meta')): for meta in list(self.root.xpath('//meta')):
meta.getparent().remove(meta) meta.getparent().remove(meta)
# Strip all comments since Adobe DE is petrified of them # Strip all comments since Adobe DE is petrified of them
Processor.save(self, strip_comments=True) Processor.save(self, strip_comments=True)
def remove_first_image(self): def remove_first_image(self):
images = self.root.xpath('//img') images = self.root.xpath('//img')
if images: if images:
images[0].getparent().remove(images[0]) images[0].getparent().remove(images[0])
return True return True
return False return False
def config(defaults=None): def config(defaults=None):
return common_config(defaults=defaults) return common_config(defaults=defaults)
@ -235,7 +239,7 @@ def option_parser():
Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file. Convert a HTML file to an EPUB ebook. Recursively follows links in the HTML file.
If you specify an OPF file instead of an HTML file, the list of links is takes from If you specify an OPF file instead of an HTML file, the list of links is takes from
the <spine> element of the OPF file. the <spine> element of the OPF file.
''')) '''))
def parse_content(filelist, opts, tdir): def parse_content(filelist, opts, tdir):
@ -246,7 +250,7 @@ def parse_content(filelist, opts, tdir):
first_image_removed = False first_image_removed = False
for htmlfile in filelist: for htmlfile in filelist:
logging.getLogger('html2epub').debug('Processing %s...'%htmlfile) logging.getLogger('html2epub').debug('Processing %s...'%htmlfile)
hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'),
resource_map, filelist, stylesheets) resource_map, filelist, stylesheets)
if not first_image_removed and opts.remove_first_image: if not first_image_removed and opts.remove_first_image:
first_image_removed = hp.remove_first_image() first_image_removed = hp.remove_first_image()
@ -254,7 +258,7 @@ def parse_content(filelist, opts, tdir):
hp.save() hp.save()
stylesheet_map[os.path.basename(hp.save_path())] = \ stylesheet_map[os.path.basename(hp.save_path())] = \
[s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None] [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None]
logging.getLogger('html2epub').debug('Saving stylesheets...') logging.getLogger('html2epub').debug('Saving stylesheets...')
if opts.base_font_size2 > 0: if opts.base_font_size2 > 0:
Rationalizer.remove_font_size_information(stylesheets.values()) Rationalizer.remove_font_size_information(stylesheets.values())
@ -268,7 +272,7 @@ def parse_content(filelist, opts, tdir):
if toc.count('chapter') + toc.count('file') > opts.toc_threshold: if toc.count('chapter') + toc.count('file') > opts.toc_threshold:
toc.purge(['link', 'unknown']) toc.purge(['link', 'unknown'])
toc.purge(['link'], max=opts.max_toc_links) toc.purge(['link'], max=opts.max_toc_links)
return resource_map, hp.htmlfile_map, toc, stylesheet_map return resource_map, hp.htmlfile_map, toc, stylesheet_map
TITLEPAGE = '''\ TITLEPAGE = '''\
@ -325,26 +329,26 @@ def process_title_page(mi, filelist, htmlfilemap, opts, tdir):
metadata_cover = mi.cover metadata_cover = mi.cover
if metadata_cover and not os.path.exists(metadata_cover): if metadata_cover and not os.path.exists(metadata_cover):
metadata_cover = None metadata_cover = None
cpath = '/'.join(('resources', '_cover_.jpg')) cpath = '/'.join(('resources', '_cover_.jpg'))
cover_dest = os.path.join(tdir, 'content', *cpath.split('/')) cover_dest = os.path.join(tdir, 'content', *cpath.split('/'))
if metadata_cover is not None: if metadata_cover is not None:
if not create_cover_image(metadata_cover, cover_dest, if not create_cover_image(metadata_cover, cover_dest,
opts.profile.screen_size): opts.profile.screen_size):
metadata_cover = None metadata_cover = None
specified_cover = opts.cover specified_cover = opts.cover
if specified_cover and not os.path.exists(specified_cover): if specified_cover and not os.path.exists(specified_cover):
specified_cover = None specified_cover = None
if specified_cover is not None: if specified_cover is not None:
if not create_cover_image(specified_cover, cover_dest, if not create_cover_image(specified_cover, cover_dest,
opts.profile.screen_size): opts.profile.screen_size):
specified_cover = None specified_cover = None
cover = metadata_cover if specified_cover is None or (opts.prefer_metadata_cover and metadata_cover is not None) else specified_cover cover = metadata_cover if specified_cover is None or (opts.prefer_metadata_cover and metadata_cover is not None) else specified_cover
if cover is not None: if cover is not None:
titlepage = TITLEPAGE%cpath titlepage = TITLEPAGE%cpath
tp = 'calibre_title_page.html' if old_title_page is None else old_title_page tp = 'calibre_title_page.html' if old_title_page is None else old_title_page
tppath = os.path.join(tdir, 'content', tp) tppath = os.path.join(tdir, 'content', tp)
with open(tppath, 'wb') as f: with open(tppath, 'wb') as f:
f.write(titlepage) f.write(titlepage)
@ -370,7 +374,7 @@ def condense_ncx(ncx_path):
compressed = etree.tostring(tree.getroot(), encoding='utf-8') compressed = etree.tostring(tree.getroot(), encoding='utf-8')
open(ncx_path, 'wb').write(compressed) open(ncx_path, 'wb').write(compressed)
def convert(htmlfile, opts, notification=None, create_epub=True, def convert(htmlfile, opts, notification=None, create_epub=True,
oeb_cover=False, extract_to=None): oeb_cover=False, extract_to=None):
htmlfile = os.path.abspath(htmlfile) htmlfile = os.path.abspath(htmlfile)
if opts.output is None: if opts.output is None:
@ -399,16 +403,16 @@ def convert(htmlfile, opts, notification=None, create_epub=True,
else: else:
opf, filelist = get_filelist(htmlfile, opts) opf, filelist = get_filelist(htmlfile, opts)
mi = merge_metadata(htmlfile, opf, opts) mi = merge_metadata(htmlfile, opf, opts)
opts.chapter = XPath(opts.chapter, opts.chapter = XPath(opts.chapter,
namespaces={'re':'http://exslt.org/regular-expressions'}) namespaces={'re':'http://exslt.org/regular-expressions'})
for x in (1, 2, 3): for x in (1, 2, 3):
attr = 'level%d_toc'%x attr = 'level%d_toc'%x
if getattr(opts, attr): if getattr(opts, attr):
setattr(opts, attr, XPath(getattr(opts, attr), setattr(opts, attr, XPath(getattr(opts, attr),
namespaces={'re':'http://exslt.org/regular-expressions'})) namespaces={'re':'http://exslt.org/regular-expressions'}))
else: else:
setattr(opts, attr, None) setattr(opts, attr, None)
with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir: with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
if opts.keep_intermediate: if opts.keep_intermediate:
print 'Intermediate files in', tdir print 'Intermediate files in', tdir
@ -416,16 +420,16 @@ def convert(htmlfile, opts, notification=None, create_epub=True,
parse_content(filelist, opts, tdir) parse_content(filelist, opts, tdir)
logger = logging.getLogger('html2epub') logger = logging.getLogger('html2epub')
resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()] resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
title_page, has_title_page = process_title_page(mi, filelist, htmlfile_map, opts, tdir) title_page, has_title_page = process_title_page(mi, filelist, htmlfile_map, opts, tdir)
spine = [htmlfile_map[f.path] for f in filelist] spine = [htmlfile_map[f.path] for f in filelist]
if not oeb_cover and title_page is not None: if not oeb_cover and title_page is not None:
spine = [title_page] + spine spine = [title_page] + spine
mi.cover = None mi.cover = None
mi.cover_data = (None, None) mi.cover_data = (None, None)
mi = create_metadata(tdir, mi, spine, resources) mi = create_metadata(tdir, mi, spine, resources)
buf = cStringIO.StringIO() buf = cStringIO.StringIO()
if mi.toc: if mi.toc:
@ -453,7 +457,7 @@ def convert(htmlfile, opts, notification=None, create_epub=True,
logger.info('\tBuilding page map...') logger.info('\tBuilding page map...')
add_page_map(opf_path, opts) add_page_map(opf_path, opts)
check_links(opf_path, opts.pretty_print) check_links(opf_path, opts.pretty_print)
opf = OPF(opf_path, tdir) opf = OPF(opf_path, tdir)
opf.remove_guide() opf.remove_guide()
oeb_cover_file = None oeb_cover_file = None
@ -465,7 +469,7 @@ def convert(htmlfile, opts, notification=None, create_epub=True,
opf.add_guide_item('cover', 'Cover', 'content/'+spine[0]) opf.add_guide_item('cover', 'Cover', 'content/'+spine[0])
if oeb_cover and oeb_cover_file: if oeb_cover and oeb_cover_file:
opf.add_guide_item('cover', 'Cover', 'content/'+oeb_cover_file) opf.add_guide_item('cover', 'Cover', 'content/'+oeb_cover_file)
cpath = os.path.join(tdir, 'content', 'resources', '_cover_.jpg') cpath = os.path.join(tdir, 'content', 'resources', '_cover_.jpg')
if os.path.exists(cpath): if os.path.exists(cpath):
opf.add_path_to_manifest(cpath, 'image/jpeg') opf.add_path_to_manifest(cpath, 'image/jpeg')
@ -477,29 +481,29 @@ def convert(htmlfile, opts, notification=None, create_epub=True,
condense_ncx(ncx_path) condense_ncx(ncx_path)
if os.stat(ncx_path).st_size > opts.profile.flow_size: if os.stat(ncx_path).st_size > opts.profile.flow_size:
logger.warn('NCX still larger than allowed size at %d bytes. Menu based Table of Contents may not work on device.'%os.stat(ncx_path).st_size) logger.warn('NCX still larger than allowed size at %d bytes. Menu based Table of Contents may not work on device.'%os.stat(ncx_path).st_size)
if create_epub: if create_epub:
epub = initialize_container(opts.output) epub = initialize_container(opts.output)
epub.add_dir(tdir) epub.add_dir(tdir)
epub.close() epub.close()
run_plugins_on_postprocess(opts.output, 'epub') run_plugins_on_postprocess(opts.output, 'epub')
logger.info(_('Output written to ')+opts.output) logger.info(_('Output written to ')+opts.output)
if opts.show_opf: if opts.show_opf:
print open(opf_path, 'rb').read() print open(opf_path, 'rb').read()
if opts.extract_to is not None: if opts.extract_to is not None:
if os.path.exists(opts.extract_to): if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to) shutil.rmtree(opts.extract_to)
shutil.copytree(tdir, opts.extract_to) shutil.copytree(tdir, opts.extract_to)
if extract_to is not None: if extract_to is not None:
if os.path.exists(extract_to): if os.path.exists(extract_to):
shutil.rmtree(extract_to) shutil.rmtree(extract_to)
shutil.copytree(tdir, extract_to) shutil.copytree(tdir, extract_to)
def main(args=sys.argv): def main(args=sys.argv):
parser = option_parser() parser = option_parser()
opts, args = parser.parse_args(args) opts, args = parser.parse_args(args)
@ -509,6 +513,6 @@ def main(args=sys.argv):
return 1 return 1
convert(args[1], opts) convert(args[1], opts)
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())