Fix an encoding error in html2epub and make splitting code more robust

2025-07-09 03:04:10 -04:00 · 2008-12-11 17:44:18 -08:00 · 2008-12-11 17:44:18 -08:00 · f976ad63be
commit f976ad63be
parent 371c1bee5b
2 changed files with 18 additions and 7 deletions
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -48,8 +48,7 @@ from calibre.ebooks.epub import initialize_container, PROFILES
 from calibre.ebooks.epub.split import split
 from calibre.ebooks.epub.fonts import Rationalizer
 from calibre.constants import preferred_encoding
-from calibre import walk
+from calibre import walk, CurrentDir, to_unicode
 from calibre import CurrentDir
 content = functools.partial(os.path.join, u'content')
@ -79,7 +78,7 @@ def check(opf_path, pretty_print):
            base = os.path.dirname(path)
            root = html.fromstring(open(content(path), 'rb').read())
            for element, attribute, link, pos in list(root.iterlinks()):
-                link = link.decode('utf-8')
+                link = to_unicode(link)
                plink = Link(link, base)
                bad = False
                if plink.path is not None and not os.path.exists(plink.path):
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -138,7 +138,9 @@ class Splitter(LoggingInterface):
        for t in self.do_split(tree, split_point, before):
            r = t.getroot()
-            size = len(tostring(r)) 
+            if self.is_page_empty(r):
                continue
            size = len(tostring(r))
            if size <= self.opts.profile.flow_size:
                self.trees.append(t)
                #print tostring(t.getroot(), pretty_print=True)
@ -384,6 +386,9 @@ class Splitter(LoggingInterface):
                frag = None
                if len(href) > 1:
                    frag = href[1]
                if frag not in self.anchor_map:
                    self.log_warning('\t\tUnable to re-map OPF link', href)
                    continue
                new_file = self.anchor_map[frag]
                ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
@ -410,7 +415,11 @@ def fix_content_links(html_files, changes, opts):
                anchor = href[1] if len(href) > 1 else None
                href = href[0]
                if href in split_files:
-                    newf = anchor_maps[split_files.index(href)][anchor]
+                    try:
                        newf = anchor_maps[split_files.index(href)][anchor]
                    except:
                        print '\t\tUnable to remap HTML link:', href, anchor
                        continue
                    frag = ('#'+anchor) if anchor else ''
                    a.set('href', newf+frag)
                    changed = True
@ -431,7 +440,10 @@ def fix_ncx(path, changes):
            anchor = href[1] if len(href) > 1 else None
            href = href[0].split('/')[-1]
            if href in split_files:
-                newf = anchor_maps[split_files.index(href)][anchor]
+                try:
                    newf = anchor_maps[split_files.index(href)][anchor]
                except:
                    print 'Unable to remap NCX link:', href, anchor
                frag = ('#'+anchor) if anchor else ''
                content.set('src', 'content/'+newf+frag)
                changed = True
@ -470,4 +482,4 @@ def split(pathtoopf, opts, stylesheet_map):
                fix_ncx(item.get('href'), changes)
                break 
-        open(pathtoopf, 'wb').write(opf.render())
+        open(pathtoopf, 'wb').write(opf.render())