Fix bug in transformation of non-jpeg images. html2lrf internals refactored to support --use-spine.

2025-07-09 03:04:10 -04:00 · 2007-10-12 17:43:47 +00:00 · 2007-10-12 17:43:47 +00:00 · df82180209
commit df82180209
parent 9e1498969d
1 changed files with 98 additions and 99 deletions
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -14,14 +14,14 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import tempfile
 """ 
 Code to convert HTML ebooks into LRF ebooks.
 I am indebted to esperanc for the initial CSS->Xylog Style conversion code
 and to Falstaff for pylrs.
 """
-import os, re, sys, copy, glob, logging
+import os, re, sys, copy, glob, logging, tempfile
 from collections import deque
 from htmlentitydefs import name2codepoint
 from urllib import unquote
 from urlparse import urlparse
@ -55,6 +55,15 @@ def update_css(ncss, ocss):
        else:
            ocss[key] = ncss[key]
 def munge_paths(basepath, url):
    purl = urlparse(url,)
    path, fragment = purl[2], purl[5]
    if not path:
        path = basepath
    elif not os.path.isabs(path):
        path = os.path.join(os.path.dirname(basepath), path)
    return os.path.normpath(path), fragment
 class HTMLConverter(object):
    SELECTOR_PAT   = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
@ -161,11 +170,10 @@ class HTMLConverter(object):
           'u'      : {'text-decoration': 'underline'}, 
           }
-    def __init__(self, book, fonts, options, logger, path):
+    def __init__(self, book, fonts, options, logger, paths):
        '''
-        Convert HTML file at C{path} and add it to C{book}. After creating
+        Convert HTML files at C{paths} and add to C{book}. After creating
-        the object, you must call L{self.process_links} on it to create the links and
+        the object, you must call L{self.writeto} to output the LRF/S file.
        then L{self.writeto} to output the LRF/S file.
        @param book: The LRF book 
        @type book:  L{libprs500.lrf.pylrs.Book}
@ -180,12 +188,12 @@ class HTMLConverter(object):
        self.rotated_images = {}  #: Temporary files with rotated version of images        
        self.text_styles      = []#: Keep track of already used textstyles
        self.block_styles     = []#: Keep track of already used blockstyles
-        self.images  = {}     #: Images referenced in the HTML document
+        self.images  = {}      #: Images referenced in the HTML document
-        self.targets = {}         #: <a name=...> and id elements
+        self.targets = {}      #: <a name=...> and id elements
-        self.links   = {}         #: <a href=...> elements        
+        self.links   = deque() #: <a href=...> elements        
        self.processed_files = []
        self.unused_target_blocks = [] #: Used to remove extra TextBlocks
-        self.link_level  = 0  #: Current link level
+        self.link_level  = 0    #: Current link level
        self.memory = []        #: Used to ensure that duplicate CSS unhandled erros are not reported
        self.tops = {}          #: element representing the top of each HTML file in the LRF file
        self.previous_text = '' #: Used to figure out when to lstrip
@ -209,6 +217,7 @@ class HTMLConverter(object):
        self.override_css = {}
        self.override_pcss = {}
        if self._override_css is not None:
            if os.access(self._override_css, os.R_OK):
                src = open(self._override_css, 'rb').read()
@ -223,7 +232,16 @@ class HTMLConverter(object):
            if npcss:
                update_css(npcss, self.override_pcss)
-        self.start_on_file(path, is_root=True)
+        paths = [os.path.normpath(os.path.abspath(path)) for path in paths]
        self.base_files = copy.copy(paths)
        while len(paths) > 0 and self.link_level <= self.link_levels:
            for path in paths:
                self.add_file(path)
            self.links = self.process_links()
            self.link_level += 1
            paths = [link['path'] for link in self.links]
    def is_baen(self, soup):
        return bool(soup.find('meta', attrs={'name':'Publisher', 
@ -281,33 +299,25 @@ class HTMLConverter(object):
        #print soup
        return soup
-    def start_on_file(self, path, is_root=True, link_level=0):
+    def add_file(self, path):
        self.css = HTMLConverter.CSS.copy()
        self.pseudo_css = self.override_pcss.copy()
        self.css.update(self.override_css)
-        path = os.path.abspath(path)
+        path = os.path.normpath(os.path.abspath(path))
        os.chdir(os.path.dirname(path))
        self.file_name = os.path.basename(path)
        self.logger.info('Processing %s', self.file_name)
-        sys.stdout.flush()
+        raw = open(path, 'rb').read()
-        soup = self.preprocess(open(self.file_name, 'rb').read())
+        soup = self.preprocess(raw)
        self.logger.info('\tConverting to BBeB...')
        sys.stdout.flush()        
        self.current_page = None
        self.current_para = None
        self.current_style = {}
        self.page_break_found = False
        match = self.PAGE_BREAK_PAT.search(unicode(soup))
        if match and not re.match('avoid', match.group(1), re.IGNORECASE):
            self.page_break_found = True
        self.target_prefix = path
        self.links[path] = []
        self.previous_text = '\n'
-        self.tops[path] = self.parse_file(soup, is_root)
+        self.tops[path] = self.parse_file(soup)
        self.processed_files.append(path)            
        self.process_links(is_root, path, link_level=link_level)
    def parse_css(self, style):
        """
@ -394,7 +404,7 @@ class HTMLConverter(object):
            prop.update(self.parse_style_properties(tag["style"]))
        return prop, pprop
-    def parse_file(self, soup, is_root):
+    def parse_file(self, soup):
        def get_valid_block(page):
            for item in page.contents:
                if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
@ -405,8 +415,9 @@ class HTMLConverter(object):
        self.current_page = self.book.create_page()
        self.current_block = self.book.create_text_block()
        self.current_para = Paragraph()
-        if self.cover and is_root:
+        if self.cover:
            self.add_image_page(self.cover)
            self.cover = None
        top = self.current_block
        self.process_children(soup, {}, {})
@ -462,8 +473,9 @@ class HTMLConverter(object):
                except KeyError:
                    pass
-        url = urlparse(tag['href'])
+        path, fragment = munge_paths(self.target_prefix, tag['href'])
-        return {'para':para, 'text':text, 'url':url}
+        return {'para':para, 'text':text, 'path':os.path.normpath(path), 
                'fragment':fragment}
    def get_text(self, tag, limit=None):
@ -489,7 +501,7 @@ class HTMLConverter(object):
                    text = rule.sub(sub, text)
            return text
-    def process_links(self, is_root, selfpath, link_level=0):
+    def process_links(self):
        def add_toc_entry(text, target):
            # TextBlocks in Canvases have a None parent or an Objects Parent
            if target.parent != None and \
@ -531,37 +543,19 @@ class HTMLConverter(object):
                page.contents.remove(bs)
            return ans
-        cwd = os.getcwd()
+        outside_links = deque() 
-        for link in self.links[selfpath]:
+        while len(self.links) > 0:
-            try:
+            link = self.links.popleft()
-                para, text, purl = link['para'], link['text'], link['url']
+            para, text, path, fragment = link['para'], link['text'], link['path'], link['fragment']
-                # Needed for TOC entries due to bug in LRF
+            # Needed for TOC entries due to bug in LRF
-                ascii_text = text.encode('ascii', 'replace')
+            ascii_text = text.encode('ascii', 'ignore')
                if purl[1]: # Not a link to a file on the local filesystem
                    continue
                basepath, fragment = unquote(purl[2]), purl[5]
                if not basepath:
                    basepath = selfpath
                path = os.path.abspath(basepath)
-                if link_level < self.link_levels and path not in self.processed_files:                
+            if path in self.processed_files:
                    try:
                        self.start_on_file(path, is_root=False, link_level=link_level+1)
                    except Exception:
                        self.logger.warning('Unable to process %s', path)
                        if self.verbose:
                            self.logger.exception(' ')
                        continue
                    finally:
                        os.chdir(cwd)            
                if path+fragment in self.targets.keys():                    
                    tb = get_target_block(path+fragment, self.targets)
                else:
-                    try:
+                    tb = self.tops[path]
-                        tb = self.tops[path]
+                if self.link_level == 0 and len(self.base_files) == 1:
                    except KeyError:
                        return
                if is_root:
                    add_toc_entry(ascii_text, tb)  
                jb = JumpButton(tb)                
                self.book.append(jb)
@ -572,8 +566,11 @@ class HTMLConverter(object):
                    self.unused_target_blocks.remove(tb)
                except ValueError:
                    pass
-            finally:
+            else:
-                os.chdir(cwd)
+                outside_links.append(link)
        return outside_links
    def end_page(self):
        """
@ -785,6 +782,12 @@ class HTMLConverter(object):
    def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
        def detect_encoding(im):
            fmt = im.format
            if fmt == 'JPG':
                fmt = 'JPEG'
            return fmt
        original_path = path
        if self.rotated_images.has_key(path):
            path = self.rotated_images[path].name
@ -793,15 +796,10 @@ class HTMLConverter(object):
        try:
            im = PILImage.open(path)
            encoding = im.format
            if encoding:
                encoding = encoding.upper()
                if encoding == 'JPG':
                    encoding = 'JPEG'
        except IOError, err:
            self.logger.warning('Unable to process image: %s\n%s', original_path, err)
            return
-
+        encoding = detect_encoding(im)
        if width == None or height == None:            
            width, height = im.size
@ -809,12 +807,11 @@ class HTMLConverter(object):
        factor = 720./self.profile.dpi
        def scale_image(width, height):
-            pt = PersistentTemporaryFile(suffix='.jpeg')
+            pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
            try:
-                im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
+                im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, encoding)
                pt.close()
                self.scaled_images[path] = pt
                encoding = 'JPEG'
                return pt.name
            except IOError: # PIL chokes on interlaced PNG images
                self.logger.warning('Unable to process interlaced PNG %s', path)
@ -847,12 +844,11 @@ class HTMLConverter(object):
            return
        if not self.disable_autorotation and width > pwidth and width > height:
-            pt = PersistentTemporaryFile(suffix='.jpeg')
+            pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
            try:
                im = im.rotate(90)
-                im.convert('RGB').save(pt, 'JPEG')
+                im.save(pt, encoding)
                path = pt.name
                encoding = 'JPEG'
                self.rotated_images[path] = pt
                width, height = im.size
            except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
@ -1245,35 +1241,39 @@ class HTMLConverter(object):
            pass
        elif tagname == 'a' and self.link_levels >= 0:
            if tag.has_key('href') and not self.link_exclude.match(tag['href']):
-                purl = urlparse(tag['href'])
+                path = munge_paths(self.target_prefix, tag['href'])[0]
                path = unquote(purl[2])
                ext = os.path.splitext(path)[1]
                if ext: ext = ext[1:].lower()
-                if path and os.access(path, os.R_OK) and ext and \
+                if os.access(path, os.R_OK):
-                                        ext in ['png', 'jpg', 'bmp', 'jpeg']:
+                    if ext in ['png', 'jpg', 'bmp', 'jpeg']:
-                    self.process_image(path, tag_css)
+                        self.process_image(path, tag_css)
                    else:
                        text = self.get_text(tag, limit=1000)
                        if not text.strip():
                            text = "Link"
                        self.add_text(text, tag_css, {}, force_span_use=True)
                        self.links.append(self.create_link(self.current_para.contents, tag))
                        if tag.has_key('id') or tag.has_key('name'):
                            key = 'name' if tag.has_key('name') else 'id'
                            self.targets[self.target_prefix+tag[key]] = self.current_block
                else:
-                    text = self.get_text(tag, limit=1000)
+                    self.logger.warn('Could not follow link to '+tag['href'])
                    if not text.strip():
                        text = "Link"
                    self.add_text(text, tag_css, {}, force_span_use=True)
                    self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
                    if tag.has_key('id') or tag.has_key('name'):
                        key = 'name' if tag.has_key('name') else 'id'
                        self.targets[self.target_prefix+tag[key]] = self.current_block
            elif tag.has_key('name') or tag.has_key('id'):
                self.process_anchor(tag, tag_css, tag_pseudo_css)                            
        elif tagname == 'img':
-            if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
+            if tag.has_key('src'):
-                path = os.path.abspath(unquote(tag['src']))
+                path = munge_paths(self.target_prefix, tag['src'])[0]
-                width, height = None, None
+                if os.access(path, os.R_OK):
-                try:
+                    width, height = None, None
-                    width = int(tag['width'])
+                    try:
-                    height = int(tag['height'])
+                        width = int(tag['width'])
-                except:
+                        height = int(tag['height'])
-                    pass
+                    except:
-                dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
+                        pass
-                self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
+                    dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
                    self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
                else:
                    self.logger.warn('Could not find image: '+tag['src'])
            else:
                self.logger.debug("Failed to process: %s", str(tag))
        elif tagname in ['style', 'link']:
@ -1286,8 +1286,7 @@ class HTMLConverter(object):
                        npcss.update(pcss)
            elif tag.has_key('type') and tag['type'] == "text/css" \
                    and tag.has_key('href'):
-                purl = urlparse(tag['href'])
+                path = munge_paths(self.target_prefix, tag['href'])[0]           
                path = unquote(purl[2])                
                try:
                    f = open(path, 'rb')
                    src = f.read()
@ -1297,7 +1296,7 @@ class HTMLConverter(object):
                        self.page_break_found = True
                    ncss, npcss = self.parse_css(src)
                except IOError:
-                    pass
+                    self.logger.warn('Could not read stylesheet: '+tag['href'])
            if ncss:
                update_css(ncss, self.css)
                self.css.update(self.override_css)
@ -1609,7 +1608,7 @@ def process_file(path, options, logger=None):
                                         re.compile(fpba[2], re.IGNORECASE)]
        if not hasattr(options, 'anchor_ids'):
            options.anchor_ids = True
-        conv = HTMLConverter(book, fonts, options, logger, path)
+        conv = HTMLConverter(book, fonts, options, logger, [path])
        oname = options.output
        if not oname:
            suffix = '.lrs' if options.lrs else '.lrf'