Fix bug in transformation of non-jpeg images. html2lrf internals refactored to support --use-spine.

2025-07-09 03:04:10 -04:00 · 2007-10-12 17:43:47 +00:00 · 2007-10-12 17:43:47 +00:00 · df82180209
commit df82180209
parent 9e1498969d
1 changed files with 98 additions and 99 deletions
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -14,14 +14,14 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-import tempfile
 """ 
 Code to convert HTML ebooks into LRF ebooks.

 I am indebted to esperanc for the initial CSS->Xylog Style conversion code
 and to Falstaff for pylrs.
 """
-import os, re, sys, copy, glob, logging
+import os, re, sys, copy, glob, logging, tempfile
+from collections import deque
 from htmlentitydefs import name2codepoint
 from urllib import unquote
 from urlparse import urlparse
@ -55,6 +55,15 @@ def update_css(ncss, ocss):
        else:
            ocss[key] = ncss[key]
            
+def munge_paths(basepath, url):
+    purl = urlparse(url,)
+    path, fragment = purl[2], purl[5]
+    if not path:
+        path = basepath
+    elif not os.path.isabs(path):
+        path = os.path.join(os.path.dirname(basepath), path)
+    return os.path.normpath(path), fragment
+
 class HTMLConverter(object):
    SELECTOR_PAT   = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
@ -161,11 +170,10 @@ class HTMLConverter(object):
           'u'      : {'text-decoration': 'underline'}, 
           }
    
-    def __init__(self, book, fonts, options, logger, path):
+    def __init__(self, book, fonts, options, logger, paths):
        '''
-        Convert HTML file at C{path} and add it to C{book}. After creating
-        the object, you must call L{self.process_links} on it to create the links and
-        then L{self.writeto} to output the LRF/S file.
+        Convert HTML files at C{paths} and add to C{book}. After creating
+        the object, you must call L{self.writeto} to output the LRF/S file.
        
        @param book: The LRF book 
        @type book:  L{libprs500.lrf.pylrs.Book}
@ -180,12 +188,12 @@ class HTMLConverter(object):
        self.rotated_images = {}  #: Temporary files with rotated version of images        
        self.text_styles      = []#: Keep track of already used textstyles
        self.block_styles     = []#: Keep track of already used blockstyles
-        self.images  = {}     #: Images referenced in the HTML document
-        self.targets = {}         #: <a name=...> and id elements
-        self.links   = {}         #: <a href=...> elements        
+        self.images  = {}      #: Images referenced in the HTML document
+        self.targets = {}      #: <a name=...> and id elements
+        self.links   = deque() #: <a href=...> elements        
        self.processed_files = []
        self.unused_target_blocks = [] #: Used to remove extra TextBlocks
-        self.link_level  = 0  #: Current link level
+        self.link_level  = 0    #: Current link level
        self.memory = []        #: Used to ensure that duplicate CSS unhandled erros are not reported
        self.tops = {}          #: element representing the top of each HTML file in the LRF file
        self.previous_text = '' #: Used to figure out when to lstrip
@ -209,6 +217,7 @@ class HTMLConverter(object):
        
        self.override_css = {}
        self.override_pcss = {}
+         
        if self._override_css is not None:
            if os.access(self._override_css, os.R_OK):
                src = open(self._override_css, 'rb').read()
@ -223,7 +232,16 @@ class HTMLConverter(object):
            if npcss:
                update_css(npcss, self.override_pcss)
        
-        self.start_on_file(path, is_root=True)
+        paths = [os.path.normpath(os.path.abspath(path)) for path in paths]
+        self.base_files = copy.copy(paths)
+        while len(paths) > 0 and self.link_level <= self.link_levels:
+            for path in paths:
+                self.add_file(path)
+            self.links = self.process_links()
+            self.link_level += 1
+            paths = [link['path'] for link in self.links]
+        
+        
        
    def is_baen(self, soup):
        return bool(soup.find('meta', attrs={'name':'Publisher', 
@ -281,33 +299,25 @@ class HTMLConverter(object):
        #print soup
        return soup
    
-    def start_on_file(self, path, is_root=True, link_level=0):
+    def add_file(self, path):
        self.css = HTMLConverter.CSS.copy()
        self.pseudo_css = self.override_pcss.copy()
        self.css.update(self.override_css)
        
-        path = os.path.abspath(path)
-        os.chdir(os.path.dirname(path))
+        path = os.path.normpath(os.path.abspath(path))
        self.file_name = os.path.basename(path)
        self.logger.info('Processing %s', self.file_name)
-        sys.stdout.flush()
-        soup = self.preprocess(open(self.file_name, 'rb').read())
+        raw = open(path, 'rb').read()
+        soup = self.preprocess(raw)
        self.logger.info('\tConverting to BBeB...')
-        sys.stdout.flush()        
        self.current_page = None
        self.current_para = None
        self.current_style = {}
        self.page_break_found = False
-        match = self.PAGE_BREAK_PAT.search(unicode(soup))
-        if match and not re.match('avoid', match.group(1), re.IGNORECASE):
-            self.page_break_found = True
        self.target_prefix = path
-        self.links[path] = []
        self.previous_text = '\n'
-        self.tops[path] = self.parse_file(soup, is_root)
+        self.tops[path] = self.parse_file(soup)
        self.processed_files.append(path)            
-        self.process_links(is_root, path, link_level=link_level)
-            
        
    def parse_css(self, style):
        """
@ -394,7 +404,7 @@ class HTMLConverter(object):
            prop.update(self.parse_style_properties(tag["style"]))
        return prop, pprop
        
-    def parse_file(self, soup, is_root):
+    def parse_file(self, soup):
        def get_valid_block(page):
            for item in page.contents:
                if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
@ -405,8 +415,9 @@ class HTMLConverter(object):
        self.current_page = self.book.create_page()
        self.current_block = self.book.create_text_block()
        self.current_para = Paragraph()
-        if self.cover and is_root:
+        if self.cover:
            self.add_image_page(self.cover)
+            self.cover = None
        top = self.current_block
        
        self.process_children(soup, {}, {})
@ -462,8 +473,9 @@ class HTMLConverter(object):
                except KeyError:
                    pass
        
-        url = urlparse(tag['href'])
-        return {'para':para, 'text':text, 'url':url}
+        path, fragment = munge_paths(self.target_prefix, tag['href'])
+        return {'para':para, 'text':text, 'path':os.path.normpath(path), 
+                'fragment':fragment}
        
    
    def get_text(self, tag, limit=None):
@ -489,7 +501,7 @@ class HTMLConverter(object):
                    text = rule.sub(sub, text)
            return text
    
-    def process_links(self, is_root, selfpath, link_level=0):
+    def process_links(self):
        def add_toc_entry(text, target):
            # TextBlocks in Canvases have a None parent or an Objects Parent
            if target.parent != None and \
@ -531,37 +543,19 @@ class HTMLConverter(object):
                page.contents.remove(bs)
            return ans
        
-        cwd = os.getcwd()
-        for link in self.links[selfpath]:
-            try:
-                para, text, purl = link['para'], link['text'], link['url']
-                # Needed for TOC entries due to bug in LRF
-                ascii_text = text.encode('ascii', 'replace')
-                if purl[1]: # Not a link to a file on the local filesystem
-                    continue
-                basepath, fragment = unquote(purl[2]), purl[5]
-                if not basepath:
-                    basepath = selfpath
-                path = os.path.abspath(basepath)
+        outside_links = deque() 
+        while len(self.links) > 0:
+            link = self.links.popleft()
+            para, text, path, fragment = link['para'], link['text'], link['path'], link['fragment']
+            # Needed for TOC entries due to bug in LRF
+            ascii_text = text.encode('ascii', 'ignore')
            
-                if link_level < self.link_levels and path not in self.processed_files:                
-                    try:
-                        self.start_on_file(path, is_root=False, link_level=link_level+1)
-                    except Exception:
-                        self.logger.warning('Unable to process %s', path)
-                        if self.verbose:
-                            self.logger.exception(' ')
-                        continue
-                    finally:
-                        os.chdir(cwd)            
+            if path in self.processed_files:
                if path+fragment in self.targets.keys():                    
                    tb = get_target_block(path+fragment, self.targets)
                else:
-                    try:
-                        tb = self.tops[path]
-                    except KeyError:
-                        return
-                if is_root:
+                    tb = self.tops[path]
+                if self.link_level == 0 and len(self.base_files) == 1:
                    add_toc_entry(ascii_text, tb)  
                jb = JumpButton(tb)                
                self.book.append(jb)
@ -572,8 +566,11 @@ class HTMLConverter(object):
                    self.unused_target_blocks.remove(tb)
                except ValueError:
                    pass
-            finally:
-                os.chdir(cwd)
+            else:
+                outside_links.append(link)
+        
+        return outside_links
+            
            
    def end_page(self):
        """
@ -785,6 +782,12 @@ class HTMLConverter(object):
        
    
    def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
+        def detect_encoding(im):
+            fmt = im.format
+            if fmt == 'JPG':
+                fmt = 'JPEG'
+            return fmt
+            
        original_path = path
        if self.rotated_images.has_key(path):
            path = self.rotated_images[path].name
@ -793,15 +796,10 @@ class HTMLConverter(object):
        
        try:
            im = PILImage.open(path)
-            encoding = im.format
-            if encoding:
-                encoding = encoding.upper()
-                if encoding == 'JPG':
-                    encoding = 'JPEG'
        except IOError, err:
            self.logger.warning('Unable to process image: %s\n%s', original_path, err)
            return
-
+        encoding = detect_encoding(im)

        if width == None or height == None:            
            width, height = im.size
@ -809,12 +807,11 @@ class HTMLConverter(object):
        factor = 720./self.profile.dpi
        
        def scale_image(width, height):
-            pt = PersistentTemporaryFile(suffix='.jpeg')
+            pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
            try:
-                im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
+                im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, encoding)
                pt.close()
                self.scaled_images[path] = pt
-                encoding = 'JPEG'
                return pt.name
            except IOError: # PIL chokes on interlaced PNG images
                self.logger.warning('Unable to process interlaced PNG %s', path)
@ -847,12 +844,11 @@ class HTMLConverter(object):
            return
            
        if not self.disable_autorotation and width > pwidth and width > height:
-            pt = PersistentTemporaryFile(suffix='.jpeg')
+            pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
            try:
                im = im.rotate(90)
-                im.convert('RGB').save(pt, 'JPEG')
+                im.save(pt, encoding)
                path = pt.name
-                encoding = 'JPEG'
                self.rotated_images[path] = pt
                width, height = im.size
            except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
@ -1245,35 +1241,39 @@ class HTMLConverter(object):
            pass
        elif tagname == 'a' and self.link_levels >= 0:
            if tag.has_key('href') and not self.link_exclude.match(tag['href']):
-                purl = urlparse(tag['href'])
-                path = unquote(purl[2])
+                path = munge_paths(self.target_prefix, tag['href'])[0]
                ext = os.path.splitext(path)[1]
                if ext: ext = ext[1:].lower()
-                if path and os.access(path, os.R_OK) and ext and \
-                                        ext in ['png', 'jpg', 'bmp', 'jpeg']:
-                    self.process_image(path, tag_css)
+                if os.access(path, os.R_OK):
+                    if ext in ['png', 'jpg', 'bmp', 'jpeg']:
+                        self.process_image(path, tag_css)
+                    else:
+                        text = self.get_text(tag, limit=1000)
+                        if not text.strip():
+                            text = "Link"
+                        self.add_text(text, tag_css, {}, force_span_use=True)
+                        self.links.append(self.create_link(self.current_para.contents, tag))
+                        if tag.has_key('id') or tag.has_key('name'):
+                            key = 'name' if tag.has_key('name') else 'id'
+                            self.targets[self.target_prefix+tag[key]] = self.current_block
                else:
-                    text = self.get_text(tag, limit=1000)
-                    if not text.strip():
-                        text = "Link"
-                    self.add_text(text, tag_css, {}, force_span_use=True)
-                    self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
-                    if tag.has_key('id') or tag.has_key('name'):
-                        key = 'name' if tag.has_key('name') else 'id'
-                        self.targets[self.target_prefix+tag[key]] = self.current_block
+                    self.logger.warn('Could not follow link to '+tag['href'])
            elif tag.has_key('name') or tag.has_key('id'):
                self.process_anchor(tag, tag_css, tag_pseudo_css)                            
        elif tagname == 'img':
-            if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
-                path = os.path.abspath(unquote(tag['src']))
-                width, height = None, None
-                try:
-                    width = int(tag['width'])
-                    height = int(tag['height'])
-                except:
-                    pass
-                dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
-                self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
+            if tag.has_key('src'):
+                path = munge_paths(self.target_prefix, tag['src'])[0]
+                if os.access(path, os.R_OK):
+                    width, height = None, None
+                    try:
+                        width = int(tag['width'])
+                        height = int(tag['height'])
+                    except:
+                        pass
+                    dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
+                    self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
+                else:
+                    self.logger.warn('Could not find image: '+tag['src'])
            else:
                self.logger.debug("Failed to process: %s", str(tag))
        elif tagname in ['style', 'link']:
@ -1286,8 +1286,7 @@ class HTMLConverter(object):
                        npcss.update(pcss)
            elif tag.has_key('type') and tag['type'] == "text/css" \
                    and tag.has_key('href'):
-                purl = urlparse(tag['href'])
-                path = unquote(purl[2])                
+                path = munge_paths(self.target_prefix, tag['href'])[0]           
                try:
                    f = open(path, 'rb')
                    src = f.read()
@ -1297,7 +1296,7 @@ class HTMLConverter(object):
                        self.page_break_found = True
                    ncss, npcss = self.parse_css(src)
                except IOError:
-                    pass
+                    self.logger.warn('Could not read stylesheet: '+tag['href'])
            if ncss:
                update_css(ncss, self.css)
                self.css.update(self.override_css)
@ -1609,7 +1608,7 @@ def process_file(path, options, logger=None):
                                         re.compile(fpba[2], re.IGNORECASE)]
        if not hasattr(options, 'anchor_ids'):
            options.anchor_ids = True
-        conv = HTMLConverter(book, fonts, options, logger, path)
+        conv = HTMLConverter(book, fonts, options, logger, [path])
        oname = options.output
        if not oname:
            suffix = '.lrs' if options.lrs else '.lrf'