diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index e1abc970e8..f45276c127 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -14,14 +14,14 @@ ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -import tempfile """ Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. """ -import os, re, sys, copy, glob, logging +import os, re, sys, copy, glob, logging, tempfile +from collections import deque from htmlentitydefs import name2codepoint from urllib import unquote from urlparse import urlparse @@ -54,6 +54,15 @@ def update_css(ncss, ocss): ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] + +def munge_paths(basepath, url): + purl = urlparse(url,) + path, fragment = purl[2], purl[5] + if not path: + path = basepath + elif not os.path.isabs(path): + path = os.path.join(os.path.dirname(basepath), path) + return os.path.normpath(path), fragment class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") @@ -161,11 +170,10 @@ class HTMLConverter(object): 'u' : {'text-decoration': 'underline'}, } - def __init__(self, book, fonts, options, logger, path): + def __init__(self, book, fonts, options, logger, paths): ''' - Convert HTML file at C{path} and add it to C{book}. After creating - the object, you must call L{self.process_links} on it to create the links and - then L{self.writeto} to output the LRF/S file. + Convert HTML files at C{paths} and add to C{book}. After creating + the object, you must call L{self.writeto} to output the LRF/S file. @param book: The LRF book @type book: L{libprs500.lrf.pylrs.Book} @@ -180,12 +188,12 @@ class HTMLConverter(object): self.rotated_images = {} #: Temporary files with rotated version of images self.text_styles = []#: Keep track of already used textstyles self.block_styles = []#: Keep track of already used blockstyles - self.images = {} #: Images referenced in the HTML document - self.targets = {} #: and id elements - self.links = {} #: elements + self.images = {} #: Images referenced in the HTML document + self.targets = {} #: and id elements + self.links = deque() #: elements self.processed_files = [] self.unused_target_blocks = [] #: Used to remove extra TextBlocks - self.link_level = 0 #: Current link level + self.link_level = 0 #: Current link level self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported self.tops = {} #: element representing the top of each HTML file in the LRF file self.previous_text = '' #: Used to figure out when to lstrip @@ -209,6 +217,7 @@ class HTMLConverter(object): self.override_css = {} self.override_pcss = {} + if self._override_css is not None: if os.access(self._override_css, os.R_OK): src = open(self._override_css, 'rb').read() @@ -223,7 +232,16 @@ class HTMLConverter(object): if npcss: update_css(npcss, self.override_pcss) - self.start_on_file(path, is_root=True) + paths = [os.path.normpath(os.path.abspath(path)) for path in paths] + self.base_files = copy.copy(paths) + while len(paths) > 0 and self.link_level <= self.link_levels: + for path in paths: + self.add_file(path) + self.links = self.process_links() + self.link_level += 1 + paths = [link['path'] for link in self.links] + + def is_baen(self, soup): return bool(soup.find('meta', attrs={'name':'Publisher', @@ -281,33 +299,25 @@ class HTMLConverter(object): #print soup return soup - def start_on_file(self, path, is_root=True, link_level=0): + def add_file(self, path): self.css = HTMLConverter.CSS.copy() self.pseudo_css = self.override_pcss.copy() self.css.update(self.override_css) - path = os.path.abspath(path) - os.chdir(os.path.dirname(path)) + path = os.path.normpath(os.path.abspath(path)) self.file_name = os.path.basename(path) self.logger.info('Processing %s', self.file_name) - sys.stdout.flush() - soup = self.preprocess(open(self.file_name, 'rb').read()) + raw = open(path, 'rb').read() + soup = self.preprocess(raw) self.logger.info('\tConverting to BBeB...') - sys.stdout.flush() self.current_page = None self.current_para = None self.current_style = {} self.page_break_found = False - match = self.PAGE_BREAK_PAT.search(unicode(soup)) - if match and not re.match('avoid', match.group(1), re.IGNORECASE): - self.page_break_found = True self.target_prefix = path - self.links[path] = [] self.previous_text = '\n' - self.tops[path] = self.parse_file(soup, is_root) - self.processed_files.append(path) - self.process_links(is_root, path, link_level=link_level) - + self.tops[path] = self.parse_file(soup) + self.processed_files.append(path) def parse_css(self, style): """ @@ -394,7 +404,7 @@ class HTMLConverter(object): prop.update(self.parse_style_properties(tag["style"])) return prop, pprop - def parse_file(self, soup, is_root): + def parse_file(self, soup): def get_valid_block(page): for item in page.contents: if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)): @@ -405,8 +415,9 @@ class HTMLConverter(object): self.current_page = self.book.create_page() self.current_block = self.book.create_text_block() self.current_para = Paragraph() - if self.cover and is_root: + if self.cover: self.add_image_page(self.cover) + self.cover = None top = self.current_block self.process_children(soup, {}, {}) @@ -462,8 +473,9 @@ class HTMLConverter(object): except KeyError: pass - url = urlparse(tag['href']) - return {'para':para, 'text':text, 'url':url} + path, fragment = munge_paths(self.target_prefix, tag['href']) + return {'para':para, 'text':text, 'path':os.path.normpath(path), + 'fragment':fragment} def get_text(self, tag, limit=None): @@ -489,7 +501,7 @@ class HTMLConverter(object): text = rule.sub(sub, text) return text - def process_links(self, is_root, selfpath, link_level=0): + def process_links(self): def add_toc_entry(text, target): # TextBlocks in Canvases have a None parent or an Objects Parent if target.parent != None and \ @@ -531,37 +543,19 @@ class HTMLConverter(object): page.contents.remove(bs) return ans - cwd = os.getcwd() - for link in self.links[selfpath]: - try: - para, text, purl = link['para'], link['text'], link['url'] - # Needed for TOC entries due to bug in LRF - ascii_text = text.encode('ascii', 'replace') - if purl[1]: # Not a link to a file on the local filesystem - continue - basepath, fragment = unquote(purl[2]), purl[5] - if not basepath: - basepath = selfpath - path = os.path.abspath(basepath) - - if link_level < self.link_levels and path not in self.processed_files: - try: - self.start_on_file(path, is_root=False, link_level=link_level+1) - except Exception: - self.logger.warning('Unable to process %s', path) - if self.verbose: - self.logger.exception(' ') - continue - finally: - os.chdir(cwd) + outside_links = deque() + while len(self.links) > 0: + link = self.links.popleft() + para, text, path, fragment = link['para'], link['text'], link['path'], link['fragment'] + # Needed for TOC entries due to bug in LRF + ascii_text = text.encode('ascii', 'ignore') + + if path in self.processed_files: if path+fragment in self.targets.keys(): tb = get_target_block(path+fragment, self.targets) else: - try: - tb = self.tops[path] - except KeyError: - return - if is_root: + tb = self.tops[path] + if self.link_level == 0 and len(self.base_files) == 1: add_toc_entry(ascii_text, tb) jb = JumpButton(tb) self.book.append(jb) @@ -572,8 +566,11 @@ class HTMLConverter(object): self.unused_target_blocks.remove(tb) except ValueError: pass - finally: - os.chdir(cwd) + else: + outside_links.append(link) + + return outside_links + def end_page(self): """ @@ -785,6 +782,12 @@ class HTMLConverter(object): def process_image(self, path, tag_css, width=None, height=None, dropcaps=False): + def detect_encoding(im): + fmt = im.format + if fmt == 'JPG': + fmt = 'JPEG' + return fmt + original_path = path if self.rotated_images.has_key(path): path = self.rotated_images[path].name @@ -793,28 +796,22 @@ class HTMLConverter(object): try: im = PILImage.open(path) - encoding = im.format - if encoding: - encoding = encoding.upper() - if encoding == 'JPG': - encoding = 'JPEG' except IOError, err: self.logger.warning('Unable to process image: %s\n%s', original_path, err) return + encoding = detect_encoding(im) - if width == None or height == None: width, height = im.size factor = 720./self.profile.dpi def scale_image(width, height): - pt = PersistentTemporaryFile(suffix='.jpeg') + pt = PersistentTemporaryFile(suffix='.'+encoding.lower()) try: - im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG') + im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, encoding) pt.close() self.scaled_images[path] = pt - encoding = 'JPEG' return pt.name except IOError: # PIL chokes on interlaced PNG images self.logger.warning('Unable to process interlaced PNG %s', path) @@ -847,12 +844,11 @@ class HTMLConverter(object): return if not self.disable_autorotation and width > pwidth and width > height: - pt = PersistentTemporaryFile(suffix='.jpeg') + pt = PersistentTemporaryFile(suffix='.'+encoding.lower()) try: im = im.rotate(90) - im.convert('RGB').save(pt, 'JPEG') + im.save(pt, encoding) path = pt.name - encoding = 'JPEG' self.rotated_images[path] = pt width, height = im.size except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error @@ -1245,35 +1241,39 @@ class HTMLConverter(object): pass elif tagname == 'a' and self.link_levels >= 0: if tag.has_key('href') and not self.link_exclude.match(tag['href']): - purl = urlparse(tag['href']) - path = unquote(purl[2]) + path = munge_paths(self.target_prefix, tag['href'])[0] ext = os.path.splitext(path)[1] if ext: ext = ext[1:].lower() - if path and os.access(path, os.R_OK) and ext and \ - ext in ['png', 'jpg', 'bmp', 'jpeg']: - self.process_image(path, tag_css) + if os.access(path, os.R_OK): + if ext in ['png', 'jpg', 'bmp', 'jpeg']: + self.process_image(path, tag_css) + else: + text = self.get_text(tag, limit=1000) + if not text.strip(): + text = "Link" + self.add_text(text, tag_css, {}, force_span_use=True) + self.links.append(self.create_link(self.current_para.contents, tag)) + if tag.has_key('id') or tag.has_key('name'): + key = 'name' if tag.has_key('name') else 'id' + self.targets[self.target_prefix+tag[key]] = self.current_block else: - text = self.get_text(tag, limit=1000) - if not text.strip(): - text = "Link" - self.add_text(text, tag_css, {}, force_span_use=True) - self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag)) - if tag.has_key('id') or tag.has_key('name'): - key = 'name' if tag.has_key('name') else 'id' - self.targets[self.target_prefix+tag[key]] = self.current_block + self.logger.warn('Could not follow link to '+tag['href']) elif tag.has_key('name') or tag.has_key('id'): self.process_anchor(tag, tag_css, tag_pseudo_css) elif tagname == 'img': - if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK): - path = os.path.abspath(unquote(tag['src'])) - width, height = None, None - try: - width = int(tag['width']) - height = int(tag['height']) - except: - pass - dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps' - self.process_image(path, tag_css, width, height, dropcaps=dropcaps) + if tag.has_key('src'): + path = munge_paths(self.target_prefix, tag['src'])[0] + if os.access(path, os.R_OK): + width, height = None, None + try: + width = int(tag['width']) + height = int(tag['height']) + except: + pass + dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps' + self.process_image(path, tag_css, width, height, dropcaps=dropcaps) + else: + self.logger.warn('Could not find image: '+tag['src']) else: self.logger.debug("Failed to process: %s", str(tag)) elif tagname in ['style', 'link']: @@ -1286,8 +1286,7 @@ class HTMLConverter(object): npcss.update(pcss) elif tag.has_key('type') and tag['type'] == "text/css" \ and tag.has_key('href'): - purl = urlparse(tag['href']) - path = unquote(purl[2]) + path = munge_paths(self.target_prefix, tag['href'])[0] try: f = open(path, 'rb') src = f.read() @@ -1297,7 +1296,7 @@ class HTMLConverter(object): self.page_break_found = True ncss, npcss = self.parse_css(src) except IOError: - pass + self.logger.warn('Could not read stylesheet: '+tag['href']) if ncss: update_css(ncss, self.css) self.css.update(self.override_css) @@ -1609,7 +1608,7 @@ def process_file(path, options, logger=None): re.compile(fpba[2], re.IGNORECASE)] if not hasattr(options, 'anchor_ids'): options.anchor_ids = True - conv = HTMLConverter(book, fonts, options, logger, path) + conv = HTMLConverter(book, fonts, options, logger, [path]) oname = options.output if not oname: suffix = '.lrs' if options.lrs else '.lrf'