Fix bug in transformation of non-jpeg images. html2lrf internals refactored to support --use-spine.

This commit is contained in:
Kovid Goyal 2007-10-12 17:43:47 +00:00
parent 9e1498969d
commit df82180209

View File

@ -14,14 +14,14 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import tempfile
""" """
Code to convert HTML ebooks into LRF ebooks. Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs. and to Falstaff for pylrs.
""" """
import os, re, sys, copy, glob, logging import os, re, sys, copy, glob, logging, tempfile
from collections import deque
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from urllib import unquote from urllib import unquote
from urlparse import urlparse from urlparse import urlparse
@ -55,6 +55,15 @@ def update_css(ncss, ocss):
else: else:
ocss[key] = ncss[key] ocss[key] = ncss[key]
def munge_paths(basepath, url):
purl = urlparse(url,)
path, fragment = purl[2], purl[5]
if not path:
path = basepath
elif not os.path.isabs(path):
path = os.path.join(os.path.dirname(basepath), path)
return os.path.normpath(path), fragment
class HTMLConverter(object): class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
@ -161,11 +170,10 @@ class HTMLConverter(object):
'u' : {'text-decoration': 'underline'}, 'u' : {'text-decoration': 'underline'},
} }
def __init__(self, book, fonts, options, logger, path): def __init__(self, book, fonts, options, logger, paths):
''' '''
Convert HTML file at C{path} and add it to C{book}. After creating Convert HTML files at C{paths} and add to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and the object, you must call L{self.writeto} to output the LRF/S file.
then L{self.writeto} to output the LRF/S file.
@param book: The LRF book @param book: The LRF book
@type book: L{libprs500.lrf.pylrs.Book} @type book: L{libprs500.lrf.pylrs.Book}
@ -180,12 +188,12 @@ class HTMLConverter(object):
self.rotated_images = {} #: Temporary files with rotated version of images self.rotated_images = {} #: Temporary files with rotated version of images
self.text_styles = []#: Keep track of already used textstyles self.text_styles = []#: Keep track of already used textstyles
self.block_styles = []#: Keep track of already used blockstyles self.block_styles = []#: Keep track of already used blockstyles
self.images = {} #: Images referenced in the HTML document self.images = {} #: Images referenced in the HTML document
self.targets = {} #: <a name=...> and id elements self.targets = {} #: <a name=...> and id elements
self.links = {} #: <a href=...> elements self.links = deque() #: <a href=...> elements
self.processed_files = [] self.processed_files = []
self.unused_target_blocks = [] #: Used to remove extra TextBlocks self.unused_target_blocks = [] #: Used to remove extra TextBlocks
self.link_level = 0 #: Current link level self.link_level = 0 #: Current link level
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
self.tops = {} #: element representing the top of each HTML file in the LRF file self.tops = {} #: element representing the top of each HTML file in the LRF file
self.previous_text = '' #: Used to figure out when to lstrip self.previous_text = '' #: Used to figure out when to lstrip
@ -209,6 +217,7 @@ class HTMLConverter(object):
self.override_css = {} self.override_css = {}
self.override_pcss = {} self.override_pcss = {}
if self._override_css is not None: if self._override_css is not None:
if os.access(self._override_css, os.R_OK): if os.access(self._override_css, os.R_OK):
src = open(self._override_css, 'rb').read() src = open(self._override_css, 'rb').read()
@ -223,7 +232,16 @@ class HTMLConverter(object):
if npcss: if npcss:
update_css(npcss, self.override_pcss) update_css(npcss, self.override_pcss)
self.start_on_file(path, is_root=True) paths = [os.path.normpath(os.path.abspath(path)) for path in paths]
self.base_files = copy.copy(paths)
while len(paths) > 0 and self.link_level <= self.link_levels:
for path in paths:
self.add_file(path)
self.links = self.process_links()
self.link_level += 1
paths = [link['path'] for link in self.links]
def is_baen(self, soup): def is_baen(self, soup):
return bool(soup.find('meta', attrs={'name':'Publisher', return bool(soup.find('meta', attrs={'name':'Publisher',
@ -281,33 +299,25 @@ class HTMLConverter(object):
#print soup #print soup
return soup return soup
def start_on_file(self, path, is_root=True, link_level=0): def add_file(self, path):
self.css = HTMLConverter.CSS.copy() self.css = HTMLConverter.CSS.copy()
self.pseudo_css = self.override_pcss.copy() self.pseudo_css = self.override_pcss.copy()
self.css.update(self.override_css) self.css.update(self.override_css)
path = os.path.abspath(path) path = os.path.normpath(os.path.abspath(path))
os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path) self.file_name = os.path.basename(path)
self.logger.info('Processing %s', self.file_name) self.logger.info('Processing %s', self.file_name)
sys.stdout.flush() raw = open(path, 'rb').read()
soup = self.preprocess(open(self.file_name, 'rb').read()) soup = self.preprocess(raw)
self.logger.info('\tConverting to BBeB...') self.logger.info('\tConverting to BBeB...')
sys.stdout.flush()
self.current_page = None self.current_page = None
self.current_para = None self.current_para = None
self.current_style = {} self.current_style = {}
self.page_break_found = False self.page_break_found = False
match = self.PAGE_BREAK_PAT.search(unicode(soup))
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
self.target_prefix = path self.target_prefix = path
self.links[path] = []
self.previous_text = '\n' self.previous_text = '\n'
self.tops[path] = self.parse_file(soup, is_root) self.tops[path] = self.parse_file(soup)
self.processed_files.append(path) self.processed_files.append(path)
self.process_links(is_root, path, link_level=link_level)
def parse_css(self, style): def parse_css(self, style):
""" """
@ -394,7 +404,7 @@ class HTMLConverter(object):
prop.update(self.parse_style_properties(tag["style"])) prop.update(self.parse_style_properties(tag["style"]))
return prop, pprop return prop, pprop
def parse_file(self, soup, is_root): def parse_file(self, soup):
def get_valid_block(page): def get_valid_block(page):
for item in page.contents: for item in page.contents:
if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)): if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
@ -405,8 +415,9 @@ class HTMLConverter(object):
self.current_page = self.book.create_page() self.current_page = self.book.create_page()
self.current_block = self.book.create_text_block() self.current_block = self.book.create_text_block()
self.current_para = Paragraph() self.current_para = Paragraph()
if self.cover and is_root: if self.cover:
self.add_image_page(self.cover) self.add_image_page(self.cover)
self.cover = None
top = self.current_block top = self.current_block
self.process_children(soup, {}, {}) self.process_children(soup, {}, {})
@ -462,8 +473,9 @@ class HTMLConverter(object):
except KeyError: except KeyError:
pass pass
url = urlparse(tag['href']) path, fragment = munge_paths(self.target_prefix, tag['href'])
return {'para':para, 'text':text, 'url':url} return {'para':para, 'text':text, 'path':os.path.normpath(path),
'fragment':fragment}
def get_text(self, tag, limit=None): def get_text(self, tag, limit=None):
@ -489,7 +501,7 @@ class HTMLConverter(object):
text = rule.sub(sub, text) text = rule.sub(sub, text)
return text return text
def process_links(self, is_root, selfpath, link_level=0): def process_links(self):
def add_toc_entry(text, target): def add_toc_entry(text, target):
# TextBlocks in Canvases have a None parent or an Objects Parent # TextBlocks in Canvases have a None parent or an Objects Parent
if target.parent != None and \ if target.parent != None and \
@ -531,37 +543,19 @@ class HTMLConverter(object):
page.contents.remove(bs) page.contents.remove(bs)
return ans return ans
cwd = os.getcwd() outside_links = deque()
for link in self.links[selfpath]: while len(self.links) > 0:
try: link = self.links.popleft()
para, text, purl = link['para'], link['text'], link['url'] para, text, path, fragment = link['para'], link['text'], link['path'], link['fragment']
# Needed for TOC entries due to bug in LRF # Needed for TOC entries due to bug in LRF
ascii_text = text.encode('ascii', 'replace') ascii_text = text.encode('ascii', 'ignore')
if purl[1]: # Not a link to a file on the local filesystem
continue
basepath, fragment = unquote(purl[2]), purl[5]
if not basepath:
basepath = selfpath
path = os.path.abspath(basepath)
if link_level < self.link_levels and path not in self.processed_files: if path in self.processed_files:
try:
self.start_on_file(path, is_root=False, link_level=link_level+1)
except Exception:
self.logger.warning('Unable to process %s', path)
if self.verbose:
self.logger.exception(' ')
continue
finally:
os.chdir(cwd)
if path+fragment in self.targets.keys(): if path+fragment in self.targets.keys():
tb = get_target_block(path+fragment, self.targets) tb = get_target_block(path+fragment, self.targets)
else: else:
try: tb = self.tops[path]
tb = self.tops[path] if self.link_level == 0 and len(self.base_files) == 1:
except KeyError:
return
if is_root:
add_toc_entry(ascii_text, tb) add_toc_entry(ascii_text, tb)
jb = JumpButton(tb) jb = JumpButton(tb)
self.book.append(jb) self.book.append(jb)
@ -572,8 +566,11 @@ class HTMLConverter(object):
self.unused_target_blocks.remove(tb) self.unused_target_blocks.remove(tb)
except ValueError: except ValueError:
pass pass
finally: else:
os.chdir(cwd) outside_links.append(link)
return outside_links
def end_page(self): def end_page(self):
""" """
@ -785,6 +782,12 @@ class HTMLConverter(object):
def process_image(self, path, tag_css, width=None, height=None, dropcaps=False): def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
def detect_encoding(im):
fmt = im.format
if fmt == 'JPG':
fmt = 'JPEG'
return fmt
original_path = path original_path = path
if self.rotated_images.has_key(path): if self.rotated_images.has_key(path):
path = self.rotated_images[path].name path = self.rotated_images[path].name
@ -793,15 +796,10 @@ class HTMLConverter(object):
try: try:
im = PILImage.open(path) im = PILImage.open(path)
encoding = im.format
if encoding:
encoding = encoding.upper()
if encoding == 'JPG':
encoding = 'JPEG'
except IOError, err: except IOError, err:
self.logger.warning('Unable to process image: %s\n%s', original_path, err) self.logger.warning('Unable to process image: %s\n%s', original_path, err)
return return
encoding = detect_encoding(im)
if width == None or height == None: if width == None or height == None:
width, height = im.size width, height = im.size
@ -809,12 +807,11 @@ class HTMLConverter(object):
factor = 720./self.profile.dpi factor = 720./self.profile.dpi
def scale_image(width, height): def scale_image(width, height):
pt = PersistentTemporaryFile(suffix='.jpeg') pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
try: try:
im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG') im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, encoding)
pt.close() pt.close()
self.scaled_images[path] = pt self.scaled_images[path] = pt
encoding = 'JPEG'
return pt.name return pt.name
except IOError: # PIL chokes on interlaced PNG images except IOError: # PIL chokes on interlaced PNG images
self.logger.warning('Unable to process interlaced PNG %s', path) self.logger.warning('Unable to process interlaced PNG %s', path)
@ -847,12 +844,11 @@ class HTMLConverter(object):
return return
if not self.disable_autorotation and width > pwidth and width > height: if not self.disable_autorotation and width > pwidth and width > height:
pt = PersistentTemporaryFile(suffix='.jpeg') pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
try: try:
im = im.rotate(90) im = im.rotate(90)
im.convert('RGB').save(pt, 'JPEG') im.save(pt, encoding)
path = pt.name path = pt.name
encoding = 'JPEG'
self.rotated_images[path] = pt self.rotated_images[path] = pt
width, height = im.size width, height = im.size
except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
@ -1245,35 +1241,39 @@ class HTMLConverter(object):
pass pass
elif tagname == 'a' and self.link_levels >= 0: elif tagname == 'a' and self.link_levels >= 0:
if tag.has_key('href') and not self.link_exclude.match(tag['href']): if tag.has_key('href') and not self.link_exclude.match(tag['href']):
purl = urlparse(tag['href']) path = munge_paths(self.target_prefix, tag['href'])[0]
path = unquote(purl[2])
ext = os.path.splitext(path)[1] ext = os.path.splitext(path)[1]
if ext: ext = ext[1:].lower() if ext: ext = ext[1:].lower()
if path and os.access(path, os.R_OK) and ext and \ if os.access(path, os.R_OK):
ext in ['png', 'jpg', 'bmp', 'jpeg']: if ext in ['png', 'jpg', 'bmp', 'jpeg']:
self.process_image(path, tag_css) self.process_image(path, tag_css)
else:
text = self.get_text(tag, limit=1000)
if not text.strip():
text = "Link"
self.add_text(text, tag_css, {}, force_span_use=True)
self.links.append(self.create_link(self.current_para.contents, tag))
if tag.has_key('id') or tag.has_key('name'):
key = 'name' if tag.has_key('name') else 'id'
self.targets[self.target_prefix+tag[key]] = self.current_block
else: else:
text = self.get_text(tag, limit=1000) self.logger.warn('Could not follow link to '+tag['href'])
if not text.strip():
text = "Link"
self.add_text(text, tag_css, {}, force_span_use=True)
self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
if tag.has_key('id') or tag.has_key('name'):
key = 'name' if tag.has_key('name') else 'id'
self.targets[self.target_prefix+tag[key]] = self.current_block
elif tag.has_key('name') or tag.has_key('id'): elif tag.has_key('name') or tag.has_key('id'):
self.process_anchor(tag, tag_css, tag_pseudo_css) self.process_anchor(tag, tag_css, tag_pseudo_css)
elif tagname == 'img': elif tagname == 'img':
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK): if tag.has_key('src'):
path = os.path.abspath(unquote(tag['src'])) path = munge_paths(self.target_prefix, tag['src'])[0]
width, height = None, None if os.access(path, os.R_OK):
try: width, height = None, None
width = int(tag['width']) try:
height = int(tag['height']) width = int(tag['width'])
except: height = int(tag['height'])
pass except:
dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps' pass
self.process_image(path, tag_css, width, height, dropcaps=dropcaps) dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
else:
self.logger.warn('Could not find image: '+tag['src'])
else: else:
self.logger.debug("Failed to process: %s", str(tag)) self.logger.debug("Failed to process: %s", str(tag))
elif tagname in ['style', 'link']: elif tagname in ['style', 'link']:
@ -1286,8 +1286,7 @@ class HTMLConverter(object):
npcss.update(pcss) npcss.update(pcss)
elif tag.has_key('type') and tag['type'] == "text/css" \ elif tag.has_key('type') and tag['type'] == "text/css" \
and tag.has_key('href'): and tag.has_key('href'):
purl = urlparse(tag['href']) path = munge_paths(self.target_prefix, tag['href'])[0]
path = unquote(purl[2])
try: try:
f = open(path, 'rb') f = open(path, 'rb')
src = f.read() src = f.read()
@ -1297,7 +1296,7 @@ class HTMLConverter(object):
self.page_break_found = True self.page_break_found = True
ncss, npcss = self.parse_css(src) ncss, npcss = self.parse_css(src)
except IOError: except IOError:
pass self.logger.warn('Could not read stylesheet: '+tag['href'])
if ncss: if ncss:
update_css(ncss, self.css) update_css(ncss, self.css)
self.css.update(self.override_css) self.css.update(self.override_css)
@ -1609,7 +1608,7 @@ def process_file(path, options, logger=None):
re.compile(fpba[2], re.IGNORECASE)] re.compile(fpba[2], re.IGNORECASE)]
if not hasattr(options, 'anchor_ids'): if not hasattr(options, 'anchor_ids'):
options.anchor_ids = True options.anchor_ids = True
conv = HTMLConverter(book, fonts, options, logger, path) conv = HTMLConverter(book, fonts, options, logger, [path])
oname = options.output oname = options.output
if not oname: if not oname:
suffix = '.lrs' if options.lrs else '.lrf' suffix = '.lrs' if options.lrs else '.lrf'