mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix bug in transformation of non-jpeg images. html2lrf internals refactored to support --use-spine.
This commit is contained in:
parent
9e1498969d
commit
df82180209
@ -14,14 +14,14 @@
|
|||||||
## You should have received a copy of the GNU General Public License along
|
## You should have received a copy of the GNU General Public License along
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
import tempfile
|
|
||||||
"""
|
"""
|
||||||
Code to convert HTML ebooks into LRF ebooks.
|
Code to convert HTML ebooks into LRF ebooks.
|
||||||
|
|
||||||
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
|
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
|
||||||
and to Falstaff for pylrs.
|
and to Falstaff for pylrs.
|
||||||
"""
|
"""
|
||||||
import os, re, sys, copy, glob, logging
|
import os, re, sys, copy, glob, logging, tempfile
|
||||||
|
from collections import deque
|
||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
@ -55,6 +55,15 @@ def update_css(ncss, ocss):
|
|||||||
else:
|
else:
|
||||||
ocss[key] = ncss[key]
|
ocss[key] = ncss[key]
|
||||||
|
|
||||||
|
def munge_paths(basepath, url):
|
||||||
|
purl = urlparse(url,)
|
||||||
|
path, fragment = purl[2], purl[5]
|
||||||
|
if not path:
|
||||||
|
path = basepath
|
||||||
|
elif not os.path.isabs(path):
|
||||||
|
path = os.path.join(os.path.dirname(basepath), path)
|
||||||
|
return os.path.normpath(path), fragment
|
||||||
|
|
||||||
class HTMLConverter(object):
|
class HTMLConverter(object):
|
||||||
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
|
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
|
||||||
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
|
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
|
||||||
@ -161,11 +170,10 @@ class HTMLConverter(object):
|
|||||||
'u' : {'text-decoration': 'underline'},
|
'u' : {'text-decoration': 'underline'},
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, book, fonts, options, logger, path):
|
def __init__(self, book, fonts, options, logger, paths):
|
||||||
'''
|
'''
|
||||||
Convert HTML file at C{path} and add it to C{book}. After creating
|
Convert HTML files at C{paths} and add to C{book}. After creating
|
||||||
the object, you must call L{self.process_links} on it to create the links and
|
the object, you must call L{self.writeto} to output the LRF/S file.
|
||||||
then L{self.writeto} to output the LRF/S file.
|
|
||||||
|
|
||||||
@param book: The LRF book
|
@param book: The LRF book
|
||||||
@type book: L{libprs500.lrf.pylrs.Book}
|
@type book: L{libprs500.lrf.pylrs.Book}
|
||||||
@ -182,7 +190,7 @@ class HTMLConverter(object):
|
|||||||
self.block_styles = []#: Keep track of already used blockstyles
|
self.block_styles = []#: Keep track of already used blockstyles
|
||||||
self.images = {} #: Images referenced in the HTML document
|
self.images = {} #: Images referenced in the HTML document
|
||||||
self.targets = {} #: <a name=...> and id elements
|
self.targets = {} #: <a name=...> and id elements
|
||||||
self.links = {} #: <a href=...> elements
|
self.links = deque() #: <a href=...> elements
|
||||||
self.processed_files = []
|
self.processed_files = []
|
||||||
self.unused_target_blocks = [] #: Used to remove extra TextBlocks
|
self.unused_target_blocks = [] #: Used to remove extra TextBlocks
|
||||||
self.link_level = 0 #: Current link level
|
self.link_level = 0 #: Current link level
|
||||||
@ -209,6 +217,7 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
self.override_css = {}
|
self.override_css = {}
|
||||||
self.override_pcss = {}
|
self.override_pcss = {}
|
||||||
|
|
||||||
if self._override_css is not None:
|
if self._override_css is not None:
|
||||||
if os.access(self._override_css, os.R_OK):
|
if os.access(self._override_css, os.R_OK):
|
||||||
src = open(self._override_css, 'rb').read()
|
src = open(self._override_css, 'rb').read()
|
||||||
@ -223,7 +232,16 @@ class HTMLConverter(object):
|
|||||||
if npcss:
|
if npcss:
|
||||||
update_css(npcss, self.override_pcss)
|
update_css(npcss, self.override_pcss)
|
||||||
|
|
||||||
self.start_on_file(path, is_root=True)
|
paths = [os.path.normpath(os.path.abspath(path)) for path in paths]
|
||||||
|
self.base_files = copy.copy(paths)
|
||||||
|
while len(paths) > 0 and self.link_level <= self.link_levels:
|
||||||
|
for path in paths:
|
||||||
|
self.add_file(path)
|
||||||
|
self.links = self.process_links()
|
||||||
|
self.link_level += 1
|
||||||
|
paths = [link['path'] for link in self.links]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def is_baen(self, soup):
|
def is_baen(self, soup):
|
||||||
return bool(soup.find('meta', attrs={'name':'Publisher',
|
return bool(soup.find('meta', attrs={'name':'Publisher',
|
||||||
@ -281,33 +299,25 @@ class HTMLConverter(object):
|
|||||||
#print soup
|
#print soup
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def start_on_file(self, path, is_root=True, link_level=0):
|
def add_file(self, path):
|
||||||
self.css = HTMLConverter.CSS.copy()
|
self.css = HTMLConverter.CSS.copy()
|
||||||
self.pseudo_css = self.override_pcss.copy()
|
self.pseudo_css = self.override_pcss.copy()
|
||||||
self.css.update(self.override_css)
|
self.css.update(self.override_css)
|
||||||
|
|
||||||
path = os.path.abspath(path)
|
path = os.path.normpath(os.path.abspath(path))
|
||||||
os.chdir(os.path.dirname(path))
|
|
||||||
self.file_name = os.path.basename(path)
|
self.file_name = os.path.basename(path)
|
||||||
self.logger.info('Processing %s', self.file_name)
|
self.logger.info('Processing %s', self.file_name)
|
||||||
sys.stdout.flush()
|
raw = open(path, 'rb').read()
|
||||||
soup = self.preprocess(open(self.file_name, 'rb').read())
|
soup = self.preprocess(raw)
|
||||||
self.logger.info('\tConverting to BBeB...')
|
self.logger.info('\tConverting to BBeB...')
|
||||||
sys.stdout.flush()
|
|
||||||
self.current_page = None
|
self.current_page = None
|
||||||
self.current_para = None
|
self.current_para = None
|
||||||
self.current_style = {}
|
self.current_style = {}
|
||||||
self.page_break_found = False
|
self.page_break_found = False
|
||||||
match = self.PAGE_BREAK_PAT.search(unicode(soup))
|
|
||||||
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
|
||||||
self.page_break_found = True
|
|
||||||
self.target_prefix = path
|
self.target_prefix = path
|
||||||
self.links[path] = []
|
|
||||||
self.previous_text = '\n'
|
self.previous_text = '\n'
|
||||||
self.tops[path] = self.parse_file(soup, is_root)
|
self.tops[path] = self.parse_file(soup)
|
||||||
self.processed_files.append(path)
|
self.processed_files.append(path)
|
||||||
self.process_links(is_root, path, link_level=link_level)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_css(self, style):
|
def parse_css(self, style):
|
||||||
"""
|
"""
|
||||||
@ -394,7 +404,7 @@ class HTMLConverter(object):
|
|||||||
prop.update(self.parse_style_properties(tag["style"]))
|
prop.update(self.parse_style_properties(tag["style"]))
|
||||||
return prop, pprop
|
return prop, pprop
|
||||||
|
|
||||||
def parse_file(self, soup, is_root):
|
def parse_file(self, soup):
|
||||||
def get_valid_block(page):
|
def get_valid_block(page):
|
||||||
for item in page.contents:
|
for item in page.contents:
|
||||||
if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
|
if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
|
||||||
@ -405,8 +415,9 @@ class HTMLConverter(object):
|
|||||||
self.current_page = self.book.create_page()
|
self.current_page = self.book.create_page()
|
||||||
self.current_block = self.book.create_text_block()
|
self.current_block = self.book.create_text_block()
|
||||||
self.current_para = Paragraph()
|
self.current_para = Paragraph()
|
||||||
if self.cover and is_root:
|
if self.cover:
|
||||||
self.add_image_page(self.cover)
|
self.add_image_page(self.cover)
|
||||||
|
self.cover = None
|
||||||
top = self.current_block
|
top = self.current_block
|
||||||
|
|
||||||
self.process_children(soup, {}, {})
|
self.process_children(soup, {}, {})
|
||||||
@ -462,8 +473,9 @@ class HTMLConverter(object):
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
url = urlparse(tag['href'])
|
path, fragment = munge_paths(self.target_prefix, tag['href'])
|
||||||
return {'para':para, 'text':text, 'url':url}
|
return {'para':para, 'text':text, 'path':os.path.normpath(path),
|
||||||
|
'fragment':fragment}
|
||||||
|
|
||||||
|
|
||||||
def get_text(self, tag, limit=None):
|
def get_text(self, tag, limit=None):
|
||||||
@ -489,7 +501,7 @@ class HTMLConverter(object):
|
|||||||
text = rule.sub(sub, text)
|
text = rule.sub(sub, text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def process_links(self, is_root, selfpath, link_level=0):
|
def process_links(self):
|
||||||
def add_toc_entry(text, target):
|
def add_toc_entry(text, target):
|
||||||
# TextBlocks in Canvases have a None parent or an Objects Parent
|
# TextBlocks in Canvases have a None parent or an Objects Parent
|
||||||
if target.parent != None and \
|
if target.parent != None and \
|
||||||
@ -531,37 +543,19 @@ class HTMLConverter(object):
|
|||||||
page.contents.remove(bs)
|
page.contents.remove(bs)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
cwd = os.getcwd()
|
outside_links = deque()
|
||||||
for link in self.links[selfpath]:
|
while len(self.links) > 0:
|
||||||
try:
|
link = self.links.popleft()
|
||||||
para, text, purl = link['para'], link['text'], link['url']
|
para, text, path, fragment = link['para'], link['text'], link['path'], link['fragment']
|
||||||
# Needed for TOC entries due to bug in LRF
|
# Needed for TOC entries due to bug in LRF
|
||||||
ascii_text = text.encode('ascii', 'replace')
|
ascii_text = text.encode('ascii', 'ignore')
|
||||||
if purl[1]: # Not a link to a file on the local filesystem
|
|
||||||
continue
|
|
||||||
basepath, fragment = unquote(purl[2]), purl[5]
|
|
||||||
if not basepath:
|
|
||||||
basepath = selfpath
|
|
||||||
path = os.path.abspath(basepath)
|
|
||||||
|
|
||||||
if link_level < self.link_levels and path not in self.processed_files:
|
if path in self.processed_files:
|
||||||
try:
|
|
||||||
self.start_on_file(path, is_root=False, link_level=link_level+1)
|
|
||||||
except Exception:
|
|
||||||
self.logger.warning('Unable to process %s', path)
|
|
||||||
if self.verbose:
|
|
||||||
self.logger.exception(' ')
|
|
||||||
continue
|
|
||||||
finally:
|
|
||||||
os.chdir(cwd)
|
|
||||||
if path+fragment in self.targets.keys():
|
if path+fragment in self.targets.keys():
|
||||||
tb = get_target_block(path+fragment, self.targets)
|
tb = get_target_block(path+fragment, self.targets)
|
||||||
else:
|
else:
|
||||||
try:
|
|
||||||
tb = self.tops[path]
|
tb = self.tops[path]
|
||||||
except KeyError:
|
if self.link_level == 0 and len(self.base_files) == 1:
|
||||||
return
|
|
||||||
if is_root:
|
|
||||||
add_toc_entry(ascii_text, tb)
|
add_toc_entry(ascii_text, tb)
|
||||||
jb = JumpButton(tb)
|
jb = JumpButton(tb)
|
||||||
self.book.append(jb)
|
self.book.append(jb)
|
||||||
@ -572,8 +566,11 @@ class HTMLConverter(object):
|
|||||||
self.unused_target_blocks.remove(tb)
|
self.unused_target_blocks.remove(tb)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
finally:
|
else:
|
||||||
os.chdir(cwd)
|
outside_links.append(link)
|
||||||
|
|
||||||
|
return outside_links
|
||||||
|
|
||||||
|
|
||||||
def end_page(self):
|
def end_page(self):
|
||||||
"""
|
"""
|
||||||
@ -785,6 +782,12 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
|
|
||||||
def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
|
def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
|
||||||
|
def detect_encoding(im):
|
||||||
|
fmt = im.format
|
||||||
|
if fmt == 'JPG':
|
||||||
|
fmt = 'JPEG'
|
||||||
|
return fmt
|
||||||
|
|
||||||
original_path = path
|
original_path = path
|
||||||
if self.rotated_images.has_key(path):
|
if self.rotated_images.has_key(path):
|
||||||
path = self.rotated_images[path].name
|
path = self.rotated_images[path].name
|
||||||
@ -793,15 +796,10 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
im = PILImage.open(path)
|
im = PILImage.open(path)
|
||||||
encoding = im.format
|
|
||||||
if encoding:
|
|
||||||
encoding = encoding.upper()
|
|
||||||
if encoding == 'JPG':
|
|
||||||
encoding = 'JPEG'
|
|
||||||
except IOError, err:
|
except IOError, err:
|
||||||
self.logger.warning('Unable to process image: %s\n%s', original_path, err)
|
self.logger.warning('Unable to process image: %s\n%s', original_path, err)
|
||||||
return
|
return
|
||||||
|
encoding = detect_encoding(im)
|
||||||
|
|
||||||
if width == None or height == None:
|
if width == None or height == None:
|
||||||
width, height = im.size
|
width, height = im.size
|
||||||
@ -809,12 +807,11 @@ class HTMLConverter(object):
|
|||||||
factor = 720./self.profile.dpi
|
factor = 720./self.profile.dpi
|
||||||
|
|
||||||
def scale_image(width, height):
|
def scale_image(width, height):
|
||||||
pt = PersistentTemporaryFile(suffix='.jpeg')
|
pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
|
||||||
try:
|
try:
|
||||||
im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
|
im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, encoding)
|
||||||
pt.close()
|
pt.close()
|
||||||
self.scaled_images[path] = pt
|
self.scaled_images[path] = pt
|
||||||
encoding = 'JPEG'
|
|
||||||
return pt.name
|
return pt.name
|
||||||
except IOError: # PIL chokes on interlaced PNG images
|
except IOError: # PIL chokes on interlaced PNG images
|
||||||
self.logger.warning('Unable to process interlaced PNG %s', path)
|
self.logger.warning('Unable to process interlaced PNG %s', path)
|
||||||
@ -847,12 +844,11 @@ class HTMLConverter(object):
|
|||||||
return
|
return
|
||||||
|
|
||||||
if not self.disable_autorotation and width > pwidth and width > height:
|
if not self.disable_autorotation and width > pwidth and width > height:
|
||||||
pt = PersistentTemporaryFile(suffix='.jpeg')
|
pt = PersistentTemporaryFile(suffix='.'+encoding.lower())
|
||||||
try:
|
try:
|
||||||
im = im.rotate(90)
|
im = im.rotate(90)
|
||||||
im.convert('RGB').save(pt, 'JPEG')
|
im.save(pt, encoding)
|
||||||
path = pt.name
|
path = pt.name
|
||||||
encoding = 'JPEG'
|
|
||||||
self.rotated_images[path] = pt
|
self.rotated_images[path] = pt
|
||||||
width, height = im.size
|
width, height = im.size
|
||||||
except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
|
except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
|
||||||
@ -1245,27 +1241,29 @@ class HTMLConverter(object):
|
|||||||
pass
|
pass
|
||||||
elif tagname == 'a' and self.link_levels >= 0:
|
elif tagname == 'a' and self.link_levels >= 0:
|
||||||
if tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
if tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
||||||
purl = urlparse(tag['href'])
|
path = munge_paths(self.target_prefix, tag['href'])[0]
|
||||||
path = unquote(purl[2])
|
|
||||||
ext = os.path.splitext(path)[1]
|
ext = os.path.splitext(path)[1]
|
||||||
if ext: ext = ext[1:].lower()
|
if ext: ext = ext[1:].lower()
|
||||||
if path and os.access(path, os.R_OK) and ext and \
|
if os.access(path, os.R_OK):
|
||||||
ext in ['png', 'jpg', 'bmp', 'jpeg']:
|
if ext in ['png', 'jpg', 'bmp', 'jpeg']:
|
||||||
self.process_image(path, tag_css)
|
self.process_image(path, tag_css)
|
||||||
else:
|
else:
|
||||||
text = self.get_text(tag, limit=1000)
|
text = self.get_text(tag, limit=1000)
|
||||||
if not text.strip():
|
if not text.strip():
|
||||||
text = "Link"
|
text = "Link"
|
||||||
self.add_text(text, tag_css, {}, force_span_use=True)
|
self.add_text(text, tag_css, {}, force_span_use=True)
|
||||||
self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
|
self.links.append(self.create_link(self.current_para.contents, tag))
|
||||||
if tag.has_key('id') or tag.has_key('name'):
|
if tag.has_key('id') or tag.has_key('name'):
|
||||||
key = 'name' if tag.has_key('name') else 'id'
|
key = 'name' if tag.has_key('name') else 'id'
|
||||||
self.targets[self.target_prefix+tag[key]] = self.current_block
|
self.targets[self.target_prefix+tag[key]] = self.current_block
|
||||||
|
else:
|
||||||
|
self.logger.warn('Could not follow link to '+tag['href'])
|
||||||
elif tag.has_key('name') or tag.has_key('id'):
|
elif tag.has_key('name') or tag.has_key('id'):
|
||||||
self.process_anchor(tag, tag_css, tag_pseudo_css)
|
self.process_anchor(tag, tag_css, tag_pseudo_css)
|
||||||
elif tagname == 'img':
|
elif tagname == 'img':
|
||||||
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
|
if tag.has_key('src'):
|
||||||
path = os.path.abspath(unquote(tag['src']))
|
path = munge_paths(self.target_prefix, tag['src'])[0]
|
||||||
|
if os.access(path, os.R_OK):
|
||||||
width, height = None, None
|
width, height = None, None
|
||||||
try:
|
try:
|
||||||
width = int(tag['width'])
|
width = int(tag['width'])
|
||||||
@ -1274,6 +1272,8 @@ class HTMLConverter(object):
|
|||||||
pass
|
pass
|
||||||
dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
|
dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
|
||||||
self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
|
self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
|
||||||
|
else:
|
||||||
|
self.logger.warn('Could not find image: '+tag['src'])
|
||||||
else:
|
else:
|
||||||
self.logger.debug("Failed to process: %s", str(tag))
|
self.logger.debug("Failed to process: %s", str(tag))
|
||||||
elif tagname in ['style', 'link']:
|
elif tagname in ['style', 'link']:
|
||||||
@ -1286,8 +1286,7 @@ class HTMLConverter(object):
|
|||||||
npcss.update(pcss)
|
npcss.update(pcss)
|
||||||
elif tag.has_key('type') and tag['type'] == "text/css" \
|
elif tag.has_key('type') and tag['type'] == "text/css" \
|
||||||
and tag.has_key('href'):
|
and tag.has_key('href'):
|
||||||
purl = urlparse(tag['href'])
|
path = munge_paths(self.target_prefix, tag['href'])[0]
|
||||||
path = unquote(purl[2])
|
|
||||||
try:
|
try:
|
||||||
f = open(path, 'rb')
|
f = open(path, 'rb')
|
||||||
src = f.read()
|
src = f.read()
|
||||||
@ -1297,7 +1296,7 @@ class HTMLConverter(object):
|
|||||||
self.page_break_found = True
|
self.page_break_found = True
|
||||||
ncss, npcss = self.parse_css(src)
|
ncss, npcss = self.parse_css(src)
|
||||||
except IOError:
|
except IOError:
|
||||||
pass
|
self.logger.warn('Could not read stylesheet: '+tag['href'])
|
||||||
if ncss:
|
if ncss:
|
||||||
update_css(ncss, self.css)
|
update_css(ncss, self.css)
|
||||||
self.css.update(self.override_css)
|
self.css.update(self.override_css)
|
||||||
@ -1609,7 +1608,7 @@ def process_file(path, options, logger=None):
|
|||||||
re.compile(fpba[2], re.IGNORECASE)]
|
re.compile(fpba[2], re.IGNORECASE)]
|
||||||
if not hasattr(options, 'anchor_ids'):
|
if not hasattr(options, 'anchor_ids'):
|
||||||
options.anchor_ids = True
|
options.anchor_ids = True
|
||||||
conv = HTMLConverter(book, fonts, options, logger, path)
|
conv = HTMLConverter(book, fonts, options, logger, [path])
|
||||||
oname = options.output
|
oname = options.output
|
||||||
if not oname:
|
if not oname:
|
||||||
suffix = '.lrs' if options.lrs else '.lrf'
|
suffix = '.lrs' if options.lrs else '.lrf'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user