calibre/src/libprs500/ebooks/lrf/html/convert_from.py
2007-08-23 06:31:30 +00:00

1582 lines
68 KiB
Python

## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This work is based on htmlbbeb created by esperanc.
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
"""
import os, re, sys, copy, glob, logging
from htmlentitydefs import name2codepoint
from urllib import unquote
from urlparse import urlparse
from math import ceil, floor
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Comment, Tag, \
NavigableString, Declaration, ProcessingInstruction
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
TextBlock, ImageBlock, JumpButton, CharButton, \
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas, DropCaps, \
LrsError, Sup, Sub, properties_different
from libprs500.ebooks.lrf.pylrs.pylrs import Span
from libprs500.ebooks.lrf import Book
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import filename_to_utf8, setup_cli_handlers, __appname__
from libprs500.ptempfile import PersistentTemporaryFile
from libprs500.ebooks.metadata.opf import OPFReader
class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo']
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
ENTITY_RULES = zip(patterns, targets) + [(re.compile('''), "'")]
MARKUP_MASSAGE = [
# Close <a /> tags
(re.compile("(<a\s+.*?)/>|<a/>", re.IGNORECASE),
lambda match: match.group(1)+"></a>"),
# Strip comments from <style> tags. This is needed as
# sometimes there are unterminated comments
(re.compile(r"<\s*style.*?>(.*?)<\/\s*style\s*>", re.DOTALL|re.IGNORECASE),
lambda match: match.group().replace('<!--', '').replace('-->', '')),
# remove <p> tags from within <a> tags
(re.compile(r'<a.*?>(.*?)</a\s*>', re.DOTALL|re.IGNORECASE),
lambda match: re.compile(r'<\s*?p.*?>', re.IGNORECASE).sub('', match.group())),
]
# Fix Baen markup
BAEN = [
(re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE),
lambda match: match.group(1)),
(re.compile(r'<p>\s*(<a id.*?>\s*</a>)\s*</p>', re.IGNORECASE),
lambda match: match.group(1)),
(re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
PDFTOHTML = [
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<span style="page-break-after:always"> </span>'),
# Remove page numbers
(re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p>
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.*)<br.*?>', re.IGNORECASE),
lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
else match.group(1)),
# Remove hyphenation
(re.compile(r'-\n|-\n\r'), lambda match: ''),
]
def __hasattr__(self, attr):
if hasattr(self.options, attr):
return True
return object.__hasattr__(self, attr)
def __getattr__(self, attr):
if hasattr(self.options, attr):
return getattr(self.options, attr)
return object.__getattr__(self, attr)
def __setattr__(self, attr, val):
if hasattr(self.options, attr):
setattr(self.options, attr, val)
else:
object.__setattr__(self, attr, val)
CSS = {
'h1' : {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'},
'h2' : {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'},
'h3' : {"font-size" : "large", "font-weight":"bold", 'text-indent':'0pt'},
'h4' : {"font-size" : "large", 'text-indent':'0pt'},
'h5' : {"font-weight" : "bold", 'text-indent':'0pt'},
'b' : {"font-weight" : "bold"},
'strong' : {"font-weight" : "bold"},
'i' : {"font-style" : "italic"},
'cite' : {'font-style' : 'italic'},
'em' : {"font-style" : "italic"},
'small' : {'font-size' : 'small'},
'pre' : {'font-family' : 'monospace', 'white-space': 'pre' },
'code' : {'font-family' : 'monospace' },
'tt' : {'font-family' : 'monospace'},
'center' : {'text-align' : 'center'},
'th' : {'font-size' : 'large', 'font-weight':'bold'},
'big' : {'font-size' : 'large', 'font-weight':'bold'},
'.libprs500_dropcaps' : {'font-size': 'xx-large'},
}
def __init__(self, book, fonts, options, logger, path):
'''
Convert HTML file at C{path} and add it to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and
then L{self.writeto} to output the LRF/S file.
@param book: The LRF book
@type book: L{libprs500.lrf.pylrs.Book}
@param fonts: dict specifying the font families to use
'''
# Defaults for various formatting tags
object.__setattr__(self, 'options', options)
self.logger = logger
self.fonts = fonts #: dict specifying font families to use
# Memory
self.scaled_images = {} #: Temporary files with scaled version of images
self.rotated_images = {} #: Temporary files with rotated version of images
self.text_styles = []#: Keep track of already used textstyles
self.block_styles = []#: Keep track of already used blockstyles
self.images = {} #: Images referenced in the HTML document
self.targets = {} #: <a name=...> and id elements
self.links = {} #: <a href=...> elements
self.processed_files = []
self.unused_target_blocks = [] #: Used to remove extra TextBlocks
self.link_level = 0 #: Current link level
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
self.tops = {} #: element representing the top of each HTML file in the LRF file
self.previous_text = '' #: Used to figure out when to lstrip
# Styles
self.blockquote_style = book.create_block_style(sidemargin=60,
topskip=20, footskip=20)
self.unindented_style = book.create_text_style(parindent=0)
# Set by table processing code so that any <a name> within the table
# point to the previous element
self.anchor_to_previous = None
self.in_table = False
# List processing
self.list_level = 0
self.list_indent = 20
self.list_counter = 1
self.book = book #: The Book object representing a BBeB book
self.start_on_file(path, is_root=True)
def is_baen(self, soup):
return bool(soup.find('meta', attrs={'name':'Publisher',
'content':re.compile('Baen', re.IGNORECASE)}))
def start_on_file(self, path, is_root=True, link_level=0):
path = os.path.abspath(path)
os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path)
self.logger.info('Processing %s\n\tParsing HTML...', self.file_name)
sys.stdout.flush()
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
if self.baen:
nmassage.extend(HTMLConverter.BAEN)
raw = open(self.file_name, 'rb').read()
if self.pdftohtml:
nmassage.extend(HTMLConverter.PDFTOHTML)
raw = unicode(raw, 'utf8', 'replace')
soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
if not self.baen and self.is_baen(soup):
self.baen = True
self.logger.info('Baen file detected. Re-parsing...')
return self.start_on_file(path, is_root=is_root, link_level=link_level)
self.logger.info('\tConverting to BBeB...')
sys.stdout.flush()
self.current_page = None
self.current_para = None
self.current_style = {}
self.page_break_found = False
match = self.PAGE_BREAK_PAT.search(unicode(soup))
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
self.css = HTMLConverter.CSS.copy()
self.pseudo_css = {}
self.target_prefix = path
self.links[path] = []
self.previous_text = '\n'
self.tops[path] = self.parse_file(soup, is_root)
self.processed_files.append(path)
self.process_links(is_root, path, link_level=link_level)
def parse_css(self, style):
"""
Parse the contents of a <style> tag or .css file.
@param style: C{str(style)} should be the CSS to parse.
@return: A dictionary with one entry per selector where the key is the
selector name and the value is a dictionary of properties
"""
sdict, pdict = {}, {}
style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
for key in sel[0].split(','):
val = self.parse_style_properties(sel[1])
key = key.strip().lower()
if ':' in key:
key, sep, pseudo = key.partition(':')
if key in pdict:
if pseudo in pdict[key]:
pdict[key][pseudo].update(val)
else:
pdict[key][pseudo] = val
else:
pdict[key] = {pseudo:val}
else:
if key in sdict:
sdict[key].update(val)
else:
sdict[key] = val
return sdict, pdict
def parse_style_properties(self, props):
"""
Parses a style attribute. The code within a CSS selector block or in
the style attribute of an HTML element.
@return: A dictionary with one entry for each property where the key
is the property name and the value is the property value.
"""
prop = dict()
for s in props.split(';'):
l = s.split(':',1)
if len(l)==2:
key = str(l[0].strip()).lower()
val = l[1].strip()
prop [key] = val
return prop
def tag_css(self, tag, parent_css={}):
"""
Return a dictionary of style properties applicable to Tag tag.
"""
def merge_parent_css(prop, pcss):
temp = {}
for key in pcss.keys():
chk = key.lower()
# float should not be inherited according to the CSS spec
# however we need to as we don't do alignment at a block level.
# float is removed by the process_alignment function.
if chk.startswith('font') or chk == 'text-align' or \
chk == 'float' or chk == 'white-space':
temp[key] = pcss[key]
prop.update(temp)
prop, pprop = {}, {}
tagname = tag.name.lower()
if parent_css:
merge_parent_css(prop, parent_css)
if tag.has_key("align"):
prop["text-align"] = tag["align"]
if self.css.has_key(tagname):
prop.update(self.css[tagname])
if self.pseudo_css.has_key(tagname):
pprop.update(self.pseudo_css[tagname])
if tag.has_key("class"):
cls = tag["class"].lower()
for cls in cls.split():
for classname in ["."+cls, tagname+"."+cls]:
if self.css.has_key(classname):
prop.update(self.css[classname])
if self.pseudo_css.has_key(classname):
pprop.update(self.pseudo_css[classname])
if tag.has_key("style"):
prop.update(self.parse_style_properties(tag["style"]))
return prop, pprop
def parse_file(self, soup, is_root):
def get_valid_block(page):
for item in page.contents:
if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
if isinstance(item, TextBlock) and not item.contents:
continue
return item
previous = self.book.last_page()
self.current_page = self.book.create_page()
self.current_block = self.book.create_text_block()
self.current_para = Paragraph()
if self.cover and is_root:
self.add_image_page(self.cover)
top = self.current_block
self.process_children(soup, {}, {})
if self.current_para and self.current_block:
self.current_para.append_to(self.current_block)
if self.current_block and self.current_page:
self.current_block.append_to(self.current_page)
if self.current_page and self.current_page.has_text():
self.book.append(self.current_page)
if not top.parent or not top.contents:
if not previous:
try:
previous = self.book.pages()[0]
except IndexError:
raise ConversionError, self.file_name + ' does not seem to have any content'
top = get_valid_block(previous)
if not top or not top.parent:
raise ConversionError, self.file_name + ' does not seem to have any content'
return top
found = False
for page in self.book.pages():
if page == previous:
found = True
continue
if found:
top = get_valid_block(page)
if not top:
continue
break
if not top or not top.parent:
raise ConversionError, 'Could not parse ' + self.file_name
return top
def create_link(self, children, tag):
para = None
for i in range(len(children)-1, -1, -1):
if isinstance(children[i], Span):
para = children[i]
break
if para is None:
raise ConversionError('Failed to parse link %s'%(tag,))
text = self.get_text(tag, 1000)
if not text:
text = 'Link'
img = tag.find('img')
if img:
try:
text = img['alt']
except KeyError:
pass
url = urlparse(tag['href'])
return {'para':para, 'text':text, 'url':url}
def get_text(self, tag, limit=None):
css = self.tag_css(tag)[0]
if (css.has_key('display') and css['display'].lower() == 'none') or \
(css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
return ''
text = u''
for c in tag.contents:
if limit != None and len(text) > limit:
break
if isinstance(c, HTMLConverter.IGNORED_TAGS):
return u''
if isinstance(c, NavigableString):
text += unicode(c)
elif isinstance(c, Tag):
if c.name.lower() == 'img' and c.has_key('alt'):
text += c['alt']
return text
text += self.get_text(c)
if text:
for rule, sub in self.__class__.ENTITY_RULES:
text = rule.sub(sub, text)
return text
def process_links(self, is_root, selfpath, link_level=0):
def add_toc_entry(text, target):
# TextBlocks in Canvases have a None parent or an Objects Parent
if target.parent != None and \
hasattr(target.parent, 'objId'):
self.book.addTocEntry(ascii_text, tb)
else:
self.logger.debug("Cannot add link %s to TOC", ascii_text)
def get_target_block(fragment, targets):
'''Return the correct block for the <a name> element'''
bs = targets[fragment]
if not isinstance(bs, BlockSpace):
return bs
ans, found, page = None, False, bs.parent
for item in page.contents:
if found:
if isinstance(item, (TextBlock, RuledLine, ImageBlock)):
ans = item
break
if item == bs:
found = True
continue
if not ans:
for i in range(len(page.contents)-1, -1, -1):
if isinstance(page.contents[i], (TextBlock, RuledLine, ImageBlock)):
ans = page.contents[i]
break
if not ans:
ntb = self.book.create_text_block()
ntb.Paragraph(' ')
page.append(ntb)
ans = ntb
if found:
targets[fragment] = ans
page.contents.remove(bs)
return ans
cwd = os.getcwd()
for link in self.links[selfpath]:
try:
para, text, purl = link['para'], link['text'], link['url']
# Needed for TOC entries due to bug in LRF
ascii_text = text.encode('ascii', 'replace')
if purl[1]: # Not a link to a file on the local filesystem
continue
basepath, fragment = unquote(purl[2]), purl[5]
if not basepath:
basepath = selfpath
path = os.path.abspath(basepath)
if link_level < self.link_levels and path not in self.processed_files:
try:
self.start_on_file(path, is_root=False, link_level=link_level+1)
except Exception:
self.logger.warning('Unable to process %s', path)
if self.verbose:
self.logger.exception(' ')
continue
finally:
os.chdir(cwd)
if path+fragment in self.targets.keys():
tb = get_target_block(path+fragment, self.targets)
else:
try:
tb = self.tops[path]
except KeyError:
return
if is_root:
add_toc_entry(ascii_text, tb)
jb = JumpButton(tb)
self.book.append(jb)
cb = CharButton(jb, text=text)
para.contents = []
para.append(cb)
try:
self.unused_target_blocks.remove(tb)
except ValueError:
pass
finally:
os.chdir(cwd)
def end_page(self):
"""
End the current page, ensuring that any further content is displayed
on a new page.
"""
self.current_para.append_to(self.current_block)
self.current_para = Paragraph()
self.current_block.append_to(self.current_page)
self.current_block = self.book.create_text_block()
if self.current_page.has_text():
self.book.append(self.current_page)
self.current_page = self.book.create_page()
def add_image_page(self, path):
if os.access(path, os.R_OK):
self.end_page()
page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
topmargin=0, textwidth=self.profile.screen_width,
headheight=0, headsep=0, footspace=0,
footheight=0,
textheight=self.profile.screen_height)
if not self.images.has_key(path):
self.images[path] = ImageStream(path)
ib = ImageBlock(self.images[path], x1=self.profile.screen_width,
y1=self.profile.screen_height, blockwidth=self.profile.screen_width,
blockheight=self.profile.screen_height)
page.append(ib)
self.book.append(page)
def process_children(self, ptag, pcss, ppcss={}):
""" Process the children of ptag """
# Need to make a copy of contents as when
# extract is called on a child, it will
# mess up the iteration.
contents = [i for i in ptag.contents]
for c in contents:
if isinstance(c, HTMLConverter.IGNORED_TAGS):
continue
elif isinstance(c, Tag):
self.parse_tag(c, pcss)
elif isinstance(c, NavigableString):
self.add_text(c, pcss, ppcss)
if not self.in_table:
try:
ptag.extract()
except AttributeError:
print ptag, type(ptag)
def get_alignment(self, css):
align = 'head'
if css.has_key('text-align'):
val = css['text-align'].lower()
if val in ["right", "foot"]:
align = "foot"
elif val == "center":
align = "center"
if css.has_key('float'):
val = css['float'].lower()
if val == 'left':
align = 'head'
if val == 'right':
align = 'foot'
css.pop('float')
return align
def process_alignment(self, css):
'''
Create a new TextBlock only if necessary as indicated by css
@type css: dict
'''
align = self.get_alignment(css)
if align != self.current_block.textStyle.attrs['align']:
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
ts.attrs['align'] = align
try:
index = self.text_styles.index(ts)
ts = self.text_styles[index]
except ValueError:
self.text_styles.append(ts)
self.current_block = self.book.create_text_block(
blockStyle=self.current_block.blockStyle,
textStyle=ts)
self.current_para = Paragraph()
return True
return False
def add_text(self, tag, css, pseudo_css, force_span_use=False):
'''
Add text to the current paragraph taking CSS into account.
@param tag: Either a BeautifulSoup tag or a string
@param css: A dict
'''
src = tag.string if hasattr(tag, 'string') else tag
src = src.replace('\r\n', '\n').replace('\r', '\n')
if pseudo_css.has_key('first-letter'):
src = src.lstrip()
f = src[0]
src = src[1:]
ncss = css.copy()
ncss.update(pseudo_css.pop('first-letter'))
self.add_text(f, ncss, {}, force_span_use)
collapse_whitespace = not css.has_key('white-space') or css['white-space'] != 'pre'
if self.process_alignment(css) and collapse_whitespace:
# Dont want leading blanks in a new paragraph
src = src.lstrip()
def append_text(src):
fp, key, variant = self.font_properties(css)
for pat, repl in self.__class__.ENTITY_RULES:
src = pat.sub(repl, src)
src = src.replace(u'\xa0', ' ')# nbsp is replaced with \xa0 by BeatifulSoup
normal_font_size = int(fp['fontsize'])
if variant == 'small-caps':
dump = Span(fontsize=normal_font_size-30)
temp = []
for c in src:
if c.isupper():
if temp:
dump.append(''.join(temp))
temp = []
dump.append(Span(c, fontsize=normal_font_size))
else:
temp.append(c.upper())
src = dump
if temp:
src.append(''.join(temp))
if key in ['italic', 'bi']:
already_italic = False
for fonts in self.fonts.values():
it = fonts['italic'][1] if fonts.has_key('italic') else ''
bi = fonts['bi'][1] if fonts.has_key('bi') else ''
if fp['fontfacename'] in (it, bi):
already_italic = True
break
if not already_italic:
src = Italic(src)
unneeded = []
for prop in fp:
if fp[prop] == self.current_block.textStyle.attrs[prop]:
unneeded.append(prop)
for prop in unneeded:
fp.pop(prop)
elem = Span(text=src, **fp) if (fp or force_span_use) else src
self.current_para.append(elem)
if collapse_whitespace:
src = re.sub(r'\s{1,}', ' ', src)
if len(self.previous_text) != len(self.previous_text.rstrip()):
src = src.lstrip()
if len(src):
self.previous_text = src
append_text(src)
else:
srcs = src.split('\n')
for src in srcs:
if src:
append_text(src)
if len(srcs) > 1:
self.line_break()
def line_break(self):
self.current_para.append(CR())
self.previous_text = '\n'
def end_current_para(self):
'''
End current paragraph with a paragraph break after it.
'''
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_block.append(CR())
self.current_para = Paragraph()
def end_current_block(self):
'''
End current TextBlock. Create new TextBlock with the same styles.
'''
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if self.current_block.contents or self.current_block.must_append:
self.current_page.append(self.current_block)
self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
original_path = path
if self.rotated_images.has_key(path):
path = self.rotated_images[path].name
if self.scaled_images.has_key(path):
path = self.scaled_images[path].name
try:
im = PILImage.open(path)
encoding = im.format
if encoding:
encoding = encoding.upper()
if encoding == 'JPG':
encoding = 'JPEG'
except IOError, err:
self.logger.warning('Unable to process image: %s\n%s', original_path, err)
return
if width == None or height == None:
width, height = im.size
factor = 720./self.profile.dpi
def scale_image(width, height):
pt = PersistentTemporaryFile(suffix='.jpeg')
try:
im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
pt.close()
self.scaled_images[path] = pt
encoding = 'JPEG'
return pt.name
except IOError: # PIL chokes on interlaced PNG images
self.logger.warning('Unable to process interlaced PNG %s', path)
return None
pheight = int(self.current_page.pageStyle.attrs['textheight'])
pwidth = int(self.current_page.pageStyle.attrs['textwidth'])
if dropcaps:
scale = False
if width > 0.75*pwidth:
width = int(0.75*pwidth)
scale = True
if height > 0.75*pheight:
height = int(0.75*pheight)
scale = True
if scale:
path = scale_image(width, height)
if not self.images.has_key(path):
self.images[path] = ImageStream(path)
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
xsize=width, ysize=height)
line_height = (int(self.current_block.textStyle.attrs['baselineskip']) +
int(self.current_block.textStyle.attrs['linespace']))//10
line_height *= self.profile.dpi/72.
lines = int(ceil(float(height)/line_height))
dc = DropCaps(lines)
dc.append(Plot(im, xsize=ceil(width*factor), ysize=ceil(height*factor)))
self.current_para.append(dc)
return
if not self.disable_autorotation and width > pwidth and width > height:
pt = PersistentTemporaryFile(suffix='.jpeg')
try:
im = im.rotate(90)
im.convert('RGB').save(pt, 'JPEG')
path = pt.name
encoding = 'JPEG'
self.rotated_images[path] = pt
width, height = im.size
except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
self.logger.debug('Unable to process interlaced PNG %s', original_path)
finally:
pt.close()
if height > pheight:
corrf = pheight/(1.*height)
width, height = floor(corrf*width), pheight-1
if width > pwidth:
corrf = (pwidth)/(1.*width)
width, height = pwidth-1, floor(corrf*height)
path = scale_image(width, height)
if width > pwidth:
corrf = pwidth/(1.*width)
width, height = pwidth-1, floor(corrf*height)
if height > pheight:
corrf = (pheight)/(1.*height)
width, height = floor(corrf*width), pheight-1
path = scale_image(width, height)
width, height = int(width), int(height)
if not path:
return
if not self.images.has_key(path):
try:
self.images[path] = ImageStream(path, encoding=encoding)
except LrsError, err:
self.logger.warning('Could not process image: %s\n%s', original_path, err)
return
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
xsize=width, ysize=height)
self.process_alignment(tag_css)
if max(width, height) <= min(pwidth, pheight)/5.:
self.current_para.append(Plot(im, xsize=ceil(width*factor),
ysize=ceil(height*factor)))
elif height <= int(floor((2/3.)*pheight)):
pb = self.current_block
self.end_current_para()
self.process_alignment(tag_css)
self.current_para.append(Plot(im, xsize=width*factor,
ysize=height*factor))
self.current_block.append(self.current_para)
self.current_page.append(self.current_block)
self.current_block = self.book.create_text_block(
textStyle=pb.textStyle,
blockStyle=pb.blockStyle)
self.current_para = Paragraph()
else:
self.end_page()
self.current_page.append(Canvas(width=pwidth,
height=height))
left = int(floor((pwidth - width)/2.))
self.current_page.contents[-1].put_object(
ImageBlock(self.images[path], xsize=pwidth,
ysize=pheight, x1=pwidth, y1=pheight,
blockwidth=pwidth, blockheight=pheight),
left, 0)
def process_page_breaks(self, tag, tagname, tag_css):
if 'page-break-before' in tag_css.keys():
if tag_css['page-break-before'].lower() != 'avoid':
self.end_page()
tag_css.pop('page-break-before')
end_page = False
if 'page-break-after' in tag_css.keys() and \
tag_css['page-break-after'].lower() != 'avoid':
end_page = True
tag_css.pop('page-break-after')
if (self.force_page_break_attr[0].match(tagname) and \
tag.has_key(self.force_page_break_attr[1]) and \
self.force_page_break_attr[2].match(tag[self.force_page_break_attr[1]])) or \
self.force_page_break.match(tagname):
self.end_page()
self.page_break_found = True
if not self.page_break_found and self.page_break.match(tagname):
if len(self.current_page.contents) > 3:
self.end_page()
self.logger.debug('Forcing page break at %s', tagname)
return end_page
def block_properties(self, tag_css):
ans = {}
for key in ('topskip', 'footskip', 'sidemargin'):
ans[key] = self.book.defaultBlockStyle.attrs[key]
src = [None for i in range(4)]
if tag_css.has_key('padding'):
msrc = tag_css['padding'].split()
for i in range(len(msrc)):
src[i] = msrc[i]
i = 0
for c in ('top', 'right', 'bottom', 'left'):
if tag_css.has_key('padding-'+c):
src[i] = tag_css['padding-'+c]
i += 1
t = {}
t['topskip'], t['footskip'], t['sidemargin'] = src[0], src[2], src[3]
for key in ('topskip', 'footskip', 'sidemargin'):
if t[key] is not None:
ans[key] = self.unit_convert(t[key])
return ans
def font_properties(self, css):
'''
Convert the font propertiess in css to the Xylog equivalents. If the CSS
does not contain a particular font property, the default from self.book.defaultTextSytle
is used.
@return: dict, key, variant. The dict contains the Xlog equivalents. key indicates
the font type (i.e. bold, bi, normal) and variant is None or 'small-caps'
'''
t = {}
for key in ('fontwidth', 'fontsize', 'wordspace', 'fontfacename', 'fontweight', 'baselineskip'):
t[key] = self.book.defaultTextStyle.attrs[key]
def font_weight(val):
ans = 0
m = re.search("([0-9]+)", val)
if m:
ans = int(m.group(1))
elif val.find("bold") >= 0 or val.find("strong") >= 0:
ans = 700
return 'bold' if ans >= 700 else 'normal'
def font_style(val):
ans = 'normal'
if 'italic' in val or 'oblique' in val:
ans = 'italic'
return ans
def font_family(val):
ans = 'serif'
if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
ans = 'mono'
elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
val.find("trebuchet"), val.find("sans")) >= 0:
ans = 'sans'
return ans
def font_variant(val):
ans = None
if 'small-caps' in val.lower():
ans = 'small-caps'
return ans
def font_key(family, style, weight):
key = 'normal'
if style == 'italic' and weight == 'normal':
key = 'italic'
elif style == 'normal' and weight == 'bold':
key = 'bold'
elif style == 'italic' and weight == 'bold':
key = 'bi'
return key
def font_size(val):
normal = 100 #10*pts
ans = self.unit_convert(val, pts=True)
if ans:
if ans < 0:
ans += normal
if ans < 0:
ans = normal
else:
if "xx-small" in val:
ans = 40
elif "x-small" in val:
ans = 60
elif "small" in val:
ans = 80
elif "xx-large" in val:
ans = 180
elif "x-large" in val:
ans = 140
elif "large" in val:
ans = 120
if ans is not None:
ans += int(self.font_delta * 20)
ans = str(ans)
return ans
family, weight, style, variant = 'serif', 'normal', 'normal', None
for key in css.keys():
val = css[key].lower()
if key == 'font':
vals = val.split()
for val in vals:
family = font_family(val)
if family != 'serif':
break
for val in vals:
weight = font_weight(val)
if weight != 'normal':
break
for val in vals:
style = font_style(val)
if style != 'normal':
break
for val in vals:
sz = font_size(val)
if sz:
t['fontsize'] = sz
break
for val in vals:
variant = font_variant(val)
if variant:
t['fontvariant'] = variant
break
elif key in ['font-family', 'font-name']:
family = font_family(val)
elif key == "font-size":
ans = font_size(val)
if ans:
t['fontsize'] = ans
elif key == 'font-weight':
weight = font_weight(val)
elif key == 'font-style':
style = font_style(val)
elif key == 'font-variant':
variant = font_variant(val)
key = font_key(family, style, weight)
if self.fonts[family].has_key(key):
t['fontfacename'] = self.fonts[family][key][1]
else:
t['fontfacename'] = self.fonts[family]['normal'][1]
if key in ['bold', 'bi']:
t['fontweight'] = 700
fs = int(t['fontsize'])
if fs > 120:
t['wordspace'] = int(fs/4.)
t['baselineskip'] = fs + 20
return t, key, variant
def unit_convert(self, val, pts=False):
'''
Tries to convert html units in C{val} to pixels.
Assumes: 1em = 100% = 10pts
@param pts: If True return 10*pts instead of pixels.
@return: The number of pixels (an int) if successful. Otherwise, returns None.
'''
dpi = self.profile.dpi
result = None
try:
result = int(val)
except ValueError:
pass
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
if m is not None:
unit = float(m.group(1))
if m.group(2) == '%':
normal = self.unit_convert('10pt')
result = int((unit/100.0)*normal)
elif m.group(2) == 'px':
result = int(unit)
elif m.group(2) == 'in':
result = int(unit * dpi)
elif m.group(2) == 'pt':
result = int(unit * dpi/72.)
elif m.group(2)== 'em':
result = int(unit * (dpi/72.) * 10)
elif m.group(2)== 'pc':
result = int(unit * (dpi/72.) * 12)
elif m.group(2)== 'mm':
result = int(unit * 0.04 * (dpi/72.))
elif m.group(2)== 'cm':
result = int(unit * 0.4 * (dpi/72.))
if pts:
if result is not None:
result = int((float(result)/dpi)*720)
return result
def text_properties(self, tag_css):
indent = self.book.defaultTextStyle.attrs['parindent']
if tag_css.has_key('text-indent'):
indent = self.unit_convert(str(tag_css['text-indent']), pts=True)
if not indent:
indent = 0
if hasattr(self, 'minimum_indent') and indent > 0 and indent < self.minimum_indent:
indent = self.minimum_indent
fp = self.font_properties(tag_css)[0]
fp['parindent'] = indent
return fp
def process_block(self, tag, tag_css):
''' Ensure padding and text-indent properties are respected '''
text_properties = self.text_properties(tag_css)
block_properties = self.block_properties(tag_css)
align = self.get_alignment(tag_css)
if properties_different(self.current_block.blockStyle.attrs, block_properties) or \
properties_different(self.current_block.textStyle.attrs, text_properties) or\
align != self.current_block.textStyle.attrs['align']:
ts = self.current_block.textStyle.copy()
ts.attrs.update(text_properties)
ts.attrs['align'] = align
bs = self.current_block.blockStyle.copy()
bs.attrs.update(block_properties)
self.current_block.append_to(self.current_page)
try:
index = self.text_styles.index(ts)
ts = self.text_styles[index]
except ValueError:
self.text_styles.append(ts)
try:
index = self.block_styles.index(bs)
bs = self.block_styles[index]
except ValueError:
self.block_styles.append(bs)
self.current_block = self.book.create_text_block(blockStyle=bs,
textStyle=ts)
return True
return False
def parse_tag(self, tag, parent_css):
try:
tagname = tag.name.lower()
except AttributeError:
if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
self.add_text(tag, parent_css, {})
return
tag_css, tag_pseudo_css = self.tag_css(tag, parent_css=parent_css)
try: # Skip element if its display attribute is set to none
if tag_css['display'].lower() == 'none' or \
tag_css['visibility'].lower() == 'hidden':
return
except KeyError:
pass
end_page = self.process_page_breaks(tag, tagname, tag_css)
if tagname in ["title", "script", "meta", 'del', 'frameset']:
pass
elif tagname == 'a' and self.link_levels >= 0:
if tag.has_key('href') and not self.link_exclude.match(tag['href']):
purl = urlparse(tag['href'])
path = unquote(purl[2])
ext = os.path.splitext(path)[1]
if ext: ext = ext[1:].lower()
if path and os.access(path, os.R_OK) and ext and \
ext in ['png', 'jpg', 'bmp', 'jpeg']:
self.process_image(path, tag_css)
else:
text = self.get_text(tag, limit=1000)
if not text.strip():
text = "Link"
self.add_text(text, tag_css, {}, force_span_use=True)
self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
if tag.has_key('id') or tag.has_key('name'):
key = 'name' if tag.has_key('name') else 'id'
self.targets[self.target_prefix+tag[key]] = self.current_block
elif tag.has_key('name') or tag.has_key('id'):
key = 'name' if tag.has_key('name') else 'id'
name = tag[key].replace('#', '')
if self.anchor_to_previous:
self.process_children(tag, tag_css, tag_pseudo_css)
for c in self.anchor_to_previous.contents:
if isinstance(c, (TextBlock, ImageBlock)):
self.targets[self.target_prefix+tag[key]] = c
return
tb = self.book.create_text_block()
tb.Paragraph(" ")
self.anchor_to_previous.append(tb)
self.targets[self.target_prefix+name] = tb
return
previous = self.current_block
self.process_children(tag, tag_css, tag_pseudo_css)
target = None
if self.current_block == previous:
if self.current_para.has_text():
self.current_para.append_to(self.current_block)
self.current_para = Paragraph()
target = self.current_block
else: # Empty <a> element
self.current_page.append(self.current_block)
self.current_block = self.book.create_text_block(
textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
target = self.book.create_text_block()
self.current_page.append(target)
else:
found = False
for item in self.current_page.contents:
if item == previous:
found = True
continue
if found:
target = item
break
if target and not isinstance(target, (TextBlock, ImageBlock)):
if isinstance(target, RuledLine):
target = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
target.Paragraph(' ')
self.current_page.append(target)
else:
target = BlockSpace()
self.current_page.append(target)
if target == None:
if self.current_block.has_text():
target = self.current_block
else:
target = BlockSpace()
self.current_page.append(target)
self.targets[self.target_prefix+name] = target
elif tagname == 'img':
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
path = os.path.abspath(unquote(tag['src']))
width, height = None, None
try:
width = int(tag['width'])
height = int(tag['height'])
except:
pass
dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
else:
self.logger.debug("Failed to process: %s", str(tag))
elif tagname in ['style', 'link']:
def update_css(ncss, ocss):
for key in ncss.keys():
if ocss.has_key(key):
ocss[key].update(ncss[key])
else:
ocss[key] = ncss[key]
ncss, npcss = {}, {}
if tagname == 'style':
for c in tag.contents:
if isinstance(c, NavigableString):
css, pcss = self.parse_css(str(c))
ncss.update(css)
npcss.update(pcss)
elif tag.has_key('type') and tag['type'] == "text/css" \
and tag.has_key('href'):
purl = urlparse(tag['href'])
path = unquote(purl[2])
try:
f = open(path, 'rb')
src = f.read()
f.close()
match = self.PAGE_BREAK_PAT.search(src)
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
ncss, npcss = self.parse_css(src)
except IOError:
pass
if ncss:
update_css(ncss, self.css)
if npcss:
update_css(pcss, self.pseudo_css)
elif tagname == 'pre':
self.end_current_para()
self.end_current_block()
self.current_block = self.book.create_text_block()
self.current_block.textStyle.attrs['parindent'] = '0'
if tag.contents:
c = tag.contents[0]
if isinstance(c, NavigableString):
c = str(c).replace('\r\n', '\n').replace('\r', '\n')
if c.startswith('\n'):
c = c[1:]
tag.contents[0] = NavigableString(c)
tag.contents[0].setup(tag)
self.process_children(tag, tag_css, tag_pseudo_css)
self.end_current_block()
elif tagname in ['ul', 'ol', 'dl']:
self.list_level += 1
if tagname == 'ol':
old_counter = self.list_counter
self.list_counter = 1
prev_bs = self.current_block.blockStyle
self.end_current_block()
attrs = self.current_block.blockStyle.attrs
attrs = attrs.copy()
attrs['sidemargin'] = self.list_indent*self.list_level
bs = self.book.create_block_style(**attrs)
self.current_block = self.book.create_text_block(
blockStyle=bs,
textStyle=self.unindented_style)
self.process_children(tag, tag_css, tag_pseudo_css)
self.end_current_block()
self.current_block.blockStyle = prev_bs
self.list_level -= 1
if tagname == 'ol':
self.list_counter = old_counter
elif tagname in ['li', 'dt', 'dd']:
margin = self.list_indent*self.list_level
if tagname == 'dd':
margin += 80
if int(self.current_block.blockStyle.attrs['sidemargin']) != margin:
self.end_current_block()
attrs = self.current_block.blockStyle.attrs
attrs = attrs.copy()
attrs['sidemargin'] = margin
attrs['blockwidth'] = int(attrs['blockwidth']) + margin
bs = self.book.create_block_style(**attrs)
self.current_block = self.book.create_text_block(
blockStyle=bs,
textStyle=self.unindented_style)
if self.current_para.has_text():
self.line_break()
self.current_block.append(self.current_para)
self.current_para = Paragraph()
self.previous_text = '\n'
if tagname == 'li':
in_ol, parent = True, tag.parent
while parent:
if parent.name and parent.name.lower() in ['ul', 'ol']:
in_ol = parent.name.lower() == 'ol'
break
parent = parent.parent
prepend = str(self.list_counter)+'. ' if in_ol else u'\u2022' + ' '
self.current_para.append(Span(prepend))
self.process_children(tag, tag_css, tag_pseudo_css)
if in_ol:
self.list_counter += 1
else:
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'blockquote':
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
pb = self.current_block
self.current_para = Paragraph()
ts = self.book.create_text_style()
ts.attrs['parindent'] = 0
try:
index = self.text_styles.index(ts)
ts = self.text_styles[index]
except ValueError:
self.text_styles.append(ts)
bs = self.book.create_block_style()
bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
60, 20, 20
try:
index = self.block_styles.index(bs)
bs = self.block_styles[index]
except ValueError:
self.block_styles.append(bs)
self.current_block = self.book.create_text_block(
blockStyle=bs, textStyle=ts)
self.previous_text = '\n'
self.process_children(tag, tag_css, tag_pseudo_css)
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_para = Paragraph()
self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
blockStyle=pb.blockStyle)
elif tagname in ['sub', 'sup']:
text = self.get_text(tag)
elem = Sub if tagname == 'sub' else Sup
self.current_para.append(elem(text))
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
new_block = self.process_block(tag, tag_css)
if self.anchor_ids and tag.has_key('id'):
tkey = self.target_prefix+tag['id']
if not new_block:
self.end_current_block()
self.current_block.must_append = True
self.targets[tkey] = self.current_block
src = self.get_text(tag, limit=1000)
if not self.disable_chapter_detection and tagname.startswith('h'):
if self.chapter_regex.search(src):
self.logger.debug('Detected chapter %s', src)
self.end_page()
self.page_break_found = True
if not tag.contents:
self.current_block.append(CR())
return
if self.current_para.has_text():
self.current_para.append_to(self.current_block)
if self.current_block.contents:
self.current_block.append(CR())
self.previous_text = '\n'
self.current_para = Paragraph()
self.process_children(tag, tag_css, tag_pseudo_css)
if self.current_para.contents :
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if tagname.startswith('h') or self.blank_after_para:
self.current_block.append(CR())
elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'font':
if tag.has_key('face'):
tag_css['font-family'] = tag['face']
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname in ['br']:
self.line_break()
self.previous_text = '\n'
elif tagname in ['hr', 'tr']: # tr needed for nested tables
self.end_current_block()
if tagname == 'hr':
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
self.previous_text = '\n'
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'td': # Needed for nested tables
if not self.in_table:
self.current_para.append(' ')
self.previous_text = ' '
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'table' and not self.ignore_tables and not self.in_table:
tag_css = self.tag_css(tag)[0] # Table should not inherit CSS
try:
self.process_table(tag, tag_css)
except Exception, err:
self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err))
self.logger.debug('', exc_info=True)
self.logger.debug('Bad table:\n%s', str(tag)[:300])
self.in_table = False
self.process_children(tag, tag_css, tag_pseudo_css)
finally:
tag.extract()
else:
self.process_children(tag, tag_css, tag_pseudo_css)
if end_page:
self.end_page()
def process_table(self, tag, tag_css):
self.end_current_block()
self.current_block = self.book.create_text_block()
rowpad = 10
table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10)
canvases = []
ps = self.current_page.pageStyle.attrs
for block, xpos, ypos, delta in table.blocks(int(ps['textwidth']), int(ps['textheight'])):
if not block:
if ypos > int(ps['textheight']):
raise Exception, 'Table has cell that is too large'
canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+rowpad,
blockrule='block-fixed'))
else:
canvases[-1].put_object(block, xpos + int(delta/2.), ypos)
for canvas in canvases:
self.current_page.append(canvas)
self.end_current_block()
def remove_unused_target_blocks(self):
for block in self.unused_target_blocks:
block.parent.contents.remove(block)
block.parent = None
def writeto(self, path, lrs=False):
self.remove_unused_target_blocks()
self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
def cleanup(self):
for _file in self.scaled_images.values() + self.rotated_images.values():
_file.__del__()
def process_file(path, options, logger=None):
if re.match('http://|https://', path):
raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,)
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('html2lrf')
setup_cli_handlers(logger, level)
cwd = os.getcwd()
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
dirpath = os.path.dirname(path)
try:
cpath, tpath = '', ''
try_opf(path, options, logger)
if options.cover:
cpath = os.path.join(dirpath, os.path.basename(options.cover))
if not os.path.exists(cpath):
cpath = os.path.abspath(os.path.expanduser(options.cover))
options.cover = cpath
if os.access(options.cover, os.R_OK):
from libprs500.devices.prs500.driver import PRS500
im = PILImage.open(os.path.join(cwd, cpath))
cim = im.resize((options.profile.screen_width,
options.profile.screen_height),
PILImage.BICUBIC).convert('RGB')
cf = PersistentTemporaryFile(prefix=__appname__+"_", suffix=".jpg")
cf.close()
cim.save(cf.name)
cpath = cf.name
th = PRS500.THUMBNAIL_HEIGHT
tim = im.resize((int(0.75*th), th), PILImage.ANTIALIAS).convert('RGB')
tf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
tf.close()
tim.save(tf.name)
tpath = tf.name
else:
raise ConversionError, 'Cannot read from: %s'% (options.cover,)
if not options.title:
options.title = default_title
title = (options.title, options.title_sort)
author = (options.author, options.author_sort)
args = dict(font_delta=options.font_delta, title=title, \
author=author, sourceencoding='utf8',\
freetext=options.freetext, category=options.category,
publisher=options.publisher,
booksetting=BookSetting(dpi=10*options.profile.dpi,
screenheight=options.profile.screen_height,
screenwidth=options.profile.screen_width))
if tpath:
args['thumbnail'] = tpath
header = None
if options.header:
header = Paragraph()
fheader = options.headerformat
fheader = re.sub(r'([^%]|^)%t','\1' + options.title, fheader)
fheader = re.sub(r'([^%]|^)%a','\1' + options.author, fheader)
fheader = re.sub(r'%%a','%a',fheader)
fheader = re.sub(r'%%t','%t',fheader)
header.append(fheader + " ")
book, fonts = Book(options, logger, header=header, **args)
le = re.compile(options.link_exclude) if options.link_exclude else \
re.compile('$')
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
re.compile('$')
fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \
re.compile('$')
options.cover = cpath
options.force_page_break = fpb
options.link_exclude = le
options.page_break = pb
options.chapter_regex = re.compile(options.chapter_regex, re.IGNORECASE)
fpba = options.force_page_break_attr.split(',')
if len(fpba) != 3:
fpba = ['$', '', '$']
options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
re.compile(fpba[2], re.IGNORECASE)]
if not hasattr(options, 'anchor_ids'):
options.anchor_ids = True
conv = HTMLConverter(book, fonts, options, logger, path)
oname = options.output
if not oname:
suffix = '.lrs' if options.lrs else '.lrf'
name = os.path.splitext(os.path.basename(path))[0] + suffix
oname = os.path.join(cwd,name)
oname = os.path.abspath(os.path.expanduser(oname))
conv.writeto(oname, lrs=options.lrs)
logger.info('Output written to %s', oname)
conv.cleanup()
return oname
finally:
os.chdir(cwd)
def try_opf(path, options, logger):
try:
opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
except IndexError:
return
opf = OPFReader(open(opf, 'rb'))
try:
title = opf.title
if title and not options.title:
options.title = title
if options.author == 'Unknown':
if opf.authors:
options.author = ', '.join(opf.authors)
if opf.author_sort:
options.author_sort = opf.author_sort
if options.publisher == 'Unknown':
publisher = opf.publisher
if publisher:
options.publisher = publisher
if not options.category:
category = opf.category
if category:
options.category = category
if not options.cover:
cover = opf.cover
if cover:
cover = os.path.join(os.path.dirname(path), cover)
if os.access(cover, os.R_OK):
try:
PILImage.open(cover)
options.cover = cover
except:
pass
if not options.cover:
for prefix in opf.possible_cover_prefixes():
if options.cover:
break
for suffix in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']:
cpath = os.path.join(os.path.dirname(path), prefix+suffix)
try:
PILImage.open(cpath)
options.cover = cpath
break
except:
continue
except Exception:
logger.exception('Failed to process opf file')
def option_parser():
return lrf_option_parser('''Usage: %prog [options] mybook.html\n\n'''
'''%prog converts mybook.html to mybook.lrf''')
def main(args=sys.argv):
try:
parser = option_parser()
options, args = parser.parse_args(args)
if options.output:
options.output = os.path.abspath(os.path.expanduser(options.output))
if len(args) != 2:
parser.print_help()
return 1
src = args[1]
if options.verbose:
import warnings
warnings.defaultaction = 'error'
except Exception, err:
print >> sys.stderr, err
return 1
process_file(src, options)
return 0
if __name__ == '__main__':
sys.exit(main())