mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
1345 lines
59 KiB
Python
1345 lines
59 KiB
Python
## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
|
|
## This work is based on htmlbbeb created by esperanc.
|
|
##
|
|
## This program is free software; you can redistribute it and/or modify
|
|
## it under the terms of the GNU General Public License as published by
|
|
## the Free Software Foundation; either version 2 of the License, or
|
|
## (at your option) any later version.
|
|
##
|
|
## This program is distributed in the hope that it will be useful,
|
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
## GNU General Public License for more details.
|
|
##
|
|
## You should have received a copy of the GNU General Public License along
|
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
"""
|
|
Code to convert HTML ebooks into LRF ebooks.
|
|
|
|
I am indebted to esperanc for the initial CSS->Xylog Style conversion routines
|
|
and to Falstaff for pylrs.
|
|
"""
|
|
import os, re, sys, shutil, traceback, copy, glob
|
|
from htmlentitydefs import name2codepoint
|
|
from urllib import unquote
|
|
from urlparse import urlparse
|
|
from tempfile import mkdtemp
|
|
from operator import itemgetter
|
|
from math import ceil, floor
|
|
try:
|
|
from PIL import Image as PILImage
|
|
except ImportError:
|
|
import Image as PILImage
|
|
|
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, \
|
|
Comment, Tag, NavigableString, Declaration, ProcessingInstruction
|
|
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, \
|
|
TextBlock, ImageBlock, JumpButton, CharButton, Bold, Space, \
|
|
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas
|
|
from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
|
|
from libprs500.ebooks.lrf import ConversionError, option_parser, Book, PRS500_PROFILE
|
|
from libprs500.ebooks.lrf.html.table import Table
|
|
from libprs500 import extract, filename_to_utf8
|
|
from libprs500.ptempfile import PersistentTemporaryFile
|
|
|
|
class Span(_Span):
|
|
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
|
|
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
|
|
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
|
|
rules = zip(patterns, targets)
|
|
|
|
|
|
@staticmethod
|
|
def unit_convert(val, ref=80):
|
|
"""
|
|
Tries to convert html units stored in C{val} to pixels. C{ref} contains
|
|
the reference value for relative units. Returns the number of pixels
|
|
(an int) if successful. Otherwise, returns None.
|
|
Assumes: 1 pixel is 1/4 mm. One em is 10pts
|
|
"""
|
|
result = None
|
|
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
|
|
if m is not None:
|
|
unit = float(m.group(1))
|
|
if m.group(2) == '%':
|
|
result = int(unit/100.0*ref)
|
|
elif m.group(2) == 'px':
|
|
result = int(unit)
|
|
elif m.group(2) == 'in':
|
|
result = int(unit * 25.4 * 4)
|
|
elif m.group(2) == 'pt':
|
|
result = int(unit * 25.4 * 4 / 72)
|
|
elif m.group(2)== 'em':
|
|
result = int(unit * 25.4 * 4 / 72 * 10)
|
|
elif m.group(2)== 'pc':
|
|
result = int(unit * 25.4 * 4 / 72 * 12)
|
|
elif m.group(2)== 'mm':
|
|
result = int(unit * 4)
|
|
elif m.group(2)== 'cm':
|
|
result = int(unit * 10 * 4)
|
|
return result
|
|
|
|
@staticmethod
|
|
def translate_attrs(d, font_delta=0, memory=None):
|
|
"""
|
|
Receives a dictionary of html attributes and styles and returns
|
|
approximate Xylog equivalents in a new dictionary
|
|
"""
|
|
def font_weight(val):
|
|
ans = None
|
|
m = re.search("([0-9]+)", val)
|
|
if m:
|
|
ans = str(int(m.group(1)))
|
|
elif val.find("bold") >= 0 or val.find("strong") >= 0:
|
|
ans = "1000"
|
|
return ans
|
|
|
|
def font_family(val):
|
|
ans = None
|
|
if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
|
|
ans = "Courier10 BT Roman"
|
|
elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
|
|
val.find("trebuchet"), val.find("sans")) >= 0:
|
|
ans = "Swis721 BT Roman"
|
|
return ans
|
|
|
|
def font_size(val):
|
|
ans = None
|
|
unit = Span.unit_convert(val, 14)
|
|
if unit:
|
|
# Assume a 10 pt font (14 pixels) has fontsize 100
|
|
ans = int (unit / 14.0 * 100)
|
|
else:
|
|
if "xx-small" in val:
|
|
ans = 40
|
|
elif "x-small" in val >= 0:
|
|
ans = 60
|
|
elif "small" in val:
|
|
ans = 80
|
|
elif "xx-large" in val:
|
|
ans = 180
|
|
elif "x-large" in val >= 0:
|
|
ans = 140
|
|
elif "large" in val >= 0:
|
|
ans = 120
|
|
if ans is not None:
|
|
ans += font_delta * 20
|
|
ans = str(ans)
|
|
return ans
|
|
|
|
t = dict()
|
|
for key in d.keys():
|
|
val = d[key].lower()
|
|
if key == 'font':
|
|
val = val.split()
|
|
val.reverse()
|
|
for sval in val:
|
|
ans = font_family(sval)
|
|
if ans:
|
|
t['fontfacename'] = ans
|
|
else:
|
|
ans = font_size(sval)
|
|
if ans:
|
|
t['fontsize'] = ans
|
|
else:
|
|
ans = font_weight(sval)
|
|
if ans:
|
|
t['fontweight'] = ans
|
|
elif key in ['font-family', 'font-name']:
|
|
ans = font_family(val)
|
|
if ans:
|
|
t['fontfacename'] = ans
|
|
elif key == "font-size":
|
|
ans = font_size(val)
|
|
if ans:
|
|
t['fontsize'] = ans
|
|
elif key == 'font-weight':
|
|
ans = font_weight(val)
|
|
if ans:
|
|
t['fontweight'] = ans
|
|
if int(ans) > 140:
|
|
t['wordspace'] = '50'
|
|
else:
|
|
report = True
|
|
if memory != None:
|
|
if key in memory:
|
|
report = False
|
|
else:
|
|
memory.append(key)
|
|
if report:
|
|
print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
|
|
return t
|
|
|
|
def __init__(self, ns, css, memory, font_delta=0):
|
|
src = ns.string if hasattr(ns, 'string') else ns
|
|
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
|
|
for pat, repl in Span.rules:
|
|
src = pat.sub(repl, src)
|
|
if not src:
|
|
raise ConversionError('No point in adding an empty string to a Span')
|
|
if 'font-style' in css.keys():
|
|
fs = css.pop('font-style')
|
|
if fs.lower() == 'italic':
|
|
src = Italic(src)
|
|
attrs = Span.translate_attrs(css, font_delta=font_delta, memory=memory)
|
|
if 'fontsize' in attrs.keys():
|
|
attrs['baselineskip'] = int(attrs['fontsize']) + 20
|
|
_Span.__init__(self, text=src, **attrs)
|
|
|
|
|
|
|
|
class HTMLConverter(object):
|
|
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
|
|
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
|
|
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
|
|
# Fix <a /> elements
|
|
MARKUP_MASSAGE = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"), #Close <a /> tags
|
|
lambda match: match.group(1)+"></a>"),
|
|
# Strip comments from <style> tags. This is needed as
|
|
# sometimes there are unterminated comments
|
|
(re.compile(r"<\s*style.*?>(.*?)<\/\s*style\s*>", re.DOTALL|re.IGNORECASE),
|
|
lambda match: match.group().replace('<!--', '').replace('-->', '')),
|
|
]
|
|
# Fix Baen markup
|
|
BAEN_SANCTIFY = [(re.compile(r'<\s*[Aa]\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*<\/[Aa]>'),
|
|
lambda match: ''),
|
|
(re.compile(r'page-break-before:\s*\w+([\s;\}])'),
|
|
lambda match: match.group(1)) ]
|
|
|
|
class Link(object):
|
|
def __init__(self, para, tag):
|
|
self.para = para
|
|
self.tag = tag
|
|
|
|
processed_files = {} #: Files that have been processed
|
|
|
|
def __init__(self, book, path,
|
|
font_delta=0, verbose=False, cover=None,
|
|
max_link_levels=sys.maxint, link_level=0,
|
|
is_root=True, baen=False, chapter_detection=True,
|
|
chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE),
|
|
link_exclude=re.compile('$'),
|
|
page_break=re.compile('h[12]', re.IGNORECASE),
|
|
profile=PRS500_PROFILE,
|
|
disable_autorotation=False):
|
|
'''
|
|
Convert HTML file at C{path} and add it to C{book}. After creating
|
|
the object, you must call L{self.process_links} on it to create the links and
|
|
then L{self.writeto} to output the LRF/S file.
|
|
|
|
@param book: The LRF book
|
|
@type book: L{libprs500.lrf.pylrs.Book}
|
|
@param path: path to the HTML file to process
|
|
@type path: C{str}
|
|
@param width: Width of the device on which the LRF file is to be read
|
|
@type width: C{int}
|
|
@param height: Height of the device on which the LRF file is to be read
|
|
@type height: C{int}
|
|
@param font_delta: The amount in pts by which all fonts should be changed
|
|
@type font_delta: C{int}
|
|
@param verbose: Whether processing should be verbose or not
|
|
@type verbose: C{bool}
|
|
@param cover: Path to an image to use as the cover of this book
|
|
@type cover: C{str}
|
|
@param max_link_levels: Number of link levels to process recursively
|
|
@type max_link_levels: C{int}
|
|
@param link_level: Current link level
|
|
@type link_level: C{int}
|
|
@param is_root: True iff this object is converting the root HTML file
|
|
@type is_root: C{bool}
|
|
@param chapter_detection: Insert page breaks before what looks like
|
|
the start of a chapter
|
|
@type chapter_detection: C{bool}
|
|
@param chapter_regex: The compiled regular expression used to search for chapter titles
|
|
@param link_exclude: Compiled regex. Matching hrefs are ignored.
|
|
@param page_break: Compiled regex. Page breaks are inserted before matching
|
|
tags if no page-breaks are found and no chapter headings
|
|
are detected.
|
|
@param profile: Defines the geometry of the display device
|
|
@param disable_autorotation: Don't autorotate very wide images
|
|
'''
|
|
# Defaults for various formatting tags
|
|
self.css = dict(
|
|
h1 = {"font-size" :"xx-large", "font-weight":"bold", 'text-indent':'0pt'},
|
|
h2 = {"font-size" :"x-large", "font-weight":"bold", 'text-indent':'0pt'},
|
|
h3 = {"font-size" :"large", "font-weight":"bold", 'text-indent':'0pt'},
|
|
h4 = {"font-size" :"large", 'text-indent':'0pt'},
|
|
h5 = {"font-weight" :"bold", 'text-indent':'0pt'},
|
|
b = {"font-weight" :"bold"},
|
|
strong = {"font-weight" :"bold"},
|
|
i = {"font-style" :"italic"},
|
|
em = {"font-style" :"italic"},
|
|
small = {'font-size' :'small'},
|
|
pre = {'font-family' :'monospace' },
|
|
tt = {'font-family' :'monospace'},
|
|
center = {'text-align' : 'center'}
|
|
)
|
|
self.profile = profile #: Defines the geometry of the display device
|
|
self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
|
|
self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
|
|
self.link_exclude = link_exclude #: Ignore matching hrefs
|
|
self.scaled_images = {} #: Temporary files with scaled version of images
|
|
self.rotated_images = {} #: Temporary files with rotated version of images
|
|
self.max_link_levels = max_link_levels #: Number of link levels to process recursively
|
|
self.link_level = link_level #: Current link level
|
|
self.disable_autorotation = disable_autorotation
|
|
self.blockquote_style = book.create_block_style(sidemargin=60,
|
|
topskip=20, footskip=20)
|
|
self.unindented_style = book.create_text_style(parindent=0)
|
|
self.page_break = page_break #: Regex controlling forced page-break behavior
|
|
self.text_styles = []#: Keep track of already used textstyles
|
|
self.block_styles = []#: Keep track of already used blockstyles
|
|
self.images = {} #: Images referenced in the HTML document
|
|
self.targets = {} #: <a name=...> elements
|
|
self.links = [] #: <a href=...> elements
|
|
self.files = {} #: links that point to other files
|
|
self.links_processed = False #: Whether links_processed has been called on this object
|
|
self.font_delta = font_delta
|
|
# Set by table processing code so that any <a name> within the table
|
|
# point to the previous element
|
|
self.anchor_to_previous = None
|
|
self.cover = cover
|
|
self.in_table = False
|
|
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
|
|
self.in_ol = False #: Flag indicating we're in an <ol> element
|
|
self.book = book #: The Book object representing a BBeB book
|
|
self.is_root = is_root #: Are we converting the root HTML file
|
|
self.lstrip_toggle = False #: If true the next add_text call will do an lstrip
|
|
path = os.path.abspath(path)
|
|
os.chdir(os.path.dirname(path))
|
|
self.file_name = os.path.basename(path)
|
|
print "Processing", self.file_name
|
|
print '\tParsing HTML...',
|
|
sys.stdout.flush()
|
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
|
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
|
|
self.baen = baen
|
|
if baen:
|
|
nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
|
|
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
|
|
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
|
markupMassage=nmassage)
|
|
print 'done\n\tConverting to BBeB...',
|
|
sys.stdout.flush()
|
|
self.verbose = verbose
|
|
self.current_page = None
|
|
self.current_para = None
|
|
self.current_style = {}
|
|
self.page_break_found = False
|
|
match = self.PAGE_BREAK_PAT.search(unicode(self.soup))
|
|
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
|
self.page_break_found = True
|
|
self.parse_file()
|
|
HTMLConverter.processed_files[path] = self
|
|
print 'done'
|
|
|
|
def parse_css(self, style):
|
|
"""
|
|
Parse the contents of a <style> tag or .css file.
|
|
@param style: C{str(style)} should be the CSS to parse.
|
|
@return: A dictionary with one entry per selector where the key is the
|
|
selector name and the value is a dictionary of properties
|
|
"""
|
|
sdict = dict()
|
|
style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
|
|
for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
|
|
for key in sel[0].split(','):
|
|
key = key.strip().lower()
|
|
val = self.parse_style_properties(sel[1])
|
|
if key in sdict:
|
|
sdict[key].update(val)
|
|
else:
|
|
sdict[key] = val
|
|
return sdict
|
|
|
|
def parse_style_properties(self, props):
|
|
"""
|
|
Parses a style attribute. The code within a CSS selector block or in
|
|
the style attribute of an HTML element.
|
|
@return: A dictionary with one entry for each property where the key
|
|
is the property name and the value is the property value.
|
|
"""
|
|
prop = dict()
|
|
for s in props.split(';'):
|
|
l = s.split(':',1)
|
|
if len(l)==2:
|
|
key = str(l[0].strip()).lower()
|
|
val = l[1].strip()
|
|
prop [key] = val
|
|
return prop
|
|
|
|
def tag_css(self, tag, parent_css={}):
|
|
"""
|
|
Return a dictionary of style properties applicable to Tag tag.
|
|
"""
|
|
def merge_parent_css(prop, pcss):
|
|
temp = {}
|
|
for key in pcss.keys():
|
|
chk = key.lower()
|
|
# float should not be inherited according to the CSS spec
|
|
# however we need to as we don't do alignment at a block level.
|
|
# float is removed by the process_alignment function.
|
|
if chk.startswith('font') or chk == 'text-align' or \
|
|
chk == 'float':
|
|
temp[key] = pcss[key]
|
|
prop.update(temp)
|
|
|
|
prop = dict()
|
|
if tag.has_key("align"):
|
|
prop["text-align"] = tag["align"]
|
|
if self.css.has_key(tag.name):
|
|
prop.update(self.css[tag.name])
|
|
if tag.has_key("class"):
|
|
cls = tag["class"].lower()
|
|
for classname in ["."+cls, tag.name+"."+cls]:
|
|
if self.css.has_key(classname):
|
|
prop.update(self.css[classname])
|
|
if parent_css:
|
|
merge_parent_css(prop, parent_css)
|
|
if tag.has_key("style"):
|
|
prop.update(self.parse_style_properties(tag["style"]))
|
|
return prop
|
|
|
|
def parse_file(self):
|
|
def get_valid_block(page):
|
|
for item in page.contents:
|
|
if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
|
|
return item
|
|
previous = self.book.last_page()
|
|
self.current_page = self.book.create_page()
|
|
self.current_block = self.book.create_text_block()
|
|
self.current_para = Paragraph()
|
|
if self.cover:
|
|
self.add_image_page(self.cover)
|
|
self.top = self.current_block
|
|
|
|
self.process_children(self.soup, {})
|
|
|
|
if self.current_para and self.current_block:
|
|
self.current_para.append_to(self.current_block)
|
|
if self.current_block and self.current_page:
|
|
self.current_block.append_to(self.current_page)
|
|
if self.current_page and self.current_page.has_text():
|
|
self.book.append(self.current_page)
|
|
|
|
if not self.top.parent:
|
|
if not previous:
|
|
try:
|
|
previous = self.book.pages()[0]
|
|
except IndexError:
|
|
raise ConversionError, self.file_name + ' does not seem to have any content'
|
|
self.top = get_valid_block(previous)
|
|
if not self.top or not self.top.parent:
|
|
raise ConversionError, self.file_name + ' does not seem to have any content'
|
|
return
|
|
|
|
found = False
|
|
for page in self.book.pages():
|
|
if page == previous:
|
|
found = True
|
|
continue
|
|
if found:
|
|
self.top = get_valid_block(page)
|
|
if not self.top:
|
|
continue
|
|
break
|
|
|
|
if not self.top or not self.top.parent:
|
|
raise ConversionError, 'Could not parse ' + self.file_name
|
|
|
|
|
|
def get_text(self, tag):
|
|
css = self.tag_css(tag)
|
|
if (css.has_key('display') and css['display'].lower() == 'none') or \
|
|
(css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
|
|
return ''
|
|
text = ''
|
|
for c in tag.contents:
|
|
if isinstance(c, HTMLConverter.IGNORED_TAGS):
|
|
return ''
|
|
if isinstance(c, NavigableString):
|
|
text += str(c)
|
|
elif isinstance(c, Tag):
|
|
if c.name.lower() == 'img' and c.has_key('alt'):
|
|
text += c['alt']
|
|
return text
|
|
text += self.get_text(c)
|
|
return text
|
|
|
|
def process_links(self):
|
|
def add_toc_entry(text, target):
|
|
# TextBlocks in Canvases have a None parent or an Objects Parent
|
|
if target.parent != None and \
|
|
hasattr(target.parent, 'objId'):
|
|
self.book.addTocEntry(ascii_text, tb)
|
|
elif self.verbose:
|
|
print "Cannot add link", ascii_text, "to TOC"
|
|
|
|
|
|
def get_target_block(fragment, targets):
|
|
'''Return the correct block for the <a name> element'''
|
|
bs = targets[fragment]
|
|
if not isinstance(bs, BlockSpace):
|
|
return bs
|
|
ans, found, page = None, False, bs.parent
|
|
for item in page.contents:
|
|
if found:
|
|
if isinstance(item, (TextBlock, RuledLine, ImageBlock)):
|
|
ans = item
|
|
break
|
|
if item == bs:
|
|
found = True
|
|
continue
|
|
|
|
if not ans:
|
|
for i in range(len(page.contents)-1, -1, -1):
|
|
if isinstance(page.contents[i], (TextBlock, RuledLine, ImageBlock)):
|
|
ans = page.contents[i]
|
|
break
|
|
|
|
if not ans:
|
|
ntb = self.book.create_text_block()
|
|
ntb.Paragraph(' ')
|
|
page.append(ntb)
|
|
ans = ntb
|
|
|
|
if found:
|
|
targets[fragment] = ans
|
|
page.contents.remove(bs)
|
|
return ans
|
|
|
|
cwd = os.getcwd()
|
|
for link in self.links:
|
|
para, tag = link.para, link.tag
|
|
text = self.get_text(tag)
|
|
# Needed for TOC entries due to bug in LRF
|
|
ascii_text = text.decode('utf8', 'replace').encode('ascii', 'replace')
|
|
if not text:
|
|
text = 'Link'
|
|
img = tag.find('img')
|
|
if img:
|
|
try:
|
|
text = img['alt']
|
|
except KeyError:
|
|
pass
|
|
purl = urlparse(link.tag['href'])
|
|
if purl[1]: # Not a link to a file on the local filesystem
|
|
continue
|
|
path, fragment = unquote(purl[2]), purl[5]
|
|
if not path or os.path.basename(path) == self.file_name:
|
|
if fragment in self.targets.keys():
|
|
tb = get_target_block(fragment, self.targets)
|
|
if self.is_root:
|
|
add_toc_entry(ascii_text, tb)
|
|
sys.stdout.flush()
|
|
jb = JumpButton(tb)
|
|
self.book.append(jb)
|
|
cb = CharButton(jb, text=text)
|
|
para.contents = []
|
|
para.append(cb)
|
|
elif self.link_level < self.max_link_levels:
|
|
try: # os.access raises Exceptions in path has null bytes
|
|
if not os.access(path.encode('utf8', 'replace'), os.R_OK):
|
|
raise Exception()
|
|
except Exception:
|
|
if self.verbose:
|
|
print "Skipping", link
|
|
continue
|
|
path = os.path.abspath(path)
|
|
if not path in HTMLConverter.processed_files.keys():
|
|
try:
|
|
self.files[path] = HTMLConverter(self.book, path,
|
|
profile=self.profile,
|
|
font_delta=self.font_delta, verbose=self.verbose,
|
|
link_level=self.link_level+1,
|
|
max_link_levels=self.max_link_levels,
|
|
is_root = False, baen=self.baen,
|
|
chapter_detection=self.chapter_detection,
|
|
chapter_regex=self.chapter_regex,
|
|
link_exclude=self.link_exclude,
|
|
page_break=self.page_break,
|
|
disable_autorotation=self.disable_autorotation)
|
|
HTMLConverter.processed_files[path] = self.files[path]
|
|
except Exception:
|
|
print >>sys.stderr, 'Unable to process', path
|
|
if self.verbose:
|
|
traceback.print_exc()
|
|
continue
|
|
finally:
|
|
os.chdir(cwd)
|
|
else:
|
|
self.files[path] = HTMLConverter.processed_files[path]
|
|
conv = self.files[path]
|
|
if fragment in conv.targets.keys():
|
|
tb = get_target_block(fragment, conv.targets)
|
|
else:
|
|
tb = conv.top
|
|
if self.is_root:
|
|
add_toc_entry(ascii_text, tb)
|
|
jb = JumpButton(tb)
|
|
self.book.append(jb)
|
|
cb = CharButton(jb, text=text)
|
|
para.contents = []
|
|
para.append(cb)
|
|
|
|
self.links_processed = True
|
|
|
|
for path in self.files.keys():
|
|
if self.files[path].links_processed:
|
|
continue
|
|
try:
|
|
os.chdir(os.path.dirname(path))
|
|
self.files[path].process_links()
|
|
finally:
|
|
os.chdir(cwd)
|
|
|
|
def end_page(self):
|
|
"""
|
|
End the current page, ensuring that any further content is displayed
|
|
on a new page.
|
|
"""
|
|
self.current_para.append_to(self.current_block)
|
|
self.current_para = Paragraph()
|
|
self.current_block.append_to(self.current_page)
|
|
self.current_block = self.book.create_text_block()
|
|
if self.current_page.has_text():
|
|
self.book.append(self.current_page)
|
|
self.current_page = self.book.create_page()
|
|
|
|
|
|
def add_image_page(self, path):
|
|
if os.access(path, os.R_OK):
|
|
self.end_page()
|
|
page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
|
|
topmargin=0, textwidth=self.profile.screen_width,
|
|
headheight=0, headsep=0, footspace=0,
|
|
footheight=0,
|
|
textheight=self.profile.screen_height)
|
|
if not self.images.has_key(path):
|
|
self.images[path] = ImageStream(path)
|
|
ib = ImageBlock(self.images[path])
|
|
page.append(ib)
|
|
self.book.append(page)
|
|
|
|
def process_children(self, ptag, pcss):
|
|
""" Process the children of ptag """
|
|
for c in ptag.contents:
|
|
if isinstance(c, HTMLConverter.IGNORED_TAGS):
|
|
continue
|
|
elif isinstance(c, Tag):
|
|
self.parse_tag(c, pcss)
|
|
elif isinstance(c, NavigableString):
|
|
self.add_text(c, pcss)
|
|
|
|
def process_alignment(self, css):
|
|
'''
|
|
Create a new TextBlock only if necessary as indicated by css
|
|
@type css: dict
|
|
'''
|
|
align = 'head'
|
|
if css.has_key('text-align'):
|
|
val = css['text-align'].lower()
|
|
if val in ["right", "foot"]:
|
|
align = "foot"
|
|
elif val == "center":
|
|
align = "center"
|
|
if css.has_key('float'):
|
|
val = css['float'].lower()
|
|
if val == 'left':
|
|
align = 'head'
|
|
if val == 'right':
|
|
align = 'foot'
|
|
css.pop('float')
|
|
if align != self.current_block.textStyle.attrs['align']:
|
|
self.current_para.append_to(self.current_block)
|
|
self.current_block.append_to(self.current_page)
|
|
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
|
ts.attrs['align'] = align
|
|
try:
|
|
index = self.text_styles.index(ts)
|
|
ts = self.text_styles[index]
|
|
except ValueError:
|
|
self.text_styles.append(ts)
|
|
self.current_block = self.book.create_text_block(
|
|
blockStyle=self.current_block.blockStyle,
|
|
textStyle=ts)
|
|
self.current_para = Paragraph()
|
|
|
|
def add_text(self, tag, css):
|
|
'''
|
|
Add text to the current paragraph taking CSS into account.
|
|
@param tag: Either a BeautifulSoup tag or a string
|
|
@param css:
|
|
@type css:
|
|
'''
|
|
src = tag.string if hasattr(tag, 'string') else tag
|
|
if self.lstrip_toggle:
|
|
src = src.lstrip()
|
|
self.lstrip_toggle = False
|
|
if not src.strip():
|
|
self.current_para.append(' ')
|
|
else:
|
|
self.process_alignment(css)
|
|
try:
|
|
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
|
|
font_delta=self.font_delta))
|
|
except ConversionError, err:
|
|
if self.verbose:
|
|
print >>sys.stderr, err
|
|
|
|
def sanctify_css(self, css):
|
|
""" Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
|
|
css = copy.copy(css)
|
|
for key in css.keys():
|
|
test = key.lower()
|
|
if test.startswith('margin') or test.startswith('text') or \
|
|
'padding' in test or 'border' in test or 'page-break' in test \
|
|
or test.startswith('mso') or test.startswith('background')\
|
|
or test.startswith('line') or test in ['color', 'display', \
|
|
'letter-spacing', 'font-variant', 'position']:
|
|
css.pop(key)
|
|
return css
|
|
|
|
def end_current_para(self):
|
|
'''
|
|
End current paragraph with a paragraph break after it. If the current
|
|
paragraph has no non whitespace text in it do nothing.
|
|
'''
|
|
if not self.current_para.has_text():
|
|
return
|
|
if self.current_para.contents:
|
|
self.current_block.append(self.current_para)
|
|
self.current_para = Paragraph()
|
|
if self.current_block.contents and \
|
|
not isinstance(self.current_block.contents[-1], CR):
|
|
self.current_block.append(CR())
|
|
|
|
def end_current_block(self):
|
|
self.current_para.append_to(self.current_block)
|
|
self.current_block.append_to(self.current_page)
|
|
self.current_para = Paragraph()
|
|
self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
|
|
blockStyle=self.current_block.blockStyle)
|
|
|
|
def process_image(self, path, tag_css, width=None, height=None):
|
|
if self.rotated_images.has_key(path):
|
|
path = self.rotated_images[path].name
|
|
if self.scaled_images.has_key(path):
|
|
path = self.scaled_images[path].name
|
|
|
|
im = PILImage.open(path)
|
|
|
|
if width == None or height == None:
|
|
width, height = im.size
|
|
|
|
def scale_image(width, height):
|
|
pt = PersistentTemporaryFile(suffix='.jpeg')
|
|
im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
|
|
pt.close()
|
|
self.scaled_images[path] = pt
|
|
return pt.name
|
|
|
|
pheight = int(self.current_page.pageStyle.attrs['textheight'])
|
|
pwidth = int(self.current_page.pageStyle.attrs['textwidth'])
|
|
|
|
if not self.disable_autorotation and width > pwidth and width > height:
|
|
pt = PersistentTemporaryFile(suffix='.jpeg')
|
|
im = im.rotate(90)
|
|
im.convert('RGB').save(pt, 'JPEG')
|
|
path = pt.name
|
|
pt.close()
|
|
self.rotated_images[path] = pt
|
|
width, height = im.size
|
|
|
|
|
|
if height > pheight:
|
|
corrf = pheight/(1.*height)
|
|
width, height = floor(corrf*width), pheight-1
|
|
if width > pwidth:
|
|
corrf = (pwidth)/(1.*width)
|
|
width, height = pwidth-1, floor(corrf*height)
|
|
path = scale_image(width, height)
|
|
if width > pwidth:
|
|
corrf = pwidth/(1.*width)
|
|
width, height = pwidth-1, floor(corrf*height)
|
|
if height > pheight:
|
|
corrf = (pheight)/(1.*height)
|
|
width, height = floor(corrf*width), pheight-1
|
|
path = scale_image(width, height)
|
|
width, height = int(width), int(height)
|
|
|
|
if not self.images.has_key(path):
|
|
self.images[path] = ImageStream(path)
|
|
|
|
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
|
|
xsize=width, ysize=height)
|
|
factor = 720./self.profile.dpi
|
|
|
|
self.process_alignment(tag_css)
|
|
|
|
if max(width, height) <= min(pwidth, pheight)/5.:
|
|
self.current_para.append(Plot(im, xsize=ceil(width*factor),
|
|
ysize=ceil(height*factor)))
|
|
elif height <= int(floor((2/3.)*pheight)):
|
|
pb = self.current_block
|
|
self.end_current_para()
|
|
self.process_alignment(tag_css)
|
|
self.current_para.append(Plot(im, xsize=width*factor,
|
|
ysize=height*factor))
|
|
self.current_block.append(self.current_para)
|
|
self.current_page.append(self.current_block)
|
|
self.current_block = self.book.create_text_block(
|
|
textStyle=pb.textStyle,
|
|
blockStyle=pb.blockStyle)
|
|
self.current_para = Paragraph()
|
|
else:
|
|
self.end_page()
|
|
self.current_page.append(Canvas(width=pwidth,
|
|
height=height))
|
|
left = int(floor((pwidth - width)/2.))
|
|
self.current_page.contents[-1].put_object(
|
|
ImageBlock(self.images[path], xsize=pwidth,
|
|
ysize=pheight, x1=pwidth, y1=pheight,
|
|
blockwidth=pwidth, blockheight=pheight),
|
|
left, 0)
|
|
|
|
def parse_tag(self, tag, parent_css):
|
|
try:
|
|
tagname = tag.name.lower()
|
|
except AttributeError:
|
|
if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
|
|
self.add_text(tag, parent_css)
|
|
return
|
|
tag_css = self.tag_css(tag, parent_css=parent_css)
|
|
try: # Skip element if its display attribute is set to none
|
|
if tag_css['display'].lower() == 'none' or \
|
|
tag_css['visibility'].lower() == 'hidden':
|
|
return
|
|
except KeyError:
|
|
pass
|
|
if 'page-break-before' in tag_css.keys():
|
|
if tag_css['page-break-before'].lower() != 'avoid':
|
|
self.end_page()
|
|
tag_css.pop('page-break-before')
|
|
end_page = False
|
|
if 'page-break-after' in tag_css.keys() and \
|
|
tag_css['page-break-after'].lower() != 'avoid':
|
|
end_page = True
|
|
tag_css.pop('page-break-after')
|
|
if not self.page_break_found and self.page_break.match(tagname):
|
|
if len(self.current_page.contents) > 3:
|
|
self.end_page()
|
|
if self.verbose:
|
|
print 'Forcing page break at', tagname
|
|
if tagname in ["title", "script", "meta", 'del', 'frameset']:
|
|
pass
|
|
elif tagname == 'a' and self.max_link_levels >= 0:
|
|
if tag.has_key('name'):
|
|
if self.anchor_to_previous:
|
|
self.process_children(tag, tag_css)
|
|
for c in self.anchor_to_previous.contents:
|
|
if isinstance(c, (TextBlock, ImageBlock)):
|
|
self.targets[tag['name']] = c
|
|
return
|
|
tb = self.book.create_text_block()
|
|
tb.Paragraph(" ")
|
|
self.anchor_to_previous.append(tb)
|
|
self.targets[tag['name']] = tb
|
|
return
|
|
previous = self.current_block
|
|
self.process_children(tag, tag_css)
|
|
target = None
|
|
if self.current_block == previous:
|
|
self.current_para.append_to(self.current_block)
|
|
self.current_para = Paragraph()
|
|
if self.current_block.has_text():
|
|
target = self.current_block
|
|
else:
|
|
target = BlockSpace()
|
|
self.current_page.append(target)
|
|
else:
|
|
found = False
|
|
for item in self.current_page.contents:
|
|
if item == previous:
|
|
found = True
|
|
continue
|
|
if found:
|
|
target = item
|
|
break
|
|
if target and not isinstance(target, (TextBlock, ImageBlock)):
|
|
if isinstance(target, RuledLine):
|
|
target = self.book.create_text_block(textStyle=self.current_block.textStyle,
|
|
blockStyle=self.current_block.blockStyle)
|
|
target.Paragraph(' ')
|
|
self.current_page.append(target)
|
|
else:
|
|
target = BlockSpace()
|
|
self.current_page.append(target)
|
|
if target == None:
|
|
if self.current_block.has_text():
|
|
target = self.current_block
|
|
else:
|
|
target = BlockSpace()
|
|
self.current_page.append(target)
|
|
self.targets[tag['name']] = target
|
|
elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
|
purl = urlparse(tag['href'])
|
|
path = unquote(purl[2])
|
|
if path and os.path.splitext(path)[1][1:].lower() in \
|
|
['png', 'jpg', 'bmp', 'jpeg']:
|
|
self.process_image(path, tag_css)
|
|
else:
|
|
text = self.get_text(tag)
|
|
if not text:
|
|
text = "Link"
|
|
self.add_text(text, tag_css)
|
|
self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
|
|
elif tagname == 'img':
|
|
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
|
|
path = os.path.abspath(unquote(tag['src']))
|
|
width, height = None, None
|
|
try:
|
|
width = int(tag['width'])
|
|
height = int(tag['height'])
|
|
except:
|
|
pass
|
|
self.process_image(path, tag_css, width, height)
|
|
|
|
else:
|
|
print >>sys.stderr, "Failed to process:", tag
|
|
elif tagname in ['style', 'link']:
|
|
def update_css(ncss):
|
|
for key in ncss.keys():
|
|
if self.css.has_key(key):
|
|
self.css[key].update(ncss[key])
|
|
else:
|
|
self.css[key] = ncss[key]
|
|
ncss = {}
|
|
if tagname == 'style':
|
|
for c in tag.contents:
|
|
if isinstance(c, NavigableString):
|
|
ncss.update(self.parse_css(str(c)))
|
|
elif tag.has_key('type') and tag['type'] == "text/css" \
|
|
and tag.has_key('href'):
|
|
purl = urlparse(tag['href'])
|
|
path = unquote(purl[2])
|
|
try:
|
|
f = open(path, 'rb')
|
|
src = f.read()
|
|
f.close()
|
|
match = self.PAGE_BREAK_PAT.search(src)
|
|
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
|
self.page_break_found = True
|
|
ncss = self.parse_css(src)
|
|
except IOError:
|
|
pass
|
|
if ncss:
|
|
update_css(ncss)
|
|
elif tagname == 'pre':
|
|
self.end_current_para()
|
|
self.current_block.append_to(self.current_page)
|
|
attrs = Span.translate_attrs(tag_css, self.font_delta, self.memory)
|
|
ts = self.book.create_text_style(**self.unindented_style.attrs)
|
|
ts.attrs.update(attrs)
|
|
self.current_block = self.book.create_text_block(
|
|
blockStyle=self.current_block.blockStyle,
|
|
textStyle=ts)
|
|
src = ''.join([str(i) for i in tag.contents])
|
|
lines = src.split('\n')
|
|
for line in lines:
|
|
try:
|
|
self.current_para.append(Span(line, tag_css, self.memory))
|
|
self.current_para.CR()
|
|
except ConversionError:
|
|
pass
|
|
self.end_current_block()
|
|
elif tagname in ['ul', 'ol']:
|
|
self.in_ol = 1 if tagname == 'ol' else 0
|
|
self.end_current_block()
|
|
self.current_block = self.book.create_text_block(
|
|
blockStyle=self.current_block.blockStyle,
|
|
textStyle=self.unindented_style)
|
|
self.process_children(tag, tag_css)
|
|
self.in_ol = 0
|
|
self.end_current_block()
|
|
elif tagname == 'li':
|
|
prepend = str(self.in_ol)+'. ' if self.in_ol else u'\u2022' + ' '
|
|
if self.current_para.has_text():
|
|
self.current_para.append(CR())
|
|
self.current_block.append(self.current_para)
|
|
self.current_para = Paragraph()
|
|
self.current_para.append(Space(xsize=100))
|
|
self.current_para.append(prepend)
|
|
self.process_children(tag, tag_css)
|
|
if self.in_ol:
|
|
self.in_ol += 1
|
|
elif tagname == 'blockquote':
|
|
self.current_para.append_to(self.current_block)
|
|
self.current_block.append_to(self.current_page)
|
|
pb = self.current_block
|
|
self.current_para = Paragraph()
|
|
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
|
ts.attrs['parindent'] = 0
|
|
try:
|
|
index = self.text_styles.index(ts)
|
|
ts = self.text_styles[index]
|
|
except ValueError:
|
|
self.text_styles.append(ts)
|
|
bs = self.book.create_block_style(**self.current_block.blockStyle.attrs)
|
|
bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
|
|
60, 20, 20
|
|
try:
|
|
index = self.block_styles.index(bs)
|
|
bs = self.block_styles[index]
|
|
except ValueError:
|
|
self.block_styles.append(bs)
|
|
self.current_block = self.book.create_text_block(
|
|
blockStyle=bs, textStyle=ts)
|
|
self.process_children(tag, tag_css)
|
|
self.current_para.append_to(self.current_block)
|
|
self.current_block.append_to(self.current_page)
|
|
self.current_para = Paragraph()
|
|
self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
|
|
blockStyle=pb.blockStyle)
|
|
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
if self.chapter_detection and tagname.startswith('h'):
|
|
src = self.get_text(tag)
|
|
if self.chapter_regex.search(src):
|
|
if self.verbose:
|
|
print 'Detected chapter', src
|
|
self.end_page()
|
|
self.page_break_found = True
|
|
self.end_current_para()
|
|
self.lstrip_toggle = True
|
|
if tag_css.has_key('text-indent'):
|
|
indent = Span.unit_convert(tag_css['text-indent'])
|
|
if not indent:
|
|
indent=0
|
|
else:
|
|
indent = self.book.defaultTextStyle.attrs['parindent']
|
|
if indent != self.current_block.textStyle.attrs['parindent']:
|
|
self.current_block.append_to(self.current_page)
|
|
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
|
|
ts.attrs['parindent'] = indent
|
|
try:
|
|
index = self.text_styles.index(ts)
|
|
ts = self.text_styles[index]
|
|
except ValueError:
|
|
self.text_styles.append(ts)
|
|
self.current_block = self.book.create_text_block(blockStyle=self.current_block.blockStyle,
|
|
textStyle=ts)
|
|
self.process_children(tag, tag_css)
|
|
self.end_current_para()
|
|
if tagname.startswith('h'):
|
|
self.current_block.append(CR())
|
|
elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt']:
|
|
self.process_children(tag, tag_css)
|
|
elif tagname == 'font':
|
|
if tag.has_key('face'):
|
|
tag_css['font-family'] = tag['face']
|
|
self.process_children(tag, tag_css)
|
|
elif tagname in ['br']:
|
|
self.current_para.append(CR())
|
|
elif tagname in ['hr', 'tr']: # tr needed for nested tables
|
|
self.end_current_para()
|
|
self.current_block.append(CR())
|
|
self.end_current_block()
|
|
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
|
|
elif tagname == 'td': # Needed for nested tables
|
|
self.current_para.append(" ")
|
|
self.process_children(tag, tag_css)
|
|
elif tagname == 'table' and not self.in_table:
|
|
tag_css = self.tag_css(tag) # Table should not inherit CSS
|
|
self.process_table(tag, tag_css)
|
|
else:
|
|
self.process_children(tag, tag_css)
|
|
if end_page:
|
|
self.end_page()
|
|
|
|
def process_table(self, tag, tag_css):
|
|
self.end_current_block()
|
|
colpad = 10
|
|
table = Table(self, tag, tag_css, rowpad=10, colpad=10)
|
|
canvases = []
|
|
for block, xpos, ypos, delta in table.blocks(int(self.current_page.pageStyle.attrs['textwidth'])):
|
|
if not block:
|
|
canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+colpad,
|
|
blockrule='block-fixed'))
|
|
else:
|
|
canvases[-1].put_object(block, xpos + int(delta/2.), 0)
|
|
|
|
for canvas in canvases:
|
|
self.current_page.append(canvas)
|
|
self.end_current_block()
|
|
|
|
|
|
def writeto(self, path, lrs=False):
|
|
self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
|
|
|
|
def cleanup(self):
|
|
for _file in self.scaled_images.values() + self.rotated_images.values():
|
|
_file.__del__()
|
|
|
|
def process_file(path, options):
|
|
cwd = os.getcwd()
|
|
dirpath = None
|
|
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
|
|
try:
|
|
dirpath, path = get_path(path)
|
|
cpath, tpath = '', ''
|
|
isbn = try_opf(path, options)
|
|
if not options.cover and isbn:
|
|
for item in isbn:
|
|
matches = glob.glob(re.sub('-', '', item[1])+'.*')
|
|
for match in matches:
|
|
if match.lower().endswith('.jpeg') or match.lower().endswith('.jpg') or \
|
|
match.lower().endswith('.gif') or match.lower().endswith('.png'):
|
|
options.cover = match
|
|
break
|
|
if options.cover:
|
|
options.cover = os.path.abspath(os.path.expanduser(options.cover))
|
|
cpath = options.cover
|
|
if os.access(options.cover, os.R_OK):
|
|
from libprs500.devices.prs500.driver import PRS500
|
|
im = PILImage.open(os.path.join(cwd, cpath))
|
|
cim = im.resize((options.profile.screen_width,
|
|
options.profile.screen_height),
|
|
PILImage.BICUBIC)
|
|
cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
|
|
cf.close()
|
|
cim.save(cf.name)
|
|
cpath = cf.name
|
|
th = PRS500.THUMBNAIL_HEIGHT
|
|
tim = im.resize((int(0.75*th), th), PILImage.ANTIALIAS)
|
|
tf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
|
|
tf.close()
|
|
tim.save(tf.name)
|
|
tpath = tf.name
|
|
else:
|
|
raise ConversionError, 'Cannot read from: %s'% (options.cover,)
|
|
|
|
|
|
if not options.title:
|
|
options.title = default_title
|
|
title = (options.title, options.title_sort)
|
|
author = (options.author, options.author_sort)
|
|
args = dict(font_delta=options.font_delta, title=title, \
|
|
author=author, sourceencoding='utf8',\
|
|
freetext=options.freetext, category=options.category,
|
|
publisher=options.publisher,
|
|
booksetting=BookSetting(dpi=10*options.profile.dpi,
|
|
screenheight=options.profile.screen_height,
|
|
screenwidth=options.profile.screen_width))
|
|
if tpath:
|
|
args['thumbnail'] = tpath
|
|
header = None
|
|
if options.header:
|
|
header = Paragraph()
|
|
header.append(Bold(options.title))
|
|
header.append(' by ')
|
|
header.append(Italic(options.author+" "))
|
|
book = Book(options, header=header, **args)
|
|
le = re.compile(options.link_exclude) if options.link_exclude else \
|
|
re.compile('$')
|
|
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
|
|
re.compile('$')
|
|
conv = HTMLConverter(book, path, profile=options.profile,
|
|
font_delta=options.font_delta,
|
|
cover=cpath, max_link_levels=options.link_levels,
|
|
verbose=options.verbose, baen=options.baen,
|
|
chapter_detection=options.chapter_detection,
|
|
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
|
|
link_exclude=re.compile(le), page_break=pb,
|
|
disable_autorotation=options.disable_autorotation)
|
|
conv.process_links()
|
|
oname = options.output
|
|
if not oname:
|
|
suffix = '.lrs' if options.lrs else '.lrf'
|
|
name = os.path.splitext(os.path.basename(path))[0] + suffix
|
|
oname = os.path.join(cwd,name)
|
|
oname = os.path.abspath(os.path.expanduser(oname))
|
|
conv.writeto(oname, lrs=options.lrs)
|
|
print 'Output written to', oname
|
|
conv.cleanup()
|
|
finally:
|
|
os.chdir(cwd)
|
|
if dirpath:
|
|
shutil.rmtree(dirpath, True)
|
|
|
|
def try_opf(path, options):
|
|
try:
|
|
opf = glob.glob('*.opf')[0]
|
|
except IndexError:
|
|
return
|
|
soup = BeautifulStoneSoup(open(opf).read())
|
|
try:
|
|
title = soup.package.metadata.find('dc:title')
|
|
if title and not options.title:
|
|
options.title = title.string
|
|
creators = soup.package.metadata.findAll('dc:creator')
|
|
if options.author == 'Unknown':
|
|
for author in creators:
|
|
role = author.get('role')
|
|
if not role:
|
|
role = author.get('opf:role')
|
|
if role == 'aut':
|
|
options.author = author.string
|
|
fa = author.get('file-as')
|
|
if fa:
|
|
options.author_sort = fa
|
|
if options.publisher == 'Unknown':
|
|
publisher = soup.package.metadata.find('dc:publisher')
|
|
if publisher:
|
|
options.publisher = publisher.string
|
|
if not options.category.strip():
|
|
category = soup.package.metadata.find('dc:type')
|
|
if category:
|
|
options.category = category.string
|
|
isbn = []
|
|
for item in soup.package.metadata.findAll('dc:identifier'):
|
|
scheme = item.get('scheme')
|
|
if not scheme:
|
|
scheme = item.get('opf:scheme')
|
|
isbn.append((scheme, item.string))
|
|
return isbn
|
|
except Exception, err:
|
|
if options.verbose:
|
|
print >>sys.stderr, 'Failed to process opf file', err
|
|
pass
|
|
|
|
|
|
|
|
def parse_options(argv=None, cli=True):
|
|
""" CLI for html -> lrf conversions """
|
|
if not argv:
|
|
argv = sys.argv[1:]
|
|
parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]
|
|
|
|
%prog converts mybook.html to mybook.lrf""")
|
|
laf = parser.add_option_group('LOOK AND FEEL')
|
|
laf.add_option('--cover', action='store', dest='cover', default=None, \
|
|
help='Path to file containing image to be used as cover')
|
|
laf.add_option('--font-delta', action='store', type='int', default=0, \
|
|
help="""Increase the font size by 2 * FONT_DELTA pts and """
|
|
'''the line spacing by FONT_DELTA pts. '''
|
|
"""If FONT_DELTA is negative, the font size is decreased.""",
|
|
dest='font_delta')
|
|
laf.add_option('--disable-autorotation', action='store_true', default=False,
|
|
help='Disable autorotation of images.', dest='disable_autorotation')
|
|
link = parser.add_option_group('LINK PROCESSING OPTIONS')
|
|
link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
|
|
dest='link_levels',
|
|
help=r'''The maximum number of levels to recursively process '''
|
|
'''links. A value of 0 means thats links are not followed. '''
|
|
'''A negative value means that <a> tags are ignored.''')
|
|
link.add_option('--link-exclude', dest='link_exclude', default='$',
|
|
help='''A regular expression. <a> tags whoose href '''
|
|
'''matches will be ignored. Defaults to %default''')
|
|
chapter = parser.add_option_group('CHAPTER OPTIONS')
|
|
chapter.add_option('--disable-chapter-detection', action='store_false',
|
|
default=True, dest='chapter_detection',
|
|
help='''Prevent html2lrf from automatically inserting page breaks'''
|
|
''' before what it thinks are chapters.''')
|
|
chapter.add_option('--chapter-regex', dest='chapter_regex',
|
|
default='chapter|book|appendix',
|
|
help='''The regular expression used to detect chapter titles.'''
|
|
''' It is searched for in heading tags. Defaults to %default''')
|
|
chapter.add_option('--page-break-before', dest='page_break', default='h[12]',
|
|
help='''If html2lrf does not find any page breaks in the '''
|
|
'''html file and cannot detect chapter headings, it will '''
|
|
'''automatically insert page-breaks before the tags whose '''
|
|
'''names match this regular expression. Defaults to %default. '''
|
|
'''You can disable it by setting the regexp to "$". '''
|
|
'''The purpose of this option is to try to ensure that '''
|
|
'''there are no really long pages as this degrades the page '''
|
|
'''turn performance of the LRF. Thus this option is ignored '''
|
|
'''if the current page has only a few elements.''')
|
|
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
|
|
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
|
|
help='''Preprocess Baen HTML files to improve generated LRF.''')
|
|
options, args = parser.parse_args(args=argv)
|
|
if len(args) != 1:
|
|
if cli:
|
|
parser.print_help()
|
|
raise ConversionError, 'no filename specified'
|
|
if options.output:
|
|
options.output = os.path.abspath(os.path.expanduser(options.output))
|
|
return options, args, parser
|
|
|
|
|
|
def main():
|
|
try:
|
|
options, args, parser = parse_options()
|
|
src = args[0]
|
|
if options.verbose:
|
|
import warnings
|
|
warnings.defaultaction = 'error'
|
|
except:
|
|
sys.exit(1)
|
|
process_file(src, options)
|
|
|
|
def console_query(dirpath, candidate, docs):
|
|
if len(docs) == 1:
|
|
return 0
|
|
try:
|
|
import readline
|
|
except ImportError:
|
|
pass
|
|
i = 0
|
|
for doc in docs:
|
|
prefix = '>' if i == candidate else ''
|
|
print prefix+str(i)+'.\t', doc[0]
|
|
i += 1
|
|
print
|
|
while True:
|
|
try:
|
|
choice = raw_input('Choose file to convert (0-'+str(i-1) + \
|
|
'). Current choice is ['+ str(candidate) + ']:')
|
|
if not choice:
|
|
return candidate
|
|
choice = int(choice)
|
|
if choice < 0 or choice >= i:
|
|
continue
|
|
candidate = choice
|
|
except EOFError, KeyboardInterrupt:
|
|
sys.exit()
|
|
except:
|
|
continue
|
|
break
|
|
return candidate
|
|
|
|
|
|
def get_path(path, query=console_query):
|
|
path = os.path.abspath(os.path.expanduser(path))
|
|
ext = os.path.splitext(path)[1][1:].lower()
|
|
if ext in ['htm', 'html', 'xhtml']:
|
|
return None, path
|
|
dirpath = mkdtemp('','html2lrf')
|
|
extract(path, dirpath)
|
|
candidate, docs = None, []
|
|
for root, dirs, files in os.walk(dirpath):
|
|
for name in files:
|
|
ext = os.path.splitext(name)[1][1:].lower()
|
|
if ext not in ['html', 'xhtml', 'htm', 'xhtm']:
|
|
continue
|
|
docs.append((name, root, os.stat(os.path.join(root, name)).st_size))
|
|
if 'toc' in name.lower():
|
|
candidate = name
|
|
docs.sort(key=itemgetter(2))
|
|
if candidate:
|
|
for i in range(len(docs)):
|
|
if docs[i][0] == candidate:
|
|
candidate = i
|
|
break
|
|
else:
|
|
candidate = len(docs) - 1
|
|
if len(docs) == 0:
|
|
raise ConversionError('No suitable files found in archive')
|
|
if len(docs) > 0:
|
|
candidate = query(dirpath, candidate, docs)
|
|
return dirpath, os.path.join(docs[candidate][1], docs[candidate][0])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |