Rationalized CLI of html2lrf

Fixed link handling to show text rather than href by default
Fine tuned image handling
Added automatic page breaks if page-break not found
This commit is contained in:
Kovid Goyal 2007-05-12 21:21:21 +00:00
parent 69f20f634d
commit a29bf8eea0
5 changed files with 194 additions and 123 deletions

View File

@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
suit your distribution. suit your distribution.
""" """
__version__ = "0.3.25" __version__ = "0.3.26"
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -17,7 +17,7 @@ This package contains logic to read and write LRF files. The LRF file format is
At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}. At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}.
""" """
from optparse import OptionParser from optparse import OptionParser, OptionValueError
from libprs500.lrf.pylrs.pylrs import Book as _Book from libprs500.lrf.pylrs.pylrs import Book as _Book
from libprs500.lrf.pylrs.pylrs import TextBlock, Header, PutObj, Paragraph, TextStyle from libprs500.lrf.pylrs.pylrs import TextBlock, Header, PutObj, Paragraph, TextStyle
@ -26,31 +26,53 @@ from libprs500 import __version__ as VERSION
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
class PRS500_PROFILE(object):
screen_width = 600
screen_height = 800
page_width = 575
page_height = 747
dpi = 166
def profile_from_string(option, opt_str, value, parser):
if value == 'prs500':
setattr(parser.values, option.dest, PRS500_PROFILE)
else:
raise OptionValueError('Profile: '+value+' is not implemented')
class ConversionError(Exception): class ConversionError(Exception):
pass pass
def option_parser(usage): def option_parser(usage):
parser = OptionParser(usage=usage, version='libprs500 '+VERSION) parser = OptionParser(usage=usage, version='libprs500 '+VERSION,
parser.add_option('--header', action='store_true', default=False, dest='header', epilog='html2lrf created by Kovid Goyal')
metadata = parser.add_option_group('METADATA OPTIONS')
metadata.add_option('--header', action='store_true', default=False, dest='header',
help='Add a header to all the pages with title and author.') help='Add a header to all the pages with title and author.')
parser.add_option("-t", "--title", action="store", type="string", \ metadata.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the title") dest="title", help="Set the title. Default: filename.")
parser.add_option("-a", "--author", action="store", type="string", \ metadata.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown') dest="author", help="Set the author. Default: %default", default='Unknown')
parser.add_option("--freetext", action="store", type="string", \ metadata.add_option("--freetext", action="store", type="string", \
dest="freetext", help="Set the comments in the metadata", default=' ') dest="freetext", help="Set the comments.", default=' ')
parser.add_option("--category", action="store", type="string", \ metadata.add_option("--category", action="store", type="string", \
dest="category", help="Set the category", default=' ') dest="category", help="Set the category", default=' ')
metadata.add_option('--title-sort', action='store', default='', dest='title_sort',
help='Sort key for the title')
metadata.add_option('--author-sort', action='store', default='', dest='author_sort',
help='Sort key for the author')
profiles=['prs500']
parser.add_option('-o', '--output', action='store', default=None, \ parser.add_option('-o', '--output', action='store', default=None, \
help='Output file name. Default is derived from input filename') help='Output file name. Default is derived from input filename')
parser.add_option('--title-sort', action='store', default='', dest='title_sort', parser.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
help='Sort key for the title') choices=profiles, action='callback', callback=profile_from_string,
parser.add_option('--author-sort', action='store', default='', dest='author_sort', help='''Profile of the target device for which this LRF is '''
help='Sort key for the author') '''being generated. Default: ''' + profiles[0] + '''
Supported profiles: '''+', '.join(profiles))
return parser return parser
def Book(font_delta=0, header=None, **settings): def Book(font_delta=0, header=None, profile=PRS500_PROFILE, **settings):
ps = dict(textwidth=575, textheight=747) ps = dict(textwidth=profile.page_width,
textheight=profile.page_height)
if header: if header:
hdr = Header() hdr = Header()
hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60)) hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60))
@ -62,5 +84,4 @@ def Book(font_delta=0, header=None, **settings):
ps['topmargin'] = 10 ps['topmargin'] = 10
return _Book(textstyledefault=dict(fontsize=100+font_delta*20, return _Book(textstyledefault=dict(fontsize=100+font_delta*20,
parindent=80, linespace=12), \ parindent=80, linespace=12), \
pagestyledefault=ps, \ pagestyledefault=ps, **settings)
**settings)

View File

@ -39,7 +39,7 @@ from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBl
Bold, Space, Plot, Image, BlockSpace,\ Bold, Space, Plot, Image, BlockSpace,\
RuledLine, BookSetting RuledLine, BookSetting
from libprs500.lrf.pylrs.pylrs import Span as _Span from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError, option_parser, Book from libprs500.lrf import ConversionError, option_parser, Book, PRS500_PROFILE
from libprs500 import extract, filename_to_utf8 from libprs500 import extract, filename_to_utf8
from libprs500.ptempfile import PersistentTemporaryFile from libprs500.ptempfile import PersistentTemporaryFile
@ -158,7 +158,7 @@ class Span(_Span):
ans = font_weight(val) ans = font_weight(val)
if ans: if ans:
t['fontweight'] = ans t['fontweight'] = ans
if int(ans) > 1400: if int(ans) > 140:
t['wordspace'] = '50' t['wordspace'] = '50'
elif key.startswith("margin"): elif key.startswith("margin"):
if key == "margin": if key == "margin":
@ -214,8 +214,9 @@ class Span(_Span):
class HTMLConverter(object): class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
# Fix <a /> elements # Fix <a /> elements
MARKUP_MASSAGE = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"), MARKUP_MASSAGE = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
lambda match: match.group(1)+"></a>")] lambda match: match.group(1)+"></a>")]
@ -234,12 +235,14 @@ class HTMLConverter(object):
processed_files = {} #: Files that have been processed processed_files = {} #: Files that have been processed
def __init__(self, book, path, dpi=166, width=575, height=747, def __init__(self, book, path,
font_delta=0, verbose=False, cover=None, font_delta=0, verbose=False, cover=None,
max_link_levels=sys.maxint, link_level=0, max_link_levels=sys.maxint, link_level=0,
is_root=True, baen=False, chapter_detection=True, is_root=True, baen=False, chapter_detection=True,
chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE), chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE),
link_exclude=re.compile('$')): link_exclude=re.compile('$'),
page_break=re.compile('h[12]', re.IGNORECASE),
profile=PRS500_PROFILE, hide_broken_links=False):
''' '''
Convert HTML file at C{path} and add it to C{book}. After creating Convert HTML file at C{path} and add it to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and the object, you must call L{self.process_links} on it to create the links and
@ -270,6 +273,11 @@ class HTMLConverter(object):
@type chapter_detection: C{bool} @type chapter_detection: C{bool}
@param chapter_regex: The compiled regular expression used to search for chapter titles @param chapter_regex: The compiled regular expression used to search for chapter titles
@param link_exclude: Compiled regex. Matching hrefs are ignored. @param link_exclude: Compiled regex. Matching hrefs are ignored.
@param page_break: Compiled regex. Page breaks are inserted before matching
tags if no page-breaks are found and no chapter headings
are detected.
@param profile: Defines the geometry of the display device
@param hide_broken_links: Don't display broken links
''' '''
# Defaults for various formatting tags # Defaults for various formatting tags
self.css = dict( self.css = dict(
@ -285,10 +293,8 @@ class HTMLConverter(object):
small = {'font-size' :'small'}, small = {'font-size' :'small'},
pre = {'font-family' :'monospace' }, pre = {'font-family' :'monospace' },
center = {'text-align' : 'center'} center = {'text-align' : 'center'}
) )
self.page_width = width #: The width of the page self.profile = profile #: Defines the geometry of the display device
self.page_height = height #: The height of the page
self.dpi = dpi #: The DPI of the intended display device
self.chapter_detection = chapter_detection #: Flag to toggle chapter detection self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
self.chapter_regex = chapter_regex #: Regex used to search for chapter titles self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
self.link_exclude = link_exclude #: Ignore matching hrefs self.link_exclude = link_exclude #: Ignore matching hrefs
@ -298,6 +304,7 @@ class HTMLConverter(object):
self.blockquote_style = book.create_block_style(sidemargin=60, self.blockquote_style = book.create_block_style(sidemargin=60,
topskip=20, footskip=20) topskip=20, footskip=20)
self.unindented_style = book.create_text_style(parindent=0) self.unindented_style = book.create_text_style(parindent=0)
self.page_break = page_break #: Regex controlling forced page-break behavior
self.text_styles = []#: Keep track of already used textstyles self.text_styles = []#: Keep track of already used textstyles
self.block_styles = []#: Keep track of already used blockstyles self.block_styles = []#: Keep track of already used blockstyles
self.images = {} #: Images referenced in the HTML document self.images = {} #: Images referenced in the HTML document
@ -311,7 +318,8 @@ class HTMLConverter(object):
self.in_ol = False #: Flag indicating we're in an <ol> element self.in_ol = False #: Flag indicating we're in an <ol> element
self.book = book #: The Book object representing a BBeB book self.book = book #: The Book object representing a BBeB book
self.is_root = is_root #: Are we converting the root HTML file self.is_root = is_root #: Are we converting the root HTML file
self.lstrip_toggle = False #; If true the next add_text call will do an lstrip self.lstrip_toggle = False #: If true the next add_text call will do an lstrip
self.hide_broken_links = hide_broken_links
path = os.path.abspath(path) path = os.path.abspath(path)
os.chdir(os.path.dirname(path)) os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path) self.file_name = os.path.basename(path)
@ -331,7 +339,11 @@ class HTMLConverter(object):
self.verbose = verbose self.verbose = verbose
self.current_page = None self.current_page = None
self.current_para = None self.current_para = None
self.current_style = {} self.current_style = {}
self.page_break_found = False
match = self.PAGE_BREAK_PAT.search(unicode(self.soup))
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
self.parse_file() self.parse_file()
HTMLConverter.processed_files[path] = self HTMLConverter.processed_files[path] = self
print 'done' print 'done'
@ -440,7 +452,8 @@ class HTMLConverter(object):
def get_text(self, tag): def get_text(self, tag):
css = self.tag_css(tag) css = self.tag_css(tag)
if css.has_key('display') and css['display'].lower() == 'none': if (css.has_key('display') and css['display'].lower() == 'none') or \
(css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
return '' return ''
text = '' text = ''
for c in tag.contents: for c in tag.contents:
@ -485,22 +498,26 @@ class HTMLConverter(object):
page.contents.remove(bs) page.contents.remove(bs)
return ans return ans
cwd = os.getcwd() cwd = os.getcwd()
for link in self.links: for link in self.links:
para, tag = link.para, link.tag
text = self.get_text(tag)
if self.hide_broken_links:
para.contents = []
para.append(_Span(text=text))
purl = urlparse(link.tag['href']) purl = urlparse(link.tag['href'])
if purl[1]: # Not a link to a file on the local filesystem if purl[1]: # Not a link to a file on the local filesystem
continue continue
path, fragment = unquote(purl[2]), purl[5] path, fragment = unquote(purl[2]), purl[5]
para, tag = link.para, link.tag
if not path or os.path.basename(path) == self.file_name: if not path or os.path.basename(path) == self.file_name:
if fragment in self.targets.keys(): if fragment in self.targets.keys():
tb = get_target_block(fragment, self.targets) tb = get_target_block(fragment, self.targets)
if self.is_root: if self.is_root:
self.book.addTocEntry(self.get_text(tag), tb) self.book.addTocEntry(text, tb)
sys.stdout.flush() sys.stdout.flush()
jb = JumpButton(tb) jb = JumpButton(tb)
self.book.append(jb) self.book.append(jb)
cb = CharButton(jb, text=self.get_text(tag)) cb = CharButton(jb, text=text)
para.contents = [] para.contents = []
para.append(cb) para.append(cb)
elif self.link_level < self.max_link_levels: elif self.link_level < self.max_link_levels:
@ -515,15 +532,16 @@ class HTMLConverter(object):
if not path in HTMLConverter.processed_files.keys(): if not path in HTMLConverter.processed_files.keys():
try: try:
self.files[path] = HTMLConverter(self.book, path, self.files[path] = HTMLConverter(self.book, path,
width=self.page_width, height=self.page_height, profile=self.profile,
dpi=self.dpi,
font_delta=self.font_delta, verbose=self.verbose, font_delta=self.font_delta, verbose=self.verbose,
link_level=self.link_level+1, link_level=self.link_level+1,
max_link_levels=self.max_link_levels, max_link_levels=self.max_link_levels,
is_root = False, baen=self.baen, is_root = False, baen=self.baen,
chapter_detection=self.chapter_detection, chapter_detection=self.chapter_detection,
chapter_regex=self.chapter_regex, chapter_regex=self.chapter_regex,
link_exclude=self.link_exclude) link_exclude=self.link_exclude,
page_break=self.page_break,
hide_broken_links=self.hide_broken_links)
HTMLConverter.processed_files[path] = self.files[path] HTMLConverter.processed_files[path] = self.files[path]
except Exception: except Exception:
print >>sys.stderr, 'Unable to process', path print >>sys.stderr, 'Unable to process', path
@ -540,10 +558,10 @@ class HTMLConverter(object):
else: else:
tb = conv.top tb = conv.top
if self.is_root: if self.is_root:
self.book.addTocEntry(self.get_text(tag), tb) self.book.addTocEntry(text, tb)
jb = JumpButton(tb) jb = JumpButton(tb)
self.book.append(jb) self.book.append(jb)
cb = CharButton(jb, text=self.get_text(tag)) cb = CharButton(jb, text=text)
para.contents = [] para.contents = []
para.append(cb) para.append(cb)
@ -574,10 +592,12 @@ class HTMLConverter(object):
def add_image_page(self, path): def add_image_page(self, path):
if os.access(path, os.R_OK): if os.access(path, os.R_OK):
self.end_page() self.end_page()
page = self.book.create_page(evensidemargin=0, oddsidemargin=0, page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
topmargin=0, textwidth=self.page_width, topmargin=0, textwidth=self.profile.screen_width,
textheight=self.page_height) headheight=0, headsep=0, footspace=0,
footheight=0,
textheight=self.profile.screen_height)
if not self.images.has_key(path): if not self.images.has_key(path):
self.images[path] = ImageStream(path) self.images[path] = ImageStream(path)
page.append(ImageBlock(self.images[path])) page.append(ImageBlock(self.images[path]))
@ -651,11 +671,8 @@ class HTMLConverter(object):
'padding' in test or 'border' in test or 'page-break' in test \ 'padding' in test or 'border' in test or 'page-break' in test \
or test.startswith('mso') or test.startswith('background')\ or test.startswith('mso') or test.startswith('background')\
or test.startswith('line') or test in ['color', 'display', \ or test.startswith('line') or test in ['color', 'display', \
'letter-spacing', 'letter-spacing', 'font-variant']:
'font-variant']: css.pop(key)
css.pop(key)
if self.verbose:
print 'Ignoring CSS key:', key
return css return css
def end_current_para(self): def end_current_para(self):
@ -688,7 +705,8 @@ class HTMLConverter(object):
return return
tag_css = self.tag_css(tag, parent_css=parent_css) tag_css = self.tag_css(tag, parent_css=parent_css)
try: # Skip element if its display attribute is set to none try: # Skip element if its display attribute is set to none
if tag_css['display'].lower() == 'none': if tag_css['display'].lower() == 'none' or \
tag_css['visibility'].lower() == 'hidden':
return return
except KeyError: except KeyError:
pass pass
@ -701,7 +719,11 @@ class HTMLConverter(object):
tag_css['page-break-after'].lower() != 'avoid': tag_css['page-break-after'].lower() != 'avoid':
end_page = True end_page = True
tag_css.pop('page-break-after') tag_css.pop('page-break-after')
if not self.page_break_found and self.page_break.match(tagname):
if len(self.current_page.contents) > 3:
self.end_page()
if self.verbose:
print 'Forcing page break at', tagname
if tagname in ["title", "script", "meta", 'del', 'frameset']: if tagname in ["title", "script", "meta", 'del', 'frameset']:
pass pass
elif tagname == 'a' and self.max_link_levels >= 0: elif tagname == 'a' and self.max_link_levels >= 0:
@ -744,12 +766,12 @@ class HTMLConverter(object):
self.targets[tag['name']] = target self.targets[tag['name']] = target
elif tag.has_key('href') and not self.link_exclude.match(tag['href']): elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
purl = urlparse(tag['href']) purl = urlparse(tag['href'])
path = purl[2] path = unquote(purl[2])
if path and os.path.splitext(path)[1][1:].lower() in \ if path and os.path.splitext(path)[1][1:].lower() in \
['png', 'jpg', 'bmp', 'jpeg']: ['png', 'jpg', 'bmp', 'jpeg']:
self.add_image_page(path) self.add_image_page(path)
else: else:
self.add_text('Link: '+tag['href'], tag_css) self.add_text('Link: ' + tag['href'], tag_css)
self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag)) self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
elif tagname == 'img': elif tagname == 'img':
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK): if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
@ -772,31 +794,32 @@ class HTMLConverter(object):
return pt.name return pt.name
if height > self.page_height: if height > self.profile.page_height:
corrf = self.page_height/(1.*height) corrf = self.profile.page_height/(1.*height)
width, height = floor(corrf*width), self.page_height-1 width, height = floor(corrf*width), self.profile.page_height-1
if width > self.page_width: if width > self.profile.page_width:
corrf = (self.page_width)/(1.*width) corrf = (self.profile.page_width)/(1.*width)
width, height = self.page_width-1, floor(corrf*height) width, height = self.profile.page_width-1, floor(corrf*height)
path = scale_image(width, height) path = scale_image(width, height)
if width > self.page_width: if width > self.profile.page_width:
corrf = self.page_width/(1.*width) corrf = self.profile.page_width/(1.*width)
width, height = self.page_width-1, floor(corrf*height) width, height = self.profile.page_width-1, floor(corrf*height)
if height > self.page_height: if height > self.profile.page_height:
corrf = (self.page_height)/(1.*height) corrf = (self.profile.page_height)/(1.*height)
width, height = floor(corrf*width), self.page_height-1 width, height = floor(corrf*width), self.profile.page_height-1
path = scale_image(width, height) path = scale_image(width, height)
width, height = int(width), int(height) width, height = int(width), int(height)
if not self.images.has_key(path): if not self.images.has_key(path):
self.images[path] = ImageStream(path) self.images[path] = ImageStream(path)
factor = 720./self.dpi factor = 720./self.profile.dpi
if max(width, height) <= min(self.page_width, self.page_height)/5.: if max(width, height) <= min(self.profile.page_width,
self.profile.page_height)/5.:
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\ im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
xsize=width, ysize=height) xsize=width, ysize=height)
self.current_para.append(Plot(im, xsize=ceil(width*factor), self.current_para.append(Plot(im, xsize=ceil(width*factor),
ysize=ceil(height*factor))) ysize=ceil(height*factor)))
elif height <= self.page_height/1.5: else:
pb = self.current_block pb = self.current_block
self.end_current_para() self.end_current_para()
self.process_alignment(tag_css) self.process_alignment(tag_css)
@ -809,16 +832,7 @@ class HTMLConverter(object):
self.current_block = self.book.create_text_block( self.current_block = self.book.create_text_block(
textStyle=pb.textStyle, textStyle=pb.textStyle,
blockStyle=pb.blockStyle) blockStyle=pb.blockStyle)
self.current_para = Paragraph() self.current_para = Paragraph()
else:
self.current_block.append(self.current_para)
self.current_page.append(self.current_block)
self.current_para = Paragraph()
self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
im = ImageBlock(self.images[path], x1=width, y1=height,
xsize=width, ysize=height)
self.current_page.append(im)
else: else:
print >>sys.stderr, "Failed to process:", tag print >>sys.stderr, "Failed to process:", tag
elif tagname in ['style', 'link']: elif tagname in ['style', 'link']:
@ -835,14 +849,16 @@ class HTMLConverter(object):
ncss.update(self.parse_css(str(c))) ncss.update(self.parse_css(str(c)))
elif tag.has_key('type') and tag['type'] == "text/css" \ elif tag.has_key('type') and tag['type'] == "text/css" \
and tag.has_key('href'): and tag.has_key('href'):
url = tag['href'] purl = urlparse(tag['href'])
path = unquote(purl[2])
try: try:
if url.startswith('http://'): f = open(path, 'rb')
f = urlopen(url) src = f.read()
else:
f = open(unquote(url))
ncss = self.parse_css(f.read())
f.close() f.close()
match = self.PAGE_BREAK_PAT.search(src)
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
ncss = self.parse_css(f.read())
except IOError: except IOError:
pass pass
if ncss: if ncss:
@ -917,6 +933,7 @@ class HTMLConverter(object):
if self.verbose: if self.verbose:
print 'Detected chapter', src print 'Detected chapter', src
self.end_page() self.end_page()
self.page_break_found = True
self.end_current_para() self.end_current_para()
self.lstrip_toggle = True self.lstrip_toggle = True
if tag_css.has_key('text-indent'): if tag_css.has_key('text-indent'):
@ -953,7 +970,7 @@ class HTMLConverter(object):
self.end_current_para() self.end_current_para()
self.current_block.append(CR()) self.current_block.append(CR())
self.end_current_block() self.end_current_block()
self.current_page.RuledLine(linelength=self.page_width) self.current_page.RuledLine(linelength=self.profile.page_width)
else: else:
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
@ -967,18 +984,21 @@ class HTMLConverter(object):
for _file in self.scaled_images.values(): for _file in self.scaled_images.values():
_file.__del__() _file.__del__()
def process_file(path, options): def process_file(path, options):
cwd = os.getcwd() cwd = os.getcwd()
dirpath = None dirpath = None
try: try:
dirpath, path = get_path(path) dirpath, path = get_path(path)
cpath, tpath = options.cover, '' cpath, tpath = '', ''
if options.cover and os.access(options.cover, os.R_OK): if options.cover:
try: options.cover = os.path.abspath(os.path.expanduser(options.cover))
cpath = options.cover
if os.access(options.cover, os.R_OK):
from libprs500.prs500 import PRS500 from libprs500.prs500 import PRS500
im = PILImage.open(os.path.join(cwd, cpath)) im = PILImage.open(os.path.join(cwd, cpath))
cim = im.resize((600, 800), PILImage.BICUBIC) cim = im.resize((options.profile.screen_width,
options.profile.screen_height),
PILImage.BICUBIC)
cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg") cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
cf.close() cf.close()
cim.save(cf.name) cim.save(cf.name)
@ -989,17 +1009,16 @@ def process_file(path, options):
tf.close() tf.close()
tim.save(tf.name) tim.save(tf.name)
tpath = tf.name tpath = tf.name
except ImportError: else:
print >>sys.stderr, "WARNING: You don't have PIL installed. ", raise ConversionError, 'Cannot read from: %s', (options.cover,)
'Cover and thumbnails wont work'
pass
title = (options.title, options.title_sort) title = (options.title, options.title_sort)
author = (options.author, options.author_sort) author = (options.author, options.author_sort)
args = dict(font_delta=options.font_delta, title=title, \ args = dict(font_delta=options.font_delta, title=title, \
author=author, sourceencoding='utf8',\ author=author, sourceencoding='utf8',\
freetext=options.freetext, category=options.category, freetext=options.freetext, category=options.category,
booksetting=BookSetting(dpi=10*options.dpi,screenheight=800, booksetting=BookSetting(dpi=10*options.profile.dpi,
screenwidth=600)) screenheight=options.profile.screen_height,
screenwidth=options.profile.screen_width))
if tpath: if tpath:
args['thumbnail'] = tpath args['thumbnail'] = tpath
header = None header = None
@ -1011,13 +1030,16 @@ def process_file(path, options):
book = Book(header=header, **args) book = Book(header=header, **args)
le = re.compile(options.link_exclude) if options.link_exclude else \ le = re.compile(options.link_exclude) if options.link_exclude else \
re.compile('$') re.compile('$')
conv = HTMLConverter(book, path, dpi=options.dpi, pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
re.compile('$')
conv = HTMLConverter(book, path, profile=options.profile,
font_delta=options.font_delta, font_delta=options.font_delta,
cover=cpath, max_link_levels=options.link_levels, cover=cpath, max_link_levels=options.link_levels,
baen=options.baen, verbose=options.verbose, baen=options.baen,
chapter_detection=options.chapter_detection, chapter_detection=options.chapter_detection,
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE), chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
link_exclude=re.compile(le)) link_exclude=re.compile(le), page_break=pb,
hide_broken_links=not options.show_broken_links)
conv.process_links() conv.process_links()
oname = options.output oname = options.output
if not oname: if not oname:
@ -1033,47 +1055,73 @@ def process_file(path, options):
if dirpath: if dirpath:
shutil.rmtree(dirpath, True) shutil.rmtree(dirpath, True)
def main(): def parse_options(argv=None, cli=True):
""" CLI for html -> lrf conversions """ """ CLI for html -> lrf conversions """
if not argv:
argv = sys.argv[1:]
parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip] parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]
%prog converts mybook.html to mybook.lrf""") %prog converts mybook.html to mybook.lrf""")
parser.add_option('--cover', action='store', dest='cover', default=None, \ parser.add_option('--cover', action='store', dest='cover', default=None, \
help='Path to file containing image to be used as cover') help='Path to file containing image to be used as cover')
parser.add_option('--lrs', action='store_true', dest='lrs', \
help='Convert to LRS', default=False)
parser.add_option('--font-delta', action='store', type='int', default=0, \ parser.add_option('--font-delta', action='store', type='int', default=0, \
help="""Increase the font size by 2 * FONT_DELTA pts. help="""Increase the font size by 2 * FONT_DELTA pts.
If FONT_DELTA is negative, the font size is decreased.""", If FONT_DELTA is negative, the font size is decreased.""",
dest='font_delta') dest='font_delta')
parser.add_option('--link-levels', action='store', type='int', default=sys.maxint, \ link = parser.add_option_group('LINK PROCESSING OPTIONS')
link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
dest='link_levels', dest='link_levels',
help=r'''The maximum number of levels to recursively process ''' help=r'''The maximum number of levels to recursively process '''
'''links. A value of 0 means thats links are not followed. ''' '''links. A value of 0 means thats links are not followed. '''
'''A negative value means that <a> tags are ignored.''') '''A negative value means that <a> tags are ignored.''')
parser.add_option('--baen', action='store_true', default=False, dest='baen', link.add_option('--link-exclude', dest='link_exclude', default='$',
help='''Preprocess Baen HTML files to improve generated LRF.''') help='''A regular expression. <a> tags whoose href '''
parser.add_option('--dpi', action='store', type='int', default=166, dest='dpi', '''matches will be ignored. Defaults to %default''')
help='''The DPI of the target device. Default is 166 for the chapter = parser.add_option_group('CHAPTER OPTIONS')
Sony PRS 500''') chapter.add_option('--disable-chapter-detection', action='store_false',
parser.add_option('--disable-chapter-detection', action='store_false',
default=True, dest='chapter_detection', default=True, dest='chapter_detection',
help='''Prevent html2lrf from automatically inserting page breaks''' help='''Prevent html2lrf from automatically inserting page breaks'''
'''before what it thinks are chapters.''') '''before what it thinks are chapters.''')
parser.add_option('--chapter-regex', dest='chapter_regex', chapter.add_option('--chapter-regex', dest='chapter_regex',
default='chapter|book|appendix', default='chapter|book|appendix',
help='''The regular expression used to detect chapter titles.''' help='''The regular expression used to detect chapter titles.'''
'''It is searched for in heading tags. Default is chapter|book|appendix''') '''It is searched for in heading tags. Defaults to %default''')
parser.add_option('--link-exclude', dest='link_exclude', default='', chapter.add_option('--page-break-before', dest='page_break', default='h[12]',
help='''A regular expression. <a> tags whoose href ''' help='''If html2lrf does not find any page breaks in the '''
'''matches will be ignored''') '''html file and cannot detect chapter headings, it will '''
options, args = parser.parse_args() '''automatically insert page-breaks before the tags whose '''
'''names match this regular expression. Defaults to %default. '''
'''You can disable it by setting the regexp to "$". '''
'''The purpose of this option is to try to ensure that '''
'''there are no really long pages as this degrades the page '''
'''turn performance of the LRF. Thus this option is ignored '''
'''if the current page has only a few elements.''')
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
help='''Preprocess Baen HTML files to improve generated LRF.''')
debug = parser.add_option_group('DEBUG OPTIONS')
debug.add_option('--verbose', dest='verbose', action='store_true', default=False,
help='''Be verbose while processing''')
debug.add_option('--lrs', action='store_true', dest='lrs', \
help='Convert to LRS', default=False)
debug.add_option('--show-broken-links', dest='show_broken_links', action='store_true',
default=False, help='''Show the href of broken links in generated LRF''')
options, args = parser.parse_args(args=argv)
if len(args) != 1: if len(args) != 1:
parser.print_help() if cli:
sys.exit(1) parser.print_help()
src = args[0] raise ConversionError, 'no filename specified'
if options.title == None: if options.title == None:
options.title = filename_to_utf8(os.path.splitext(os.path.basename(src))[0]) options.title = filename_to_utf8(os.path.splitext(os.path.basename(args[0]))[0])
return options, args
def main():
try:
options, args = parse_options()
src = args[0]
except:
sys.exit(1)
process_file(src, options) process_file(src, options)
def console_query(dirpath, candidate, docs): def console_query(dirpath, candidate, docs):

View File

@ -70,7 +70,7 @@
<h2><a name='images'>Inline images</a></h2> <h2><a name='images'>Inline images</a></h2>
<p> <p>
Here I demonstrate the use of inline images in the midst of text. Here is a small image <img src='small.jpg' /> embedded in a sentence. Now we have a slightly larger image that is automatically put in its own block <img style="text-align:center" src='medium.jpg' /> and finally we have a large image which is automatically placed on a page by itself and prevented from being autoscaled when the user changes from S to M to L. Try changing sizes and see how the different embedding styles behave. <img src='large.jpg' /> Here I demonstrate the use of inline images in the midst of text. Here is a small image <img src='small.jpg' /> embedded in a sentence. Now we have a slightly larger image that is automatically put in its own block <img style="text-align:center" src='medium.jpg' /> and finally we have a large image which wont fit on this page. Try changing sizes from S to M to L and see how the images behave. <img src='large.jpg' />
</p> </p>
<p class='toc'> <p class='toc'>
<hr /> <hr />

View File

@ -69,7 +69,9 @@ def convert_txt(path, options):
book = Book(header=header, title=title, author=author, \ book = Book(header=header, title=title, author=author, \
sourceencoding=options.encoding, freetext=options.freetext, \ sourceencoding=options.encoding, freetext=options.freetext, \
category=options.category, booksetting=BookSetting category=options.category, booksetting=BookSetting
(dpi=10*options.dpi,screenheight=800, screenwidth=600)) (dpi=10*options.profile.dpi,
screenheight=options.profile.screen_height,
screenwidth=options.profile.screen_height))
buffer = '' buffer = ''
pg = book.create_page() pg = book.create_page()
block = book.create_text_block() block = book.create_text_block()