mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Rationalized CLI of html2lrf
Fixed link handling to show text rather than href by default Fine tuned image handling Added automatic page breaks if page-break not found
This commit is contained in:
parent
69f20f634d
commit
a29bf8eea0
@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
|
|||||||
suit your distribution.
|
suit your distribution.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "0.3.25"
|
__version__ = "0.3.26"
|
||||||
__docformat__ = "epytext"
|
__docformat__ = "epytext"
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ This package contains logic to read and write LRF files. The LRF file format is
|
|||||||
At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}.
|
At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser, OptionValueError
|
||||||
|
|
||||||
from libprs500.lrf.pylrs.pylrs import Book as _Book
|
from libprs500.lrf.pylrs.pylrs import Book as _Book
|
||||||
from libprs500.lrf.pylrs.pylrs import TextBlock, Header, PutObj, Paragraph, TextStyle
|
from libprs500.lrf.pylrs.pylrs import TextBlock, Header, PutObj, Paragraph, TextStyle
|
||||||
@ -26,31 +26,53 @@ from libprs500 import __version__ as VERSION
|
|||||||
__docformat__ = "epytext"
|
__docformat__ = "epytext"
|
||||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||||
|
|
||||||
|
class PRS500_PROFILE(object):
|
||||||
|
screen_width = 600
|
||||||
|
screen_height = 800
|
||||||
|
page_width = 575
|
||||||
|
page_height = 747
|
||||||
|
dpi = 166
|
||||||
|
|
||||||
|
def profile_from_string(option, opt_str, value, parser):
|
||||||
|
if value == 'prs500':
|
||||||
|
setattr(parser.values, option.dest, PRS500_PROFILE)
|
||||||
|
else:
|
||||||
|
raise OptionValueError('Profile: '+value+' is not implemented')
|
||||||
|
|
||||||
class ConversionError(Exception):
|
class ConversionError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def option_parser(usage):
|
def option_parser(usage):
|
||||||
parser = OptionParser(usage=usage, version='libprs500 '+VERSION)
|
parser = OptionParser(usage=usage, version='libprs500 '+VERSION,
|
||||||
parser.add_option('--header', action='store_true', default=False, dest='header',
|
epilog='html2lrf created by Kovid Goyal')
|
||||||
|
metadata = parser.add_option_group('METADATA OPTIONS')
|
||||||
|
metadata.add_option('--header', action='store_true', default=False, dest='header',
|
||||||
help='Add a header to all the pages with title and author.')
|
help='Add a header to all the pages with title and author.')
|
||||||
parser.add_option("-t", "--title", action="store", type="string", \
|
metadata.add_option("-t", "--title", action="store", type="string", \
|
||||||
dest="title", help="Set the title")
|
dest="title", help="Set the title. Default: filename.")
|
||||||
parser.add_option("-a", "--author", action="store", type="string", \
|
metadata.add_option("-a", "--author", action="store", type="string", \
|
||||||
dest="author", help="Set the author", default='Unknown')
|
dest="author", help="Set the author. Default: %default", default='Unknown')
|
||||||
parser.add_option("--freetext", action="store", type="string", \
|
metadata.add_option("--freetext", action="store", type="string", \
|
||||||
dest="freetext", help="Set the comments in the metadata", default=' ')
|
dest="freetext", help="Set the comments.", default=' ')
|
||||||
parser.add_option("--category", action="store", type="string", \
|
metadata.add_option("--category", action="store", type="string", \
|
||||||
dest="category", help="Set the category", default=' ')
|
dest="category", help="Set the category", default=' ')
|
||||||
|
metadata.add_option('--title-sort', action='store', default='', dest='title_sort',
|
||||||
|
help='Sort key for the title')
|
||||||
|
metadata.add_option('--author-sort', action='store', default='', dest='author_sort',
|
||||||
|
help='Sort key for the author')
|
||||||
|
profiles=['prs500']
|
||||||
parser.add_option('-o', '--output', action='store', default=None, \
|
parser.add_option('-o', '--output', action='store', default=None, \
|
||||||
help='Output file name. Default is derived from input filename')
|
help='Output file name. Default is derived from input filename')
|
||||||
parser.add_option('--title-sort', action='store', default='', dest='title_sort',
|
parser.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
|
||||||
help='Sort key for the title')
|
choices=profiles, action='callback', callback=profile_from_string,
|
||||||
parser.add_option('--author-sort', action='store', default='', dest='author_sort',
|
help='''Profile of the target device for which this LRF is '''
|
||||||
help='Sort key for the author')
|
'''being generated. Default: ''' + profiles[0] + '''
|
||||||
|
Supported profiles: '''+', '.join(profiles))
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def Book(font_delta=0, header=None, **settings):
|
def Book(font_delta=0, header=None, profile=PRS500_PROFILE, **settings):
|
||||||
ps = dict(textwidth=575, textheight=747)
|
ps = dict(textwidth=profile.page_width,
|
||||||
|
textheight=profile.page_height)
|
||||||
if header:
|
if header:
|
||||||
hdr = Header()
|
hdr = Header()
|
||||||
hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60))
|
hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60))
|
||||||
@ -62,5 +84,4 @@ def Book(font_delta=0, header=None, **settings):
|
|||||||
ps['topmargin'] = 10
|
ps['topmargin'] = 10
|
||||||
return _Book(textstyledefault=dict(fontsize=100+font_delta*20,
|
return _Book(textstyledefault=dict(fontsize=100+font_delta*20,
|
||||||
parindent=80, linespace=12), \
|
parindent=80, linespace=12), \
|
||||||
pagestyledefault=ps, \
|
pagestyledefault=ps, **settings)
|
||||||
**settings)
|
|
@ -39,7 +39,7 @@ from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBl
|
|||||||
Bold, Space, Plot, Image, BlockSpace,\
|
Bold, Space, Plot, Image, BlockSpace,\
|
||||||
RuledLine, BookSetting
|
RuledLine, BookSetting
|
||||||
from libprs500.lrf.pylrs.pylrs import Span as _Span
|
from libprs500.lrf.pylrs.pylrs import Span as _Span
|
||||||
from libprs500.lrf import ConversionError, option_parser, Book
|
from libprs500.lrf import ConversionError, option_parser, Book, PRS500_PROFILE
|
||||||
from libprs500 import extract, filename_to_utf8
|
from libprs500 import extract, filename_to_utf8
|
||||||
from libprs500.ptempfile import PersistentTemporaryFile
|
from libprs500.ptempfile import PersistentTemporaryFile
|
||||||
|
|
||||||
@ -158,7 +158,7 @@ class Span(_Span):
|
|||||||
ans = font_weight(val)
|
ans = font_weight(val)
|
||||||
if ans:
|
if ans:
|
||||||
t['fontweight'] = ans
|
t['fontweight'] = ans
|
||||||
if int(ans) > 1400:
|
if int(ans) > 140:
|
||||||
t['wordspace'] = '50'
|
t['wordspace'] = '50'
|
||||||
elif key.startswith("margin"):
|
elif key.startswith("margin"):
|
||||||
if key == "margin":
|
if key == "margin":
|
||||||
@ -214,8 +214,9 @@ class Span(_Span):
|
|||||||
|
|
||||||
|
|
||||||
class HTMLConverter(object):
|
class HTMLConverter(object):
|
||||||
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
|
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
|
||||||
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
|
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
|
||||||
|
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
|
||||||
# Fix <a /> elements
|
# Fix <a /> elements
|
||||||
MARKUP_MASSAGE = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
|
MARKUP_MASSAGE = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
|
||||||
lambda match: match.group(1)+"></a>")]
|
lambda match: match.group(1)+"></a>")]
|
||||||
@ -234,12 +235,14 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
processed_files = {} #: Files that have been processed
|
processed_files = {} #: Files that have been processed
|
||||||
|
|
||||||
def __init__(self, book, path, dpi=166, width=575, height=747,
|
def __init__(self, book, path,
|
||||||
font_delta=0, verbose=False, cover=None,
|
font_delta=0, verbose=False, cover=None,
|
||||||
max_link_levels=sys.maxint, link_level=0,
|
max_link_levels=sys.maxint, link_level=0,
|
||||||
is_root=True, baen=False, chapter_detection=True,
|
is_root=True, baen=False, chapter_detection=True,
|
||||||
chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE),
|
chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE),
|
||||||
link_exclude=re.compile('$')):
|
link_exclude=re.compile('$'),
|
||||||
|
page_break=re.compile('h[12]', re.IGNORECASE),
|
||||||
|
profile=PRS500_PROFILE, hide_broken_links=False):
|
||||||
'''
|
'''
|
||||||
Convert HTML file at C{path} and add it to C{book}. After creating
|
Convert HTML file at C{path} and add it to C{book}. After creating
|
||||||
the object, you must call L{self.process_links} on it to create the links and
|
the object, you must call L{self.process_links} on it to create the links and
|
||||||
@ -270,6 +273,11 @@ class HTMLConverter(object):
|
|||||||
@type chapter_detection: C{bool}
|
@type chapter_detection: C{bool}
|
||||||
@param chapter_regex: The compiled regular expression used to search for chapter titles
|
@param chapter_regex: The compiled regular expression used to search for chapter titles
|
||||||
@param link_exclude: Compiled regex. Matching hrefs are ignored.
|
@param link_exclude: Compiled regex. Matching hrefs are ignored.
|
||||||
|
@param page_break: Compiled regex. Page breaks are inserted before matching
|
||||||
|
tags if no page-breaks are found and no chapter headings
|
||||||
|
are detected.
|
||||||
|
@param profile: Defines the geometry of the display device
|
||||||
|
@param hide_broken_links: Don't display broken links
|
||||||
'''
|
'''
|
||||||
# Defaults for various formatting tags
|
# Defaults for various formatting tags
|
||||||
self.css = dict(
|
self.css = dict(
|
||||||
@ -285,10 +293,8 @@ class HTMLConverter(object):
|
|||||||
small = {'font-size' :'small'},
|
small = {'font-size' :'small'},
|
||||||
pre = {'font-family' :'monospace' },
|
pre = {'font-family' :'monospace' },
|
||||||
center = {'text-align' : 'center'}
|
center = {'text-align' : 'center'}
|
||||||
)
|
)
|
||||||
self.page_width = width #: The width of the page
|
self.profile = profile #: Defines the geometry of the display device
|
||||||
self.page_height = height #: The height of the page
|
|
||||||
self.dpi = dpi #: The DPI of the intended display device
|
|
||||||
self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
|
self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
|
||||||
self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
|
self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
|
||||||
self.link_exclude = link_exclude #: Ignore matching hrefs
|
self.link_exclude = link_exclude #: Ignore matching hrefs
|
||||||
@ -298,6 +304,7 @@ class HTMLConverter(object):
|
|||||||
self.blockquote_style = book.create_block_style(sidemargin=60,
|
self.blockquote_style = book.create_block_style(sidemargin=60,
|
||||||
topskip=20, footskip=20)
|
topskip=20, footskip=20)
|
||||||
self.unindented_style = book.create_text_style(parindent=0)
|
self.unindented_style = book.create_text_style(parindent=0)
|
||||||
|
self.page_break = page_break #: Regex controlling forced page-break behavior
|
||||||
self.text_styles = []#: Keep track of already used textstyles
|
self.text_styles = []#: Keep track of already used textstyles
|
||||||
self.block_styles = []#: Keep track of already used blockstyles
|
self.block_styles = []#: Keep track of already used blockstyles
|
||||||
self.images = {} #: Images referenced in the HTML document
|
self.images = {} #: Images referenced in the HTML document
|
||||||
@ -311,7 +318,8 @@ class HTMLConverter(object):
|
|||||||
self.in_ol = False #: Flag indicating we're in an <ol> element
|
self.in_ol = False #: Flag indicating we're in an <ol> element
|
||||||
self.book = book #: The Book object representing a BBeB book
|
self.book = book #: The Book object representing a BBeB book
|
||||||
self.is_root = is_root #: Are we converting the root HTML file
|
self.is_root = is_root #: Are we converting the root HTML file
|
||||||
self.lstrip_toggle = False #; If true the next add_text call will do an lstrip
|
self.lstrip_toggle = False #: If true the next add_text call will do an lstrip
|
||||||
|
self.hide_broken_links = hide_broken_links
|
||||||
path = os.path.abspath(path)
|
path = os.path.abspath(path)
|
||||||
os.chdir(os.path.dirname(path))
|
os.chdir(os.path.dirname(path))
|
||||||
self.file_name = os.path.basename(path)
|
self.file_name = os.path.basename(path)
|
||||||
@ -331,7 +339,11 @@ class HTMLConverter(object):
|
|||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.current_page = None
|
self.current_page = None
|
||||||
self.current_para = None
|
self.current_para = None
|
||||||
self.current_style = {}
|
self.current_style = {}
|
||||||
|
self.page_break_found = False
|
||||||
|
match = self.PAGE_BREAK_PAT.search(unicode(self.soup))
|
||||||
|
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
||||||
|
self.page_break_found = True
|
||||||
self.parse_file()
|
self.parse_file()
|
||||||
HTMLConverter.processed_files[path] = self
|
HTMLConverter.processed_files[path] = self
|
||||||
print 'done'
|
print 'done'
|
||||||
@ -440,7 +452,8 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
def get_text(self, tag):
|
def get_text(self, tag):
|
||||||
css = self.tag_css(tag)
|
css = self.tag_css(tag)
|
||||||
if css.has_key('display') and css['display'].lower() == 'none':
|
if (css.has_key('display') and css['display'].lower() == 'none') or \
|
||||||
|
(css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
|
||||||
return ''
|
return ''
|
||||||
text = ''
|
text = ''
|
||||||
for c in tag.contents:
|
for c in tag.contents:
|
||||||
@ -485,22 +498,26 @@ class HTMLConverter(object):
|
|||||||
page.contents.remove(bs)
|
page.contents.remove(bs)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
cwd = os.getcwd()
|
cwd = os.getcwd()
|
||||||
for link in self.links:
|
for link in self.links:
|
||||||
|
para, tag = link.para, link.tag
|
||||||
|
text = self.get_text(tag)
|
||||||
|
if self.hide_broken_links:
|
||||||
|
para.contents = []
|
||||||
|
para.append(_Span(text=text))
|
||||||
purl = urlparse(link.tag['href'])
|
purl = urlparse(link.tag['href'])
|
||||||
if purl[1]: # Not a link to a file on the local filesystem
|
if purl[1]: # Not a link to a file on the local filesystem
|
||||||
continue
|
continue
|
||||||
path, fragment = unquote(purl[2]), purl[5]
|
path, fragment = unquote(purl[2]), purl[5]
|
||||||
para, tag = link.para, link.tag
|
|
||||||
if not path or os.path.basename(path) == self.file_name:
|
if not path or os.path.basename(path) == self.file_name:
|
||||||
if fragment in self.targets.keys():
|
if fragment in self.targets.keys():
|
||||||
tb = get_target_block(fragment, self.targets)
|
tb = get_target_block(fragment, self.targets)
|
||||||
if self.is_root:
|
if self.is_root:
|
||||||
self.book.addTocEntry(self.get_text(tag), tb)
|
self.book.addTocEntry(text, tb)
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
jb = JumpButton(tb)
|
jb = JumpButton(tb)
|
||||||
self.book.append(jb)
|
self.book.append(jb)
|
||||||
cb = CharButton(jb, text=self.get_text(tag))
|
cb = CharButton(jb, text=text)
|
||||||
para.contents = []
|
para.contents = []
|
||||||
para.append(cb)
|
para.append(cb)
|
||||||
elif self.link_level < self.max_link_levels:
|
elif self.link_level < self.max_link_levels:
|
||||||
@ -515,15 +532,16 @@ class HTMLConverter(object):
|
|||||||
if not path in HTMLConverter.processed_files.keys():
|
if not path in HTMLConverter.processed_files.keys():
|
||||||
try:
|
try:
|
||||||
self.files[path] = HTMLConverter(self.book, path,
|
self.files[path] = HTMLConverter(self.book, path,
|
||||||
width=self.page_width, height=self.page_height,
|
profile=self.profile,
|
||||||
dpi=self.dpi,
|
|
||||||
font_delta=self.font_delta, verbose=self.verbose,
|
font_delta=self.font_delta, verbose=self.verbose,
|
||||||
link_level=self.link_level+1,
|
link_level=self.link_level+1,
|
||||||
max_link_levels=self.max_link_levels,
|
max_link_levels=self.max_link_levels,
|
||||||
is_root = False, baen=self.baen,
|
is_root = False, baen=self.baen,
|
||||||
chapter_detection=self.chapter_detection,
|
chapter_detection=self.chapter_detection,
|
||||||
chapter_regex=self.chapter_regex,
|
chapter_regex=self.chapter_regex,
|
||||||
link_exclude=self.link_exclude)
|
link_exclude=self.link_exclude,
|
||||||
|
page_break=self.page_break,
|
||||||
|
hide_broken_links=self.hide_broken_links)
|
||||||
HTMLConverter.processed_files[path] = self.files[path]
|
HTMLConverter.processed_files[path] = self.files[path]
|
||||||
except Exception:
|
except Exception:
|
||||||
print >>sys.stderr, 'Unable to process', path
|
print >>sys.stderr, 'Unable to process', path
|
||||||
@ -540,10 +558,10 @@ class HTMLConverter(object):
|
|||||||
else:
|
else:
|
||||||
tb = conv.top
|
tb = conv.top
|
||||||
if self.is_root:
|
if self.is_root:
|
||||||
self.book.addTocEntry(self.get_text(tag), tb)
|
self.book.addTocEntry(text, tb)
|
||||||
jb = JumpButton(tb)
|
jb = JumpButton(tb)
|
||||||
self.book.append(jb)
|
self.book.append(jb)
|
||||||
cb = CharButton(jb, text=self.get_text(tag))
|
cb = CharButton(jb, text=text)
|
||||||
para.contents = []
|
para.contents = []
|
||||||
para.append(cb)
|
para.append(cb)
|
||||||
|
|
||||||
@ -574,10 +592,12 @@ class HTMLConverter(object):
|
|||||||
|
|
||||||
def add_image_page(self, path):
|
def add_image_page(self, path):
|
||||||
if os.access(path, os.R_OK):
|
if os.access(path, os.R_OK):
|
||||||
self.end_page()
|
self.end_page()
|
||||||
page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
|
page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
|
||||||
topmargin=0, textwidth=self.page_width,
|
topmargin=0, textwidth=self.profile.screen_width,
|
||||||
textheight=self.page_height)
|
headheight=0, headsep=0, footspace=0,
|
||||||
|
footheight=0,
|
||||||
|
textheight=self.profile.screen_height)
|
||||||
if not self.images.has_key(path):
|
if not self.images.has_key(path):
|
||||||
self.images[path] = ImageStream(path)
|
self.images[path] = ImageStream(path)
|
||||||
page.append(ImageBlock(self.images[path]))
|
page.append(ImageBlock(self.images[path]))
|
||||||
@ -651,11 +671,8 @@ class HTMLConverter(object):
|
|||||||
'padding' in test or 'border' in test or 'page-break' in test \
|
'padding' in test or 'border' in test or 'page-break' in test \
|
||||||
or test.startswith('mso') or test.startswith('background')\
|
or test.startswith('mso') or test.startswith('background')\
|
||||||
or test.startswith('line') or test in ['color', 'display', \
|
or test.startswith('line') or test in ['color', 'display', \
|
||||||
'letter-spacing',
|
'letter-spacing', 'font-variant']:
|
||||||
'font-variant']:
|
css.pop(key)
|
||||||
css.pop(key)
|
|
||||||
if self.verbose:
|
|
||||||
print 'Ignoring CSS key:', key
|
|
||||||
return css
|
return css
|
||||||
|
|
||||||
def end_current_para(self):
|
def end_current_para(self):
|
||||||
@ -688,7 +705,8 @@ class HTMLConverter(object):
|
|||||||
return
|
return
|
||||||
tag_css = self.tag_css(tag, parent_css=parent_css)
|
tag_css = self.tag_css(tag, parent_css=parent_css)
|
||||||
try: # Skip element if its display attribute is set to none
|
try: # Skip element if its display attribute is set to none
|
||||||
if tag_css['display'].lower() == 'none':
|
if tag_css['display'].lower() == 'none' or \
|
||||||
|
tag_css['visibility'].lower() == 'hidden':
|
||||||
return
|
return
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
@ -701,7 +719,11 @@ class HTMLConverter(object):
|
|||||||
tag_css['page-break-after'].lower() != 'avoid':
|
tag_css['page-break-after'].lower() != 'avoid':
|
||||||
end_page = True
|
end_page = True
|
||||||
tag_css.pop('page-break-after')
|
tag_css.pop('page-break-after')
|
||||||
|
if not self.page_break_found and self.page_break.match(tagname):
|
||||||
|
if len(self.current_page.contents) > 3:
|
||||||
|
self.end_page()
|
||||||
|
if self.verbose:
|
||||||
|
print 'Forcing page break at', tagname
|
||||||
if tagname in ["title", "script", "meta", 'del', 'frameset']:
|
if tagname in ["title", "script", "meta", 'del', 'frameset']:
|
||||||
pass
|
pass
|
||||||
elif tagname == 'a' and self.max_link_levels >= 0:
|
elif tagname == 'a' and self.max_link_levels >= 0:
|
||||||
@ -744,12 +766,12 @@ class HTMLConverter(object):
|
|||||||
self.targets[tag['name']] = target
|
self.targets[tag['name']] = target
|
||||||
elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
|
||||||
purl = urlparse(tag['href'])
|
purl = urlparse(tag['href'])
|
||||||
path = purl[2]
|
path = unquote(purl[2])
|
||||||
if path and os.path.splitext(path)[1][1:].lower() in \
|
if path and os.path.splitext(path)[1][1:].lower() in \
|
||||||
['png', 'jpg', 'bmp', 'jpeg']:
|
['png', 'jpg', 'bmp', 'jpeg']:
|
||||||
self.add_image_page(path)
|
self.add_image_page(path)
|
||||||
else:
|
else:
|
||||||
self.add_text('Link: '+tag['href'], tag_css)
|
self.add_text('Link: ' + tag['href'], tag_css)
|
||||||
self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
|
self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
|
||||||
elif tagname == 'img':
|
elif tagname == 'img':
|
||||||
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
|
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
|
||||||
@ -772,31 +794,32 @@ class HTMLConverter(object):
|
|||||||
return pt.name
|
return pt.name
|
||||||
|
|
||||||
|
|
||||||
if height > self.page_height:
|
if height > self.profile.page_height:
|
||||||
corrf = self.page_height/(1.*height)
|
corrf = self.profile.page_height/(1.*height)
|
||||||
width, height = floor(corrf*width), self.page_height-1
|
width, height = floor(corrf*width), self.profile.page_height-1
|
||||||
if width > self.page_width:
|
if width > self.profile.page_width:
|
||||||
corrf = (self.page_width)/(1.*width)
|
corrf = (self.profile.page_width)/(1.*width)
|
||||||
width, height = self.page_width-1, floor(corrf*height)
|
width, height = self.profile.page_width-1, floor(corrf*height)
|
||||||
path = scale_image(width, height)
|
path = scale_image(width, height)
|
||||||
if width > self.page_width:
|
if width > self.profile.page_width:
|
||||||
corrf = self.page_width/(1.*width)
|
corrf = self.profile.page_width/(1.*width)
|
||||||
width, height = self.page_width-1, floor(corrf*height)
|
width, height = self.profile.page_width-1, floor(corrf*height)
|
||||||
if height > self.page_height:
|
if height > self.profile.page_height:
|
||||||
corrf = (self.page_height)/(1.*height)
|
corrf = (self.profile.page_height)/(1.*height)
|
||||||
width, height = floor(corrf*width), self.page_height-1
|
width, height = floor(corrf*width), self.profile.page_height-1
|
||||||
path = scale_image(width, height)
|
path = scale_image(width, height)
|
||||||
width, height = int(width), int(height)
|
width, height = int(width), int(height)
|
||||||
|
|
||||||
if not self.images.has_key(path):
|
if not self.images.has_key(path):
|
||||||
self.images[path] = ImageStream(path)
|
self.images[path] = ImageStream(path)
|
||||||
factor = 720./self.dpi
|
factor = 720./self.profile.dpi
|
||||||
if max(width, height) <= min(self.page_width, self.page_height)/5.:
|
if max(width, height) <= min(self.profile.page_width,
|
||||||
|
self.profile.page_height)/5.:
|
||||||
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
|
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
|
||||||
xsize=width, ysize=height)
|
xsize=width, ysize=height)
|
||||||
self.current_para.append(Plot(im, xsize=ceil(width*factor),
|
self.current_para.append(Plot(im, xsize=ceil(width*factor),
|
||||||
ysize=ceil(height*factor)))
|
ysize=ceil(height*factor)))
|
||||||
elif height <= self.page_height/1.5:
|
else:
|
||||||
pb = self.current_block
|
pb = self.current_block
|
||||||
self.end_current_para()
|
self.end_current_para()
|
||||||
self.process_alignment(tag_css)
|
self.process_alignment(tag_css)
|
||||||
@ -809,16 +832,7 @@ class HTMLConverter(object):
|
|||||||
self.current_block = self.book.create_text_block(
|
self.current_block = self.book.create_text_block(
|
||||||
textStyle=pb.textStyle,
|
textStyle=pb.textStyle,
|
||||||
blockStyle=pb.blockStyle)
|
blockStyle=pb.blockStyle)
|
||||||
self.current_para = Paragraph()
|
self.current_para = Paragraph()
|
||||||
else:
|
|
||||||
self.current_block.append(self.current_para)
|
|
||||||
self.current_page.append(self.current_block)
|
|
||||||
self.current_para = Paragraph()
|
|
||||||
self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
|
|
||||||
blockStyle=self.current_block.blockStyle)
|
|
||||||
im = ImageBlock(self.images[path], x1=width, y1=height,
|
|
||||||
xsize=width, ysize=height)
|
|
||||||
self.current_page.append(im)
|
|
||||||
else:
|
else:
|
||||||
print >>sys.stderr, "Failed to process:", tag
|
print >>sys.stderr, "Failed to process:", tag
|
||||||
elif tagname in ['style', 'link']:
|
elif tagname in ['style', 'link']:
|
||||||
@ -835,14 +849,16 @@ class HTMLConverter(object):
|
|||||||
ncss.update(self.parse_css(str(c)))
|
ncss.update(self.parse_css(str(c)))
|
||||||
elif tag.has_key('type') and tag['type'] == "text/css" \
|
elif tag.has_key('type') and tag['type'] == "text/css" \
|
||||||
and tag.has_key('href'):
|
and tag.has_key('href'):
|
||||||
url = tag['href']
|
purl = urlparse(tag['href'])
|
||||||
|
path = unquote(purl[2])
|
||||||
try:
|
try:
|
||||||
if url.startswith('http://'):
|
f = open(path, 'rb')
|
||||||
f = urlopen(url)
|
src = f.read()
|
||||||
else:
|
|
||||||
f = open(unquote(url))
|
|
||||||
ncss = self.parse_css(f.read())
|
|
||||||
f.close()
|
f.close()
|
||||||
|
match = self.PAGE_BREAK_PAT.search(src)
|
||||||
|
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
|
||||||
|
self.page_break_found = True
|
||||||
|
ncss = self.parse_css(f.read())
|
||||||
except IOError:
|
except IOError:
|
||||||
pass
|
pass
|
||||||
if ncss:
|
if ncss:
|
||||||
@ -917,6 +933,7 @@ class HTMLConverter(object):
|
|||||||
if self.verbose:
|
if self.verbose:
|
||||||
print 'Detected chapter', src
|
print 'Detected chapter', src
|
||||||
self.end_page()
|
self.end_page()
|
||||||
|
self.page_break_found = True
|
||||||
self.end_current_para()
|
self.end_current_para()
|
||||||
self.lstrip_toggle = True
|
self.lstrip_toggle = True
|
||||||
if tag_css.has_key('text-indent'):
|
if tag_css.has_key('text-indent'):
|
||||||
@ -953,7 +970,7 @@ class HTMLConverter(object):
|
|||||||
self.end_current_para()
|
self.end_current_para()
|
||||||
self.current_block.append(CR())
|
self.current_block.append(CR())
|
||||||
self.end_current_block()
|
self.end_current_block()
|
||||||
self.current_page.RuledLine(linelength=self.page_width)
|
self.current_page.RuledLine(linelength=self.profile.page_width)
|
||||||
else:
|
else:
|
||||||
self.process_children(tag, tag_css)
|
self.process_children(tag, tag_css)
|
||||||
|
|
||||||
@ -967,18 +984,21 @@ class HTMLConverter(object):
|
|||||||
for _file in self.scaled_images.values():
|
for _file in self.scaled_images.values():
|
||||||
_file.__del__()
|
_file.__del__()
|
||||||
|
|
||||||
|
|
||||||
def process_file(path, options):
|
def process_file(path, options):
|
||||||
cwd = os.getcwd()
|
cwd = os.getcwd()
|
||||||
dirpath = None
|
dirpath = None
|
||||||
try:
|
try:
|
||||||
dirpath, path = get_path(path)
|
dirpath, path = get_path(path)
|
||||||
cpath, tpath = options.cover, ''
|
cpath, tpath = '', ''
|
||||||
if options.cover and os.access(options.cover, os.R_OK):
|
if options.cover:
|
||||||
try:
|
options.cover = os.path.abspath(os.path.expanduser(options.cover))
|
||||||
|
cpath = options.cover
|
||||||
|
if os.access(options.cover, os.R_OK):
|
||||||
from libprs500.prs500 import PRS500
|
from libprs500.prs500 import PRS500
|
||||||
im = PILImage.open(os.path.join(cwd, cpath))
|
im = PILImage.open(os.path.join(cwd, cpath))
|
||||||
cim = im.resize((600, 800), PILImage.BICUBIC)
|
cim = im.resize((options.profile.screen_width,
|
||||||
|
options.profile.screen_height),
|
||||||
|
PILImage.BICUBIC)
|
||||||
cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
|
cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
|
||||||
cf.close()
|
cf.close()
|
||||||
cim.save(cf.name)
|
cim.save(cf.name)
|
||||||
@ -989,17 +1009,16 @@ def process_file(path, options):
|
|||||||
tf.close()
|
tf.close()
|
||||||
tim.save(tf.name)
|
tim.save(tf.name)
|
||||||
tpath = tf.name
|
tpath = tf.name
|
||||||
except ImportError:
|
else:
|
||||||
print >>sys.stderr, "WARNING: You don't have PIL installed. ",
|
raise ConversionError, 'Cannot read from: %s', (options.cover,)
|
||||||
'Cover and thumbnails wont work'
|
|
||||||
pass
|
|
||||||
title = (options.title, options.title_sort)
|
title = (options.title, options.title_sort)
|
||||||
author = (options.author, options.author_sort)
|
author = (options.author, options.author_sort)
|
||||||
args = dict(font_delta=options.font_delta, title=title, \
|
args = dict(font_delta=options.font_delta, title=title, \
|
||||||
author=author, sourceencoding='utf8',\
|
author=author, sourceencoding='utf8',\
|
||||||
freetext=options.freetext, category=options.category,
|
freetext=options.freetext, category=options.category,
|
||||||
booksetting=BookSetting(dpi=10*options.dpi,screenheight=800,
|
booksetting=BookSetting(dpi=10*options.profile.dpi,
|
||||||
screenwidth=600))
|
screenheight=options.profile.screen_height,
|
||||||
|
screenwidth=options.profile.screen_width))
|
||||||
if tpath:
|
if tpath:
|
||||||
args['thumbnail'] = tpath
|
args['thumbnail'] = tpath
|
||||||
header = None
|
header = None
|
||||||
@ -1011,13 +1030,16 @@ def process_file(path, options):
|
|||||||
book = Book(header=header, **args)
|
book = Book(header=header, **args)
|
||||||
le = re.compile(options.link_exclude) if options.link_exclude else \
|
le = re.compile(options.link_exclude) if options.link_exclude else \
|
||||||
re.compile('$')
|
re.compile('$')
|
||||||
conv = HTMLConverter(book, path, dpi=options.dpi,
|
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
|
||||||
|
re.compile('$')
|
||||||
|
conv = HTMLConverter(book, path, profile=options.profile,
|
||||||
font_delta=options.font_delta,
|
font_delta=options.font_delta,
|
||||||
cover=cpath, max_link_levels=options.link_levels,
|
cover=cpath, max_link_levels=options.link_levels,
|
||||||
baen=options.baen,
|
verbose=options.verbose, baen=options.baen,
|
||||||
chapter_detection=options.chapter_detection,
|
chapter_detection=options.chapter_detection,
|
||||||
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
|
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
|
||||||
link_exclude=re.compile(le))
|
link_exclude=re.compile(le), page_break=pb,
|
||||||
|
hide_broken_links=not options.show_broken_links)
|
||||||
conv.process_links()
|
conv.process_links()
|
||||||
oname = options.output
|
oname = options.output
|
||||||
if not oname:
|
if not oname:
|
||||||
@ -1033,47 +1055,73 @@ def process_file(path, options):
|
|||||||
if dirpath:
|
if dirpath:
|
||||||
shutil.rmtree(dirpath, True)
|
shutil.rmtree(dirpath, True)
|
||||||
|
|
||||||
def main():
|
def parse_options(argv=None, cli=True):
|
||||||
""" CLI for html -> lrf conversions """
|
""" CLI for html -> lrf conversions """
|
||||||
|
if not argv:
|
||||||
|
argv = sys.argv[1:]
|
||||||
parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]
|
parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]
|
||||||
|
|
||||||
%prog converts mybook.html to mybook.lrf""")
|
%prog converts mybook.html to mybook.lrf""")
|
||||||
parser.add_option('--cover', action='store', dest='cover', default=None, \
|
parser.add_option('--cover', action='store', dest='cover', default=None, \
|
||||||
help='Path to file containing image to be used as cover')
|
help='Path to file containing image to be used as cover')
|
||||||
parser.add_option('--lrs', action='store_true', dest='lrs', \
|
|
||||||
help='Convert to LRS', default=False)
|
|
||||||
parser.add_option('--font-delta', action='store', type='int', default=0, \
|
parser.add_option('--font-delta', action='store', type='int', default=0, \
|
||||||
help="""Increase the font size by 2 * FONT_DELTA pts.
|
help="""Increase the font size by 2 * FONT_DELTA pts.
|
||||||
If FONT_DELTA is negative, the font size is decreased.""",
|
If FONT_DELTA is negative, the font size is decreased.""",
|
||||||
dest='font_delta')
|
dest='font_delta')
|
||||||
parser.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
|
link = parser.add_option_group('LINK PROCESSING OPTIONS')
|
||||||
|
link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
|
||||||
dest='link_levels',
|
dest='link_levels',
|
||||||
help=r'''The maximum number of levels to recursively process '''
|
help=r'''The maximum number of levels to recursively process '''
|
||||||
'''links. A value of 0 means thats links are not followed. '''
|
'''links. A value of 0 means thats links are not followed. '''
|
||||||
'''A negative value means that <a> tags are ignored.''')
|
'''A negative value means that <a> tags are ignored.''')
|
||||||
parser.add_option('--baen', action='store_true', default=False, dest='baen',
|
link.add_option('--link-exclude', dest='link_exclude', default='$',
|
||||||
help='''Preprocess Baen HTML files to improve generated LRF.''')
|
help='''A regular expression. <a> tags whoose href '''
|
||||||
parser.add_option('--dpi', action='store', type='int', default=166, dest='dpi',
|
'''matches will be ignored. Defaults to %default''')
|
||||||
help='''The DPI of the target device. Default is 166 for the
|
chapter = parser.add_option_group('CHAPTER OPTIONS')
|
||||||
Sony PRS 500''')
|
chapter.add_option('--disable-chapter-detection', action='store_false',
|
||||||
parser.add_option('--disable-chapter-detection', action='store_false',
|
|
||||||
default=True, dest='chapter_detection',
|
default=True, dest='chapter_detection',
|
||||||
help='''Prevent html2lrf from automatically inserting page breaks'''
|
help='''Prevent html2lrf from automatically inserting page breaks'''
|
||||||
'''before what it thinks are chapters.''')
|
'''before what it thinks are chapters.''')
|
||||||
parser.add_option('--chapter-regex', dest='chapter_regex',
|
chapter.add_option('--chapter-regex', dest='chapter_regex',
|
||||||
default='chapter|book|appendix',
|
default='chapter|book|appendix',
|
||||||
help='''The regular expression used to detect chapter titles.'''
|
help='''The regular expression used to detect chapter titles.'''
|
||||||
'''It is searched for in heading tags. Default is chapter|book|appendix''')
|
'''It is searched for in heading tags. Defaults to %default''')
|
||||||
parser.add_option('--link-exclude', dest='link_exclude', default='',
|
chapter.add_option('--page-break-before', dest='page_break', default='h[12]',
|
||||||
help='''A regular expression. <a> tags whoose href '''
|
help='''If html2lrf does not find any page breaks in the '''
|
||||||
'''matches will be ignored''')
|
'''html file and cannot detect chapter headings, it will '''
|
||||||
options, args = parser.parse_args()
|
'''automatically insert page-breaks before the tags whose '''
|
||||||
|
'''names match this regular expression. Defaults to %default. '''
|
||||||
|
'''You can disable it by setting the regexp to "$". '''
|
||||||
|
'''The purpose of this option is to try to ensure that '''
|
||||||
|
'''there are no really long pages as this degrades the page '''
|
||||||
|
'''turn performance of the LRF. Thus this option is ignored '''
|
||||||
|
'''if the current page has only a few elements.''')
|
||||||
|
prepro = parser.add_option_group('PREPROCESSING OPTIONS')
|
||||||
|
prepro.add_option('--baen', action='store_true', default=False, dest='baen',
|
||||||
|
help='''Preprocess Baen HTML files to improve generated LRF.''')
|
||||||
|
debug = parser.add_option_group('DEBUG OPTIONS')
|
||||||
|
debug.add_option('--verbose', dest='verbose', action='store_true', default=False,
|
||||||
|
help='''Be verbose while processing''')
|
||||||
|
debug.add_option('--lrs', action='store_true', dest='lrs', \
|
||||||
|
help='Convert to LRS', default=False)
|
||||||
|
debug.add_option('--show-broken-links', dest='show_broken_links', action='store_true',
|
||||||
|
default=False, help='''Show the href of broken links in generated LRF''')
|
||||||
|
options, args = parser.parse_args(args=argv)
|
||||||
if len(args) != 1:
|
if len(args) != 1:
|
||||||
parser.print_help()
|
if cli:
|
||||||
sys.exit(1)
|
parser.print_help()
|
||||||
src = args[0]
|
raise ConversionError, 'no filename specified'
|
||||||
if options.title == None:
|
if options.title == None:
|
||||||
options.title = filename_to_utf8(os.path.splitext(os.path.basename(src))[0])
|
options.title = filename_to_utf8(os.path.splitext(os.path.basename(args[0]))[0])
|
||||||
|
return options, args
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
try:
|
||||||
|
options, args = parse_options()
|
||||||
|
src = args[0]
|
||||||
|
except:
|
||||||
|
sys.exit(1)
|
||||||
process_file(src, options)
|
process_file(src, options)
|
||||||
|
|
||||||
def console_query(dirpath, candidate, docs):
|
def console_query(dirpath, candidate, docs):
|
||||||
|
@ -70,7 +70,7 @@
|
|||||||
|
|
||||||
<h2><a name='images'>Inline images</a></h2>
|
<h2><a name='images'>Inline images</a></h2>
|
||||||
<p>
|
<p>
|
||||||
Here I demonstrate the use of inline images in the midst of text. Here is a small image <img src='small.jpg' /> embedded in a sentence. Now we have a slightly larger image that is automatically put in its own block <img style="text-align:center" src='medium.jpg' /> and finally we have a large image which is automatically placed on a page by itself and prevented from being autoscaled when the user changes from S to M to L. Try changing sizes and see how the different embedding styles behave. <img src='large.jpg' />
|
Here I demonstrate the use of inline images in the midst of text. Here is a small image <img src='small.jpg' /> embedded in a sentence. Now we have a slightly larger image that is automatically put in its own block <img style="text-align:center" src='medium.jpg' /> and finally we have a large image which wont fit on this page. Try changing sizes from S to M to L and see how the images behave. <img src='large.jpg' />
|
||||||
</p>
|
</p>
|
||||||
<p class='toc'>
|
<p class='toc'>
|
||||||
<hr />
|
<hr />
|
||||||
|
@ -69,7 +69,9 @@ def convert_txt(path, options):
|
|||||||
book = Book(header=header, title=title, author=author, \
|
book = Book(header=header, title=title, author=author, \
|
||||||
sourceencoding=options.encoding, freetext=options.freetext, \
|
sourceencoding=options.encoding, freetext=options.freetext, \
|
||||||
category=options.category, booksetting=BookSetting
|
category=options.category, booksetting=BookSetting
|
||||||
(dpi=10*options.dpi,screenheight=800, screenwidth=600))
|
(dpi=10*options.profile.dpi,
|
||||||
|
screenheight=options.profile.screen_height,
|
||||||
|
screenwidth=options.profile.screen_height))
|
||||||
buffer = ''
|
buffer = ''
|
||||||
pg = book.create_page()
|
pg = book.create_page()
|
||||||
block = book.create_text_block()
|
block = book.create_text_block()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user