Bug fixes in html2lrf too numerous to enumerate ;-) Also changed paragraph indentation behavior.

This commit is contained in:
Kovid Goyal 2007-05-03 21:20:52 +00:00
parent f403787fd5
commit 9d407875bd
4 changed files with 116 additions and 57 deletions

View File

@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
suit your distribution. suit your distribution.
""" """
__version__ = "0.3.19" __version__ = "0.3.20"
__docformat__ = "epytext" __docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>" __author__ = "Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -49,7 +49,7 @@ def Book(font_delta=0, header=None, **settings):
ps = dict(textwidth=575, textheight=747) ps = dict(textwidth=575, textheight=747)
if header: if header:
hdr = Header() hdr = Header()
hb = TextBlock(TextStyle(align='foot', fontsize=60)) hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60))
hb.append(header) hb.append(header)
hdr.PutObj(hb) hdr.PutObj(hb)
ps['headheight'] = 30 ps['headheight'] = 30

View File

@ -20,7 +20,7 @@ Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the CSS->Xylog Style conversion routines I am indebted to esperanc for the CSS->Xylog Style conversion routines
and to Falstaff for pylrs. and to Falstaff for pylrs.
""" """
import os, re, sys, shutil, traceback import os, re, sys, shutil, traceback, copy
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from urllib import urlopen, unquote from urllib import urlopen, unquote
from urlparse import urlparse from urlparse import urlparse
@ -32,7 +32,7 @@ from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, \
from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \ from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
ImageBlock, JumpButton, CharButton, BlockStyle,\ ImageBlock, JumpButton, CharButton, BlockStyle,\
Page, Bold, Space, Plot, TextStyle, Image, BlockSpace,\ Page, Bold, Space, Plot, TextStyle, Image, BlockSpace,\
TableOfContents RuledLine
from libprs500.lrf.pylrs.pylrs import Span as _Span from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError, option_parser, Book from libprs500.lrf import ConversionError, option_parser, Book
from libprs500 import extract from libprs500 import extract
@ -196,7 +196,7 @@ class Span(_Span):
return t return t
def __init__(self, ns, css, memory, font_delta=0): def __init__(self, ns, css, memory, font_delta=0):
src = ns.string if hasattr(ns, 'string') else str(ns) src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules: for pat, repl in Span.rules:
src = pat.sub(repl, src) src = pat.sub(repl, src)
@ -216,7 +216,11 @@ class HTMLConverter(object):
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction) IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
justification_styles = dict(head=TextStyle(align='head'), foot=TextStyle(align='foot'), justification_styles = dict(head=TextStyle(align='head'), foot=TextStyle(align='foot'),
center=TextStyle(align='center')) center=TextStyle(align='center'))
blockquote_style = [TextStyle(), BlockStyle(sidemargin=60, topskip=20, footskip=20)] blockquote_style = BlockStyle(sidemargin=60, topskip=20, footskip=20)
unindented_style = TextStyle(parindent=0)
# Fix <a /> elements
markup_massage = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
lambda match: match.group(1)+"></a>")]
class Link(object): class Link(object):
def __init__(self, para, tag): def __init__(self, para, tag):
@ -285,14 +289,18 @@ class HTMLConverter(object):
self.in_ol = False #: Flag indicating we're in an <ol> element self.in_ol = False #: Flag indicating we're in an <ol> element
self.book = book #: The Book object representing a BBeB book self.book = book #: The Book object representing a BBeB book
self.is_root = is_root #: Are we converting the root HTML file self.is_root = is_root #: Are we converting the root HTML file
self.lstrip_toggle = False #; If true the next add_text call will do an lstrip
path = os.path.abspath(path) path = os.path.abspath(path)
os.chdir(os.path.dirname(path)) os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path) self.file_name = os.path.basename(path)
print "Processing", self.file_name print "Processing", self.file_name
print '\tParsing HTML...', print '\tParsing HTML...',
sys.stdout.flush() sys.stdout.flush()
self.soup = BeautifulSoup(open(self.file_name, 'r').read(), \ nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
convertEntities=BeautifulSoup.HTML_ENTITIES) nmassage.extend(HTMLConverter.markup_massage)
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
print 'done\n\tConverting to BBeB...', print 'done\n\tConverting to BBeB...',
sys.stdout.flush() sys.stdout.flush()
self.verbose = verbose self.verbose = verbose
@ -380,7 +388,7 @@ class HTMLConverter(object):
self.current_para.append_to(self.current_block) self.current_para.append_to(self.current_block)
if self.current_block and self.current_page: if self.current_block and self.current_page:
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
if self.current_page and self.current_page.get_text().strip(): if self.current_page and self.current_page.has_text():
self.book.append(self.current_page) self.book.append(self.current_page)
if not self.top.parent: if not self.top.parent:
@ -415,6 +423,38 @@ class HTMLConverter(object):
return text return text
def process_links(self): def process_links(self):
def get_target_block(fragment, targets):
'''Return the correct block for the <a name> element'''
bs = targets[fragment]
if not isinstance(bs, BlockSpace):
return bs
ans, found, page = None, False, bs.parent
for item in page.contents:
if found:
if isinstance(item, (TextBlock, ImageBlock)):
ans = item
break
if item == bs:
found = True
continue
if not ans:
for i in range(len(page.contents)-1, -1, -1):
if isinstance(page.contents[i], (TextBlock, ImageBlock)):
ans = page.contents[i]
break
if not ans:
ntb = TextBlock()
ntb.Paragraph(' ')
page.append(ntb)
ans = ntb
if found:
targets[fragment] = ans
page.contents.remove(bs)
return ans
cwd = os.getcwd() cwd = os.getcwd()
for link in self.links: for link in self.links:
purl = urlparse(link.tag['href']) purl = urlparse(link.tag['href'])
@ -424,7 +464,7 @@ class HTMLConverter(object):
para, tag = link.para, link.tag para, tag = link.para, link.tag
if not path or os.path.basename(path) == self.file_name: if not path or os.path.basename(path) == self.file_name:
if fragment in self.targets.keys(): if fragment in self.targets.keys():
tb = self.targets[fragment] tb = get_target_block(fragment, self.targets)
if self.is_root: if self.is_root:
self.book.addTocEntry(self.get_text(tag), tb) self.book.addTocEntry(self.get_text(tag), tb)
sys.stdout.flush() sys.stdout.flush()
@ -458,7 +498,7 @@ class HTMLConverter(object):
self.files[path] = HTMLConverter.processed_files[path] self.files[path] = HTMLConverter.processed_files[path]
conv = self.files[path] conv = self.files[path]
if fragment in conv.targets.keys(): if fragment in conv.targets.keys():
tb = conv.targets[fragment] tb = get_target_block(fragment, conv.targets)
else: else:
tb = conv.top tb = conv.top
if self.is_root: if self.is_root:
@ -479,7 +519,6 @@ class HTMLConverter(object):
self.files[path].process_links() self.files[path].process_links()
finally: finally:
os.chdir(cwd) os.chdir(cwd)
def end_page(self): def end_page(self):
""" """
@ -490,7 +529,7 @@ class HTMLConverter(object):
self.current_para = Paragraph() self.current_para = Paragraph()
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
self.current_block = TextBlock() self.current_block = TextBlock()
if self.current_page.get_text().strip(): if self.current_page.has_text():
self.book.append(self.current_page) self.book.append(self.current_page)
self.current_page = Page() self.current_page = Page()
@ -521,7 +560,10 @@ class HTMLConverter(object):
@param css: @param css:
@type css: @type css:
''' '''
src = tag.string if hasattr(tag, 'string') else str(tag) src = tag.string if hasattr(tag, 'string') else tag
if self.lstrip_toggle:
src = src.lstrip()
self.lstrip_toggle = False
if not src.strip(): if not src.strip():
self.current_para.append(' ') self.current_para.append(' ')
else: else:
@ -536,7 +578,7 @@ class HTMLConverter(object):
if align != self.current_block.textStyle.attrs['align']: if align != self.current_block.textStyle.attrs['align']:
self.current_para.append_to(self.current_block) self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
self.current_block = TextBlock(HTMLConverter.justification_styles[align]) self.current_block = TextBlock(textStyle=HTMLConverter.justification_styles[align])
self.current_para = Paragraph() self.current_para = Paragraph()
try: try:
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\ self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
@ -563,7 +605,7 @@ class HTMLConverter(object):
End current paragraph with a paragraph break after it. If the current End current paragraph with a paragraph break after it. If the current
paragraph has no non whitespace text in it do nothing. paragraph has no non whitespace text in it do nothing.
''' '''
if not self.current_para.get_text().strip(): if not self.current_para.has_text():
return return
if self.current_para.contents: if self.current_para.contents:
self.current_block.append(self.current_para) self.current_block.append(self.current_para)
@ -605,8 +647,8 @@ class HTMLConverter(object):
if self.current_block == previous: if self.current_block == previous:
self.current_para.append_to(self.current_block) self.current_para.append_to(self.current_block)
self.current_para = Paragraph() self.current_para = Paragraph()
if self.current_block.get_text().strip(): if self.current_block.has_text():
target = self.current_block target = self.current_block
else: else:
target = BlockSpace() target = BlockSpace()
self.current_page.append(target) self.current_page.append(target)
@ -619,12 +661,21 @@ class HTMLConverter(object):
if found: if found:
target = item target = item
break break
if target and not isinstance(target, (TextBlock, ImageBlock)):
if isinstance(target, RuledLine):
target = TextBlock()
target.Paragraph(' ')
self.current_page.append(target)
else:
target = BlockSpace()
self.current_page.append(target)
if target == None: if target == None:
if self.current_block.get_text().strip(): if self.current_block.has_text():
target = self.current_block target = self.current_block
else: else:
target = BlockSpace() target = BlockSpace()
self.current_page.append(target) self.current_page.append(target)
self.targets[tag['name']] = target self.targets[tag['name']] = target
elif tag.has_key('href'): elif tag.has_key('href'):
purl = urlparse(tag['href']) purl = urlparse(tag['href'])
@ -695,19 +746,24 @@ class HTMLConverter(object):
pass pass
elif tagname == 'pre': elif tagname == 'pre':
self.end_current_para() self.end_current_para()
self.current_block.append_to(self.current_page)
self.current_block = TextBlock(textStyle=HTMLConverter.unindented_style)
src = ''.join([str(i) for i in tag.contents]) src = ''.join([str(i) for i in tag.contents])
lines = src.split('\n') lines = src.split('\n')
for line in lines: for line in lines:
try: try:
self.current_para.append(Span(line, tag_css, self.memory)) self.current_para.append(Span(line, tag_css, self.memory))
self.current_para.CR()
except ConversionError: except ConversionError:
pass pass
self.current_para.CR() self.end_current_para()
self.current_block.append_to(self.current_page)
self.current_block = TextBlock()
elif tagname in ['ul', 'ol']: elif tagname in ['ul', 'ol']:
self.in_ol = 1 if tagname == 'ol' else 0 self.in_ol = 1 if tagname == 'ol' else 0
self.end_current_para() self.end_current_para()
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
self.current_block = TextBlock() self.current_block = TextBlock(textStyle=HTMLConverter.unindented_style)
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
self.in_ol = 0 self.in_ol = 0
self.end_current_para() self.end_current_para()
@ -715,7 +771,7 @@ class HTMLConverter(object):
self.current_block = TextBlock() self.current_block = TextBlock()
elif tagname == 'li': elif tagname == 'li':
prepend = str(self.in_ol)+'. ' if self.in_ol else u'\u2022' + ' ' prepend = str(self.in_ol)+'. ' if self.in_ol else u'\u2022' + ' '
if self.current_para.get_text().strip(): if self.current_para.has_text():
self.current_para.append(CR()) self.current_para.append(CR())
self.current_block.append(self.current_para) self.current_block.append(self.current_para)
self.current_para = Paragraph() self.current_para = Paragraph()
@ -735,7 +791,8 @@ class HTMLConverter(object):
self.current_para.append_to(self.current_block) self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
self.current_para = Paragraph() self.current_para = Paragraph()
self.current_block = TextBlock(*HTMLConverter.blockquote_style) self.current_block = TextBlock(blockStyle=HTMLConverter.blockquote_style,
textStyle=HTMLConverter.unindented_style)
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
self.current_para.append_to(self.current_block) self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
@ -743,6 +800,7 @@ class HTMLConverter(object):
self.current_block = TextBlock() self.current_block = TextBlock()
elif tagname in ['p', 'div']: elif tagname in ['p', 'div']:
self.end_current_para() self.end_current_para()
self.lstrip_toggle = True
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
self.end_current_para() self.end_current_para()
elif tagname in ['b', 'strong', 'i', 'em', 'span']: elif tagname in ['b', 'strong', 'i', 'em', 'span']:

View File

@ -275,22 +275,23 @@ class LrsContainer(object):
self.validChildren = validChildren self.validChildren = validChildren
def get_text(self): def has_text(self):
''' Return the textual content of this container''' ''' Return True iff this container has non whitespace text '''
txt = ''
if hasattr(self, 'text'): if hasattr(self, 'text'):
txt += self.text if self.text.strip():
if hasattr(self, 'contents'): return True
if hasattr(self, 'contents'):
for child in self.contents: for child in self.contents:
txt += child.get_text() if child.has_text():
return txt return True
return False
def append_to(self, parent): def append_to(self, parent):
''' '''
Append self to C{parent} iff self has non whitespace textual content Append self to C{parent} iff self has non whitespace textual content
@type parent: LrsContainer @type parent: LrsContainer
''' '''
if self.get_text().strip(): if self.has_text():
parent.append(self) parent.append(self)
def appendReferencedObjects(self, parent): def appendReferencedObjects(self, parent):
@ -622,8 +623,9 @@ class TableOfContents(object):
def addTocEntry(self, tocLabel, textBlock): def addTocEntry(self, tocLabel, textBlock):
if not isinstance(textBlock, (TextBlock, ImageBlock, BlockSpace)): if not isinstance(textBlock, (TextBlock, ImageBlock)):
raise LrsError, "TOC destination must be a TextBlock, ImageBlock or BlockSpace" raise LrsError, "TOC destination must be a TextBlock or ImageBlock"+\
" not a " + str(type(textBlock))
if textBlock.parent is None or not isinstance(textBlock.parent, Page): if textBlock.parent is None or not isinstance(textBlock.parent, Page):
raise LrsError, "TOC text block must be already appended to a page" raise LrsError, "TOC text block must be already appended to a page"
@ -1117,7 +1119,7 @@ class TextStyle(LrsStyle):
fontorientation="0", fontweight="400", fontorientation="0", fontweight="400",
fontfacename="Dutch801 Rm BT Roman", fontfacename="Dutch801 Rm BT Roman",
textcolor="0x00000000", wordspace="25", letterspace="0", textcolor="0x00000000", wordspace="25", letterspace="0",
baselineskip="120", linespace="10", parindent="0", parskip="0", baselineskip="120", linespace="12", parindent="80", parskip="0",
textbgcolor="0xFF000000") textbgcolor="0xFF000000")
alsoAllow = ["empdotscode", "empdotsfontname", "refempdotsfont", alsoAllow = ["empdotscode", "empdotsfontname", "refempdotsfont",
@ -1345,10 +1347,6 @@ class Page(LrsObject, LrsContainer):
class TextBlock(LrsObject, LrsContainer): class TextBlock(LrsObject, LrsContainer):
""" """
TextBlocks are added to Pages. They hold Paragraphs or CRs. TextBlocks are added to Pages. They hold Paragraphs or CRs.
TextBlocks can be supplied a TextStyle and a BlockStyle as the first
two arguments to the constructor, but these can be left off
and defaults will be used (since the spec says you have to have
them).
If a TextBlock is used in a header, it should be appended to If a TextBlock is used in a header, it should be appended to
the Book, not to a specific Page. the Book, not to a specific Page.
@ -1356,22 +1354,22 @@ class TextBlock(LrsObject, LrsContainer):
defaultTextStyle = TextStyle() defaultTextStyle = TextStyle()
defaultBlockStyle = BlockStyle() defaultBlockStyle = BlockStyle()
def __init__(self, *args, **settings): def __init__(self, textStyle=defaultTextStyle, \
blockStyle=defaultBlockStyle, \
**settings):
'''
Create TextBlock.
@param textStyle: The L{TextStyle} for this block.
@param blockStyle: The L{BlockStyle} for this block.
@param settings: C{dict} of extra settings to apply to this block.
'''
LrsObject.__init__(self) LrsObject.__init__(self)
LrsContainer.__init__(self, [Paragraph, CR]) LrsContainer.__init__(self, [Paragraph, CR])
textStyle = TextBlock.defaultTextStyle
blockStyle = TextBlock.defaultBlockStyle
if len(args) > 0:
textStyle = args[0]
if len(args) > 1:
blockStyle = args[1]
if len(args) > 2:
raise LrsError, \
"too many style arguments to TextBlock"
self.textSettings = {} self.textSettings = {}
self.blockSettings = {} self.blockSettings = {}
for name, value in settings.items(): for name, value in settings.items():
if name in TextStyle.validSettings: if name in TextStyle.validSettings:
self.textSettings[name] = value self.textSettings[name] = value
@ -1410,8 +1408,9 @@ class TextBlock(LrsObject, LrsContainer):
self.append(CR()) self.append(CR())
return p return p
def toElement(self, sourceEncoding): def toElement(self, sourceEncoding):
tb = self.lrsObjectElement("TextBlock", labelName="Block") tb = self.lrsObjectElement("TextBlock", labelName="Block")
tb.attrib.update(self.textSettings) tb.attrib.update(self.textSettings)
tb.attrib.update(self.blockSettings) tb.attrib.update(self.blockSettings)
@ -1489,8 +1488,8 @@ class Paragraph(LrsContainer):
LrsSimpleChar1, basestring]) LrsSimpleChar1, basestring])
if text is not None: if text is not None:
self.append(text) self.append(text)
def CR(self): def CR(self):
# Okay, here's a single autoappender for this common operation # Okay, here's a single autoappender for this common operation
cr = CR() cr = CR()
@ -1555,6 +1554,7 @@ class LrsTextTag(LrsContainer):
class LrsSimpleChar1(object): class LrsSimpleChar1(object):
pass pass
class DropCaps(LrsTextTag): class DropCaps(LrsTextTag):
@ -1632,7 +1632,6 @@ class Text(LrsContainer):
parent.appendLrfTag(LrfTag("rawtext", self.text)) parent.appendLrfTag(LrfTag("rawtext", self.text))
class CR(LrsSimpleChar1, LrsContainer): class CR(LrsSimpleChar1, LrsContainer):
""" """
A line break (when appended to a Paragraph) or a paragraph break A line break (when appended to a Paragraph) or a paragraph break
@ -1749,6 +1748,7 @@ class Span(LrsSimpleChar1, LrsContainer):
return parent.currentTextStyle return parent.currentTextStyle
def toLrfContainer(self, lrfWriter, container): def toLrfContainer(self, lrfWriter, container):
# set the attributes we want changed # set the attributes we want changed
@ -1791,13 +1791,14 @@ class Bold(Span):
return e return e
class BlockSpace(LrsContainer, LrsObject): class BlockSpace(LrsContainer):
""" Can be appended to a page to move the text point. """ """ Can be appended to a page to move the text point. """
def __init__(self, xspace=0, yspace=0, x=0, y=0): def __init__(self, xspace=0, yspace=0, x=0, y=0):
LrsObject.__init__(self)
LrsContainer.__init__(self, []) LrsContainer.__init__(self, [])
if xspace == 0 and x != 0: xspace = x if xspace == 0 and x != 0:
if yspace == 0 and y != 0: yspace = y xspace = x
if yspace == 0 and y != 0:
yspace = y
self.xspace = xspace self.xspace = xspace
self.yspace = yspace self.yspace = yspace