Bug fixes in html2lrf too numerous to enumerate ;-) Also changed paragraph indentation behavior.

This commit is contained in:
Kovid Goyal 2007-05-03 21:20:52 +00:00
parent f403787fd5
commit 9d407875bd
4 changed files with 116 additions and 57 deletions

View File

@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
suit your distribution.
"""
__version__ = "0.3.19"
__version__ = "0.3.20"
__docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -49,7 +49,7 @@ def Book(font_delta=0, header=None, **settings):
ps = dict(textwidth=575, textheight=747)
if header:
hdr = Header()
hb = TextBlock(TextStyle(align='foot', fontsize=60))
hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60))
hb.append(header)
hdr.PutObj(hb)
ps['headheight'] = 30

View File

@ -20,7 +20,7 @@ Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the CSS->Xylog Style conversion routines
and to Falstaff for pylrs.
"""
import os, re, sys, shutil, traceback
import os, re, sys, shutil, traceback, copy
from htmlentitydefs import name2codepoint
from urllib import urlopen, unquote
from urlparse import urlparse
@ -32,7 +32,7 @@ from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, \
from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
ImageBlock, JumpButton, CharButton, BlockStyle,\
Page, Bold, Space, Plot, TextStyle, Image, BlockSpace,\
TableOfContents
RuledLine
from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError, option_parser, Book
from libprs500 import extract
@ -196,7 +196,7 @@ class Span(_Span):
return t
def __init__(self, ns, css, memory, font_delta=0):
src = ns.string if hasattr(ns, 'string') else str(ns)
src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules:
src = pat.sub(repl, src)
@ -216,7 +216,11 @@ class HTMLConverter(object):
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
justification_styles = dict(head=TextStyle(align='head'), foot=TextStyle(align='foot'),
center=TextStyle(align='center'))
blockquote_style = [TextStyle(), BlockStyle(sidemargin=60, topskip=20, footskip=20)]
blockquote_style = BlockStyle(sidemargin=60, topskip=20, footskip=20)
unindented_style = TextStyle(parindent=0)
# Fix <a /> elements
markup_massage = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
lambda match: match.group(1)+"></a>")]
class Link(object):
def __init__(self, para, tag):
@ -285,14 +289,18 @@ class HTMLConverter(object):
self.in_ol = False #: Flag indicating we're in an <ol> element
self.book = book #: The Book object representing a BBeB book
self.is_root = is_root #: Are we converting the root HTML file
self.lstrip_toggle = False #; If true the next add_text call will do an lstrip
path = os.path.abspath(path)
os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path)
print "Processing", self.file_name
print '\tParsing HTML...',
sys.stdout.flush()
self.soup = BeautifulSoup(open(self.file_name, 'r').read(), \
convertEntities=BeautifulSoup.HTML_ENTITIES)
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.markup_massage)
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
print 'done\n\tConverting to BBeB...',
sys.stdout.flush()
self.verbose = verbose
@ -380,7 +388,7 @@ class HTMLConverter(object):
self.current_para.append_to(self.current_block)
if self.current_block and self.current_page:
self.current_block.append_to(self.current_page)
if self.current_page and self.current_page.get_text().strip():
if self.current_page and self.current_page.has_text():
self.book.append(self.current_page)
if not self.top.parent:
@ -415,6 +423,38 @@ class HTMLConverter(object):
return text
def process_links(self):
def get_target_block(fragment, targets):
'''Return the correct block for the <a name> element'''
bs = targets[fragment]
if not isinstance(bs, BlockSpace):
return bs
ans, found, page = None, False, bs.parent
for item in page.contents:
if found:
if isinstance(item, (TextBlock, ImageBlock)):
ans = item
break
if item == bs:
found = True
continue
if not ans:
for i in range(len(page.contents)-1, -1, -1):
if isinstance(page.contents[i], (TextBlock, ImageBlock)):
ans = page.contents[i]
break
if not ans:
ntb = TextBlock()
ntb.Paragraph(' ')
page.append(ntb)
ans = ntb
if found:
targets[fragment] = ans
page.contents.remove(bs)
return ans
cwd = os.getcwd()
for link in self.links:
purl = urlparse(link.tag['href'])
@ -424,7 +464,7 @@ class HTMLConverter(object):
para, tag = link.para, link.tag
if not path or os.path.basename(path) == self.file_name:
if fragment in self.targets.keys():
tb = self.targets[fragment]
tb = get_target_block(fragment, self.targets)
if self.is_root:
self.book.addTocEntry(self.get_text(tag), tb)
sys.stdout.flush()
@ -458,7 +498,7 @@ class HTMLConverter(object):
self.files[path] = HTMLConverter.processed_files[path]
conv = self.files[path]
if fragment in conv.targets.keys():
tb = conv.targets[fragment]
tb = get_target_block(fragment, conv.targets)
else:
tb = conv.top
if self.is_root:
@ -479,7 +519,6 @@ class HTMLConverter(object):
self.files[path].process_links()
finally:
os.chdir(cwd)
def end_page(self):
"""
@ -490,7 +529,7 @@ class HTMLConverter(object):
self.current_para = Paragraph()
self.current_block.append_to(self.current_page)
self.current_block = TextBlock()
if self.current_page.get_text().strip():
if self.current_page.has_text():
self.book.append(self.current_page)
self.current_page = Page()
@ -521,7 +560,10 @@ class HTMLConverter(object):
@param css:
@type css:
'''
src = tag.string if hasattr(tag, 'string') else str(tag)
src = tag.string if hasattr(tag, 'string') else tag
if self.lstrip_toggle:
src = src.lstrip()
self.lstrip_toggle = False
if not src.strip():
self.current_para.append(' ')
else:
@ -536,7 +578,7 @@ class HTMLConverter(object):
if align != self.current_block.textStyle.attrs['align']:
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_block = TextBlock(HTMLConverter.justification_styles[align])
self.current_block = TextBlock(textStyle=HTMLConverter.justification_styles[align])
self.current_para = Paragraph()
try:
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
@ -563,7 +605,7 @@ class HTMLConverter(object):
End current paragraph with a paragraph break after it. If the current
paragraph has no non whitespace text in it do nothing.
'''
if not self.current_para.get_text().strip():
if not self.current_para.has_text():
return
if self.current_para.contents:
self.current_block.append(self.current_para)
@ -605,8 +647,8 @@ class HTMLConverter(object):
if self.current_block == previous:
self.current_para.append_to(self.current_block)
self.current_para = Paragraph()
if self.current_block.get_text().strip():
target = self.current_block
if self.current_block.has_text():
target = self.current_block
else:
target = BlockSpace()
self.current_page.append(target)
@ -619,12 +661,21 @@ class HTMLConverter(object):
if found:
target = item
break
if target and not isinstance(target, (TextBlock, ImageBlock)):
if isinstance(target, RuledLine):
target = TextBlock()
target.Paragraph(' ')
self.current_page.append(target)
else:
target = BlockSpace()
self.current_page.append(target)
if target == None:
if self.current_block.get_text().strip():
if self.current_block.has_text():
target = self.current_block
else:
target = BlockSpace()
self.current_page.append(target)
self.targets[tag['name']] = target
elif tag.has_key('href'):
purl = urlparse(tag['href'])
@ -695,19 +746,24 @@ class HTMLConverter(object):
pass
elif tagname == 'pre':
self.end_current_para()
self.current_block.append_to(self.current_page)
self.current_block = TextBlock(textStyle=HTMLConverter.unindented_style)
src = ''.join([str(i) for i in tag.contents])
lines = src.split('\n')
for line in lines:
try:
self.current_para.append(Span(line, tag_css, self.memory))
self.current_para.CR()
except ConversionError:
pass
self.current_para.CR()
self.end_current_para()
self.current_block.append_to(self.current_page)
self.current_block = TextBlock()
elif tagname in ['ul', 'ol']:
self.in_ol = 1 if tagname == 'ol' else 0
self.end_current_para()
self.current_block.append_to(self.current_page)
self.current_block = TextBlock()
self.current_block = TextBlock(textStyle=HTMLConverter.unindented_style)
self.process_children(tag, tag_css)
self.in_ol = 0
self.end_current_para()
@ -715,7 +771,7 @@ class HTMLConverter(object):
self.current_block = TextBlock()
elif tagname == 'li':
prepend = str(self.in_ol)+'. ' if self.in_ol else u'\u2022' + ' '
if self.current_para.get_text().strip():
if self.current_para.has_text():
self.current_para.append(CR())
self.current_block.append(self.current_para)
self.current_para = Paragraph()
@ -735,7 +791,8 @@ class HTMLConverter(object):
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_para = Paragraph()
self.current_block = TextBlock(*HTMLConverter.blockquote_style)
self.current_block = TextBlock(blockStyle=HTMLConverter.blockquote_style,
textStyle=HTMLConverter.unindented_style)
self.process_children(tag, tag_css)
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
@ -743,6 +800,7 @@ class HTMLConverter(object):
self.current_block = TextBlock()
elif tagname in ['p', 'div']:
self.end_current_para()
self.lstrip_toggle = True
self.process_children(tag, tag_css)
self.end_current_para()
elif tagname in ['b', 'strong', 'i', 'em', 'span']:

View File

@ -275,22 +275,23 @@ class LrsContainer(object):
self.validChildren = validChildren
def get_text(self):
''' Return the textual content of this container'''
txt = ''
def has_text(self):
''' Return True iff this container has non whitespace text '''
if hasattr(self, 'text'):
txt += self.text
if hasattr(self, 'contents'):
if self.text.strip():
return True
if hasattr(self, 'contents'):
for child in self.contents:
txt += child.get_text()
return txt
if child.has_text():
return True
return False
def append_to(self, parent):
'''
Append self to C{parent} iff self has non whitespace textual content
@type parent: LrsContainer
'''
if self.get_text().strip():
if self.has_text():
parent.append(self)
def appendReferencedObjects(self, parent):
@ -622,8 +623,9 @@ class TableOfContents(object):
def addTocEntry(self, tocLabel, textBlock):
if not isinstance(textBlock, (TextBlock, ImageBlock, BlockSpace)):
raise LrsError, "TOC destination must be a TextBlock, ImageBlock or BlockSpace"
if not isinstance(textBlock, (TextBlock, ImageBlock)):
raise LrsError, "TOC destination must be a TextBlock or ImageBlock"+\
" not a " + str(type(textBlock))
if textBlock.parent is None or not isinstance(textBlock.parent, Page):
raise LrsError, "TOC text block must be already appended to a page"
@ -1117,7 +1119,7 @@ class TextStyle(LrsStyle):
fontorientation="0", fontweight="400",
fontfacename="Dutch801 Rm BT Roman",
textcolor="0x00000000", wordspace="25", letterspace="0",
baselineskip="120", linespace="10", parindent="0", parskip="0",
baselineskip="120", linespace="12", parindent="80", parskip="0",
textbgcolor="0xFF000000")
alsoAllow = ["empdotscode", "empdotsfontname", "refempdotsfont",
@ -1345,10 +1347,6 @@ class Page(LrsObject, LrsContainer):
class TextBlock(LrsObject, LrsContainer):
"""
TextBlocks are added to Pages. They hold Paragraphs or CRs.
TextBlocks can be supplied a TextStyle and a BlockStyle as the first
two arguments to the constructor, but these can be left off
and defaults will be used (since the spec says you have to have
them).
If a TextBlock is used in a header, it should be appended to
the Book, not to a specific Page.
@ -1356,22 +1354,22 @@ class TextBlock(LrsObject, LrsContainer):
defaultTextStyle = TextStyle()
defaultBlockStyle = BlockStyle()
def __init__(self, *args, **settings):
def __init__(self, textStyle=defaultTextStyle, \
blockStyle=defaultBlockStyle, \
**settings):
'''
Create TextBlock.
@param textStyle: The L{TextStyle} for this block.
@param blockStyle: The L{BlockStyle} for this block.
@param settings: C{dict} of extra settings to apply to this block.
'''
LrsObject.__init__(self)
LrsContainer.__init__(self, [Paragraph, CR])
textStyle = TextBlock.defaultTextStyle
blockStyle = TextBlock.defaultBlockStyle
if len(args) > 0:
textStyle = args[0]
if len(args) > 1:
blockStyle = args[1]
if len(args) > 2:
raise LrsError, \
"too many style arguments to TextBlock"
self.textSettings = {}
self.blockSettings = {}
for name, value in settings.items():
if name in TextStyle.validSettings:
self.textSettings[name] = value
@ -1410,8 +1408,9 @@ class TextBlock(LrsObject, LrsContainer):
self.append(CR())
return p
def toElement(self, sourceEncoding):
def toElement(self, sourceEncoding):
tb = self.lrsObjectElement("TextBlock", labelName="Block")
tb.attrib.update(self.textSettings)
tb.attrib.update(self.blockSettings)
@ -1489,8 +1488,8 @@ class Paragraph(LrsContainer):
LrsSimpleChar1, basestring])
if text is not None:
self.append(text)
def CR(self):
# Okay, here's a single autoappender for this common operation
cr = CR()
@ -1555,6 +1554,7 @@ class LrsTextTag(LrsContainer):
class LrsSimpleChar1(object):
pass
class DropCaps(LrsTextTag):
@ -1632,7 +1632,6 @@ class Text(LrsContainer):
parent.appendLrfTag(LrfTag("rawtext", self.text))
class CR(LrsSimpleChar1, LrsContainer):
"""
A line break (when appended to a Paragraph) or a paragraph break
@ -1749,6 +1748,7 @@ class Span(LrsSimpleChar1, LrsContainer):
return parent.currentTextStyle
def toLrfContainer(self, lrfWriter, container):
# set the attributes we want changed
@ -1791,13 +1791,14 @@ class Bold(Span):
return e
class BlockSpace(LrsContainer, LrsObject):
class BlockSpace(LrsContainer):
""" Can be appended to a page to move the text point. """
def __init__(self, xspace=0, yspace=0, x=0, y=0):
LrsObject.__init__(self)
LrsContainer.__init__(self, [])
if xspace == 0 and x != 0: xspace = x
if yspace == 0 and y != 0: yspace = y
if xspace == 0 and x != 0:
xspace = x
if yspace == 0 and y != 0:
yspace = y
self.xspace = xspace
self.yspace = yspace