mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Bug fixes in html2lrf too numerous to enumerate ;-) Also changed paragraph indentation behavior.
This commit is contained in:
parent
f403787fd5
commit
9d407875bd
@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
|
||||
suit your distribution.
|
||||
"""
|
||||
|
||||
__version__ = "0.3.19"
|
||||
__version__ = "0.3.20"
|
||||
__docformat__ = "epytext"
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
|
||||
|
@ -49,7 +49,7 @@ def Book(font_delta=0, header=None, **settings):
|
||||
ps = dict(textwidth=575, textheight=747)
|
||||
if header:
|
||||
hdr = Header()
|
||||
hb = TextBlock(TextStyle(align='foot', fontsize=60))
|
||||
hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60))
|
||||
hb.append(header)
|
||||
hdr.PutObj(hb)
|
||||
ps['headheight'] = 30
|
||||
|
@ -20,7 +20,7 @@ Code to convert HTML ebooks into LRF ebooks.
|
||||
I am indebted to esperanc for the CSS->Xylog Style conversion routines
|
||||
and to Falstaff for pylrs.
|
||||
"""
|
||||
import os, re, sys, shutil, traceback
|
||||
import os, re, sys, shutil, traceback, copy
|
||||
from htmlentitydefs import name2codepoint
|
||||
from urllib import urlopen, unquote
|
||||
from urlparse import urlparse
|
||||
@ -32,7 +32,7 @@ from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, \
|
||||
from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
|
||||
ImageBlock, JumpButton, CharButton, BlockStyle,\
|
||||
Page, Bold, Space, Plot, TextStyle, Image, BlockSpace,\
|
||||
TableOfContents
|
||||
RuledLine
|
||||
from libprs500.lrf.pylrs.pylrs import Span as _Span
|
||||
from libprs500.lrf import ConversionError, option_parser, Book
|
||||
from libprs500 import extract
|
||||
@ -196,7 +196,7 @@ class Span(_Span):
|
||||
return t
|
||||
|
||||
def __init__(self, ns, css, memory, font_delta=0):
|
||||
src = ns.string if hasattr(ns, 'string') else str(ns)
|
||||
src = ns.string if hasattr(ns, 'string') else ns
|
||||
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
|
||||
for pat, repl in Span.rules:
|
||||
src = pat.sub(repl, src)
|
||||
@ -216,7 +216,11 @@ class HTMLConverter(object):
|
||||
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
|
||||
justification_styles = dict(head=TextStyle(align='head'), foot=TextStyle(align='foot'),
|
||||
center=TextStyle(align='center'))
|
||||
blockquote_style = [TextStyle(), BlockStyle(sidemargin=60, topskip=20, footskip=20)]
|
||||
blockquote_style = BlockStyle(sidemargin=60, topskip=20, footskip=20)
|
||||
unindented_style = TextStyle(parindent=0)
|
||||
# Fix <a /> elements
|
||||
markup_massage = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
|
||||
lambda match: match.group(1)+"></a>")]
|
||||
|
||||
class Link(object):
|
||||
def __init__(self, para, tag):
|
||||
@ -285,14 +289,18 @@ class HTMLConverter(object):
|
||||
self.in_ol = False #: Flag indicating we're in an <ol> element
|
||||
self.book = book #: The Book object representing a BBeB book
|
||||
self.is_root = is_root #: Are we converting the root HTML file
|
||||
self.lstrip_toggle = False #; If true the next add_text call will do an lstrip
|
||||
path = os.path.abspath(path)
|
||||
os.chdir(os.path.dirname(path))
|
||||
self.file_name = os.path.basename(path)
|
||||
print "Processing", self.file_name
|
||||
print '\tParsing HTML...',
|
||||
sys.stdout.flush()
|
||||
self.soup = BeautifulSoup(open(self.file_name, 'r').read(), \
|
||||
convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(HTMLConverter.markup_massage)
|
||||
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
|
||||
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
||||
markupMassage=nmassage)
|
||||
print 'done\n\tConverting to BBeB...',
|
||||
sys.stdout.flush()
|
||||
self.verbose = verbose
|
||||
@ -380,7 +388,7 @@ class HTMLConverter(object):
|
||||
self.current_para.append_to(self.current_block)
|
||||
if self.current_block and self.current_page:
|
||||
self.current_block.append_to(self.current_page)
|
||||
if self.current_page and self.current_page.get_text().strip():
|
||||
if self.current_page and self.current_page.has_text():
|
||||
self.book.append(self.current_page)
|
||||
|
||||
if not self.top.parent:
|
||||
@ -415,6 +423,38 @@ class HTMLConverter(object):
|
||||
return text
|
||||
|
||||
def process_links(self):
|
||||
def get_target_block(fragment, targets):
|
||||
'''Return the correct block for the <a name> element'''
|
||||
bs = targets[fragment]
|
||||
if not isinstance(bs, BlockSpace):
|
||||
return bs
|
||||
ans, found, page = None, False, bs.parent
|
||||
for item in page.contents:
|
||||
if found:
|
||||
if isinstance(item, (TextBlock, ImageBlock)):
|
||||
ans = item
|
||||
break
|
||||
if item == bs:
|
||||
found = True
|
||||
continue
|
||||
|
||||
if not ans:
|
||||
for i in range(len(page.contents)-1, -1, -1):
|
||||
if isinstance(page.contents[i], (TextBlock, ImageBlock)):
|
||||
ans = page.contents[i]
|
||||
break
|
||||
|
||||
if not ans:
|
||||
ntb = TextBlock()
|
||||
ntb.Paragraph(' ')
|
||||
page.append(ntb)
|
||||
ans = ntb
|
||||
|
||||
if found:
|
||||
targets[fragment] = ans
|
||||
page.contents.remove(bs)
|
||||
return ans
|
||||
|
||||
cwd = os.getcwd()
|
||||
for link in self.links:
|
||||
purl = urlparse(link.tag['href'])
|
||||
@ -424,7 +464,7 @@ class HTMLConverter(object):
|
||||
para, tag = link.para, link.tag
|
||||
if not path or os.path.basename(path) == self.file_name:
|
||||
if fragment in self.targets.keys():
|
||||
tb = self.targets[fragment]
|
||||
tb = get_target_block(fragment, self.targets)
|
||||
if self.is_root:
|
||||
self.book.addTocEntry(self.get_text(tag), tb)
|
||||
sys.stdout.flush()
|
||||
@ -458,7 +498,7 @@ class HTMLConverter(object):
|
||||
self.files[path] = HTMLConverter.processed_files[path]
|
||||
conv = self.files[path]
|
||||
if fragment in conv.targets.keys():
|
||||
tb = conv.targets[fragment]
|
||||
tb = get_target_block(fragment, conv.targets)
|
||||
else:
|
||||
tb = conv.top
|
||||
if self.is_root:
|
||||
@ -479,7 +519,6 @@ class HTMLConverter(object):
|
||||
self.files[path].process_links()
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
def end_page(self):
|
||||
"""
|
||||
@ -490,7 +529,7 @@ class HTMLConverter(object):
|
||||
self.current_para = Paragraph()
|
||||
self.current_block.append_to(self.current_page)
|
||||
self.current_block = TextBlock()
|
||||
if self.current_page.get_text().strip():
|
||||
if self.current_page.has_text():
|
||||
self.book.append(self.current_page)
|
||||
self.current_page = Page()
|
||||
|
||||
@ -521,7 +560,10 @@ class HTMLConverter(object):
|
||||
@param css:
|
||||
@type css:
|
||||
'''
|
||||
src = tag.string if hasattr(tag, 'string') else str(tag)
|
||||
src = tag.string if hasattr(tag, 'string') else tag
|
||||
if self.lstrip_toggle:
|
||||
src = src.lstrip()
|
||||
self.lstrip_toggle = False
|
||||
if not src.strip():
|
||||
self.current_para.append(' ')
|
||||
else:
|
||||
@ -536,7 +578,7 @@ class HTMLConverter(object):
|
||||
if align != self.current_block.textStyle.attrs['align']:
|
||||
self.current_para.append_to(self.current_block)
|
||||
self.current_block.append_to(self.current_page)
|
||||
self.current_block = TextBlock(HTMLConverter.justification_styles[align])
|
||||
self.current_block = TextBlock(textStyle=HTMLConverter.justification_styles[align])
|
||||
self.current_para = Paragraph()
|
||||
try:
|
||||
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
|
||||
@ -563,7 +605,7 @@ class HTMLConverter(object):
|
||||
End current paragraph with a paragraph break after it. If the current
|
||||
paragraph has no non whitespace text in it do nothing.
|
||||
'''
|
||||
if not self.current_para.get_text().strip():
|
||||
if not self.current_para.has_text():
|
||||
return
|
||||
if self.current_para.contents:
|
||||
self.current_block.append(self.current_para)
|
||||
@ -605,8 +647,8 @@ class HTMLConverter(object):
|
||||
if self.current_block == previous:
|
||||
self.current_para.append_to(self.current_block)
|
||||
self.current_para = Paragraph()
|
||||
if self.current_block.get_text().strip():
|
||||
target = self.current_block
|
||||
if self.current_block.has_text():
|
||||
target = self.current_block
|
||||
else:
|
||||
target = BlockSpace()
|
||||
self.current_page.append(target)
|
||||
@ -619,12 +661,21 @@ class HTMLConverter(object):
|
||||
if found:
|
||||
target = item
|
||||
break
|
||||
if target and not isinstance(target, (TextBlock, ImageBlock)):
|
||||
if isinstance(target, RuledLine):
|
||||
target = TextBlock()
|
||||
target.Paragraph(' ')
|
||||
self.current_page.append(target)
|
||||
else:
|
||||
target = BlockSpace()
|
||||
self.current_page.append(target)
|
||||
if target == None:
|
||||
if self.current_block.get_text().strip():
|
||||
if self.current_block.has_text():
|
||||
target = self.current_block
|
||||
else:
|
||||
target = BlockSpace()
|
||||
self.current_page.append(target)
|
||||
|
||||
self.targets[tag['name']] = target
|
||||
elif tag.has_key('href'):
|
||||
purl = urlparse(tag['href'])
|
||||
@ -695,19 +746,24 @@ class HTMLConverter(object):
|
||||
pass
|
||||
elif tagname == 'pre':
|
||||
self.end_current_para()
|
||||
self.current_block.append_to(self.current_page)
|
||||
self.current_block = TextBlock(textStyle=HTMLConverter.unindented_style)
|
||||
src = ''.join([str(i) for i in tag.contents])
|
||||
lines = src.split('\n')
|
||||
for line in lines:
|
||||
try:
|
||||
self.current_para.append(Span(line, tag_css, self.memory))
|
||||
self.current_para.CR()
|
||||
except ConversionError:
|
||||
pass
|
||||
self.current_para.CR()
|
||||
self.end_current_para()
|
||||
self.current_block.append_to(self.current_page)
|
||||
self.current_block = TextBlock()
|
||||
elif tagname in ['ul', 'ol']:
|
||||
self.in_ol = 1 if tagname == 'ol' else 0
|
||||
self.end_current_para()
|
||||
self.current_block.append_to(self.current_page)
|
||||
self.current_block = TextBlock()
|
||||
self.current_block = TextBlock(textStyle=HTMLConverter.unindented_style)
|
||||
self.process_children(tag, tag_css)
|
||||
self.in_ol = 0
|
||||
self.end_current_para()
|
||||
@ -715,7 +771,7 @@ class HTMLConverter(object):
|
||||
self.current_block = TextBlock()
|
||||
elif tagname == 'li':
|
||||
prepend = str(self.in_ol)+'. ' if self.in_ol else u'\u2022' + ' '
|
||||
if self.current_para.get_text().strip():
|
||||
if self.current_para.has_text():
|
||||
self.current_para.append(CR())
|
||||
self.current_block.append(self.current_para)
|
||||
self.current_para = Paragraph()
|
||||
@ -735,7 +791,8 @@ class HTMLConverter(object):
|
||||
self.current_para.append_to(self.current_block)
|
||||
self.current_block.append_to(self.current_page)
|
||||
self.current_para = Paragraph()
|
||||
self.current_block = TextBlock(*HTMLConverter.blockquote_style)
|
||||
self.current_block = TextBlock(blockStyle=HTMLConverter.blockquote_style,
|
||||
textStyle=HTMLConverter.unindented_style)
|
||||
self.process_children(tag, tag_css)
|
||||
self.current_para.append_to(self.current_block)
|
||||
self.current_block.append_to(self.current_page)
|
||||
@ -743,6 +800,7 @@ class HTMLConverter(object):
|
||||
self.current_block = TextBlock()
|
||||
elif tagname in ['p', 'div']:
|
||||
self.end_current_para()
|
||||
self.lstrip_toggle = True
|
||||
self.process_children(tag, tag_css)
|
||||
self.end_current_para()
|
||||
elif tagname in ['b', 'strong', 'i', 'em', 'span']:
|
||||
|
@ -275,22 +275,23 @@ class LrsContainer(object):
|
||||
self.validChildren = validChildren
|
||||
|
||||
|
||||
def get_text(self):
|
||||
''' Return the textual content of this container'''
|
||||
txt = ''
|
||||
def has_text(self):
|
||||
''' Return True iff this container has non whitespace text '''
|
||||
if hasattr(self, 'text'):
|
||||
txt += self.text
|
||||
if hasattr(self, 'contents'):
|
||||
if self.text.strip():
|
||||
return True
|
||||
if hasattr(self, 'contents'):
|
||||
for child in self.contents:
|
||||
txt += child.get_text()
|
||||
return txt
|
||||
if child.has_text():
|
||||
return True
|
||||
return False
|
||||
|
||||
def append_to(self, parent):
|
||||
'''
|
||||
Append self to C{parent} iff self has non whitespace textual content
|
||||
@type parent: LrsContainer
|
||||
'''
|
||||
if self.get_text().strip():
|
||||
if self.has_text():
|
||||
parent.append(self)
|
||||
|
||||
def appendReferencedObjects(self, parent):
|
||||
@ -622,8 +623,9 @@ class TableOfContents(object):
|
||||
|
||||
|
||||
def addTocEntry(self, tocLabel, textBlock):
|
||||
if not isinstance(textBlock, (TextBlock, ImageBlock, BlockSpace)):
|
||||
raise LrsError, "TOC destination must be a TextBlock, ImageBlock or BlockSpace"
|
||||
if not isinstance(textBlock, (TextBlock, ImageBlock)):
|
||||
raise LrsError, "TOC destination must be a TextBlock or ImageBlock"+\
|
||||
" not a " + str(type(textBlock))
|
||||
|
||||
if textBlock.parent is None or not isinstance(textBlock.parent, Page):
|
||||
raise LrsError, "TOC text block must be already appended to a page"
|
||||
@ -1117,7 +1119,7 @@ class TextStyle(LrsStyle):
|
||||
fontorientation="0", fontweight="400",
|
||||
fontfacename="Dutch801 Rm BT Roman",
|
||||
textcolor="0x00000000", wordspace="25", letterspace="0",
|
||||
baselineskip="120", linespace="10", parindent="0", parskip="0",
|
||||
baselineskip="120", linespace="12", parindent="80", parskip="0",
|
||||
textbgcolor="0xFF000000")
|
||||
|
||||
alsoAllow = ["empdotscode", "empdotsfontname", "refempdotsfont",
|
||||
@ -1345,10 +1347,6 @@ class Page(LrsObject, LrsContainer):
|
||||
class TextBlock(LrsObject, LrsContainer):
|
||||
"""
|
||||
TextBlocks are added to Pages. They hold Paragraphs or CRs.
|
||||
TextBlocks can be supplied a TextStyle and a BlockStyle as the first
|
||||
two arguments to the constructor, but these can be left off
|
||||
and defaults will be used (since the spec says you have to have
|
||||
them).
|
||||
|
||||
If a TextBlock is used in a header, it should be appended to
|
||||
the Book, not to a specific Page.
|
||||
@ -1356,22 +1354,22 @@ class TextBlock(LrsObject, LrsContainer):
|
||||
defaultTextStyle = TextStyle()
|
||||
defaultBlockStyle = BlockStyle()
|
||||
|
||||
def __init__(self, *args, **settings):
|
||||
def __init__(self, textStyle=defaultTextStyle, \
|
||||
blockStyle=defaultBlockStyle, \
|
||||
**settings):
|
||||
'''
|
||||
Create TextBlock.
|
||||
@param textStyle: The L{TextStyle} for this block.
|
||||
@param blockStyle: The L{BlockStyle} for this block.
|
||||
@param settings: C{dict} of extra settings to apply to this block.
|
||||
'''
|
||||
LrsObject.__init__(self)
|
||||
LrsContainer.__init__(self, [Paragraph, CR])
|
||||
|
||||
textStyle = TextBlock.defaultTextStyle
|
||||
blockStyle = TextBlock.defaultBlockStyle
|
||||
if len(args) > 0:
|
||||
textStyle = args[0]
|
||||
if len(args) > 1:
|
||||
blockStyle = args[1]
|
||||
if len(args) > 2:
|
||||
raise LrsError, \
|
||||
"too many style arguments to TextBlock"
|
||||
|
||||
self.textSettings = {}
|
||||
self.blockSettings = {}
|
||||
|
||||
|
||||
for name, value in settings.items():
|
||||
if name in TextStyle.validSettings:
|
||||
self.textSettings[name] = value
|
||||
@ -1410,8 +1408,9 @@ class TextBlock(LrsObject, LrsContainer):
|
||||
self.append(CR())
|
||||
return p
|
||||
|
||||
|
||||
|
||||
def toElement(self, sourceEncoding):
|
||||
def toElement(self, sourceEncoding):
|
||||
tb = self.lrsObjectElement("TextBlock", labelName="Block")
|
||||
tb.attrib.update(self.textSettings)
|
||||
tb.attrib.update(self.blockSettings)
|
||||
@ -1489,8 +1488,8 @@ class Paragraph(LrsContainer):
|
||||
LrsSimpleChar1, basestring])
|
||||
if text is not None:
|
||||
self.append(text)
|
||||
|
||||
|
||||
|
||||
|
||||
def CR(self):
|
||||
# Okay, here's a single autoappender for this common operation
|
||||
cr = CR()
|
||||
@ -1555,6 +1554,7 @@ class LrsTextTag(LrsContainer):
|
||||
|
||||
class LrsSimpleChar1(object):
|
||||
pass
|
||||
|
||||
|
||||
class DropCaps(LrsTextTag):
|
||||
|
||||
@ -1632,7 +1632,6 @@ class Text(LrsContainer):
|
||||
parent.appendLrfTag(LrfTag("rawtext", self.text))
|
||||
|
||||
|
||||
|
||||
class CR(LrsSimpleChar1, LrsContainer):
|
||||
"""
|
||||
A line break (when appended to a Paragraph) or a paragraph break
|
||||
@ -1749,6 +1748,7 @@ class Span(LrsSimpleChar1, LrsContainer):
|
||||
return parent.currentTextStyle
|
||||
|
||||
|
||||
|
||||
def toLrfContainer(self, lrfWriter, container):
|
||||
|
||||
# set the attributes we want changed
|
||||
@ -1791,13 +1791,14 @@ class Bold(Span):
|
||||
return e
|
||||
|
||||
|
||||
class BlockSpace(LrsContainer, LrsObject):
|
||||
class BlockSpace(LrsContainer):
|
||||
""" Can be appended to a page to move the text point. """
|
||||
def __init__(self, xspace=0, yspace=0, x=0, y=0):
|
||||
LrsObject.__init__(self)
|
||||
LrsContainer.__init__(self, [])
|
||||
if xspace == 0 and x != 0: xspace = x
|
||||
if yspace == 0 and y != 0: yspace = y
|
||||
if xspace == 0 and x != 0:
|
||||
xspace = x
|
||||
if yspace == 0 and y != 0:
|
||||
yspace = y
|
||||
self.xspace = xspace
|
||||
self.yspace = yspace
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user