From 7c2aa5b07eafc5361538c7407fbd734ac1ecea90 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Fri, 17 Aug 2007 17:59:24 +0000
Subject: [PATCH] Support CSS attribute white-space
---
src/libprs500/ebooks/lrf/html/convert_from.py | 128 ++++++++++--------
src/libprs500/ebooks/lrf/html/demo/demo.html | 6 +-
src/libprs500/ebooks/lrf/pylrs/pylrs.py | 46 -------
3 files changed, 74 insertions(+), 106 deletions(-)
diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index 42da6ff1f8..82347933e6 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -219,13 +219,12 @@ class Span(_Span):
t['wordspace'] = 50
return t
- def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100):
+ def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta, parent_style,
+ normal_font_size=100):
src = ns.string if hasattr(ns, 'string') else ns
- src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules:
src = pat.sub(repl, src)
- if not src:
- raise ConversionError('No point in adding an empty string to a Span')
+ src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
if 'fontsize' in attrs.keys():
normal_font_size = int(attrs['fontsize'])
@@ -258,6 +257,9 @@ class Span(_Span):
attrs['baselineskip'] = int(attrs['fontsize']) + 20
if attrs['fontfacename'] == fonts['serif']['normal'][1]:
attrs.pop('fontfacename')
+ for key in attrs:
+ if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]):
+ attrs.pop(key)
_Span.__init__(self, text=src, **attrs)
class HTMLConverter(object):
@@ -330,7 +332,7 @@ class HTMLConverter(object):
'cite' : {'font-style' : 'italic'},
'em' : {"font-style" : "italic"},
'small' : {'font-size' : 'small'},
- 'pre' : {'font-family' : 'monospace' },
+ 'pre' : {'font-family' : 'monospace', 'white-space': 'pre' },
'code' : {'font-family' : 'monospace' },
'tt' : {'font-family' : 'monospace'},
'center' : {'text-align' : 'center'},
@@ -366,6 +368,7 @@ class HTMLConverter(object):
self.link_level = 0 #: Current link level
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
self.tops = {} #: element representing the top of each HTML file in the LRF file
+ self.previous_text = '' #: Used to figure out when to lstrip
# Styles
self.blockquote_style = book.create_block_style(sidemargin=60,
topskip=20, footskip=20)
@@ -381,8 +384,7 @@ class HTMLConverter(object):
self.list_indent = 20
self.list_counter = 1
- self.book = book #: The Book object representing a BBeB book
- self.lstrip_toggle = False #: If true the next add_text call will do an lstrip
+ self.book = book #: The Book object representing a BBeB book
self.start_on_file(path, is_root=True)
def start_on_file(self, path, is_root=True, link_level=0):
@@ -415,6 +417,7 @@ class HTMLConverter(object):
self.css = HTMLConverter.CSS.copy()
self.target_prefix = path
self.links[path] = []
+ self.previous_text = '\n'
self.tops[path] = self.parse_file(soup, is_root)
self.processed_files.append(path)
self.process_links(is_root, path, link_level=link_level)
@@ -467,20 +470,21 @@ class HTMLConverter(object):
# however we need to as we don't do alignment at a block level.
# float is removed by the process_alignment function.
if chk.startswith('font') or chk == 'text-align' or \
- chk == 'float':
+ chk == 'float' or chk == 'white-space':
temp[key] = pcss[key]
prop.update(temp)
prop = dict()
+ tagname = tag.name.lower()
if parent_css:
merge_parent_css(prop, parent_css)
if tag.has_key("align"):
prop["text-align"] = tag["align"]
- if self.css.has_key(tag.name):
- prop.update(self.css[tag.name])
+ if self.css.has_key(tagname):
+ prop.update(self.css[tagname])
if tag.has_key("class"):
cls = tag["class"].lower()
- for classname in ["."+cls, tag.name+"."+cls]:
+ for classname in ["."+cls, tagname+"."+cls]:
if self.css.has_key(classname):
prop.update(self.css[classname])
if tag.has_key("style"):
@@ -537,7 +541,12 @@ class HTMLConverter(object):
raise ConversionError, 'Could not parse ' + self.file_name
return top
- def create_link(self, para, tag):
+ def create_link(self, children, tag):
+ para = None
+ for i in range(len(children)-1, -1, -1):
+ if not isinstance(children[i], CR):
+ para = children[i]
+ break
text = self.get_text(tag, 1000)
if not text:
text = 'Link'
@@ -736,30 +745,41 @@ class HTMLConverter(object):
blockStyle=self.current_block.blockStyle,
textStyle=ts)
self.current_para = Paragraph()
+ return True
+ return False
def add_text(self, tag, css):
'''
Add text to the current paragraph taking CSS into account.
@param tag: Either a BeautifulSoup tag or a string
- @param css:
- @type css:
+ @param css: A dict
'''
src = tag.string if hasattr(tag, 'string') else tag
- src = re.sub(r'\s{1,}', ' ', src)
- if self.lstrip_toggle:
+ src = src.replace('\r\n', '\n').replace('\r', '\n')
+ collapse_whitespace = not css.has_key('white-space') or css['white-space'] != 'pre'
+ if self.process_alignment(css) and collapse_whitespace:
+ # Dont want leading blanks in a new paragraph
src = src.lstrip()
- self.lstrip_toggle = False
- if not src.strip():
- self.current_para.append(' ')
+ args = self.sanctify_css(css), self.memory, self.profile.dpi, self.fonts,\
+ self.logger, self.font_delta, self.current_block.textStyle.attrs
+ if collapse_whitespace:
+ src = re.sub(r'\s{1,}', ' ', src)
+ if len(self.previous_text) != len(self.previous_text.rstrip()):
+ src = src.lstrip()
+ if len(src):
+ self.previous_text = src
+ self.current_para.append(Span(src, *args))
else:
- self.process_alignment(css)
- try:
- self.current_para.append(Span(src, self.sanctify_css(css), self.memory,
- self.profile.dpi, self.fonts, self.logger,
- font_delta=self.font_delta))
- self.current_para.normalize_spaces()
- except ConversionError:
- self.logger.exception('Bad text')
+ srcs = src.split('\n')
+ for src in srcs:
+ if src:
+ self.current_para.append(Span(src, *args))
+ if len(srcs) > 1:
+ self.line_break()
+
+ def line_break(self):
+ self.current_para.append(CR())
+ self.previous_text = '\n'
def sanctify_css(self, css):
""" Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
@@ -770,7 +790,7 @@ class HTMLConverter(object):
'padding' in test or 'border' in test or 'page-break' in test \
or test.startswith('mso') or test.startswith('background')\
or test.startswith('line') or test in ['color', 'display', \
- 'letter-spacing', 'position']:
+ 'letter-spacing', 'position', 'white-space']:
css.pop(key)
return css
@@ -1032,7 +1052,7 @@ class HTMLConverter(object):
if not text.strip():
text = "Link"
self.add_text(text, tag_css)
- self.links[self.target_prefix].append(self.create_link(self.current_para.contents[-1], tag))
+ self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
if tag.has_key('id') or tag.has_key('name'):
key = 'name' if tag.has_key('name') else 'id'
self.targets[self.target_prefix+tag[key]] = self.current_block
@@ -1131,28 +1151,19 @@ class HTMLConverter(object):
if ncss:
update_css(ncss)
elif tagname == 'pre':
- for c in tag.findAll(True):
- c.replaceWith(self.get_text(c))
self.end_current_para()
- self.current_block.append_to(self.current_page)
- attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts,
- self.logger, self.font_delta, self.memory)
- attrs['fontfacename'] = self.fonts['mono']['normal'][1]
- ts = self.book.create_text_style(**self.unindented_style.attrs)
- ts.attrs.update(attrs)
- self.current_block = self.book.create_text_block(
- blockStyle=self.current_block.blockStyle,
- textStyle=ts)
- src = ''.join([str(i) for i in tag.contents])
- lines = src.split('\n')
- for line in lines:
- try:
- self.current_para.append(line)
- self.current_para.CR()
- except ConversionError:
- pass
self.end_current_block()
- self.current_block = self.book.create_text_block()
+ self.current_block.textStyle = self.current_block.textStyle.copy()
+ self.current_block.textStyle.attrs['parindent'] = '0'
+ if tag.contents:
+ c = tag.contents[0]
+ if isinstance(c, NavigableString):
+ c = str(c).replace('\r\n', '\n').replace('\r', '\n')
+ if c.startswith('\n'):
+ c = c[1:]
+ tag.contents[0] = NavigableString(c)
+ self.process_children(tag, tag_css)
+ self.end_current_block()
elif tagname in ['ul', 'ol', 'dl']:
self.list_level += 1
if tagname == 'ol':
@@ -1189,9 +1200,10 @@ class HTMLConverter(object):
textStyle=self.unindented_style)
if self.current_para.has_text():
- self.current_para.append(CR())
+ self.line_break()
self.current_block.append(self.current_para)
self.current_para = Paragraph()
+ self.previous_text = '\n'
if tagname == 'li':
in_ol, parent = True, tag.parent
while parent:
@@ -1228,6 +1240,7 @@ class HTMLConverter(object):
self.block_styles.append(bs)
self.current_block = self.book.create_text_block(
blockStyle=bs, textStyle=ts)
+ self.previous_text = '\n'
self.process_children(tag, tag_css)
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
@@ -1262,14 +1275,16 @@ class HTMLConverter(object):
self.end_current_para()
if not tag.contents or not src.strip(): # Handle empty elements
self.current_block.append(CR())
+ self.previous_text = '\n'
self.process_children(tag, tag_css)
return
- self.lstrip_toggle = True
+ self.previous_text = '\n'
self.process_block(tag, tag_css, tkey)
self.process_children(tag, tag_css)
self.end_current_para()
if tagname.startswith('h') or self.blank_after_para:
- self.current_block.append(CR())
+ self.current_block.append(CR())
+ self.previous_text = '\n'
elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
self.process_children(tag, tag_css)
elif tagname == 'font':
@@ -1277,16 +1292,19 @@ class HTMLConverter(object):
tag_css['font-family'] = tag['face']
self.process_children(tag, tag_css)
elif tagname in ['br']:
- self.current_para.append(CR())
+ self.line_break()
+ self.previous_text = '\n'
elif tagname in ['hr', 'tr']: # tr needed for nested tables
- self.end_current_para()
- self.current_block.append(CR())
+ self.end_current_para()
+ self.line_break()
self.end_current_block()
if tagname == 'hr':
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
+ self.previous_text = '\n'
self.process_children(tag, tag_css)
elif tagname == 'td': # Needed for nested tables
self.current_para.append(' ')
+ self.previous_text = ' '
self.process_children(tag, tag_css)
elif tagname == 'table' and not self.ignore_tables and not self.in_table:
tag_css = self.tag_css(tag) # Table should not inherit CSS
diff --git a/src/libprs500/ebooks/lrf/html/demo/demo.html b/src/libprs500/ebooks/lrf/html/demo/demo.html
index f399cfb24b..62fce386ae 100644
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@@ -72,10 +72,6 @@
Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
-
- The table conversion code is very new and likely to be swarming with bugs, so please report them at
On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
@@ -122,7 +118,7 @@
This is blockquoted text. It is rendered in a separate block with margins.
The above text should be distinct from the rest of the paragraph.
- A very indented paragraph
+ A very indented paragraph
An unindented paragraph
A default indented paragraph
diff --git a/src/libprs500/ebooks/lrf/pylrs/pylrs.py b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
index f476c32620..8915e91f6f 100644
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@@ -251,52 +251,6 @@ class LrsContainer(object):
self.validChildren = validChildren
self.must_append = False
-
- def normalize_spaces(self, prior_text=False):
- '''
- Remove multiple spaces and handle
- @param prior_text: True if the paragraph this container is part of
- has non whitespace text before this container.
- '''
- temp = []
- for i in range(len(self.contents)):
- elem = self.contents[i]
- try:
- if isinstance(elem, Text):
- n = self.contents[i+1]
- if isinstance(n, Text):
- elem.text += n.text
- i += 1
- except:
- continue
- finally:
- temp.append(elem)
- self.contents = temp
-
- def has_prior_text(idx):
- for i in range(idx):
- con = self.contents[i]
- if hasattr(con, 'has_text') and con.has_text():
- return True
- return False
-
- for i in range(len(self.contents)):
- elem = self.contents[i]
- if not prior_text and i > 0:
- prior_text = has_prior_text(i)
-
- if isinstance(elem, Text):
- src = elem.text
- if isinstance(src, basestring):
- src = re.sub(r'\s{1,}', ' ', src)
- if isinstance(self.contents[i-1], (CR, DropCaps)) \
- or not prior_text:
- src = src.lstrip()
- src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
- elem.text = src
- elif hasattr(elem, 'normalize_spaces'):
- elem.normalize_spaces(prior_text)
-
def has_text(self):
''' Return True iff this container has non whitespace text '''
if hasattr(self, 'text'):