Support CSS attribute white-space

This commit is contained in:
Kovid Goyal 2007-08-17 17:59:24 +00:00
parent 05291739db
commit 7c2aa5b07e
3 changed files with 74 additions and 106 deletions

View File

@ -219,13 +219,12 @@ class Span(_Span):
t['wordspace'] = 50 t['wordspace'] = 50
return t return t
def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100): def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta, parent_style,
normal_font_size=100):
src = ns.string if hasattr(ns, 'string') else ns src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules: for pat, repl in Span.rules:
src = pat.sub(repl, src) src = pat.sub(repl, src)
if not src: src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
raise ConversionError('No point in adding an empty string to a Span')
attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory) attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
if 'fontsize' in attrs.keys(): if 'fontsize' in attrs.keys():
normal_font_size = int(attrs['fontsize']) normal_font_size = int(attrs['fontsize'])
@ -258,6 +257,9 @@ class Span(_Span):
attrs['baselineskip'] = int(attrs['fontsize']) + 20 attrs['baselineskip'] = int(attrs['fontsize']) + 20
if attrs['fontfacename'] == fonts['serif']['normal'][1]: if attrs['fontfacename'] == fonts['serif']['normal'][1]:
attrs.pop('fontfacename') attrs.pop('fontfacename')
for key in attrs:
if parent_style.has_key(key) and str(parent_style[key]) == str(attrs[key]):
attrs.pop(key)
_Span.__init__(self, text=src, **attrs) _Span.__init__(self, text=src, **attrs)
class HTMLConverter(object): class HTMLConverter(object):
@ -330,7 +332,7 @@ class HTMLConverter(object):
'cite' : {'font-style' : 'italic'}, 'cite' : {'font-style' : 'italic'},
'em' : {"font-style" : "italic"}, 'em' : {"font-style" : "italic"},
'small' : {'font-size' : 'small'}, 'small' : {'font-size' : 'small'},
'pre' : {'font-family' : 'monospace' }, 'pre' : {'font-family' : 'monospace', 'white-space': 'pre' },
'code' : {'font-family' : 'monospace' }, 'code' : {'font-family' : 'monospace' },
'tt' : {'font-family' : 'monospace'}, 'tt' : {'font-family' : 'monospace'},
'center' : {'text-align' : 'center'}, 'center' : {'text-align' : 'center'},
@ -366,6 +368,7 @@ class HTMLConverter(object):
self.link_level = 0 #: Current link level self.link_level = 0 #: Current link level
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
self.tops = {} #: element representing the top of each HTML file in the LRF file self.tops = {} #: element representing the top of each HTML file in the LRF file
self.previous_text = '' #: Used to figure out when to lstrip
# Styles # Styles
self.blockquote_style = book.create_block_style(sidemargin=60, self.blockquote_style = book.create_block_style(sidemargin=60,
topskip=20, footskip=20) topskip=20, footskip=20)
@ -381,8 +384,7 @@ class HTMLConverter(object):
self.list_indent = 20 self.list_indent = 20
self.list_counter = 1 self.list_counter = 1
self.book = book #: The Book object representing a BBeB book self.book = book #: The Book object representing a BBeB book
self.lstrip_toggle = False #: If true the next add_text call will do an lstrip
self.start_on_file(path, is_root=True) self.start_on_file(path, is_root=True)
def start_on_file(self, path, is_root=True, link_level=0): def start_on_file(self, path, is_root=True, link_level=0):
@ -415,6 +417,7 @@ class HTMLConverter(object):
self.css = HTMLConverter.CSS.copy() self.css = HTMLConverter.CSS.copy()
self.target_prefix = path self.target_prefix = path
self.links[path] = [] self.links[path] = []
self.previous_text = '\n'
self.tops[path] = self.parse_file(soup, is_root) self.tops[path] = self.parse_file(soup, is_root)
self.processed_files.append(path) self.processed_files.append(path)
self.process_links(is_root, path, link_level=link_level) self.process_links(is_root, path, link_level=link_level)
@ -467,20 +470,21 @@ class HTMLConverter(object):
# however we need to as we don't do alignment at a block level. # however we need to as we don't do alignment at a block level.
# float is removed by the process_alignment function. # float is removed by the process_alignment function.
if chk.startswith('font') or chk == 'text-align' or \ if chk.startswith('font') or chk == 'text-align' or \
chk == 'float': chk == 'float' or chk == 'white-space':
temp[key] = pcss[key] temp[key] = pcss[key]
prop.update(temp) prop.update(temp)
prop = dict() prop = dict()
tagname = tag.name.lower()
if parent_css: if parent_css:
merge_parent_css(prop, parent_css) merge_parent_css(prop, parent_css)
if tag.has_key("align"): if tag.has_key("align"):
prop["text-align"] = tag["align"] prop["text-align"] = tag["align"]
if self.css.has_key(tag.name): if self.css.has_key(tagname):
prop.update(self.css[tag.name]) prop.update(self.css[tagname])
if tag.has_key("class"): if tag.has_key("class"):
cls = tag["class"].lower() cls = tag["class"].lower()
for classname in ["."+cls, tag.name+"."+cls]: for classname in ["."+cls, tagname+"."+cls]:
if self.css.has_key(classname): if self.css.has_key(classname):
prop.update(self.css[classname]) prop.update(self.css[classname])
if tag.has_key("style"): if tag.has_key("style"):
@ -537,7 +541,12 @@ class HTMLConverter(object):
raise ConversionError, 'Could not parse ' + self.file_name raise ConversionError, 'Could not parse ' + self.file_name
return top return top
def create_link(self, para, tag): def create_link(self, children, tag):
para = None
for i in range(len(children)-1, -1, -1):
if not isinstance(children[i], CR):
para = children[i]
break
text = self.get_text(tag, 1000) text = self.get_text(tag, 1000)
if not text: if not text:
text = 'Link' text = 'Link'
@ -736,30 +745,41 @@ class HTMLConverter(object):
blockStyle=self.current_block.blockStyle, blockStyle=self.current_block.blockStyle,
textStyle=ts) textStyle=ts)
self.current_para = Paragraph() self.current_para = Paragraph()
return True
return False
def add_text(self, tag, css): def add_text(self, tag, css):
''' '''
Add text to the current paragraph taking CSS into account. Add text to the current paragraph taking CSS into account.
@param tag: Either a BeautifulSoup tag or a string @param tag: Either a BeautifulSoup tag or a string
@param css: @param css: A dict
@type css:
''' '''
src = tag.string if hasattr(tag, 'string') else tag src = tag.string if hasattr(tag, 'string') else tag
src = re.sub(r'\s{1,}', ' ', src) src = src.replace('\r\n', '\n').replace('\r', '\n')
if self.lstrip_toggle: collapse_whitespace = not css.has_key('white-space') or css['white-space'] != 'pre'
if self.process_alignment(css) and collapse_whitespace:
# Dont want leading blanks in a new paragraph
src = src.lstrip() src = src.lstrip()
self.lstrip_toggle = False args = self.sanctify_css(css), self.memory, self.profile.dpi, self.fonts,\
if not src.strip(): self.logger, self.font_delta, self.current_block.textStyle.attrs
self.current_para.append(' ') if collapse_whitespace:
src = re.sub(r'\s{1,}', ' ', src)
if len(self.previous_text) != len(self.previous_text.rstrip()):
src = src.lstrip()
if len(src):
self.previous_text = src
self.current_para.append(Span(src, *args))
else: else:
self.process_alignment(css) srcs = src.split('\n')
try: for src in srcs:
self.current_para.append(Span(src, self.sanctify_css(css), self.memory, if src:
self.profile.dpi, self.fonts, self.logger, self.current_para.append(Span(src, *args))
font_delta=self.font_delta)) if len(srcs) > 1:
self.current_para.normalize_spaces() self.line_break()
except ConversionError:
self.logger.exception('Bad text') def line_break(self):
self.current_para.append(CR())
self.previous_text = '\n'
def sanctify_css(self, css): def sanctify_css(self, css):
""" Return a copy of C{css} that is safe for use in a SPAM Xylog tag """ """ Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
@ -770,7 +790,7 @@ class HTMLConverter(object):
'padding' in test or 'border' in test or 'page-break' in test \ 'padding' in test or 'border' in test or 'page-break' in test \
or test.startswith('mso') or test.startswith('background')\ or test.startswith('mso') or test.startswith('background')\
or test.startswith('line') or test in ['color', 'display', \ or test.startswith('line') or test in ['color', 'display', \
'letter-spacing', 'position']: 'letter-spacing', 'position', 'white-space']:
css.pop(key) css.pop(key)
return css return css
@ -1032,7 +1052,7 @@ class HTMLConverter(object):
if not text.strip(): if not text.strip():
text = "Link" text = "Link"
self.add_text(text, tag_css) self.add_text(text, tag_css)
self.links[self.target_prefix].append(self.create_link(self.current_para.contents[-1], tag)) self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
if tag.has_key('id') or tag.has_key('name'): if tag.has_key('id') or tag.has_key('name'):
key = 'name' if tag.has_key('name') else 'id' key = 'name' if tag.has_key('name') else 'id'
self.targets[self.target_prefix+tag[key]] = self.current_block self.targets[self.target_prefix+tag[key]] = self.current_block
@ -1131,28 +1151,19 @@ class HTMLConverter(object):
if ncss: if ncss:
update_css(ncss) update_css(ncss)
elif tagname == 'pre': elif tagname == 'pre':
for c in tag.findAll(True):
c.replaceWith(self.get_text(c))
self.end_current_para() self.end_current_para()
self.current_block.append_to(self.current_page)
attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts,
self.logger, self.font_delta, self.memory)
attrs['fontfacename'] = self.fonts['mono']['normal'][1]
ts = self.book.create_text_style(**self.unindented_style.attrs)
ts.attrs.update(attrs)
self.current_block = self.book.create_text_block(
blockStyle=self.current_block.blockStyle,
textStyle=ts)
src = ''.join([str(i) for i in tag.contents])
lines = src.split('\n')
for line in lines:
try:
self.current_para.append(line)
self.current_para.CR()
except ConversionError:
pass
self.end_current_block() self.end_current_block()
self.current_block = self.book.create_text_block() self.current_block.textStyle = self.current_block.textStyle.copy()
self.current_block.textStyle.attrs['parindent'] = '0'
if tag.contents:
c = tag.contents[0]
if isinstance(c, NavigableString):
c = str(c).replace('\r\n', '\n').replace('\r', '\n')
if c.startswith('\n'):
c = c[1:]
tag.contents[0] = NavigableString(c)
self.process_children(tag, tag_css)
self.end_current_block()
elif tagname in ['ul', 'ol', 'dl']: elif tagname in ['ul', 'ol', 'dl']:
self.list_level += 1 self.list_level += 1
if tagname == 'ol': if tagname == 'ol':
@ -1189,9 +1200,10 @@ class HTMLConverter(object):
textStyle=self.unindented_style) textStyle=self.unindented_style)
if self.current_para.has_text(): if self.current_para.has_text():
self.current_para.append(CR()) self.line_break()
self.current_block.append(self.current_para) self.current_block.append(self.current_para)
self.current_para = Paragraph() self.current_para = Paragraph()
self.previous_text = '\n'
if tagname == 'li': if tagname == 'li':
in_ol, parent = True, tag.parent in_ol, parent = True, tag.parent
while parent: while parent:
@ -1228,6 +1240,7 @@ class HTMLConverter(object):
self.block_styles.append(bs) self.block_styles.append(bs)
self.current_block = self.book.create_text_block( self.current_block = self.book.create_text_block(
blockStyle=bs, textStyle=ts) blockStyle=bs, textStyle=ts)
self.previous_text = '\n'
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
self.current_para.append_to(self.current_block) self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page) self.current_block.append_to(self.current_page)
@ -1262,14 +1275,16 @@ class HTMLConverter(object):
self.end_current_para() self.end_current_para()
if not tag.contents or not src.strip(): # Handle empty <p></p> elements if not tag.contents or not src.strip(): # Handle empty <p></p> elements
self.current_block.append(CR()) self.current_block.append(CR())
self.previous_text = '\n'
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
return return
self.lstrip_toggle = True self.previous_text = '\n'
self.process_block(tag, tag_css, tkey) self.process_block(tag, tag_css, tkey)
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
self.end_current_para() self.end_current_para()
if tagname.startswith('h') or self.blank_after_para: if tagname.startswith('h') or self.blank_after_para:
self.current_block.append(CR()) self.current_block.append(CR())
self.previous_text = '\n'
elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']: elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
elif tagname == 'font': elif tagname == 'font':
@ -1277,16 +1292,19 @@ class HTMLConverter(object):
tag_css['font-family'] = tag['face'] tag_css['font-family'] = tag['face']
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
elif tagname in ['br']: elif tagname in ['br']:
self.current_para.append(CR()) self.line_break()
self.previous_text = '\n'
elif tagname in ['hr', 'tr']: # tr needed for nested tables elif tagname in ['hr', 'tr']: # tr needed for nested tables
self.end_current_para() self.end_current_para()
self.current_block.append(CR()) self.line_break()
self.end_current_block() self.end_current_block()
if tagname == 'hr': if tagname == 'hr':
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth'])) self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
self.previous_text = '\n'
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
elif tagname == 'td': # Needed for nested tables elif tagname == 'td': # Needed for nested tables
self.current_para.append(' ') self.current_para.append(' ')
self.previous_text = ' '
self.process_children(tag, tag_css) self.process_children(tag, tag_css)
elif tagname == 'table' and not self.ignore_tables and not self.in_table: elif tagname == 'table' and not self.ignore_tables and not self.in_table:
tag_css = self.tag_css(tag) # Table should not inherit CSS tag_css = self.tag_css(tag) # Table should not inherit CSS

View File

@ -72,10 +72,6 @@
Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables. Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
</p> </p>
<br /> <br />
<p>
The table conversion code is very new and likely to be swarming with bugs, so please report them at <br/><font name="monospace>https://libprs500.kovidgoyal.net/newticket</font>
</p>
<br/>
<p style="page-break-after:always"> <p style="page-break-after:always">
On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan. On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
</p> </p>
@ -122,7 +118,7 @@
<blockquote>This is blockquoted text. It is rendered in a separate block with margins.</blockquote>The above text should be distinct from the rest of the paragraph. <blockquote>This is blockquoted text. It is rendered in a separate block with margins.</blockquote>The above text should be distinct from the rest of the paragraph.
</p> </p>
<hr/> <hr/>
<p style='text-indent:30em'>A very indented paragraph</p> <p style='text-indent:10em'>A very indented paragraph</p>
<p style='text-indent:0em'>An unindented paragraph</p> <p style='text-indent:0em'>An unindented paragraph</p>
<p>A default indented paragraph</p><br/> <p>A default indented paragraph</p><br/>
<hr/> <hr/>

View File

@ -251,52 +251,6 @@ class LrsContainer(object):
self.validChildren = validChildren self.validChildren = validChildren
self.must_append = False self.must_append = False
def normalize_spaces(self, prior_text=False):
'''
Remove multiple spaces and handle &nbsp;
@param prior_text: True if the paragraph this container is part of
has non whitespace text before this container.
'''
temp = []
for i in range(len(self.contents)):
elem = self.contents[i]
try:
if isinstance(elem, Text):
n = self.contents[i+1]
if isinstance(n, Text):
elem.text += n.text
i += 1
except:
continue
finally:
temp.append(elem)
self.contents = temp
def has_prior_text(idx):
for i in range(idx):
con = self.contents[i]
if hasattr(con, 'has_text') and con.has_text():
return True
return False
for i in range(len(self.contents)):
elem = self.contents[i]
if not prior_text and i > 0:
prior_text = has_prior_text(i)
if isinstance(elem, Text):
src = elem.text
if isinstance(src, basestring):
src = re.sub(r'\s{1,}', ' ', src)
if isinstance(self.contents[i-1], (CR, DropCaps)) \
or not prior_text:
src = src.lstrip()
src = src.replace(u'\xa0', ' ') # nbsp is replaced with \xa0 by BeatifulSoup
elem.text = src
elif hasattr(elem, 'normalize_spaces'):
elem.normalize_spaces(prior_text)
def has_text(self): def has_text(self):
''' Return True iff this container has non whitespace text ''' ''' Return True iff this container has non whitespace text '''
if hasattr(self, 'text'): if hasattr(self, 'text'):