From 7402d7c4d8ecd8220493c544c7c615ef56526aae Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Mon, 20 Aug 2007 04:00:33 +0000
Subject: [PATCH] Fix #167
---
src/libprs500/ebooks/__init__.py | 3 +
src/libprs500/ebooks/lrf/any/convert_from.py | 10 +-
src/libprs500/ebooks/lrf/html/convert_from.py | 106 +++++++++++-------
src/libprs500/ebooks/lrf/html/demo/demo.html | 15 +--
src/libprs500/ebooks/lrf/html/table.py | 4 +-
5 files changed, 84 insertions(+), 54 deletions(-)
diff --git a/src/libprs500/ebooks/__init__.py b/src/libprs500/ebooks/__init__.py
index fb87783e16..9337c5d331 100644
--- a/src/libprs500/ebooks/__init__.py
+++ b/src/libprs500/ebooks/__init__.py
@@ -20,3 +20,6 @@ from various formats.
class ConversionError(Exception):
pass
+
+class UnknownFormatError(Exception):
+ pass
\ No newline at end of file
diff --git a/src/libprs500/ebooks/lrf/any/convert_from.py b/src/libprs500/ebooks/lrf/any/convert_from.py
index 23c863afda..a5995309f9 100644
--- a/src/libprs500/ebooks/lrf/any/convert_from.py
+++ b/src/libprs500/ebooks/lrf/any/convert_from.py
@@ -16,6 +16,7 @@
import sys, os, logging, shutil, tempfile, glob
+from libprs500.ebooks import UnknownFormatError
from libprs500.ebooks.lrf import option_parser
from libprs500 import __appname__, setup_cli_handlers, extract
from libprs500.ebooks.lrf.lit.convert_from import process_file as lit2lrf
@@ -104,11 +105,14 @@ def process_file(path, options, logger=None):
convertor = rtf2lrf
elif 'txt' == ext:
convertor = txt2lrf
+ if not convertor:
+ raise UnknownFormatError('Coverting from %s to LRF is not supported.')
convertor(path, options, logger)
finally:
os.chdir(cwd)
if tdir and os.path.exists(tdir):
shutil.rmtree(tdir)
+ return 0
def main(args=sys.argv, logger=None):
@@ -126,11 +130,7 @@ ZIP archive.
print 'No file to convert specified.'
return 1
- process_file(args[1], options, logger)
-
-
-
- return 0
+ return process_file(args[1], options, logger)
if __name__ == '__main__':
sys.exit(main())
\ No newline at end of file
diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index 4ba7bce425..d8cee34e2d 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -212,6 +212,7 @@ class HTMLConverter(object):
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
self.css = HTMLConverter.CSS.copy()
+ self.pseudo_css = {}
self.target_prefix = path
self.links[path] = []
self.previous_text = '\n'
@@ -227,17 +228,27 @@ class HTMLConverter(object):
@return: A dictionary with one entry per selector where the key is the
selector name and the value is a dictionary of properties
"""
- sdict = dict()
+ sdict, pdict = {}, {}
style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
for key in sel[0].split(','):
- key = key.strip().lower()
val = self.parse_style_properties(sel[1])
- if key in sdict:
- sdict[key].update(val)
+ key = key.strip().lower()
+ if ':' in key:
+ key, sep, pseudo = key.partition(':')
+ if key in pdict:
+ if pseudo in pdict[key]:
+ pdict[key][pseudo].update(val)
+ else:
+ pdict[key][pseudo] = val
+ else:
+ pdict[key] = {pseudo:val}
else:
- sdict[key] = val
- return sdict
+ if key in sdict:
+ sdict[key].update(val)
+ else:
+ sdict[key] = val
+ return sdict, pdict
def parse_style_properties(self, props):
"""
@@ -271,7 +282,7 @@ class HTMLConverter(object):
temp[key] = pcss[key]
prop.update(temp)
- prop = dict()
+ prop, pprop = {}, {}
tagname = tag.name.lower()
if parent_css:
merge_parent_css(prop, parent_css)
@@ -279,14 +290,18 @@ class HTMLConverter(object):
prop["text-align"] = tag["align"]
if self.css.has_key(tagname):
prop.update(self.css[tagname])
+ if self.pseudo_css.has_key(tagname):
+ pprop.update(self.pseudo_css[tagname])
if tag.has_key("class"):
cls = tag["class"].lower()
for classname in ["."+cls, tagname+"."+cls]:
if self.css.has_key(classname):
prop.update(self.css[classname])
+ if self.pseudo_css.has_key(classname):
+ pprop.update(self.pseudo_css[classname])
if tag.has_key("style"):
- prop.update(self.parse_style_properties(tag["style"]))
- return prop
+ prop.update(self.parse_style_properties(tag["style"]))
+ return prop, pprop
def parse_file(self, soup, is_root):
def get_valid_block(page):
@@ -303,7 +318,7 @@ class HTMLConverter(object):
self.add_image_page(self.cover)
top = self.current_block
- self.process_children(soup, {})
+ self.process_children(soup, {}, {})
if self.current_para and self.current_block:
self.current_para.append_to(self.current_block)
@@ -361,7 +376,7 @@ class HTMLConverter(object):
def get_text(self, tag, limit=None):
- css = self.tag_css(tag)
+ css = self.tag_css(tag)[0]
if (css.has_key('display') and css['display'].lower() == 'none') or \
(css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
return ''
@@ -499,7 +514,7 @@ class HTMLConverter(object):
page.append(ib)
self.book.append(page)
- def process_children(self, ptag, pcss):
+ def process_children(self, ptag, pcss, ppcss={}):
""" Process the children of ptag """
# Need to make a copy of contents as when
# extract is called on a child, it will
@@ -511,7 +526,7 @@ class HTMLConverter(object):
elif isinstance(c, Tag):
self.parse_tag(c, pcss)
elif isinstance(c, NavigableString):
- self.add_text(c, pcss)
+ self.add_text(c, pcss, ppcss)
if not self.in_table:
ptag.extract()
@@ -551,7 +566,7 @@ class HTMLConverter(object):
return True
return False
- def add_text(self, tag, css, force_span_use=False):
+ def add_text(self, tag, css, pseudo_css, force_span_use=False):
'''
Add text to the current paragraph taking CSS into account.
@param tag: Either a BeautifulSoup tag or a string
@@ -559,6 +574,13 @@ class HTMLConverter(object):
'''
src = tag.string if hasattr(tag, 'string') else tag
src = src.replace('\r\n', '\n').replace('\r', '\n')
+ if pseudo_css.has_key('first-letter'):
+ src = src.lstrip()
+ f = src[0]
+ src = src[1:]
+ ncss = css.copy()
+ ncss.update(pseudo_css.pop('first-letter'))
+ self.add_text(f, ncss, {}, force_span_use)
collapse_whitespace = not css.has_key('white-space') or css['white-space'] != 'pre'
if self.process_alignment(css) and collapse_whitespace:
# Dont want leading blanks in a new paragraph
@@ -1042,9 +1064,9 @@ class HTMLConverter(object):
tagname = tag.name.lower()
except AttributeError:
if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
- self.add_text(tag, parent_css)
+ self.add_text(tag, parent_css, {})
return
- tag_css = self.tag_css(tag, parent_css=parent_css)
+ tag_css, tag_pseudo_css = self.tag_css(tag, parent_css=parent_css)
try: # Skip element if its display attribute is set to none
if tag_css['display'].lower() == 'none' or \
tag_css['visibility'].lower() == 'hidden':
@@ -1068,7 +1090,7 @@ class HTMLConverter(object):
text = self.get_text(tag, limit=1000)
if not text.strip():
text = "Link"
- self.add_text(text, tag_css, force_span_use=True)
+ self.add_text(text, tag_css, {}, force_span_use=True)
self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
if tag.has_key('id') or tag.has_key('name'):
key = 'name' if tag.has_key('name') else 'id'
@@ -1077,7 +1099,7 @@ class HTMLConverter(object):
key = 'name' if tag.has_key('name') else 'id'
name = tag[key].replace('#', '')
if self.anchor_to_previous:
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
for c in self.anchor_to_previous.contents:
if isinstance(c, (TextBlock, ImageBlock)):
self.targets[self.target_prefix+tag[key]] = c
@@ -1088,7 +1110,7 @@ class HTMLConverter(object):
self.targets[self.target_prefix+name] = tb
return
previous = self.current_block
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
target = None
if self.current_block == previous:
@@ -1142,17 +1164,19 @@ class HTMLConverter(object):
else:
self.logger.debug("Failed to process: %s", str(tag))
elif tagname in ['style', 'link']:
- def update_css(ncss):
+ def update_css(ncss, ocss):
for key in ncss.keys():
- if self.css.has_key(key):
- self.css[key].update(ncss[key])
+ if ocss.has_key(key):
+ ocss[key].update(ncss[key])
else:
- self.css[key] = ncss[key]
- ncss = {}
+ ocss[key] = ncss[key]
+ ncss, npcss = {}, {}
if tagname == 'style':
for c in tag.contents:
if isinstance(c, NavigableString):
- ncss.update(self.parse_css(str(c)))
+ css, pcss = self.parse_css(str(c))
+ ncss.update(css)
+ npcss.update(pcss)
elif tag.has_key('type') and tag['type'] == "text/css" \
and tag.has_key('href'):
purl = urlparse(tag['href'])
@@ -1164,11 +1188,13 @@ class HTMLConverter(object):
match = self.PAGE_BREAK_PAT.search(src)
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
- ncss = self.parse_css(src)
+ ncss, npcss = self.parse_css(src)
except IOError:
pass
if ncss:
- update_css(ncss)
+ update_css(ncss, self.css)
+ if npcss:
+ update_css(pcss, self.pseudo_css)
elif tagname == 'pre':
self.end_current_para()
self.end_current_block()
@@ -1181,7 +1207,7 @@ class HTMLConverter(object):
if c.startswith('\n'):
c = c[1:]
tag.contents[0] = NavigableString(c)
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
self.end_current_block()
elif tagname in ['ul', 'ol', 'dl']:
self.list_level += 1
@@ -1197,7 +1223,7 @@ class HTMLConverter(object):
self.current_block = self.book.create_text_block(
blockStyle=bs,
textStyle=self.unindented_style)
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
self.end_current_block()
self.current_block.blockStyle = prev_bs
self.list_level -= 1
@@ -1232,11 +1258,11 @@ class HTMLConverter(object):
parent = parent.parent
prepend = str(self.list_counter)+'. ' if in_ol else u'\u2022' + ' '
self.current_para.append(Span(prepend))
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
if in_ol:
self.list_counter += 1
else:
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'blockquote':
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
@@ -1260,7 +1286,7 @@ class HTMLConverter(object):
self.current_block = self.book.create_text_block(
blockStyle=bs, textStyle=ts)
self.previous_text = '\n'
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_para = Paragraph()
@@ -1304,18 +1330,18 @@ class HTMLConverter(object):
self.previous_text = '\n'
self.current_para = Paragraph()
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if tagname.startswith('h') or self.blank_after_para:
self.current_block.append(CR())
elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'font':
if tag.has_key('face'):
tag_css['font-family'] = tag['face']
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname in ['br']:
self.line_break()
self.previous_text = '\n'
@@ -1326,13 +1352,13 @@ class HTMLConverter(object):
if tagname == 'hr':
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
self.previous_text = '\n'
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'td': # Needed for nested tables
self.current_para.append(' ')
self.previous_text = ' '
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'table' and not self.ignore_tables and not self.in_table:
- tag_css = self.tag_css(tag) # Table should not inherit CSS
+ tag_css = self.tag_css(tag)[0] # Table should not inherit CSS
try:
self.process_table(tag, tag_css)
except Exception, err:
@@ -1340,11 +1366,11 @@ class HTMLConverter(object):
self.logger.debug('', exc_info=True)
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
self.in_table = False
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
finally:
tag.extract()
else:
- self.process_children(tag, tag_css)
+ self.process_children(tag, tag_css, tag_pseudo_css)
if end_page:
self.end_page()
diff --git a/src/libprs500/ebooks/lrf/html/demo/demo.html b/src/libprs500/ebooks/lrf/html/demo/demo.html
index 5b25c3d2cf..d0142ce120 100644
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@@ -4,7 +4,9 @@
.toc { page-break-after: always; text-indent: 0em; }
.tocpn {text-align: right; }
.tocchr {text-align: right;}
-.hanging_indent { padding-left:40px; text-indent:-40px }
+.hanging_indent { padding-left:4em; text-indent:-4em }
+.drop { text-indent: 0pt}
+.drop:first-letter { font-size:xx-large}
Demo of html2lrf
@@ -159,19 +161,18 @@
paragraph. Image based dropcaps are specified by adding the class = 'libprs500_dropcaps'
attribute to an <img>
tag.
-
- This is a plain text based dropcaps. It
+
+
This is a plain text based dropcaps. It
is not nearly as dramatic, but easier to code ;-)
-
+
This is an Example of small-caps.
It can also be used to highlight the start of a paragraph very effectively.
-
+
A paragraph with a hanging indent. This is especially
- useful for highly structured text like verse, or dialogue.
- The world is not all prose!
+ useful for highly structured text like verse, or dialogue.
Table of Contents
diff --git a/src/libprs500/ebooks/lrf/html/table.py b/src/libprs500/ebooks/lrf/html/table.py
index a9141459db..c63b1dfeba 100644
--- a/src/libprs500/ebooks/lrf/html/table.py
+++ b/src/libprs500/ebooks/lrf/html/table.py
@@ -220,7 +220,7 @@ class Row(object):
self.colpad = colpad
cells = row.findAll(re.compile('td|th', re.IGNORECASE))
for cell in cells:
- ccss = conv.tag_css(cell, css)
+ ccss = conv.tag_css(cell, css)[0]
self.cells.append(Cell(conv, cell, ccss))
def number_of_cells(self):
@@ -285,7 +285,7 @@ class Table(object):
conv.anchor_to_previous = conv.current_page
conv.in_table = True
for row in rows:
- rcss = conv.tag_css(row, css)
+ rcss = conv.tag_css(row, css)[0]
self.rows.append(Row(conv, row, rcss, colpad))
conv.in_table = False
conv.anchor_to_previous = None