This commit is contained in:
Kovid Goyal 2007-08-20 04:00:33 +00:00
parent a9c75bf9cf
commit 7402d7c4d8
5 changed files with 84 additions and 54 deletions

View File

@ -20,3 +20,6 @@ from various formats.
class ConversionError(Exception):
pass
class UnknownFormatError(Exception):
pass

View File

@ -16,6 +16,7 @@
import sys, os, logging, shutil, tempfile, glob
from libprs500.ebooks import UnknownFormatError
from libprs500.ebooks.lrf import option_parser
from libprs500 import __appname__, setup_cli_handlers, extract
from libprs500.ebooks.lrf.lit.convert_from import process_file as lit2lrf
@ -104,11 +105,14 @@ def process_file(path, options, logger=None):
convertor = rtf2lrf
elif 'txt' == ext:
convertor = txt2lrf
if not convertor:
raise UnknownFormatError('Coverting from %s to LRF is not supported.')
convertor(path, options, logger)
finally:
os.chdir(cwd)
if tdir and os.path.exists(tdir):
shutil.rmtree(tdir)
return 0
def main(args=sys.argv, logger=None):
@ -126,11 +130,7 @@ ZIP archive.
print 'No file to convert specified.'
return 1
process_file(args[1], options, logger)
return 0
return process_file(args[1], options, logger)
if __name__ == '__main__':
sys.exit(main())

View File

@ -212,6 +212,7 @@ class HTMLConverter(object):
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
self.css = HTMLConverter.CSS.copy()
self.pseudo_css = {}
self.target_prefix = path
self.links[path] = []
self.previous_text = '\n'
@ -227,17 +228,27 @@ class HTMLConverter(object):
@return: A dictionary with one entry per selector where the key is the
selector name and the value is a dictionary of properties
"""
sdict = dict()
sdict, pdict = {}, {}
style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
for key in sel[0].split(','):
key = key.strip().lower()
val = self.parse_style_properties(sel[1])
key = key.strip().lower()
if ':' in key:
key, sep, pseudo = key.partition(':')
if key in pdict:
if pseudo in pdict[key]:
pdict[key][pseudo].update(val)
else:
pdict[key][pseudo] = val
else:
pdict[key] = {pseudo:val}
else:
if key in sdict:
sdict[key].update(val)
else:
sdict[key] = val
return sdict
return sdict, pdict
def parse_style_properties(self, props):
"""
@ -271,7 +282,7 @@ class HTMLConverter(object):
temp[key] = pcss[key]
prop.update(temp)
prop = dict()
prop, pprop = {}, {}
tagname = tag.name.lower()
if parent_css:
merge_parent_css(prop, parent_css)
@ -279,14 +290,18 @@ class HTMLConverter(object):
prop["text-align"] = tag["align"]
if self.css.has_key(tagname):
prop.update(self.css[tagname])
if self.pseudo_css.has_key(tagname):
pprop.update(self.pseudo_css[tagname])
if tag.has_key("class"):
cls = tag["class"].lower()
for classname in ["."+cls, tagname+"."+cls]:
if self.css.has_key(classname):
prop.update(self.css[classname])
if self.pseudo_css.has_key(classname):
pprop.update(self.pseudo_css[classname])
if tag.has_key("style"):
prop.update(self.parse_style_properties(tag["style"]))
return prop
return prop, pprop
def parse_file(self, soup, is_root):
def get_valid_block(page):
@ -303,7 +318,7 @@ class HTMLConverter(object):
self.add_image_page(self.cover)
top = self.current_block
self.process_children(soup, {})
self.process_children(soup, {}, {})
if self.current_para and self.current_block:
self.current_para.append_to(self.current_block)
@ -361,7 +376,7 @@ class HTMLConverter(object):
def get_text(self, tag, limit=None):
css = self.tag_css(tag)
css = self.tag_css(tag)[0]
if (css.has_key('display') and css['display'].lower() == 'none') or \
(css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
return ''
@ -499,7 +514,7 @@ class HTMLConverter(object):
page.append(ib)
self.book.append(page)
def process_children(self, ptag, pcss):
def process_children(self, ptag, pcss, ppcss={}):
""" Process the children of ptag """
# Need to make a copy of contents as when
# extract is called on a child, it will
@ -511,7 +526,7 @@ class HTMLConverter(object):
elif isinstance(c, Tag):
self.parse_tag(c, pcss)
elif isinstance(c, NavigableString):
self.add_text(c, pcss)
self.add_text(c, pcss, ppcss)
if not self.in_table:
ptag.extract()
@ -551,7 +566,7 @@ class HTMLConverter(object):
return True
return False
def add_text(self, tag, css, force_span_use=False):
def add_text(self, tag, css, pseudo_css, force_span_use=False):
'''
Add text to the current paragraph taking CSS into account.
@param tag: Either a BeautifulSoup tag or a string
@ -559,6 +574,13 @@ class HTMLConverter(object):
'''
src = tag.string if hasattr(tag, 'string') else tag
src = src.replace('\r\n', '\n').replace('\r', '\n')
if pseudo_css.has_key('first-letter'):
src = src.lstrip()
f = src[0]
src = src[1:]
ncss = css.copy()
ncss.update(pseudo_css.pop('first-letter'))
self.add_text(f, ncss, {}, force_span_use)
collapse_whitespace = not css.has_key('white-space') or css['white-space'] != 'pre'
if self.process_alignment(css) and collapse_whitespace:
# Dont want leading blanks in a new paragraph
@ -1042,9 +1064,9 @@ class HTMLConverter(object):
tagname = tag.name.lower()
except AttributeError:
if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
self.add_text(tag, parent_css)
self.add_text(tag, parent_css, {})
return
tag_css = self.tag_css(tag, parent_css=parent_css)
tag_css, tag_pseudo_css = self.tag_css(tag, parent_css=parent_css)
try: # Skip element if its display attribute is set to none
if tag_css['display'].lower() == 'none' or \
tag_css['visibility'].lower() == 'hidden':
@ -1068,7 +1090,7 @@ class HTMLConverter(object):
text = self.get_text(tag, limit=1000)
if not text.strip():
text = "Link"
self.add_text(text, tag_css, force_span_use=True)
self.add_text(text, tag_css, {}, force_span_use=True)
self.links[self.target_prefix].append(self.create_link(self.current_para.contents, tag))
if tag.has_key('id') or tag.has_key('name'):
key = 'name' if tag.has_key('name') else 'id'
@ -1077,7 +1099,7 @@ class HTMLConverter(object):
key = 'name' if tag.has_key('name') else 'id'
name = tag[key].replace('#', '')
if self.anchor_to_previous:
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
for c in self.anchor_to_previous.contents:
if isinstance(c, (TextBlock, ImageBlock)):
self.targets[self.target_prefix+tag[key]] = c
@ -1088,7 +1110,7 @@ class HTMLConverter(object):
self.targets[self.target_prefix+name] = tb
return
previous = self.current_block
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
target = None
if self.current_block == previous:
@ -1142,17 +1164,19 @@ class HTMLConverter(object):
else:
self.logger.debug("Failed to process: %s", str(tag))
elif tagname in ['style', 'link']:
def update_css(ncss):
def update_css(ncss, ocss):
for key in ncss.keys():
if self.css.has_key(key):
self.css[key].update(ncss[key])
if ocss.has_key(key):
ocss[key].update(ncss[key])
else:
self.css[key] = ncss[key]
ncss = {}
ocss[key] = ncss[key]
ncss, npcss = {}, {}
if tagname == 'style':
for c in tag.contents:
if isinstance(c, NavigableString):
ncss.update(self.parse_css(str(c)))
css, pcss = self.parse_css(str(c))
ncss.update(css)
npcss.update(pcss)
elif tag.has_key('type') and tag['type'] == "text/css" \
and tag.has_key('href'):
purl = urlparse(tag['href'])
@ -1164,11 +1188,13 @@ class HTMLConverter(object):
match = self.PAGE_BREAK_PAT.search(src)
if match and not re.match('avoid', match.group(1), re.IGNORECASE):
self.page_break_found = True
ncss = self.parse_css(src)
ncss, npcss = self.parse_css(src)
except IOError:
pass
if ncss:
update_css(ncss)
update_css(ncss, self.css)
if npcss:
update_css(pcss, self.pseudo_css)
elif tagname == 'pre':
self.end_current_para()
self.end_current_block()
@ -1181,7 +1207,7 @@ class HTMLConverter(object):
if c.startswith('\n'):
c = c[1:]
tag.contents[0] = NavigableString(c)
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
self.end_current_block()
elif tagname in ['ul', 'ol', 'dl']:
self.list_level += 1
@ -1197,7 +1223,7 @@ class HTMLConverter(object):
self.current_block = self.book.create_text_block(
blockStyle=bs,
textStyle=self.unindented_style)
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
self.end_current_block()
self.current_block.blockStyle = prev_bs
self.list_level -= 1
@ -1232,11 +1258,11 @@ class HTMLConverter(object):
parent = parent.parent
prepend = str(self.list_counter)+'. ' if in_ol else u'\u2022' + ' '
self.current_para.append(Span(prepend))
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
if in_ol:
self.list_counter += 1
else:
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'blockquote':
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
@ -1260,7 +1286,7 @@ class HTMLConverter(object):
self.current_block = self.book.create_text_block(
blockStyle=bs, textStyle=ts)
self.previous_text = '\n'
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_para = Paragraph()
@ -1304,18 +1330,18 @@ class HTMLConverter(object):
self.previous_text = '\n'
self.current_para = Paragraph()
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if tagname.startswith('h') or self.blank_after_para:
self.current_block.append(CR())
elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite']:
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'font':
if tag.has_key('face'):
tag_css['font-family'] = tag['face']
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname in ['br']:
self.line_break()
self.previous_text = '\n'
@ -1326,13 +1352,13 @@ class HTMLConverter(object):
if tagname == 'hr':
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
self.previous_text = '\n'
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'td': # Needed for nested tables
self.current_para.append(' ')
self.previous_text = ' '
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'table' and not self.ignore_tables and not self.in_table:
tag_css = self.tag_css(tag) # Table should not inherit CSS
tag_css = self.tag_css(tag)[0] # Table should not inherit CSS
try:
self.process_table(tag, tag_css)
except Exception, err:
@ -1340,11 +1366,11 @@ class HTMLConverter(object):
self.logger.debug('', exc_info=True)
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
self.in_table = False
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
finally:
tag.extract()
else:
self.process_children(tag, tag_css)
self.process_children(tag, tag_css, tag_pseudo_css)
if end_page:
self.end_page()

View File

@ -4,7 +4,9 @@
.toc { page-break-after: always; text-indent: 0em; }
.tocpn {text-align: right; }
.tocchr {text-align: right;}
.hanging_indent { padding-left:40px; text-indent:-40px }
.hanging_indent { padding-left:4em; text-indent:-4em }
.drop { text-indent: 0pt}
.drop:first-letter { font-size:xx-large}
</style>
</head>
<h1>Demo of <span style='font-family:monospace'>html2lrf</span></h1>
@ -159,19 +161,18 @@
paragraph. Image based dropcaps are specified by adding the <code>class = 'libprs500_dropcaps'</code>
attribute to an <code>&lt;img&gt;</code> tag.<p/>
<br/>
<br/>
<p><big class='libprs500_dropcaps'>T</big>his is a plain text based dropcaps. It
<p class="drop">This is a plain text based dropcaps. It
is not nearly as dramatic, but easier to code ;-)
</p>
<br/>
<br/>
<p><span style="font-variant: small-caps">This is an Example</span> of small-caps.
It can also be used to highlight the start of a paragraph very effectively.
<br/>
</p>
<br/>
<p class='hanging_indent'>A paragraph with a hanging indent. This is especially
useful for highly structured text like verse, or dialogue.
The world is not all prose!<br/></p>
useful for highly structured text like verse, or dialogue.<br/></p>
<p class='toc'>
<hr />
<a href='#toc'>Table of Contents</a>

View File

@ -220,7 +220,7 @@ class Row(object):
self.colpad = colpad
cells = row.findAll(re.compile('td|th', re.IGNORECASE))
for cell in cells:
ccss = conv.tag_css(cell, css)
ccss = conv.tag_css(cell, css)[0]
self.cells.append(Cell(conv, cell, ccss))
def number_of_cells(self):
@ -285,7 +285,7 @@ class Table(object):
conv.anchor_to_previous = conv.current_page
conv.in_table = True
for row in rows:
rcss = conv.tag_css(row, css)
rcss = conv.tag_css(row, css)[0]
self.rows.append(Row(conv, row, rcss, colpad))
conv.in_table = False
conv.anchor_to_previous = None