Working initial HTML->LRF converter with CSS support. Next on list is support for <style>, <link> and <img> tags.

This commit is contained in:
Kovid Goyal 2007-04-18 22:51:48 +00:00
parent 15014f74fe
commit d69fad53f4
9 changed files with 191 additions and 431 deletions

View File

@ -5,5 +5,6 @@
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.5</pydev_property>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/libprs500/src</path>
<path>/libprs500/libprs500.lrf.txt</path>
</pydev_pathproperty>
</pydev_project>

View File

@ -33,7 +33,10 @@ if sys.argv[1] == 'py2exe':
f.close()
try:
import py2exe
console = [{'script' : 'src/libprs500/cli/main.py', 'dest_base':'prs500'}]
console = [{
'script' : 'src/libprs500/cli/main.py', 'dest_base':'prs500',
'script' : 'src/libprs500/lrf/html/convert_from.py', 'dest_base':'html2lrf'
}]
windows = [{'script' : 'src/libprs500/gui/main.py', 'dest_base':'prs500-gui',
'icon_resources':[(1,'icons/library.ico')]}]
excludes = ["Tkconstants", "Tkinter", "tcl", "_imagingtk",
@ -94,9 +97,8 @@ setup(
'prs500 = libprs500.cli.main:main', \
'lrf-meta = libprs500.lrf.meta:main', \
'rtf-meta = libprs500.metadata.rtf:main', \
'makelrf = libprs500.lrf.makelrf:main', \
'txt2lrf = libprs500.lrf.makelrf:txt', \
'html2lrf = libprs500.lrf.makelrf:html',\
'html2lrf = libprs500.lrf.html.convert_from:main',\
],
'gui_scripts' : [ 'prs500-gui = libprs500.gui.main:main']
},

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

View File

@ -19,10 +19,10 @@ Code to convert HTML ebooks into LRF ebooks.
"""
import os, re, sys
from htmlentitydefs import name2codepoint
from optparse import OptionParser
from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, NavigableString
from libprs500.lrf.pylrs.pylrs import Book, Page, Paragraph, TextBlock, CR
from libprs500.lrf.pylrs.pylrs import Book, Page, Paragraph, TextBlock, CR, Italic
from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError
@ -40,7 +40,7 @@ class Span(_Span):
(an int) if successful. Otherwise, returns None.
Assumes: 1 pixel is 1/4 mm. One em is 10pts
"""
m = re.match("\s*([0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
if m is not None:
unit = float(m.group(1))
if m.group(2) == '%':
@ -160,6 +160,10 @@ class Span(_Span):
src = pat.sub(repl, src)
if not src:
raise ConversionError('No point in adding an empty string')
if 'font-style' in css.keys():
fs = css.pop('font-style')
if fs.lower() == 'italic':
src = Italic(src)
attrs = Span.translate_attrs(css)
_Span.__init__(self, text=src, **attrs)
@ -227,6 +231,13 @@ class HTMLConvertor(object):
"""
Return a dictionary of style properties applicable to Tag tag.
"""
def merge_parent_css(prop, pcss):
temp = {}
for key in pcss.keys():
if key.lower().startswith('font'):
temp[key] = pcss[key]
prop.update(temp)
prop = dict()
if tag.has_key("align"):
prop["text-align"] = tag["align"]
@ -238,7 +249,7 @@ class HTMLConvertor(object):
if self.css.has_key(classname):
prop.update(self.css[classname])
if parent_css:
prop.update(parent_css)
merge_parent_css(prop, parent_css)
if tag.has_key("style"):
prop.update(self.parse_style_properties(tag["style"]))
return prop
@ -257,21 +268,51 @@ class HTMLConvertor(object):
if self.current_page:
self.book.append(self.current_page)
def end_page(self):
self.current_block.append(self.current_para)
self.current_para = Paragraph()
self.current_page.append(self.current_block)
self.current_block = TextBlock()
self.book.append(self.current_page)
self.current_page = Page()
def parse_tag(self, tag, parent_css):
def sanctify_css(css):
""" Make css safe for use in a SPAM Xylog tag """
for key in css.keys():
test = key.lower()
if test.startswith('margin') or 'indent' in test or \
'padding' in test or 'border' in test or test in \
['color', 'display', 'text-decoration', 'letter-spacing']:
css.pop(key)
return css
def add_text(tag, css):
try:
self.current_para.append(Span(tag, css))
self.current_para.append(Span(tag, sanctify_css(css)))
except ConversionError, err:
if self.verbose:
print >>sys.stderr, err
def process_text_tag(tag, pcss):
if 'page-break-before' in pcss.keys():
if pcss['page-break-before'].lower() != 'avoid':
self.end_page()
pcss.pop('page-break-before')
end_page = False
if 'page-break-after' in pcss.keys():
end_page = True
pcss.pop('page-break-after')
for c in tag.contents:
if isinstance(tag, NavigableString):
add_text(tag, pcss)
else:
self.parse_tag(c, pcss)
if end_page:
self.end_page()
try:
tagname = tag.name.lower()
@ -280,8 +321,17 @@ class HTMLConvertor(object):
return
if tagname in ["title", "script", "meta"]:
pass
elif tagname in ['style', 'link']:
# TODO: Append CSS to self.css
pass
elif tagname == 'p':
css = self.tag_css(tag, parent_css=parent_css)
indent = css.pop('text-indent', '')
if indent:
# TODO: If indent is different from current textblock's parindent
# start a new TextBlock
pass
self.current_para.CR() # Put a paragraph end
self.current_block.append(self.current_para)
self.current_para = Paragraph()
process_text_tag(tag, css)
@ -302,13 +352,14 @@ class HTMLConvertor(object):
self.current_para = Paragraph()
self.current_page = Page()
else:
css = self.tag_css(tag, parent_css=parent_css)
for c in tag.contents:
if isinstance(c, Comment):
continue
elif isinstance(c, Tag):
self.parse_tag(c)
self.parse_tag(c, css)
elif isinstance(c, NavigableString):
add_text(c, parent_css)
add_text(c, css)
def writeto(self, path):
if path.lower().endswith('lrs'):
@ -327,8 +378,33 @@ def process_file(path, options):
book = Book(title=options.title, author=options.author, \
sourceencoding='utf8')
conv = HTMLConvertor(book, soup)
name = os.path.splitext(os.path.basename(path))[0]+'.lrs'
name = os.path.splitext(os.path.basename(path))[0]+'.lrf'
os.chdir(cwd)
conv.writeto(name)
finally:
os.chdir(cwd)
def main():
""" CLI for html -> lrf conversions """
parser = OptionParser(usage=\
"""usage: %prog [options] mybook.txt
%prog converts mybook.txt to mybook.lrf
"""\
)
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the title")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown')
options, args = parser.parse_args()
if len(args) != 1:
parser.print_help()
sys.exit(1)
src = args[0]
if options.title == None:
options.title = os.path.splitext(os.path.basename(src))[0]
process_file(src, options)
if __name__ == '__main__':
main()

View File

@ -1,266 +0,0 @@
## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Thin ctypes based wrapper around libtidy. Example usage:
>>> from libtidy import parseString
>>> print parseString('<h1>fowehfow</h2>', \
output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0)
<?xml version="1.0" encoding="us-ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title></title>
</head>
<body>
<h1>
fowehfow
</h1>
</body>
</html>
"""
import ctypes
from cStringIO import StringIO
import weakref
class TidyLibError(Exception):
def __init__(self, arg):
self.arg=arg
class InvalidOptionError(TidyLibError):
def __str__(self):
return "%s was not a valid Tidy option." % (self.arg)
__repr__=__str__
class OptionArgError(TidyLibError):
def __init__(self, arg):
self.arg=arg
def __str__(self):
return self.arg
# search the path for libtidy using the known names;
thelib=None
for libname in ('cygtidy-0-99-0', 'libtidy', 'libtidy.so', 'tidylib'):
try:
thelib = getattr(ctypes.cdll, libname)
break
except OSError:
pass
if not thelib:
raise OSError("Couldn't find libtidy, please make sure it is installed.")
class Loader:
"""
I am a trivial wrapper that eliminates the need for tidy.tidyFoo,
so you can just access tidy.Foo
"""
def __init__(self):
self.lib = thelib
def __getattr__(self, name):
try:
return getattr(self.lib, "tidy%s" % name)
# current ctypes uses ValueError, future will use AttributeError
except (ValueError, AttributeError):
return getattr(self.lib, name)
_tidy=Loader()
# define a callback to pass to Tidylib
def _putByte(handle, c):
"""Lookup sink by handle and call its putByte method"""
sinkfactory[handle].putByte(c)
return 0
PUTBYTEFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char)
putByte = PUTBYTEFUNC(_putByte)
class _OutputSink(ctypes.Structure):
_fields_ = [("sinkData", ctypes.c_int),
("putByte", PUTBYTEFUNC),
]
class _Sink:
def __init__(self):
self._data = StringIO()
self.struct = _OutputSink()
self.struct.putByte = putByte
def putByte(self, c):
self._data.write(c)
def __str__(self):
return self._data.getvalue()
class ReportItem:
def __init__(self, err):
self.err = err
if err.startswith('line'):
tokens = err.split(' ',6)
self.severity = tokens[5][0] # W or E
self.line = int(tokens[1])
self.col = int(tokens[3])
self.message = tokens[6]
else:
tokens = err.split(' ',1)
self.severity = tokens[0][0]
self.message = tokens[1]
self.line = None
self.col = None
# TODO - parse emacs mode
def __str__(self):
severities = dict(W='Warning', E='Error', C='Config')
try:
if self.line:
return "line %d col %d - %s: %s" % (self.line, self.col,
severities[self.severity],
self.message)
else:
return "%s: %s" % (severities[self.severity], self.message)
except KeyError:
return self.err
def __repr__(self):
return "%s('%s')" % (self.__class__.__name__,
str(self).replace("'", "\\'"))
class FactoryDict(dict):
"""I am a dict with a create method and no __setitem__. This allows
me to control my own keys.
"""
def create(self):
"""Subclasses should implement me to generate a new item"""
def _setitem(self, name, value):
dict.__setitem__(self, name, value)
def __setitem__(self, name, value):
raise TypeError, "Use create() to get a new object"
class SinkFactory(FactoryDict):
"""Mapping for lookup of sinks by handle"""
def __init__(self):
FactoryDict.__init__(self)
self.lastsink = 0
def create(self):
sink = _Sink()
sink.struct.sinkData = self.lastsink
FactoryDict._setitem(self, self.lastsink, sink)
self.lastsink = self.lastsink+1
return sink
sinkfactory = SinkFactory()
class _Document(object):
def __init__(self):
self.cdoc = _tidy.Create()
self.errsink = sinkfactory.create()
_tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct))
def write(self, stream):
stream.write(str(self))
def get_errors(self):
ret = []
for line in str(self.errsink).split('\n'):
line = line.strip(' \n\r')
if line: ret.append(ReportItem(line))
return ret
errors=property(get_errors)
def __str__(self):
stlen = ctypes.c_int(8192)
st = ctypes.c_buffer(stlen.value)
rc = _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen))
if rc==-12: # buffer too small
st = ctypes.c_buffer(stlen.value)
_tidy.SaveString(self.cdoc, st, ctypes.byref(stlen))
return st.value
errors = {'missing or malformed argument for option: ': OptionArgError,
'unknown option: ': InvalidOptionError,
}
class DocumentFactory(FactoryDict):
def _setOptions(self, doc, **options):
for k in options.keys():
# this will flush out most argument type errors...
if options[k] is None: options[k] = ''
_tidy.OptParseValue(doc.cdoc,
k.replace('_', '-'),
str(options[k]))
if doc.errors:
match=filter(doc.errors[-1].message.startswith, errors.keys())
if match:
raise errors[match[0]](doc.errors[-1].message)
def load(self, doc, arg, loader):
loader(doc.cdoc, arg)
_tidy.CleanAndRepair(doc.cdoc)
def loadFile(self, doc, filename):
self.load(doc, filename, _tidy.ParseFile)
def loadString(self, doc, st):
self.load(doc, st, _tidy.ParseString)
def _create(self, *args, **kwargs):
doc = _Document()
self._setOptions(doc, **kwargs)
ref = weakref.ref(doc, self.releaseDoc)
FactoryDict._setitem(self, ref, doc.cdoc)
return doc
def parse(self, filename, *args, **kwargs):
"""
Open and process filename as an HTML file, returning a
processed document object.
@param kwargs: named options to pass to TidyLib for processing
the input file.
@param filename: the name of a file to process
@return: a document object
"""
doc = self._create(**kwargs)
self.loadFile(doc, filename)
return doc
def parseString(self, st, *args, **kwargs):
"""
Use st as an HTML file, and process it, returning a
document object.
@param kwargs: named options to pass to TidyLib for processing
the input file.
@param st: the string to parse
@return: a document object
"""
doc = self._create(**kwargs)
self.loadString(doc, st)
return doc
def releaseDoc(self, ref):
_tidy.Release(self[ref])
docfactory = DocumentFactory()
parse = docfactory.parse
parseString = docfactory.parseString

View File

@ -17,19 +17,14 @@ import shutil
import sys
import hashlib
import re
import time
import pkg_resources
import subprocess
from tempfile import mkdtemp
from optparse import OptionParser
import xml.dom.minidom as dom
from libprs500.lrf import ConversionError
from libprs500.lrf.meta import LRFException, LRFMetaFile
from libprs500.ptempfile import PersistentTemporaryFile
_bbebook = 'BBeBook-0.2.jar'
def generate_thumbnail(path):
""" Generate a JPEG thumbnail of size ~ 128x128 (aspect ratio preserved)"""
try:
@ -45,30 +40,6 @@ def generate_thumbnail(path):
im.save(thumb.name)
return thumb
def create_xml(cfg):
doc = dom.getDOMImplementation().createDocument(None, None, None)
def add_field(parent, tag, value):
elem = doc.createElement(tag)
elem.appendChild(doc.createTextNode(value))
parent.appendChild(elem)
info = doc.createElement('Info')
info.setAttribute('version', '1.0')
book_info = doc.createElement('BookInfo')
doc_info = doc.createElement('DocInfo')
info.appendChild(book_info)
info.appendChild(doc_info)
add_field(book_info, 'File', cfg['File'])
add_field(doc_info, 'Output', cfg['Output'])
for field in ['Title', 'Author', 'BookID', 'Publisher', 'Label', \
'Category', 'Classification', 'Icon', 'Cover', 'FreeText']:
if cfg.has_key(field):
add_field(book_info, field, cfg[field])
add_field(doc_info, 'Language', 'en')
add_field(doc_info, 'Creator', _bbebook)
add_field(doc_info, 'CreationDate', time.strftime('%Y-%m-%d', time.gmtime()))
doc.appendChild(info)
return doc.toxml()
def makelrf(author=None, title=None, \
thumbnail=None, src=None, odir=".",\
@ -150,127 +121,3 @@ def makelrf(author=None, title=None, \
if dirpath:
shutil.rmtree(dirpath, True)
def txt():
""" CLI for txt -> lrf conversions """
parser = OptionParser(usage=\
"""usage: %prog [options] mybook.txt
%prog converts mybook.txt to mybook.lrf
"""\
)
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the title")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown')
defenc = 'cp1252'
enchelp = 'Set the encoding used to decode ' + \
'the text in mybook.txt. Default encoding is ' + defenc
parser.add_option('-e', '--encoding', action='store', type='string', \
dest='encoding', help=enchelp, default=defenc)
options, args = parser.parse_args()
if len(args) != 1:
parser.print_help()
sys.exit(1)
src = args[0]
if options.title == None:
options.title = os.path.splitext(os.path.basename(src))[0]
try:
convert_txt(src, options)
except ConversionError, err:
print >>sys.stderr, err
sys.exit(1)
def convert_txt(path, options):
"""
Convert the text file at C{path} into an lrf file.
@param options: Object with the following attributes:
C{author}, C{title}, C{encoding} (the assumed encoding of
the text in C{path}.)
"""
import fileinput
from libprs500.lrf.pylrs.pylrs import Book
book = Book(title=options.title, author=options.author, \
sourceencoding=options.encoding)
buffer = ''
block = book.Page().TextBlock()
for line in fileinput.input(path):
line = line.strip()
if line:
buffer += line
else:
block.Paragraph(buffer)
buffer = ''
basename = os.path.basename(path)
name = os.path.splitext(basename)[0]+'.lrf'
try:
book.renderLrf(name)
except UnicodeDecodeError:
raise ConversionError(path + ' is not encoded in ' + \
options.encoding +'. Specify the '+ \
'correct encoding with the -e option.')
return os.path.abspath(name)
def html():
""" CLI for html -> lrf conversions """
parser = OptionParser(usage=\
"""usage: %prog [options] mybook.txt
%prog converts mybook.txt to mybook.lrf
"""\
)
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the title")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown')
options, args = parser.parse_args()
if len(args) != 1:
parser.print_help()
sys.exit(1)
src = args[0]
if options.title == None:
options.title = os.path.splitext(os.path.basename(src))[0]
from libprs500.lrf.html.convert import process_file
process_file(src, options)
def main(cargs=None):
parser = OptionParser(usage=\
"""usage: %prog [options] mybook.[html|pdf|rar]
%prog converts mybook to mybook.lrf
If you specify a rar file you must have the unrar command line client
installed. makelrf assumes the rar file is an archive containing the
html file you want converted."""\
)
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the book title")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author")
parser.add_option('-r', '--rasterize', action='store_false', \
dest="rasterize",
help="Convert pdfs into image files.")
parser.add_option('-c', '--cover', action='store', dest='cover',\
help="Path to a graphic that will be set as the cover. "\
"If it is specified the thumbnail is automatically "\
"generated from it")
parser.add_option("--thumbnail", action="store", type="string", \
dest="thumbnail", \
help="Path to a graphic that will be set as the thumbnail")
if not cargs:
cargs = sys.argv
options, args = parser.parse_args()
if len(args) != 1:
parser.print_help()
sys.exit(1)
src = args[0]
root, ext = os.path.splitext(src)
if ext not in ['.html', '.pdf', '.rar']:
print >> sys.stderr, "Can only convert files ending in .html|.pdf|.rar"
parser.print_help()
sys.exit(1)
name = makelrf(author=options.author, title=options.title, \
thumbnail=options.thumbnail, src=src, cover=options.cover, \
rasterize=options.rasterize)
print "LRF generated:", name

View File

@ -0,0 +1,14 @@
## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

View File

@ -0,0 +1,86 @@
## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Convert .txt files to .lrf
"""
import os, sys
from optparse import OptionParser
from libprs500.lrf import ConversionError
def main():
""" CLI for txt -> lrf conversions """
parser = OptionParser(usage=\
"""usage: %prog [options] mybook.txt
%prog converts mybook.txt to mybook.lrf
"""\
)
parser.add_option("-t", "--title", action="store", type="string", \
dest="title", help="Set the title")
parser.add_option("-a", "--author", action="store", type="string", \
dest="author", help="Set the author", default='Unknown')
defenc = 'cp1252'
enchelp = 'Set the encoding used to decode ' + \
'the text in mybook.txt. Default encoding is ' + defenc
parser.add_option('-e', '--encoding', action='store', type='string', \
dest='encoding', help=enchelp, default=defenc)
options, args = parser.parse_args()
if len(args) != 1:
parser.print_help()
sys.exit(1)
src = args[0]
if options.title == None:
options.title = os.path.splitext(os.path.basename(src))[0]
try:
convert_txt(src, options)
except ConversionError, err:
print >>sys.stderr, err
sys.exit(1)
def convert_txt(path, options):
"""
Convert the text file at C{path} into an lrf file.
@param options: Object with the following attributes:
C{author}, C{title}, C{encoding} (the assumed encoding of
the text in C{path}.)
"""
import fileinput
from libprs500.lrf.pylrs.pylrs import Book
book = Book(title=options.title, author=options.author, \
sourceencoding=options.encoding)
buffer = ''
block = book.Page().TextBlock()
for line in fileinput.input(path):
line = line.strip()
if line:
buffer += line
else:
block.Paragraph(buffer)
buffer = ''
basename = os.path.basename(path)
name = os.path.splitext(basename)[0]+'.lrf'
try:
book.renderLrf(name)
except UnicodeDecodeError:
raise ConversionError(path + ' is not encoded in ' + \
options.encoding +'. Specify the '+ \
'correct encoding with the -e option.')
return os.path.abspath(name)
if __name__ == '__main__':
main()