mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Added wrapper for HTMLTidy so that we can clean up HTML files before conversion to LRF.
This commit is contained in:
parent
51b5017b43
commit
a06bc96269
266
src/libprs500/lrf/libtidy.py
Normal file
266
src/libprs500/lrf/libtidy.py
Normal file
@ -0,0 +1,266 @@
|
||||
## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
"""
|
||||
Thin ctypes based wrapper around libtidy. Example usage:
|
||||
>>> from libtidy import parseString
|
||||
>>> print parseString('<h1>fowehfow</h2>', \
|
||||
output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0)
|
||||
<?xml version="1.0" encoding="us-ascii"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title></title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>
|
||||
fowehfow
|
||||
</h1>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
import ctypes
|
||||
from cStringIO import StringIO
|
||||
import weakref
|
||||
|
||||
class TidyLibError(Exception):
|
||||
def __init__(self, arg):
|
||||
self.arg=arg
|
||||
|
||||
class InvalidOptionError(TidyLibError):
|
||||
def __str__(self):
|
||||
return "%s was not a valid Tidy option." % (self.arg)
|
||||
__repr__=__str__
|
||||
|
||||
class OptionArgError(TidyLibError):
|
||||
def __init__(self, arg):
|
||||
self.arg=arg
|
||||
def __str__(self):
|
||||
return self.arg
|
||||
|
||||
# search the path for libtidy using the known names;
|
||||
thelib=None
|
||||
for libname in ('cygtidy-0-99-0', 'libtidy', 'libtidy.so', 'tidylib'):
|
||||
try:
|
||||
thelib = getattr(ctypes.cdll, libname)
|
||||
break
|
||||
except OSError:
|
||||
pass
|
||||
if not thelib:
|
||||
raise OSError("Couldn't find libtidy, please make sure it is installed.")
|
||||
|
||||
class Loader:
|
||||
"""
|
||||
I am a trivial wrapper that eliminates the need for tidy.tidyFoo,
|
||||
so you can just access tidy.Foo
|
||||
"""
|
||||
def __init__(self):
|
||||
self.lib = thelib
|
||||
def __getattr__(self, name):
|
||||
try:
|
||||
return getattr(self.lib, "tidy%s" % name)
|
||||
# current ctypes uses ValueError, future will use AttributeError
|
||||
except (ValueError, AttributeError):
|
||||
return getattr(self.lib, name)
|
||||
|
||||
_tidy=Loader()
|
||||
|
||||
# define a callback to pass to Tidylib
|
||||
def _putByte(handle, c):
|
||||
"""Lookup sink by handle and call its putByte method"""
|
||||
sinkfactory[handle].putByte(c)
|
||||
return 0
|
||||
|
||||
PUTBYTEFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char)
|
||||
putByte = PUTBYTEFUNC(_putByte)
|
||||
|
||||
class _OutputSink(ctypes.Structure):
|
||||
_fields_ = [("sinkData", ctypes.c_int),
|
||||
("putByte", PUTBYTEFUNC),
|
||||
]
|
||||
|
||||
class _Sink:
|
||||
def __init__(self):
|
||||
self._data = StringIO()
|
||||
self.struct = _OutputSink()
|
||||
self.struct.putByte = putByte
|
||||
|
||||
def putByte(self, c):
|
||||
self._data.write(c)
|
||||
|
||||
def __str__(self):
|
||||
return self._data.getvalue()
|
||||
|
||||
class ReportItem:
|
||||
def __init__(self, err):
|
||||
self.err = err
|
||||
if err.startswith('line'):
|
||||
tokens = err.split(' ',6)
|
||||
self.severity = tokens[5][0] # W or E
|
||||
self.line = int(tokens[1])
|
||||
self.col = int(tokens[3])
|
||||
self.message = tokens[6]
|
||||
else:
|
||||
tokens = err.split(' ',1)
|
||||
self.severity = tokens[0][0]
|
||||
self.message = tokens[1]
|
||||
self.line = None
|
||||
self.col = None
|
||||
# TODO - parse emacs mode
|
||||
|
||||
def __str__(self):
|
||||
severities = dict(W='Warning', E='Error', C='Config')
|
||||
try:
|
||||
if self.line:
|
||||
return "line %d col %d - %s: %s" % (self.line, self.col,
|
||||
severities[self.severity],
|
||||
self.message)
|
||||
|
||||
else:
|
||||
return "%s: %s" % (severities[self.severity], self.message)
|
||||
except KeyError:
|
||||
return self.err
|
||||
|
||||
def __repr__(self):
|
||||
return "%s('%s')" % (self.__class__.__name__,
|
||||
str(self).replace("'", "\\'"))
|
||||
|
||||
class FactoryDict(dict):
|
||||
"""I am a dict with a create method and no __setitem__. This allows
|
||||
me to control my own keys.
|
||||
"""
|
||||
def create(self):
|
||||
"""Subclasses should implement me to generate a new item"""
|
||||
|
||||
def _setitem(self, name, value):
|
||||
dict.__setitem__(self, name, value)
|
||||
|
||||
def __setitem__(self, name, value):
|
||||
raise TypeError, "Use create() to get a new object"
|
||||
|
||||
|
||||
class SinkFactory(FactoryDict):
|
||||
"""Mapping for lookup of sinks by handle"""
|
||||
def __init__(self):
|
||||
FactoryDict.__init__(self)
|
||||
self.lastsink = 0
|
||||
|
||||
def create(self):
|
||||
sink = _Sink()
|
||||
sink.struct.sinkData = self.lastsink
|
||||
FactoryDict._setitem(self, self.lastsink, sink)
|
||||
self.lastsink = self.lastsink+1
|
||||
return sink
|
||||
|
||||
sinkfactory = SinkFactory()
|
||||
|
||||
class _Document(object):
|
||||
def __init__(self):
|
||||
self.cdoc = _tidy.Create()
|
||||
self.errsink = sinkfactory.create()
|
||||
_tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct))
|
||||
|
||||
def write(self, stream):
|
||||
stream.write(str(self))
|
||||
|
||||
def get_errors(self):
|
||||
ret = []
|
||||
for line in str(self.errsink).split('\n'):
|
||||
line = line.strip(' \n\r')
|
||||
if line: ret.append(ReportItem(line))
|
||||
return ret
|
||||
|
||||
errors=property(get_errors)
|
||||
|
||||
def __str__(self):
|
||||
stlen = ctypes.c_int(8192)
|
||||
st = ctypes.c_buffer(stlen.value)
|
||||
rc = _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen))
|
||||
if rc==-12: # buffer too small
|
||||
st = ctypes.c_buffer(stlen.value)
|
||||
_tidy.SaveString(self.cdoc, st, ctypes.byref(stlen))
|
||||
return st.value
|
||||
|
||||
errors = {'missing or malformed argument for option: ': OptionArgError,
|
||||
'unknown option: ': InvalidOptionError,
|
||||
}
|
||||
|
||||
|
||||
class DocumentFactory(FactoryDict):
|
||||
def _setOptions(self, doc, **options):
|
||||
for k in options.keys():
|
||||
|
||||
# this will flush out most argument type errors...
|
||||
if options[k] is None: options[k] = ''
|
||||
|
||||
_tidy.OptParseValue(doc.cdoc,
|
||||
k.replace('_', '-'),
|
||||
str(options[k]))
|
||||
if doc.errors:
|
||||
match=filter(doc.errors[-1].message.startswith, errors.keys())
|
||||
if match:
|
||||
raise errors[match[0]](doc.errors[-1].message)
|
||||
|
||||
def load(self, doc, arg, loader):
|
||||
loader(doc.cdoc, arg)
|
||||
_tidy.CleanAndRepair(doc.cdoc)
|
||||
|
||||
def loadFile(self, doc, filename):
|
||||
self.load(doc, filename, _tidy.ParseFile)
|
||||
|
||||
def loadString(self, doc, st):
|
||||
self.load(doc, st, _tidy.ParseString)
|
||||
|
||||
def _create(self, *args, **kwargs):
|
||||
doc = _Document()
|
||||
self._setOptions(doc, **kwargs)
|
||||
ref = weakref.ref(doc, self.releaseDoc)
|
||||
FactoryDict._setitem(self, ref, doc.cdoc)
|
||||
return doc
|
||||
|
||||
def parse(self, filename, *args, **kwargs):
|
||||
"""
|
||||
Open and process filename as an HTML file, returning a
|
||||
processed document object.
|
||||
@param kwargs: named options to pass to TidyLib for processing
|
||||
the input file.
|
||||
@param filename: the name of a file to process
|
||||
@return: a document object
|
||||
"""
|
||||
doc = self._create(**kwargs)
|
||||
self.loadFile(doc, filename)
|
||||
return doc
|
||||
|
||||
def parseString(self, st, *args, **kwargs):
|
||||
"""
|
||||
Use st as an HTML file, and process it, returning a
|
||||
document object.
|
||||
@param kwargs: named options to pass to TidyLib for processing
|
||||
the input file.
|
||||
@param st: the string to parse
|
||||
@return: a document object
|
||||
"""
|
||||
doc = self._create(**kwargs)
|
||||
self.loadString(doc, st)
|
||||
return doc
|
||||
|
||||
def releaseDoc(self, ref):
|
||||
_tidy.Release(self[ref])
|
||||
|
||||
docfactory = DocumentFactory()
|
||||
parse = docfactory.parse
|
||||
parseString = docfactory.parseString
|
Loading…
x
Reference in New Issue
Block a user