From a06bc96269800ff0622eb48cf4a57aa04688bcbf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 25 Feb 2007 16:51:56 +0000 Subject: [PATCH] Added wrapper for HTMLTidy so that we can clean up HTML files before conversion to LRF. --- src/libprs500/lrf/libtidy.py | 266 +++++++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 src/libprs500/lrf/libtidy.py diff --git a/src/libprs500/lrf/libtidy.py b/src/libprs500/lrf/libtidy.py new file mode 100644 index 0000000000..e41c80bebc --- /dev/null +++ b/src/libprs500/lrf/libtidy.py @@ -0,0 +1,266 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +""" +Thin ctypes based wrapper around libtidy. Example usage: +>>> from libtidy import parseString +>>> print parseString('

fowehfow

', \ + output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0) + + + + + + + +

+ fowehfow +

+ + +""" + +import ctypes +from cStringIO import StringIO +import weakref + +class TidyLibError(Exception): + def __init__(self, arg): + self.arg=arg + +class InvalidOptionError(TidyLibError): + def __str__(self): + return "%s was not a valid Tidy option." % (self.arg) + __repr__=__str__ + +class OptionArgError(TidyLibError): + def __init__(self, arg): + self.arg=arg + def __str__(self): + return self.arg + +# search the path for libtidy using the known names; +thelib=None +for libname in ('cygtidy-0-99-0', 'libtidy', 'libtidy.so', 'tidylib'): + try: + thelib = getattr(ctypes.cdll, libname) + break + except OSError: + pass +if not thelib: + raise OSError("Couldn't find libtidy, please make sure it is installed.") + +class Loader: + """ + I am a trivial wrapper that eliminates the need for tidy.tidyFoo, + so you can just access tidy.Foo + """ + def __init__(self): + self.lib = thelib + def __getattr__(self, name): + try: + return getattr(self.lib, "tidy%s" % name) + # current ctypes uses ValueError, future will use AttributeError + except (ValueError, AttributeError): + return getattr(self.lib, name) + +_tidy=Loader() + +# define a callback to pass to Tidylib +def _putByte(handle, c): + """Lookup sink by handle and call its putByte method""" + sinkfactory[handle].putByte(c) + return 0 + +PUTBYTEFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char) +putByte = PUTBYTEFUNC(_putByte) + +class _OutputSink(ctypes.Structure): + _fields_ = [("sinkData", ctypes.c_int), + ("putByte", PUTBYTEFUNC), + ] + +class _Sink: + def __init__(self): + self._data = StringIO() + self.struct = _OutputSink() + self.struct.putByte = putByte + + def putByte(self, c): + self._data.write(c) + + def __str__(self): + return self._data.getvalue() + +class ReportItem: + def __init__(self, err): + self.err = err + if err.startswith('line'): + tokens = err.split(' ',6) + self.severity = tokens[5][0] # W or E + self.line = int(tokens[1]) + self.col = int(tokens[3]) + self.message = tokens[6] + else: + tokens = err.split(' ',1) + self.severity = tokens[0][0] + self.message = tokens[1] + self.line = None + self.col = None + # TODO - parse emacs mode + + def __str__(self): + severities = dict(W='Warning', E='Error', C='Config') + try: + if self.line: + return "line %d col %d - %s: %s" % (self.line, self.col, + severities[self.severity], + self.message) + + else: + return "%s: %s" % (severities[self.severity], self.message) + except KeyError: + return self.err + + def __repr__(self): + return "%s('%s')" % (self.__class__.__name__, + str(self).replace("'", "\\'")) + +class FactoryDict(dict): + """I am a dict with a create method and no __setitem__. This allows + me to control my own keys. + """ + def create(self): + """Subclasses should implement me to generate a new item""" + + def _setitem(self, name, value): + dict.__setitem__(self, name, value) + + def __setitem__(self, name, value): + raise TypeError, "Use create() to get a new object" + + +class SinkFactory(FactoryDict): + """Mapping for lookup of sinks by handle""" + def __init__(self): + FactoryDict.__init__(self) + self.lastsink = 0 + + def create(self): + sink = _Sink() + sink.struct.sinkData = self.lastsink + FactoryDict._setitem(self, self.lastsink, sink) + self.lastsink = self.lastsink+1 + return sink + +sinkfactory = SinkFactory() + +class _Document(object): + def __init__(self): + self.cdoc = _tidy.Create() + self.errsink = sinkfactory.create() + _tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct)) + + def write(self, stream): + stream.write(str(self)) + + def get_errors(self): + ret = [] + for line in str(self.errsink).split('\n'): + line = line.strip(' \n\r') + if line: ret.append(ReportItem(line)) + return ret + + errors=property(get_errors) + + def __str__(self): + stlen = ctypes.c_int(8192) + st = ctypes.c_buffer(stlen.value) + rc = _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen)) + if rc==-12: # buffer too small + st = ctypes.c_buffer(stlen.value) + _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen)) + return st.value + +errors = {'missing or malformed argument for option: ': OptionArgError, + 'unknown option: ': InvalidOptionError, + } + + +class DocumentFactory(FactoryDict): + def _setOptions(self, doc, **options): + for k in options.keys(): + + # this will flush out most argument type errors... + if options[k] is None: options[k] = '' + + _tidy.OptParseValue(doc.cdoc, + k.replace('_', '-'), + str(options[k])) + if doc.errors: + match=filter(doc.errors[-1].message.startswith, errors.keys()) + if match: + raise errors[match[0]](doc.errors[-1].message) + + def load(self, doc, arg, loader): + loader(doc.cdoc, arg) + _tidy.CleanAndRepair(doc.cdoc) + + def loadFile(self, doc, filename): + self.load(doc, filename, _tidy.ParseFile) + + def loadString(self, doc, st): + self.load(doc, st, _tidy.ParseString) + + def _create(self, *args, **kwargs): + doc = _Document() + self._setOptions(doc, **kwargs) + ref = weakref.ref(doc, self.releaseDoc) + FactoryDict._setitem(self, ref, doc.cdoc) + return doc + + def parse(self, filename, *args, **kwargs): + """ + Open and process filename as an HTML file, returning a + processed document object. + @param kwargs: named options to pass to TidyLib for processing + the input file. + @param filename: the name of a file to process + @return: a document object + """ + doc = self._create(**kwargs) + self.loadFile(doc, filename) + return doc + + def parseString(self, st, *args, **kwargs): + """ + Use st as an HTML file, and process it, returning a + document object. + @param kwargs: named options to pass to TidyLib for processing + the input file. + @param st: the string to parse + @return: a document object + """ + doc = self._create(**kwargs) + self.loadString(doc, st) + return doc + + def releaseDoc(self, ref): + _tidy.Release(self[ref]) + +docfactory = DocumentFactory() +parse = docfactory.parse +parseString = docfactory.parseString \ No newline at end of file