Ignore DataLossWarnings

This commit is contained in:
Kovid Goyal 2013-10-25 15:35:10 +05:30
parent 62ebb5e3f6
commit 03a39f15d1

View File

@ -6,14 +6,14 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy, re import copy, re, warnings
from functools import partial from functools import partial
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
from html5lib.constants import namespaces, tableInsertModeElements from html5lib.constants import namespaces, tableInsertModeElements
from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
from html5lib.ihatexml import InfosetFilter from html5lib.ihatexml import InfosetFilter, DataLossWarning
from html5lib.html5parser import HTMLParser from html5lib.html5parser import HTMLParser
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -357,11 +357,12 @@ def parse(raw, decoder=None, log=None):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
# TODO: Replace entities? # TODO: Replace entities?
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
# TODO: ignore warnings
while True: while True:
try: try:
parser = HTMLParser(tree=TreeBuilder) parser = HTMLParser(tree=TreeBuilder)
parser.parse(raw, parseMeta=False, useChardet=False) with warnings.catch_warnings():
warnings.simplefilter('ignore', category=DataLossWarning)
parser.parse(raw, parseMeta=False, useChardet=False)
except NamespacedHTMLPresent as err: except NamespacedHTMLPresent as err:
raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I) raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
continue continue