Remove html5lib from miscellaneous places

2025-07-09 03:04:10 -04:00 · 2017-07-08 18:29:43 +05:30 · 2017-07-08 18:29:43 +05:30 · 5e67ba1369
commit 5e67ba1369
parent 62e4a9900e
5 changed files with 28 additions and 10 deletions
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -100,7 +100,7 @@ def html5_parse(data, max_nesting_depth=100):
        if isinstance(x.tag, basestring) and len(x) is 0:  # Leaf node
            depth = node_depth(x)
            if depth > max_nesting_depth:
-                raise ValueError('html5lib resulted in a tree with nesting'
+                raise ValueError('HTML 5 parsing resulted in a tree with nesting'
                        ' depth > %d'%max_nesting_depth)
    return data
--- a/src/calibre/gui2/store/stores/google_books_plugin.py
+++ b/src/calibre/gui2/store/stores/google_books_plugin.py
@ -13,7 +13,6 @@ from contextlib import closing
 from lxml import html
 from PyQt5.Qt import QUrl
 import html5lib
 from calibre import browser, url_slash_cleaner
 from calibre.gui2 import open_url
 from calibre.gui2.store import StorePlugin
@ -23,7 +22,14 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
 def parse_html(raw):
-    return html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
+    try:
        from html5_parser import parse
    except ImportError:
        # Old versions of calibre
        import html5lib
        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
    else:
        return parse(raw)
 def search_google(query, max_results=10, timeout=60, write_html_to=None):
--- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py
+++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py
@ -11,7 +11,6 @@ import urllib
 from contextlib import closing
 from PyQt5.Qt import QUrl
 import html5lib
 from calibre import browser, url_slash_cleaner
 from calibre.ebooks.chardet import xml_to_unicode
@ -23,6 +22,17 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
 shop_url = 'http://www.ozon.ru'
 def parse_html(raw):
    try:
        from html5_parser import parse
    except ImportError:
        # Old versions of calibre
        import html5lib
        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
    else:
        return parse(raw)
 def search(query, max_results=15, timeout=60):
    url = 'http://www.ozon.ru/?context=search&text=%s&store=1,0&group=div_book' % urllib.quote_plus(query)
@ -31,7 +41,7 @@ def search(query, max_results=15, timeout=60):
    with closing(br.open(url, timeout=timeout)) as f:
        raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
-        root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
+        root = parse_html(raw)
        for tile in root.xpath('//*[@class="bShelfTile inline"]'):
            if counter <= 0:
                break
@ -74,6 +84,7 @@ def format_price_in_RUR(price):
    price = price.replace('\xa0', '').replace(',', '.').strip() + ' py6'
    return price
 if __name__ == '__main__':
    import sys
    for r in search(sys.argv[-1]):
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -23,9 +23,7 @@ from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_lo
 from calibre.gui2.tweak_book.editor.syntax.css import (
    create_formats as create_css_formats, state_map as css_state_map, CSSState, CSSUserData)
-from html5lib.constants import cdataElements, rcdataElements
+cdata_tags = frozenset(['title', 'textarea', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'])
 cdata_tags = cdataElements | rcdataElements
 normal_pat = re.compile(r'[^<>&]+')
 entity_pat = re.compile(r'&#{0,1}[a-zA-Z0-9]{1,8};')
 tag_name_pat = re.compile(r'/{0,1}[a-zA-Z0-9:-]+')
--- a/src/calibre/utils/open_with/osx.py
+++ b/src/calibre/utils/open_with/osx.py
@ -19,10 +19,11 @@ application_locations = ('/Applications', '~/Applications', '~/Desktop')
 def generate_public_uti_map():
    from lxml import etree
-    import html5lib, urllib
+    import urllib
    from html5parser import parse
    raw = urllib.urlopen(
        'https://developer.apple.com/library/ios/documentation/Miscellaneous/Reference/UTIRef/Articles/System-DeclaredUniformTypeIdentifiers.html').read()
-    root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
+    root = parse(raw)
    tables = root.xpath('//table')[0::2]
    data = {}
    for table in tables:
@ -44,6 +45,8 @@ def generate_public_uti_map():
        f.seek(0)
        nraw = re.sub(r'^PUBLIC_UTI_MAP = .+?}', '\n'.join(lines), raw, flags=re.MULTILINE | re.DOTALL)
        f.truncate(), f.write(nraw)
 # Generated by generate_public_uti_map()
 PUBLIC_UTI_MAP = {
    '3g2':          'public.3gpp2',