Remove html5lib from miscellaneous places

This commit is contained in:
Kovid Goyal 2017-07-08 18:29:43 +05:30
parent 62e4a9900e
commit 5e67ba1369
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 28 additions and 10 deletions

View File

@ -100,7 +100,7 @@ def html5_parse(data, max_nesting_depth=100):
if isinstance(x.tag, basestring) and len(x) is 0: # Leaf node if isinstance(x.tag, basestring) and len(x) is 0: # Leaf node
depth = node_depth(x) depth = node_depth(x)
if depth > max_nesting_depth: if depth > max_nesting_depth:
raise ValueError('html5lib resulted in a tree with nesting' raise ValueError('HTML 5 parsing resulted in a tree with nesting'
' depth > %d'%max_nesting_depth) ' depth > %d'%max_nesting_depth)
return data return data

View File

@ -13,7 +13,6 @@ from contextlib import closing
from lxml import html from lxml import html
from PyQt5.Qt import QUrl from PyQt5.Qt import QUrl
import html5lib
from calibre import browser, url_slash_cleaner from calibre import browser, url_slash_cleaner
from calibre.gui2 import open_url from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin from calibre.gui2.store import StorePlugin
@ -23,7 +22,14 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
def parse_html(raw): def parse_html(raw):
return html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml') try:
from html5_parser import parse
except ImportError:
# Old versions of calibre
import html5lib
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
else:
return parse(raw)
def search_google(query, max_results=10, timeout=60, write_html_to=None): def search_google(query, max_results=10, timeout=60, write_html_to=None):

View File

@ -11,7 +11,6 @@ import urllib
from contextlib import closing from contextlib import closing
from PyQt5.Qt import QUrl from PyQt5.Qt import QUrl
import html5lib
from calibre import browser, url_slash_cleaner from calibre import browser, url_slash_cleaner
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -23,6 +22,17 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
shop_url = 'http://www.ozon.ru' shop_url = 'http://www.ozon.ru'
def parse_html(raw):
try:
from html5_parser import parse
except ImportError:
# Old versions of calibre
import html5lib
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
else:
return parse(raw)
def search(query, max_results=15, timeout=60): def search(query, max_results=15, timeout=60):
url = 'http://www.ozon.ru/?context=search&text=%s&store=1,0&group=div_book' % urllib.quote_plus(query) url = 'http://www.ozon.ru/?context=search&text=%s&store=1,0&group=div_book' % urllib.quote_plus(query)
@ -31,7 +41,7 @@ def search(query, max_results=15, timeout=60):
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) root = parse_html(raw)
for tile in root.xpath('//*[@class="bShelfTile inline"]'): for tile in root.xpath('//*[@class="bShelfTile inline"]'):
if counter <= 0: if counter <= 0:
break break
@ -74,6 +84,7 @@ def format_price_in_RUR(price):
price = price.replace('\xa0', '').replace(',', '.').strip() + ' py6' price = price.replace('\xa0', '').replace(',', '.').strip() + ' py6'
return price return price
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
for r in search(sys.argv[-1]): for r in search(sys.argv[-1]):

View File

@ -23,9 +23,7 @@ from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_lo
from calibre.gui2.tweak_book.editor.syntax.css import ( from calibre.gui2.tweak_book.editor.syntax.css import (
create_formats as create_css_formats, state_map as css_state_map, CSSState, CSSUserData) create_formats as create_css_formats, state_map as css_state_map, CSSState, CSSUserData)
from html5lib.constants import cdataElements, rcdataElements cdata_tags = frozenset(['title', 'textarea', 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'])
cdata_tags = cdataElements | rcdataElements
normal_pat = re.compile(r'[^<>&]+') normal_pat = re.compile(r'[^<>&]+')
entity_pat = re.compile(r'&#{0,1}[a-zA-Z0-9]{1,8};') entity_pat = re.compile(r'&#{0,1}[a-zA-Z0-9]{1,8};')
tag_name_pat = re.compile(r'/{0,1}[a-zA-Z0-9:-]+') tag_name_pat = re.compile(r'/{0,1}[a-zA-Z0-9:-]+')

View File

@ -19,10 +19,11 @@ application_locations = ('/Applications', '~/Applications', '~/Desktop')
def generate_public_uti_map(): def generate_public_uti_map():
from lxml import etree from lxml import etree
import html5lib, urllib import urllib
from html5parser import parse
raw = urllib.urlopen( raw = urllib.urlopen(
'https://developer.apple.com/library/ios/documentation/Miscellaneous/Reference/UTIRef/Articles/System-DeclaredUniformTypeIdentifiers.html').read() 'https://developer.apple.com/library/ios/documentation/Miscellaneous/Reference/UTIRef/Articles/System-DeclaredUniformTypeIdentifiers.html').read()
root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) root = parse(raw)
tables = root.xpath('//table')[0::2] tables = root.xpath('//table')[0::2]
data = {} data = {}
for table in tables: for table in tables:
@ -44,6 +45,8 @@ def generate_public_uti_map():
f.seek(0) f.seek(0)
nraw = re.sub(r'^PUBLIC_UTI_MAP = .+?}', '\n'.join(lines), raw, flags=re.MULTILINE | re.DOTALL) nraw = re.sub(r'^PUBLIC_UTI_MAP = .+?}', '\n'.join(lines), raw, flags=re.MULTILINE | re.DOTALL)
f.truncate(), f.write(nraw) f.truncate(), f.write(nraw)
# Generated by generate_public_uti_map() # Generated by generate_public_uti_map()
PUBLIC_UTI_MAP = { PUBLIC_UTI_MAP = {
'3g2': 'public.3gpp2', '3g2': 'public.3gpp2',