Start work on porting to beautifulsoup4

This commit is contained in:
Kovid Goyal 2019-03-22 11:20:58 +05:30
parent 78b7112012
commit 692230147c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 66 additions and 1981 deletions

View File

@ -267,7 +267,7 @@ to go to https://www.nytimes.com/pages/todayspaper/index.html and fetch the list
of articles that appear in *todays* paper. While more complex than simply using of articles that appear in *todays* paper. While more complex than simply using
:term:`RSS`, the recipe creates an e-book that corresponds very closely to the :term:`RSS`, the recipe creates an e-book that corresponds very closely to the
days paper. ``parse_index`` makes heavy use of `BeautifulSoup days paper. ``parse_index`` makes heavy use of `BeautifulSoup
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ to parse <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_ to parse
the daily paper webpage. You can also use other, more modern parsers if you the daily paper webpage. You can also use other, more modern parsers if you
dislike BeautifulSoup. calibre comes with `lxml <https://lxml.de/>`_ and dislike BeautifulSoup. calibre comes with `lxml <https://lxml.de/>`_ and
`html5lib <https://github.com/html5lib/html5lib-python>`_, which are the `html5lib <https://github.com/html5lib/html5lib-python>`_, which are the

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,6 @@ __docformat__ = "restructuredtext en"
import os, time, traceback, re, sys, io import os, time, traceback, re, sys, io
from collections import defaultdict from collections import defaultdict
from functools import partial
from contextlib import nested, closing from contextlib import nested, closing
@ -17,7 +16,6 @@ from calibre import (browser, __appname__, iswindows, force_unicode,
strftime, preferred_encoding, as_unicode, random_user_agent) strftime, preferred_encoding, as_unicode, random_user_agent)
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre import entity_to_unicode
from calibre.web import Recipe from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
@ -224,14 +222,14 @@ class BasicNewsRecipe(Recipe):
#: #:
#: { #: {
#: name : 'tag name', #e.g. 'div' #: name : 'tag name', #e.g. 'div'
#: attrs : a dictionary, #e.g. {class: 'advertisment'} #: attrs : a dictionary, #e.g. {'class': 'advertisment'}
#: } #: }
#: #:
#: All keys are optional. For a full explanation of the search criteria, see #: All keys are optional. For a full explanation of the search criteria, see
#: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html#Searching%20the%20Parse%20Tree>`_ #: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree>`_
#: A common example:: #: A common example::
#: #:
#: remove_tags = [dict(name='div', attrs={'class':'advert'})] #: remove_tags = [dict(name='div', class_='advert')]
#: #:
#: This will remove all `<div class="advert">` tags and all #: This will remove all `<div class="advert">` tags and all
#: their children from the downloaded :term:`HTML`. #: their children from the downloaded :term:`HTML`.
@ -662,7 +660,7 @@ class BasicNewsRecipe(Recipe):
def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None): def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
''' '''
Convenience method that takes an URL to the index page and returns Convenience method that takes an URL to the index page and returns
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc>`_
of it. of it.
`url_or_raw`: Either a URL or the downloaded index page as a string `url_or_raw`: Either a URL or the downloaded index page as a string
@ -701,8 +699,7 @@ class BasicNewsRecipe(Recipe):
from html5_parser import parse from html5_parser import parse
return parse(_raw) return parse(_raw)
else: else:
from html5_parser.soup import set_soup_module, parse return BeautifulSoup(_raw)
set_soup_module(sys.modules[BeautifulSoup.__module__])
return parse(_raw, return_root=False) return parse(_raw, return_root=False)
def extract_readable_article(self, html, url): def extract_readable_article(self, html, url):
@ -951,7 +948,7 @@ class BasicNewsRecipe(Recipe):
def _postprocess_html(self, soup, first_fetch, job_info): def _postprocess_html(self, soup, first_fetch, job_info):
if self.no_stylesheets: if self.no_stylesheets:
for link in soup.findAll('link'): for link in soup.findAll('link'):
if (link.get('type') or 'text/css').lower() == 'text/css' and (link.get('rel') or 'stylesheet').lower() == 'stylesheet': if (link.get('type') or 'text/css').lower() == 'text/css' and 'stylesheet' in (link.get('rel') or ('stylesheet',)):
link.extract() link.extract()
for style in soup.findAll('style'): for style in soup.findAll('style'):
style.extract() style.extract()
@ -960,9 +957,10 @@ class BasicNewsRecipe(Recipe):
head = soup.find('body') head = soup.find('body')
if not head: if not head:
head = soup.find(True) head = soup.find(True)
style = BeautifulSoup(u'<style type="text/css" title="override_css">%s</style>'%( css = self.template_css + '\n\n' + (self.get_extra_css() or '')
self.template_css +'\n\n'+(self.get_extra_css() or ''))).find('style') style = soup.new_tag('style', type='text/css', title='override_css')
head.insert(len(head.contents), style) style.append(css)
head.append(style)
if first_fetch and job_info: if first_fetch and job_info:
url, f, a, feed_len = job_info url, f, a, feed_len = job_info
body = soup.find('body') body = soup.find('body')
@ -1648,14 +1646,14 @@ class BasicNewsRecipe(Recipe):
def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True): def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
''' '''
Convenience method to take a Convenience method to take a
`BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
`Tag` and extract the text from it recursively, including any CDATA sections `Tag` and extract the text from it recursively, including any CDATA sections
and alt tag attributes. Return a possibly empty unicode string. and alt tag attributes. Return a possibly empty unicode string.
`use_alt`: If `True` try to use the alt attribute for tags that don't `use_alt`: If `True` try to use the alt attribute for tags that don't
have any textual content have any textual content
`tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ `tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
`Tag` `Tag`
''' '''
if tag is None: if tag is None:
@ -1686,11 +1684,7 @@ class BasicNewsRecipe(Recipe):
@classmethod @classmethod
def soup(cls, raw): def soup(cls, raw):
entity_replace = [(re.compile(u'&(\\S+?);'), partial(entity_to_unicode, return BeautifulSoup(raw)
exceptions=[]))]
nmassage = list(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(entity_replace)
return BeautifulSoup(raw, markupMassage=nmassage)
@classmethod @classmethod
def adeify_images(cls, soup): def adeify_images(cls, soup):
@ -1708,8 +1702,8 @@ class BasicNewsRecipe(Recipe):
oldParent = item.parent oldParent = item.parent
myIndex = oldParent.contents.index(item) myIndex = oldParent.contents.index(item)
item.extract() item.extract()
divtag = Tag(soup,'div') divtag = soup.new_tag('div')
brtag = Tag(soup,'br') brtag = soup.new_tag('br')
oldParent.insert(myIndex,divtag) oldParent.insert(myIndex,divtag)
divtag.append(item) divtag.append(item)
divtag.append(brtag) divtag.append(brtag)

View File

@ -20,11 +20,9 @@ import traceback
from base64 import b64decode from base64 import b64decode
from httplib import responses from httplib import responses
from html5_parser.soup import parse, set_soup_module
from calibre import browser, relpath, unicode_path from calibre import browser, relpath, unicode_path
from calibre.constants import filesystem_encoding, iswindows from calibre.constants import filesystem_encoding, iswindows
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
@ -82,18 +80,15 @@ def basename(url):
def save_soup(soup, target): def save_soup(soup, target):
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') for meta in soup.findAll('meta', content=True):
nm = ns.find('meta') if 'charset' in meta['content'].lower():
metas = soup.findAll('meta', content=True) meta.extract()
added = False for meta in soup.findAll('meta', charset=True):
for meta in metas: meta.extract()
if 'charset' in meta.get('content', '').lower(): head = soup.find('head')
meta.replaceWith(nm) if head is not None:
added = True nm = soup.new_tag('meta', charset='utf-8')
if not added: head.insert(0, nm)
head = soup.find('head')
if head is not None:
head.insert(0, nm)
selfdir = os.path.dirname(target) selfdir = os.path.dirname(target)
@ -191,18 +186,17 @@ class RecursiveFetcher(object):
usrc = self.preprocess_raw_html(usrc, url) usrc = self.preprocess_raw_html(usrc, url)
for pat, repl in nmassage: for pat, repl in nmassage:
usrc = pat.sub(repl, usrc) usrc = pat.sub(repl, usrc)
set_soup_module(sys.modules[BeautifulSoup.__module__]) soup = BeautifulSoup(usrc)
soup = parse(usrc, return_root=False)
replace = self.prepreprocess_html_ext(soup) replace = self.prepreprocess_html_ext(soup)
if replace is not None: if replace is not None:
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
for pat, repl in nmassage: for pat, repl in nmassage:
replace = pat.sub(repl, replace) replace = pat.sub(repl, replace)
soup = parse(replace, return_root=False) soup = BeautifulSoup(replace)
if self.keep_only_tags: if self.keep_only_tags:
body = Tag(soup, 'body') body = soup.new_tag('body')
try: try:
if isinstance(self.keep_only_tags, dict): if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags] self.keep_only_tags = [self.keep_only_tags]
@ -334,13 +328,19 @@ class RecursiveFetcher(object):
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
if not os.path.exists(diskpath): if not os.path.exists(diskpath):
os.mkdir(diskpath) os.mkdir(diskpath)
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')): # noqa for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
if tag.has_key('href'): # noqa try:
mtype = tag['type']
except KeyError:
mtype = 'text/css' if tag.name.lower() == 'style' else ''
if mtype.lower() != 'text/css':
continue
if tag.has_attr('href'):
iurl = tag['href'] iurl = tag['href']
if not urlsplit(iurl).scheme: if not urlsplit(iurl).scheme:
iurl = urljoin(baseurl, iurl, False) iurl = urljoin(baseurl, iurl, False)
with self.stylemap_lock: with self.stylemap_lock:
if self.stylemap.has_key(iurl): # noqa if iurl in self.stylemap:
tag['href'] = self.stylemap[iurl] tag['href'] = self.stylemap[iurl]
continue continue
try: try:
@ -363,7 +363,7 @@ class RecursiveFetcher(object):
if not urlsplit(iurl).scheme: if not urlsplit(iurl).scheme:
iurl = urljoin(baseurl, iurl, False) iurl = urljoin(baseurl, iurl, False)
with self.stylemap_lock: with self.stylemap_lock:
if self.stylemap.has_key(iurl): # noqa if iurl in self.stylemap:
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
continue continue
try: try:
@ -387,7 +387,7 @@ class RecursiveFetcher(object):
if not os.path.exists(diskpath): if not os.path.exists(diskpath):
os.mkdir(diskpath) os.mkdir(diskpath)
c = 0 c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): # noqa for tag in soup.findAll('img', src=True):
iurl = tag['src'] iurl = tag['src']
if iurl.startswith('data:image/'): if iurl.startswith('data:image/'):
try: try:
@ -401,7 +401,7 @@ class RecursiveFetcher(object):
if not urlsplit(iurl).scheme: if not urlsplit(iurl).scheme:
iurl = urljoin(baseurl, iurl, False) iurl = urljoin(baseurl, iurl, False)
with self.imagemap_lock: with self.imagemap_lock:
if self.imagemap.has_key(iurl): # noqa if iurl in self.imagemap:
tag['src'] = self.imagemap[iurl] tag['src'] = self.imagemap[iurl]
continue continue
try: try:
@ -479,12 +479,12 @@ class RecursiveFetcher(object):
tag[key] = path+suffix tag[key] = path+suffix
def process_return_links(self, soup, baseurl): def process_return_links(self, soup, baseurl):
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): # noqa for tag in soup.findAll('a', href=True):
iurl = self.absurl(baseurl, tag, 'href') iurl = self.absurl(baseurl, tag, 'href')
if not iurl: if not iurl:
continue continue
nurl = self.normurl(iurl) nurl = self.normurl(iurl)
if self.filemap.has_key(nurl): # noqa if nurl in self.filemap:
self.localize_link(tag, 'href', self.filemap[nurl]) self.localize_link(tag, 'href', self.filemap[nurl])
def process_links(self, soup, baseurl, recursion_level, into_dir='links'): def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
@ -506,7 +506,7 @@ class RecursiveFetcher(object):
if not iurl: if not iurl:
continue continue
nurl = self.normurl(iurl) nurl = self.normurl(iurl)
if self.filemap.has_key(nurl): # noqa if nurl in self.filemap:
self.localize_link(tag, 'href', self.filemap[nurl]) self.localize_link(tag, 'href', self.filemap[nurl])
continue continue
if self.files > self.max_files: if self.files > self.max_files: