Start work on porting to beautifulsoup4

This commit is contained in:
Kovid Goyal 2019-03-22 11:20:58 +05:30
parent 78b7112012
commit 692230147c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 66 additions and 1981 deletions

View File

@ -267,7 +267,7 @@ to go to https://www.nytimes.com/pages/todayspaper/index.html and fetch the list
of articles that appear in *todays* paper. While more complex than simply using
:term:`RSS`, the recipe creates an e-book that corresponds very closely to the
days paper. ``parse_index`` makes heavy use of `BeautifulSoup
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ to parse
<https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_ to parse
the daily paper webpage. You can also use other, more modern parsers if you
dislike BeautifulSoup. calibre comes with `lxml <https://lxml.de/>`_ and
`html5lib <https://github.com/html5lib/html5lib-python>`_, which are the

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,6 @@ __docformat__ = "restructuredtext en"
import os, time, traceback, re, sys, io
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
@ -17,7 +16,6 @@ from calibre import (browser, __appname__, iswindows, force_unicode,
strftime, preferred_encoding, as_unicode, random_user_agent)
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
@ -224,14 +222,14 @@ class BasicNewsRecipe(Recipe):
#:
#: {
#: name : 'tag name', #e.g. 'div'
#: attrs : a dictionary, #e.g. {class: 'advertisment'}
#: attrs : a dictionary, #e.g. {'class': 'advertisment'}
#: }
#:
#: All keys are optional. For a full explanation of the search criteria, see
#: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html#Searching%20the%20Parse%20Tree>`_
#: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree>`_
#: A common example::
#:
#: remove_tags = [dict(name='div', attrs={'class':'advert'})]
#: remove_tags = [dict(name='div', class_='advert')]
#:
#: This will remove all `<div class="advert">` tags and all
#: their children from the downloaded :term:`HTML`.
@ -662,7 +660,7 @@ class BasicNewsRecipe(Recipe):
def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
'''
Convenience method that takes an URL to the index page and returns
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc>`_
of it.
`url_or_raw`: Either a URL or the downloaded index page as a string
@ -701,8 +699,7 @@ class BasicNewsRecipe(Recipe):
from html5_parser import parse
return parse(_raw)
else:
from html5_parser.soup import set_soup_module, parse
set_soup_module(sys.modules[BeautifulSoup.__module__])
return BeautifulSoup(_raw)
return parse(_raw, return_root=False)
def extract_readable_article(self, html, url):
@ -951,7 +948,7 @@ class BasicNewsRecipe(Recipe):
def _postprocess_html(self, soup, first_fetch, job_info):
if self.no_stylesheets:
for link in soup.findAll('link'):
if (link.get('type') or 'text/css').lower() == 'text/css' and (link.get('rel') or 'stylesheet').lower() == 'stylesheet':
if (link.get('type') or 'text/css').lower() == 'text/css' and 'stylesheet' in (link.get('rel') or ('stylesheet',)):
link.extract()
for style in soup.findAll('style'):
style.extract()
@ -960,9 +957,10 @@ class BasicNewsRecipe(Recipe):
head = soup.find('body')
if not head:
head = soup.find(True)
style = BeautifulSoup(u'<style type="text/css" title="override_css">%s</style>'%(
self.template_css +'\n\n'+(self.get_extra_css() or ''))).find('style')
head.insert(len(head.contents), style)
css = self.template_css + '\n\n' + (self.get_extra_css() or '')
style = soup.new_tag('style', type='text/css', title='override_css')
style.append(css)
head.append(style)
if first_fetch and job_info:
url, f, a, feed_len = job_info
body = soup.find('body')
@ -1648,14 +1646,14 @@ class BasicNewsRecipe(Recipe):
def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
'''
Convenience method to take a
`BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
`BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
`Tag` and extract the text from it recursively, including any CDATA sections
and alt tag attributes. Return a possibly empty unicode string.
`use_alt`: If `True` try to use the alt attribute for tags that don't
have any textual content
`tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
`tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
`Tag`
'''
if tag is None:
@ -1686,11 +1684,7 @@ class BasicNewsRecipe(Recipe):
@classmethod
def soup(cls, raw):
entity_replace = [(re.compile(u'&(\\S+?);'), partial(entity_to_unicode,
exceptions=[]))]
nmassage = list(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(entity_replace)
return BeautifulSoup(raw, markupMassage=nmassage)
return BeautifulSoup(raw)
@classmethod
def adeify_images(cls, soup):
@ -1708,8 +1702,8 @@ class BasicNewsRecipe(Recipe):
oldParent = item.parent
myIndex = oldParent.contents.index(item)
item.extract()
divtag = Tag(soup,'div')
brtag = Tag(soup,'br')
divtag = soup.new_tag('div')
brtag = soup.new_tag('br')
oldParent.insert(myIndex,divtag)
divtag.append(item)
divtag.append(brtag)

View File

@ -20,11 +20,9 @@ import traceback
from base64 import b64decode
from httplib import responses
from html5_parser.soup import parse, set_soup_module
from calibre import browser, relpath, unicode_path
from calibre.constants import filesystem_encoding, iswindows
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.config import OptionParser
from calibre.utils.filenames import ascii_filename
@ -82,18 +80,15 @@ def basename(url):
def save_soup(soup, target):
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
nm = ns.find('meta')
metas = soup.findAll('meta', content=True)
added = False
for meta in metas:
if 'charset' in meta.get('content', '').lower():
meta.replaceWith(nm)
added = True
if not added:
head = soup.find('head')
if head is not None:
head.insert(0, nm)
for meta in soup.findAll('meta', content=True):
if 'charset' in meta['content'].lower():
meta.extract()
for meta in soup.findAll('meta', charset=True):
meta.extract()
head = soup.find('head')
if head is not None:
nm = soup.new_tag('meta', charset='utf-8')
head.insert(0, nm)
selfdir = os.path.dirname(target)
@ -191,18 +186,17 @@ class RecursiveFetcher(object):
usrc = self.preprocess_raw_html(usrc, url)
for pat, repl in nmassage:
usrc = pat.sub(repl, usrc)
set_soup_module(sys.modules[BeautifulSoup.__module__])
soup = parse(usrc, return_root=False)
soup = BeautifulSoup(usrc)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
for pat, repl in nmassage:
replace = pat.sub(repl, replace)
soup = parse(replace, return_root=False)
soup = BeautifulSoup(replace)
if self.keep_only_tags:
body = Tag(soup, 'body')
body = soup.new_tag('body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
@ -334,13 +328,19 @@ class RecursiveFetcher(object):
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
if not os.path.exists(diskpath):
os.mkdir(diskpath)
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')): # noqa
if tag.has_key('href'): # noqa
for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
try:
mtype = tag['type']
except KeyError:
mtype = 'text/css' if tag.name.lower() == 'style' else ''
if mtype.lower() != 'text/css':
continue
if tag.has_attr('href'):
iurl = tag['href']
if not urlsplit(iurl).scheme:
iurl = urljoin(baseurl, iurl, False)
with self.stylemap_lock:
if self.stylemap.has_key(iurl): # noqa
if iurl in self.stylemap:
tag['href'] = self.stylemap[iurl]
continue
try:
@ -363,7 +363,7 @@ class RecursiveFetcher(object):
if not urlsplit(iurl).scheme:
iurl = urljoin(baseurl, iurl, False)
with self.stylemap_lock:
if self.stylemap.has_key(iurl): # noqa
if iurl in self.stylemap:
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
continue
try:
@ -387,7 +387,7 @@ class RecursiveFetcher(object):
if not os.path.exists(diskpath):
os.mkdir(diskpath)
c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): # noqa
for tag in soup.findAll('img', src=True):
iurl = tag['src']
if iurl.startswith('data:image/'):
try:
@ -401,7 +401,7 @@ class RecursiveFetcher(object):
if not urlsplit(iurl).scheme:
iurl = urljoin(baseurl, iurl, False)
with self.imagemap_lock:
if self.imagemap.has_key(iurl): # noqa
if iurl in self.imagemap:
tag['src'] = self.imagemap[iurl]
continue
try:
@ -479,12 +479,12 @@ class RecursiveFetcher(object):
tag[key] = path+suffix
def process_return_links(self, soup, baseurl):
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): # noqa
for tag in soup.findAll('a', href=True):
iurl = self.absurl(baseurl, tag, 'href')
if not iurl:
continue
nurl = self.normurl(iurl)
if self.filemap.has_key(nurl): # noqa
if nurl in self.filemap:
self.localize_link(tag, 'href', self.filemap[nurl])
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
@ -506,7 +506,7 @@ class RecursiveFetcher(object):
if not iurl:
continue
nurl = self.normurl(iurl)
if self.filemap.has_key(nurl): # noqa
if nurl in self.filemap:
self.localize_link(tag, 'href', self.filemap[nurl])
continue
if self.files > self.max_files: