Start work on porting to beautifulsoup4

2025-07-09 03:04:10 -04:00 · 2019-03-22 11:20:58 +05:30 · 2019-03-22 11:20:58 +05:30 · 692230147c
commit 692230147c
parent 78b7112012
4 changed files with 66 additions and 1981 deletions
--- a/manual/news.rst
+++ b/manual/news.rst
@ -267,7 +267,7 @@ to go to https://www.nytimes.com/pages/todayspaper/index.html and fetch the list
 of articles that appear in *todays* paper. While more complex than simply using
 :term:`RSS`, the recipe creates an e-book that corresponds very closely to the
 days paper. ``parse_index`` makes heavy use of `BeautifulSoup
-<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_ to parse
+<https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_ to parse
 the daily paper webpage. You can also use other, more modern parsers if you
 dislike BeautifulSoup. calibre comes with `lxml <https://lxml.de/>`_ and
 `html5lib <https://github.com/html5lib/html5lib-python>`_, which are the
--- a/src/calibre/ebooks/BeautifulSoup.py
+++ b/src/calibre/ebooks/BeautifulSoup.py
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -9,7 +9,6 @@ __docformat__ = "restructuredtext en"

 import os, time, traceback, re, sys, io
 from collections import defaultdict
-from functools import partial
 from contextlib import nested, closing


@ -17,7 +16,6 @@ from calibre import (browser, __appname__, iswindows, force_unicode,
                    strftime, preferred_encoding, as_unicode, random_user_agent)
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
 from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre import entity_to_unicode
 from calibre.web import Recipe
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata import MetaInformation
@ -224,14 +222,14 @@ class BasicNewsRecipe(Recipe):
    #:
    #:    {
    #:     name      : 'tag name',   #e.g. 'div'
-    #:     attrs     : a dictionary, #e.g. {class: 'advertisment'}
+    #:     attrs     : a dictionary, #e.g. {'class': 'advertisment'}
    #:    }
    #:
    #: All keys are optional. For a full explanation of the search criteria, see
-    #: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html#Searching%20the%20Parse%20Tree>`_
+    #: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree>`_
    #: A common example::
    #:
-    #:   remove_tags = [dict(name='div', attrs={'class':'advert'})]
+    #:   remove_tags = [dict(name='div', class_='advert')]
    #:
    #: This will remove all `<div class="advert">` tags and all
    #: their children from the downloaded :term:`HTML`.
@ -662,7 +660,7 @@ class BasicNewsRecipe(Recipe):
    def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
        '''
        Convenience method that takes an URL to the index page and returns
-        a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
+        a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc>`_
        of it.

        `url_or_raw`: Either a URL or the downloaded index page as a string
@ -701,8 +699,7 @@ class BasicNewsRecipe(Recipe):
            from html5_parser import parse
            return parse(_raw)
        else:
-            from html5_parser.soup import set_soup_module, parse
-            set_soup_module(sys.modules[BeautifulSoup.__module__])
+            return BeautifulSoup(_raw)
            return parse(_raw, return_root=False)

    def extract_readable_article(self, html, url):
@ -951,7 +948,7 @@ class BasicNewsRecipe(Recipe):
    def _postprocess_html(self, soup, first_fetch, job_info):
        if self.no_stylesheets:
            for link in soup.findAll('link'):
-                if (link.get('type') or 'text/css').lower() == 'text/css' and (link.get('rel') or 'stylesheet').lower() == 'stylesheet':
+                if (link.get('type') or 'text/css').lower() == 'text/css' and 'stylesheet' in (link.get('rel') or ('stylesheet',)):
                    link.extract()
            for style in soup.findAll('style'):
                style.extract()
@ -960,9 +957,10 @@ class BasicNewsRecipe(Recipe):
            head = soup.find('body')
        if not head:
            head = soup.find(True)
-        style = BeautifulSoup(u'<style type="text/css" title="override_css">%s</style>'%(
-            self.template_css +'\n\n'+(self.get_extra_css() or ''))).find('style')
-        head.insert(len(head.contents), style)
+        css = self.template_css + '\n\n' + (self.get_extra_css() or '')
+        style = soup.new_tag('style', type='text/css', title='override_css')
+        style.append(css)
+        head.append(style)
        if first_fetch and job_info:
            url, f, a, feed_len = job_info
            body = soup.find('body')
@ -1648,14 +1646,14 @@ class BasicNewsRecipe(Recipe):
    def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
        '''
        Convenience method to take a
-        `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
+        `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
        `Tag` and extract the text from it recursively, including any CDATA sections
        and alt tag attributes. Return a possibly empty unicode string.

        `use_alt`: If `True` try to use the alt attribute for tags that don't
        have any textual content

-        `tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>`_
+        `tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
        `Tag`
        '''
        if tag is None:
@ -1686,11 +1684,7 @@ class BasicNewsRecipe(Recipe):

    @classmethod
    def soup(cls, raw):
-        entity_replace = [(re.compile(u'&(\\S+?);'), partial(entity_to_unicode,
-                                                           exceptions=[]))]
-        nmassage = list(BeautifulSoup.MARKUP_MASSAGE)
-        nmassage.extend(entity_replace)
-        return BeautifulSoup(raw, markupMassage=nmassage)
+        return BeautifulSoup(raw)

    @classmethod
    def adeify_images(cls, soup):
@ -1708,8 +1702,8 @@ class BasicNewsRecipe(Recipe):
            oldParent = item.parent
            myIndex = oldParent.contents.index(item)
            item.extract()
-            divtag = Tag(soup,'div')
-            brtag  = Tag(soup,'br')
+            divtag = soup.new_tag('div')
+            brtag  = soup.new_tag('br')
            oldParent.insert(myIndex,divtag)
            divtag.append(item)
            divtag.append(brtag)
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -20,11 +20,9 @@ import traceback
 from base64 import b64decode
 from httplib import responses

-from html5_parser.soup import parse, set_soup_module
-
 from calibre import browser, relpath, unicode_path
 from calibre.constants import filesystem_encoding, iswindows
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.config import OptionParser
 from calibre.utils.filenames import ascii_filename
@ -82,18 +80,15 @@ def basename(url):


 def save_soup(soup, target):
-    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    nm = ns.find('meta')
-    metas = soup.findAll('meta', content=True)
-    added = False
-    for meta in metas:
-        if 'charset' in meta.get('content', '').lower():
-            meta.replaceWith(nm)
-            added = True
-    if not added:
-        head = soup.find('head')
-        if head is not None:
-            head.insert(0, nm)
+    for meta in soup.findAll('meta', content=True):
+        if 'charset' in meta['content'].lower():
+            meta.extract()
+    for meta in soup.findAll('meta', charset=True):
+        meta.extract()
+    head = soup.find('head')
+    if head is not None:
+        nm = soup.new_tag('meta', charset='utf-8')
+        head.insert(0, nm)

    selfdir = os.path.dirname(target)

@ -191,18 +186,17 @@ class RecursiveFetcher(object):
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
-        set_soup_module(sys.modules[BeautifulSoup.__module__])
-        soup = parse(usrc, return_root=False)
+        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
-            soup = parse(replace, return_root=False)
+            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
-            body = Tag(soup, 'body')
+            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
@ -334,13 +328,19 @@ class RecursiveFetcher(object):
        diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
-        for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):  # noqa
-            if tag.has_key('href'):  # noqa
+        for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
+            try:
+                mtype = tag['type']
+            except KeyError:
+                mtype = 'text/css' if tag.name.lower() == 'style' else ''
+            if mtype.lower() != 'text/css':
+                continue
+            if tag.has_attr('href'):
                iurl = tag['href']
                if not urlsplit(iurl).scheme:
                    iurl = urljoin(baseurl, iurl, False)
                with self.stylemap_lock:
-                    if self.stylemap.has_key(iurl):  # noqa
+                    if iurl in self.stylemap:
                        tag['href'] = self.stylemap[iurl]
                        continue
                try:
@ -363,7 +363,7 @@ class RecursiveFetcher(object):
                        if not urlsplit(iurl).scheme:
                            iurl = urljoin(baseurl, iurl, False)
                        with self.stylemap_lock:
-                            if self.stylemap.has_key(iurl):  # noqa
+                            if iurl in self.stylemap:
                                ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                                continue
                        try:
@ -387,7 +387,7 @@ class RecursiveFetcher(object):
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
        c = 0
-        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):  # noqa
+        for tag in soup.findAll('img', src=True):
            iurl = tag['src']
            if iurl.startswith('data:image/'):
                try:
@ -401,7 +401,7 @@ class RecursiveFetcher(object):
                if not urlsplit(iurl).scheme:
                    iurl = urljoin(baseurl, iurl, False)
                with self.imagemap_lock:
-                    if self.imagemap.has_key(iurl):  # noqa
+                    if iurl in self.imagemap:
                        tag['src'] = self.imagemap[iurl]
                        continue
                try:
@ -479,12 +479,12 @@ class RecursiveFetcher(object):
        tag[key] = path+suffix

    def process_return_links(self, soup, baseurl):
-        for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):  # noqa
+        for tag in soup.findAll('a', href=True):
            iurl = self.absurl(baseurl, tag, 'href')
            if not iurl:
                continue
            nurl = self.normurl(iurl)
-            if self.filemap.has_key(nurl):  # noqa
+            if nurl in self.filemap:
                self.localize_link(tag, 'href', self.filemap[nurl])

    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
@ -506,7 +506,7 @@ class RecursiveFetcher(object):
                if not iurl:
                    continue
                nurl = self.normurl(iurl)
-                if self.filemap.has_key(nurl):  # noqa
+                if nurl in self.filemap:
                    self.localize_link(tag, 'href', self.filemap[nurl])
                    continue
                if self.files > self.max_files: