Get rid of soupparser as bs4 has no parser of its own

2025-08-30 23:00:21 -04:00 · 2019-03-23 15:32:45 +05:30 · 2019-03-23 15:32:45 +05:30 · c8f94a552b
commit c8f94a552b
parent 256c7563b6
5 changed files with 20 additions and 155 deletions
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@ -26,7 +26,7 @@ base_url = 'https://search.overdrive.com/'
 class OverDrive(Source):

    name = 'Overdrive'
-    version = (1, 0, 0)
+    version = (1, 0, 1)
    minimum_calibre_version = (2, 80, 0)
    description = _('Downloads metadata and covers from Overdrive\'s Content Reserve')

@ -401,9 +401,9 @@ class OverDrive(Source):
                    cover_url)

    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
+        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
-        from calibre.utils.soupparser import fromstring
        from calibre.library.comments import sanitize_comments_html

        try:
@ -415,9 +415,10 @@ class OverDrive(Source):
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
+
        try:
-            root = fromstring(raw)
-        except:
+            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
+        except Exception:
            return False

        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@ -11,7 +11,7 @@ import shutil, os, re, struct, textwrap, cStringIO
 from lxml import html, etree

 from calibre import (xml_entity_to_unicode, entity_to_unicode)
-from calibre.utils.cleantext import clean_ascii_chars
+from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from calibre.ebooks import DRMError, unit_convert
 from calibre.ebooks.chardet import ENCODING_PATS
 from calibre.ebooks.mobi import MobiError
@ -184,30 +184,32 @@ class MobiReader(object):
        self.cleanup_html()

        self.log.debug('Parsing HTML...')
-        self.processed_html = clean_ascii_chars(self.processed_html)
+        self.processed_html = clean_xml_chars(self.processed_html)
        try:
            root = html.fromstring(self.processed_html)
            if len(root.xpath('//html')) > 5:
                root = html.fromstring(self.processed_html.replace('\x0c',
                    '').replace('\x14', ''))
-        except:
+        except Exception:
            self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
            self.processed_html = self.remove_random_bytes(self.processed_html)
            root = html.fromstring(self.processed_html)
        if root.xpath('descendant::p/descendant::p'):
-            from calibre.utils.soupparser import fromstring
-            self.log.warning('Malformed markup, parsing using BeautifulSoup')
+            from html5_parser import parse
+            from calibre.ebooks.chardet import strip_encoding_declarations
+            self.log.warning('Malformed markup, parsing using html5-parser')
+            self.processed_html = strip_encoding_declarations(self.processed_html)
            try:
-                root = fromstring(self.processed_html)
+                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
                self.processed_html = self.remove_random_bytes(self.processed_html)
-                root = fromstring(self.processed_html)
+                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            if len(root.xpath('body/descendant::*')) < 1:
                # There are probably stray </html>s in the markup
                self.processed_html = self.processed_html.replace('</html>',
                        '')
-                root = fromstring(self.processed_html)
+                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)

        if root.tag != 'html':
            self.log.warn('File does not have opening <html> tag')
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -107,12 +107,8 @@ def html5_parse(data, max_nesting_depth=100):
    return data


-def _html4_parse(data, prefer_soup=False):
-    if prefer_soup:
-        from calibre.utils.soupparser import fromstring
-        data = fromstring(data)
-    else:
-        data = html.fromstring(data)
+def _html4_parse(data):
+    data = html.fromstring(data)
    data.attrib.pop('xmlns', None)
    for elem in data.iter(tag=etree.Comment):
        if elem.text:
--- a/src/calibre/gui2/comments_editor.py
+++ b/src/calibre/gui2/comments_editor.py
@ -24,7 +24,7 @@ from calibre.ebooks.chardet import xml_to_unicode
 from calibre import xml_replace_entities, prepare_string_for_xml
 from calibre.gui2 import open_url, error_dialog, choose_files, gprefs, NO_URL_FORMATTING, secure_web_page
 from calibre.gui2.widgets import LineEditECM
-from calibre.utils.soupparser import fromstring
+from html5_parser import parse
 from calibre.utils.config import tweaks
 from calibre.utils.imghdr import what
 from polyglot.builtins import unicode_type
@ -355,8 +355,8 @@ class EditorWidget(QWebView, LineEditECM):  # {{{

                try:
                    root = html.fromstring(raw)
-                except:
-                    root = fromstring(raw)
+                except Exception:
+                    root = parse(raw, maybe_xhtml=False, sanitize_names=True)

                elems = []
                for body in root.xpath('//body'):
--- a/src/calibre/utils/soupparser.py
+++ b/src/calibre/utils/soupparser.py
@ -1,134 +0,0 @@
-__doc__ = """External interface to the BeautifulSoup HTML parser.
-"""
-
-__all__ = ["fromstring", "parse", "convert_tree"]
-
-from lxml import etree, html
-from calibre.ebooks.BeautifulSoup import \
-     BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
-from polyglot.builtins import codepoint_to_chr
-
-
-def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
-    """Parse a string of HTML data into an Element tree using the
-    BeautifulSoup parser.
-
-    Returns the root ``<html>`` Element of the tree.
-
-    You can pass a different BeautifulSoup parser through the
-    `beautifulsoup` keyword, and a diffent Element factory function
-    through the `makeelement` keyword.  By default, the standard
-    ``BeautifulSoup`` class and the default factory of `lxml.html` are
-    used.
-    """
-    return _parse(data, beautifulsoup, makeelement, **bsargs)
-
-
-def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
-    """Parse a file into an ElemenTree using the BeautifulSoup parser.
-
-    You can pass a different BeautifulSoup parser through the
-    `beautifulsoup` keyword, and a diffent Element factory function
-    through the `makeelement` keyword.  By default, the standard
-    ``BeautifulSoup`` class and the default factory of `lxml.html` are
-    used.
-    """
-    if not hasattr(file, 'read'):
-        file = open(file)
-    root = _parse(file, beautifulsoup, makeelement, **bsargs)
-    return etree.ElementTree(root)
-
-
-def convert_tree(beautiful_soup_tree, makeelement=None):
-    """Convert a BeautifulSoup tree to a list of Element trees.
-
-    Returns a list instead of a single root Element to support
-    HTML-like soup with more than one root element.
-
-    You can pass a different Element factory through the `makeelement`
-    keyword.
-    """
-    if makeelement is None:
-        makeelement = html.html_parser.makeelement
-    root = _convert_tree(beautiful_soup_tree, makeelement)
-    children = root.getchildren()
-    for child in children:
-        root.remove(child)
-    return children
-
-
-# helpers
-
-def _parse(source, beautifulsoup, makeelement, **bsargs):
-    if beautifulsoup is None:
-        beautifulsoup = BeautifulSoup
-    if makeelement is None:
-        makeelement = html.html_parser.makeelement
-    if 'convertEntities' not in bsargs:
-        bsargs['convertEntities'] = 'xhtml'  # Changed by Kovid, otherwise &apos; is mangled, see https://bugs.launchpad.net/calibre/+bug/1197585
-    tree = beautifulsoup(source, **bsargs)
-    root = _convert_tree(tree, makeelement)
-    # from ET: wrap the document in a html root element, if necessary
-    if len(root) == 1 and root[0].tag == "html":
-        return root[0]
-    root.tag = "html"
-    return root
-
-
-def _convert_tree(beautiful_soup_tree, makeelement):
-    root = makeelement(beautiful_soup_tree.name,
-                       attrib=dict(beautiful_soup_tree.attrs))
-    _convert_children(root, beautiful_soup_tree, makeelement)
-    return root
-
-
-def _convert_children(parent, beautiful_soup_tree, makeelement):
-    SubElement = etree.SubElement
-    et_child = None
-    for child in beautiful_soup_tree:
-        if isinstance(child, Tag):
-            et_child = SubElement(parent, child.name, attrib=dict(
-                [(k, unescape(v)) for (k,v) in child.attrs]))
-            _convert_children(et_child, child, makeelement)
-        elif type(child) is NavigableString:
-            _append_text(parent, et_child, unescape(child))
-        else:
-            if isinstance(child, Comment):
-                parent.append(etree.Comment(child))
-            elif isinstance(child, ProcessingInstruction):
-                parent.append(etree.ProcessingInstruction(
-                    *child.split(' ', 1)))
-            else:  # CData
-                _append_text(parent, et_child, unescape(child))
-
-
-def _append_text(parent, element, text):
-    if element is None:
-        parent.text = (parent.text or '') + text
-    else:
-        element.tail = (element.tail or '') + text
-
-
-# copied from ET's ElementSoup
-
-try:
-    from html.entities import name2codepoint  # Python 3
-    name2codepoint
-except ImportError:
-    from htmlentitydefs import name2codepoint
-import re
-
-handle_entities = re.compile(r"&(\w+);").sub
-
-
-def unescape(string):
-    if not string:
-        return ''
-    # work around oddities in BeautifulSoup's entity handling
-
-    def unescape_entity(m):
-        try:
-            return codepoint_to_chr(name2codepoint[m.group(1)])
-        except KeyError:
-            return m.group(0)  # use as is
-    return handle_entities(unescape_entity, string)