Get rid of soupparser as bs4 has no parser of its own

2025-07-09 03:04:10 -04:00 · 2019-03-23 15:32:45 +05:30 · 2019-03-23 15:32:45 +05:30 · c8f94a552b
commit c8f94a552b
parent 256c7563b6
5 changed files with 20 additions and 155 deletions
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@ -26,7 +26,7 @@ base_url = 'https://search.overdrive.com/'
 class OverDrive(Source):
    name = 'Overdrive'
-    version = (1, 0, 0)
+    version = (1, 0, 1)
    minimum_calibre_version = (2, 80, 0)
    description = _('Downloads metadata and covers from Overdrive\'s Content Reserve')
@ -401,9 +401,9 @@ class OverDrive(Source):
                    cover_url)
    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.soupparser import fromstring
        from calibre.library.comments import sanitize_comments_html
        try:
@ -415,9 +415,10 @@ class OverDrive(Source):
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        try:
-            root = fromstring(raw)
+            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
-        except:
+        except Exception:
            return False
        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@ -11,7 +11,7 @@ import shutil, os, re, struct, textwrap, cStringIO
 from lxml import html, etree
 from calibre import (xml_entity_to_unicode, entity_to_unicode)
-from calibre.utils.cleantext import clean_ascii_chars
+from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from calibre.ebooks import DRMError, unit_convert
 from calibre.ebooks.chardet import ENCODING_PATS
 from calibre.ebooks.mobi import MobiError
@ -184,30 +184,32 @@ class MobiReader(object):
        self.cleanup_html()
        self.log.debug('Parsing HTML...')
-        self.processed_html = clean_ascii_chars(self.processed_html)
+        self.processed_html = clean_xml_chars(self.processed_html)
        try:
            root = html.fromstring(self.processed_html)
            if len(root.xpath('//html')) > 5:
                root = html.fromstring(self.processed_html.replace('\x0c',
                    '').replace('\x14', ''))
-        except:
+        except Exception:
            self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
            self.processed_html = self.remove_random_bytes(self.processed_html)
            root = html.fromstring(self.processed_html)
        if root.xpath('descendant::p/descendant::p'):
-            from calibre.utils.soupparser import fromstring
+            from html5_parser import parse
-            self.log.warning('Malformed markup, parsing using BeautifulSoup')
+            from calibre.ebooks.chardet import strip_encoding_declarations
            self.log.warning('Malformed markup, parsing using html5-parser')
            self.processed_html = strip_encoding_declarations(self.processed_html)
            try:
-                root = fromstring(self.processed_html)
+                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
                self.processed_html = self.remove_random_bytes(self.processed_html)
-                root = fromstring(self.processed_html)
+                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            if len(root.xpath('body/descendant::*')) < 1:
                # There are probably stray </html>s in the markup
                self.processed_html = self.processed_html.replace('</html>',
                        '')
-                root = fromstring(self.processed_html)
+                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
        if root.tag != 'html':
            self.log.warn('File does not have opening <html> tag')
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -107,12 +107,8 @@ def html5_parse(data, max_nesting_depth=100):
    return data
-def _html4_parse(data, prefer_soup=False):
+def _html4_parse(data):
-    if prefer_soup:
+    data = html.fromstring(data)
        from calibre.utils.soupparser import fromstring
        data = fromstring(data)
    else:
        data = html.fromstring(data)
    data.attrib.pop('xmlns', None)
    for elem in data.iter(tag=etree.Comment):
        if elem.text:
--- a/src/calibre/gui2/comments_editor.py
+++ b/src/calibre/gui2/comments_editor.py
@ -24,7 +24,7 @@ from calibre.ebooks.chardet import xml_to_unicode
 from calibre import xml_replace_entities, prepare_string_for_xml
 from calibre.gui2 import open_url, error_dialog, choose_files, gprefs, NO_URL_FORMATTING, secure_web_page
 from calibre.gui2.widgets import LineEditECM
-from calibre.utils.soupparser import fromstring
+from html5_parser import parse
 from calibre.utils.config import tweaks
 from calibre.utils.imghdr import what
 from polyglot.builtins import unicode_type
@ -355,8 +355,8 @@ class EditorWidget(QWebView, LineEditECM):  # {{{
                try:
                    root = html.fromstring(raw)
-                except:
+                except Exception:
-                    root = fromstring(raw)
+                    root = parse(raw, maybe_xhtml=False, sanitize_names=True)
                elems = []
                for body in root.xpath('//body'):
--- a/src/calibre/utils/soupparser.py
+++ b/src/calibre/utils/soupparser.py
@ -1,134 +0,0 @@
 __doc__ = """External interface to the BeautifulSoup HTML parser.
 """
 __all__ = ["fromstring", "parse", "convert_tree"]
 from lxml import etree, html
 from calibre.ebooks.BeautifulSoup import \
     BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
 from polyglot.builtins import codepoint_to_chr
 def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
    """Parse a string of HTML data into an Element tree using the
    BeautifulSoup parser.
    Returns the root ``<html>`` Element of the tree.
    You can pass a different BeautifulSoup parser through the
    `beautifulsoup` keyword, and a diffent Element factory function
    through the `makeelement` keyword.  By default, the standard
    ``BeautifulSoup`` class and the default factory of `lxml.html` are
    used.
    """
    return _parse(data, beautifulsoup, makeelement, **bsargs)
 def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
    """Parse a file into an ElemenTree using the BeautifulSoup parser.
    You can pass a different BeautifulSoup parser through the
    `beautifulsoup` keyword, and a diffent Element factory function
    through the `makeelement` keyword.  By default, the standard
    ``BeautifulSoup`` class and the default factory of `lxml.html` are
    used.
    """
    if not hasattr(file, 'read'):
        file = open(file)
    root = _parse(file, beautifulsoup, makeelement, **bsargs)
    return etree.ElementTree(root)
 def convert_tree(beautiful_soup_tree, makeelement=None):
    """Convert a BeautifulSoup tree to a list of Element trees.
    Returns a list instead of a single root Element to support
    HTML-like soup with more than one root element.
    You can pass a different Element factory through the `makeelement`
    keyword.
    """
    if makeelement is None:
        makeelement = html.html_parser.makeelement
    root = _convert_tree(beautiful_soup_tree, makeelement)
    children = root.getchildren()
    for child in children:
        root.remove(child)
    return children
 # helpers
 def _parse(source, beautifulsoup, makeelement, **bsargs):
    if beautifulsoup is None:
        beautifulsoup = BeautifulSoup
    if makeelement is None:
        makeelement = html.html_parser.makeelement
    if 'convertEntities' not in bsargs:
        bsargs['convertEntities'] = 'xhtml'  # Changed by Kovid, otherwise &apos; is mangled, see https://bugs.launchpad.net/calibre/+bug/1197585
    tree = beautifulsoup(source, **bsargs)
    root = _convert_tree(tree, makeelement)
    # from ET: wrap the document in a html root element, if necessary
    if len(root) == 1 and root[0].tag == "html":
        return root[0]
    root.tag = "html"
    return root
 def _convert_tree(beautiful_soup_tree, makeelement):
    root = makeelement(beautiful_soup_tree.name,
                       attrib=dict(beautiful_soup_tree.attrs))
    _convert_children(root, beautiful_soup_tree, makeelement)
    return root
 def _convert_children(parent, beautiful_soup_tree, makeelement):
    SubElement = etree.SubElement
    et_child = None
    for child in beautiful_soup_tree:
        if isinstance(child, Tag):
            et_child = SubElement(parent, child.name, attrib=dict(
                [(k, unescape(v)) for (k,v) in child.attrs]))
            _convert_children(et_child, child, makeelement)
        elif type(child) is NavigableString:
            _append_text(parent, et_child, unescape(child))
        else:
            if isinstance(child, Comment):
                parent.append(etree.Comment(child))
            elif isinstance(child, ProcessingInstruction):
                parent.append(etree.ProcessingInstruction(
                    *child.split(' ', 1)))
            else:  # CData
                _append_text(parent, et_child, unescape(child))
 def _append_text(parent, element, text):
    if element is None:
        parent.text = (parent.text or '') + text
    else:
        element.tail = (element.tail or '') + text
 # copied from ET's ElementSoup
 try:
    from html.entities import name2codepoint  # Python 3
    name2codepoint
 except ImportError:
    from htmlentitydefs import name2codepoint
 import re
 handle_entities = re.compile(r"&(\w+);").sub
 def unescape(string):
    if not string:
        return ''
    # work around oddities in BeautifulSoup's entity handling
    def unescape_entity(m):
        try:
            return codepoint_to_chr(name2codepoint[m.group(1)])
        except KeyError:
            return m.group(0)  # use as is
    return handle_entities(unescape_entity, string)