Add convenience methods for creating lxml trees to the BasicNewsRecipe class

2025-12-10 23:25:01 -05:00 · 2014-06-09 15:01:45 +05:30 · 2014-06-09 15:01:45 +05:30 · e58cd115e1
commit e58cd115e1
parent 90ff907df3
1 changed files with 30 additions and 20 deletions
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -634,7 +634,7 @@ class BasicNewsRecipe(Recipe):
        '''
        pass

-    def index_to_soup(self, url_or_raw, raw=False):
+    def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
        '''
        Convenience method that takes an URL to the index page and returns
        a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -662,6 +662,16 @@ class BasicNewsRecipe(Recipe):
                _raw = self.encoding(_raw)
            else:
                _raw = _raw.decode(self.encoding, 'replace')
+        if as_tree:
+            import html5lib
+            from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
+            from calibre.utils.cleantext import clean_xml_chars
+            if isinstance(_raw, unicode):
+                _raw = strip_encoding_declarations(_raw)
+            else:
+                _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
+            return html5lib.parse(clean_xml_chars(_raw), treebuilder='lxml', namespaceHTMLElements=False)
+
        massage = list(BeautifulSoup.MARKUP_MASSAGE)
        enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
        massage.append((re.compile(r'&(\S+?);'), lambda match:
@ -1157,8 +1167,6 @@ class BasicNewsRecipe(Recipe):
        if self.ignore_duplicate_articles is not None:
            feeds = self.remove_duplicate_articles(feeds)

-        #feeds = FeedCollection(feeds)
-
        self.report_progress(0, _('Trying to download cover...'))
        self.download_cover()
        self.report_progress(0, _('Generating masthead...'))
@ -1228,8 +1236,6 @@ class BasicNewsRecipe(Recipe):
            except NoResultsPending:
                break

-        #feeds.restore_duplicates()
-
        for f, feed in enumerate(feeds):
            html = self.feed2index(f,feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
@ -1622,10 +1628,14 @@ class BasicNewsRecipe(Recipe):
        `tag`: `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
        `Tag`
        '''
-        if not tag:
+        if tag is None:
            return ''
        if isinstance(tag, basestring):
            return tag
+        if callable(getattr(tag, 'xpath', None)) and not hasattr(tag, 'contents'):  # a lxml tag
+            from lxml.etree import tostring
+            ans = tostring(tag, method='text', encoding=unicode, with_tail=False)
+        else:
            strings = []
            for item in tag.contents:
                if isinstance(item, (NavigableString, CData)):