Add convenience methods for creating lxml trees to the BasicNewsRecipe class

2025-12-12 08:05:05 -05:00 · 2014-06-09 15:01:45 +05:30 · 2014-06-09 15:01:45 +05:30 · e58cd115e1
commit e58cd115e1
parent 90ff907df3
1 changed files with 30 additions and 20 deletions
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -634,7 +634,7 @@ class BasicNewsRecipe(Recipe):
        '''
        pass
-    def index_to_soup(self, url_or_raw, raw=False):
+    def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
        '''
        Convenience method that takes an URL to the index page and returns
        a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -662,6 +662,16 @@ class BasicNewsRecipe(Recipe):
                _raw = self.encoding(_raw)
            else:
                _raw = _raw.decode(self.encoding, 'replace')
        if as_tree:
            import html5lib
            from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
            from calibre.utils.cleantext import clean_xml_chars
            if isinstance(_raw, unicode):
                _raw = strip_encoding_declarations(_raw)
            else:
                _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
            return html5lib.parse(clean_xml_chars(_raw), treebuilder='lxml', namespaceHTMLElements=False)
        massage = list(BeautifulSoup.MARKUP_MASSAGE)
        enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
        massage.append((re.compile(r'&(\S+?);'), lambda match:
@ -1157,8 +1167,6 @@ class BasicNewsRecipe(Recipe):
        if self.ignore_duplicate_articles is not None:
            feeds = self.remove_duplicate_articles(feeds)
        #feeds = FeedCollection(feeds)
        self.report_progress(0, _('Trying to download cover...'))
        self.download_cover()
        self.report_progress(0, _('Generating masthead...'))
@ -1228,8 +1236,6 @@ class BasicNewsRecipe(Recipe):
            except NoResultsPending:
                break
        #feeds.restore_duplicates()
        for f, feed in enumerate(feeds):
            html = self.feed2index(f,feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
@ -1622,24 +1628,28 @@ class BasicNewsRecipe(Recipe):
        `tag`: `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
        `Tag`
        '''
-        if not tag:
+        if tag is None:
            return ''
        if isinstance(tag, basestring):
            return tag
-        strings = []
+        if callable(getattr(tag, 'xpath', None)) and not hasattr(tag, 'contents'):  # a lxml tag
-        for item in tag.contents:
+            from lxml.etree import tostring
-            if isinstance(item, (NavigableString, CData)):
+            ans = tostring(tag, method='text', encoding=unicode, with_tail=False)
-                strings.append(item.string)
+        else:
-            elif isinstance(item, Tag):
+            strings = []
-                res = self.tag_to_string(item)
+            for item in tag.contents:
-                if res:
+                if isinstance(item, (NavigableString, CData)):
-                    strings.append(res)
+                    strings.append(item.string)
-                elif use_alt:
+                elif isinstance(item, Tag):
-                    try:
+                    res = self.tag_to_string(item)
-                        strings.append(item['alt'])
+                    if res:
-                    except KeyError:
+                        strings.append(res)
-                        pass
+                    elif use_alt:
-        ans = u''.join(strings)
+                        try:
                            strings.append(item['alt'])
                        except KeyError:
                            pass
            ans = u''.join(strings)
        if normalize_whitespace:
            ans = re.sub(r'\s+', ' ', ans)
        return ans