From e58cd115e1bb1b55d58bfae29bf8cc4a3ce4f295 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 9 Jun 2014 15:01:45 +0530 Subject: [PATCH] Add convenience methods for creating lxml trees to the BasicNewsRecipe class --- src/calibre/web/feeds/news.py | 50 +++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index f642ef130d..8313432b21 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -634,7 +634,7 @@ class BasicNewsRecipe(Recipe): ''' pass - def index_to_soup(self, url_or_raw, raw=False): + def index_to_soup(self, url_or_raw, raw=False, as_tree=False): ''' Convenience method that takes an URL to the index page and returns a `BeautifulSoup `_ @@ -662,6 +662,16 @@ class BasicNewsRecipe(Recipe): _raw = self.encoding(_raw) else: _raw = _raw.decode(self.encoding, 'replace') + if as_tree: + import html5lib + from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode + from calibre.utils.cleantext import clean_xml_chars + if isinstance(_raw, unicode): + _raw = strip_encoding_declarations(_raw) + else: + _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0] + return html5lib.parse(clean_xml_chars(_raw), treebuilder='lxml', namespaceHTMLElements=False) + massage = list(BeautifulSoup.MARKUP_MASSAGE) enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding massage.append((re.compile(r'&(\S+?);'), lambda match: @@ -1157,8 +1167,6 @@ class BasicNewsRecipe(Recipe): if self.ignore_duplicate_articles is not None: feeds = self.remove_duplicate_articles(feeds) - #feeds = FeedCollection(feeds) - self.report_progress(0, _('Trying to download cover...')) self.download_cover() self.report_progress(0, _('Generating masthead...')) @@ -1228,8 +1236,6 @@ class BasicNewsRecipe(Recipe): except NoResultsPending: break - #feeds.restore_duplicates() - for f, feed in enumerate(feeds): html = self.feed2index(f,feeds) feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) @@ -1622,24 +1628,28 @@ class BasicNewsRecipe(Recipe): `tag`: `BeautifulSoup `_ `Tag` ''' - if not tag: + if tag is None: return '' if isinstance(tag, basestring): return tag - strings = [] - for item in tag.contents: - if isinstance(item, (NavigableString, CData)): - strings.append(item.string) - elif isinstance(item, Tag): - res = self.tag_to_string(item) - if res: - strings.append(res) - elif use_alt: - try: - strings.append(item['alt']) - except KeyError: - pass - ans = u''.join(strings) + if callable(getattr(tag, 'xpath', None)) and not hasattr(tag, 'contents'): # a lxml tag + from lxml.etree import tostring + ans = tostring(tag, method='text', encoding=unicode, with_tail=False) + else: + strings = [] + for item in tag.contents: + if isinstance(item, (NavigableString, CData)): + strings.append(item.string) + elif isinstance(item, Tag): + res = self.tag_to_string(item) + if res: + strings.append(res) + elif use_alt: + try: + strings.append(item['alt']) + except KeyError: + pass + ans = u''.join(strings) if normalize_whitespace: ans = re.sub(r'\s+', ' ', ans) return ans