Add convenience methods for creating lxml trees to the BasicNewsRecipe class

This commit is contained in:
Kovid Goyal 2014-06-09 15:01:45 +05:30
parent 90ff907df3
commit e58cd115e1

View File

@ -634,7 +634,7 @@ class BasicNewsRecipe(Recipe):
'''
pass
def index_to_soup(self, url_or_raw, raw=False):
def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
'''
Convenience method that takes an URL to the index page and returns
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -662,6 +662,16 @@ class BasicNewsRecipe(Recipe):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
if as_tree:
import html5lib
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars
if isinstance(_raw, unicode):
_raw = strip_encoding_declarations(_raw)
else:
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
return html5lib.parse(clean_xml_chars(_raw), treebuilder='lxml', namespaceHTMLElements=False)
massage = list(BeautifulSoup.MARKUP_MASSAGE)
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
massage.append((re.compile(r'&(\S+?);'), lambda match:
@ -1157,8 +1167,6 @@ class BasicNewsRecipe(Recipe):
if self.ignore_duplicate_articles is not None:
feeds = self.remove_duplicate_articles(feeds)
#feeds = FeedCollection(feeds)
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
self.report_progress(0, _('Generating masthead...'))
@ -1228,8 +1236,6 @@ class BasicNewsRecipe(Recipe):
except NoResultsPending:
break
#feeds.restore_duplicates()
for f, feed in enumerate(feeds):
html = self.feed2index(f,feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
@ -1622,10 +1628,14 @@ class BasicNewsRecipe(Recipe):
`tag`: `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
`Tag`
'''
if not tag:
if tag is None:
return ''
if isinstance(tag, basestring):
return tag
if callable(getattr(tag, 'xpath', None)) and not hasattr(tag, 'contents'): # a lxml tag
from lxml.etree import tostring
ans = tostring(tag, method='text', encoding=unicode, with_tail=False)
else:
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):