mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add convenience methods for creating lxml trees to the BasicNewsRecipe class
This commit is contained in:
parent
90ff907df3
commit
e58cd115e1
@ -634,7 +634,7 @@ class BasicNewsRecipe(Recipe):
|
||||
'''
|
||||
pass
|
||||
|
||||
def index_to_soup(self, url_or_raw, raw=False):
|
||||
def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
|
||||
'''
|
||||
Convenience method that takes an URL to the index page and returns
|
||||
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
||||
@ -662,6 +662,16 @@ class BasicNewsRecipe(Recipe):
|
||||
_raw = self.encoding(_raw)
|
||||
else:
|
||||
_raw = _raw.decode(self.encoding, 'replace')
|
||||
if as_tree:
|
||||
import html5lib
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
if isinstance(_raw, unicode):
|
||||
_raw = strip_encoding_declarations(_raw)
|
||||
else:
|
||||
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
return html5lib.parse(clean_xml_chars(_raw), treebuilder='lxml', namespaceHTMLElements=False)
|
||||
|
||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
|
||||
massage.append((re.compile(r'&(\S+?);'), lambda match:
|
||||
@ -1157,8 +1167,6 @@ class BasicNewsRecipe(Recipe):
|
||||
if self.ignore_duplicate_articles is not None:
|
||||
feeds = self.remove_duplicate_articles(feeds)
|
||||
|
||||
#feeds = FeedCollection(feeds)
|
||||
|
||||
self.report_progress(0, _('Trying to download cover...'))
|
||||
self.download_cover()
|
||||
self.report_progress(0, _('Generating masthead...'))
|
||||
@ -1228,8 +1236,6 @@ class BasicNewsRecipe(Recipe):
|
||||
except NoResultsPending:
|
||||
break
|
||||
|
||||
#feeds.restore_duplicates()
|
||||
|
||||
for f, feed in enumerate(feeds):
|
||||
html = self.feed2index(f,feeds)
|
||||
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
||||
@ -1622,10 +1628,14 @@ class BasicNewsRecipe(Recipe):
|
||||
`tag`: `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
||||
`Tag`
|
||||
'''
|
||||
if not tag:
|
||||
if tag is None:
|
||||
return ''
|
||||
if isinstance(tag, basestring):
|
||||
return tag
|
||||
if callable(getattr(tag, 'xpath', None)) and not hasattr(tag, 'contents'): # a lxml tag
|
||||
from lxml.etree import tostring
|
||||
ans = tostring(tag, method='text', encoding=unicode, with_tail=False)
|
||||
else:
|
||||
strings = []
|
||||
for item in tag.contents:
|
||||
if isinstance(item, (NavigableString, CData)):
|
||||
|
Loading…
x
Reference in New Issue
Block a user