mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add convenience methods for creating lxml trees to the BasicNewsRecipe class
This commit is contained in:
parent
90ff907df3
commit
e58cd115e1
@ -634,7 +634,7 @@ class BasicNewsRecipe(Recipe):
|
|||||||
'''
|
'''
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def index_to_soup(self, url_or_raw, raw=False):
|
def index_to_soup(self, url_or_raw, raw=False, as_tree=False):
|
||||||
'''
|
'''
|
||||||
Convenience method that takes an URL to the index page and returns
|
Convenience method that takes an URL to the index page and returns
|
||||||
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
||||||
@ -662,6 +662,16 @@ class BasicNewsRecipe(Recipe):
|
|||||||
_raw = self.encoding(_raw)
|
_raw = self.encoding(_raw)
|
||||||
else:
|
else:
|
||||||
_raw = _raw.decode(self.encoding, 'replace')
|
_raw = _raw.decode(self.encoding, 'replace')
|
||||||
|
if as_tree:
|
||||||
|
import html5lib
|
||||||
|
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
|
||||||
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
if isinstance(_raw, unicode):
|
||||||
|
_raw = strip_encoding_declarations(_raw)
|
||||||
|
else:
|
||||||
|
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||||
|
return html5lib.parse(clean_xml_chars(_raw), treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
|
||||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
|
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
|
||||||
massage.append((re.compile(r'&(\S+?);'), lambda match:
|
massage.append((re.compile(r'&(\S+?);'), lambda match:
|
||||||
@ -1157,8 +1167,6 @@ class BasicNewsRecipe(Recipe):
|
|||||||
if self.ignore_duplicate_articles is not None:
|
if self.ignore_duplicate_articles is not None:
|
||||||
feeds = self.remove_duplicate_articles(feeds)
|
feeds = self.remove_duplicate_articles(feeds)
|
||||||
|
|
||||||
#feeds = FeedCollection(feeds)
|
|
||||||
|
|
||||||
self.report_progress(0, _('Trying to download cover...'))
|
self.report_progress(0, _('Trying to download cover...'))
|
||||||
self.download_cover()
|
self.download_cover()
|
||||||
self.report_progress(0, _('Generating masthead...'))
|
self.report_progress(0, _('Generating masthead...'))
|
||||||
@ -1228,8 +1236,6 @@ class BasicNewsRecipe(Recipe):
|
|||||||
except NoResultsPending:
|
except NoResultsPending:
|
||||||
break
|
break
|
||||||
|
|
||||||
#feeds.restore_duplicates()
|
|
||||||
|
|
||||||
for f, feed in enumerate(feeds):
|
for f, feed in enumerate(feeds):
|
||||||
html = self.feed2index(f,feeds)
|
html = self.feed2index(f,feeds)
|
||||||
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
||||||
@ -1622,24 +1628,28 @@ class BasicNewsRecipe(Recipe):
|
|||||||
`tag`: `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
`tag`: `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
||||||
`Tag`
|
`Tag`
|
||||||
'''
|
'''
|
||||||
if not tag:
|
if tag is None:
|
||||||
return ''
|
return ''
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
return tag
|
return tag
|
||||||
strings = []
|
if callable(getattr(tag, 'xpath', None)) and not hasattr(tag, 'contents'): # a lxml tag
|
||||||
for item in tag.contents:
|
from lxml.etree import tostring
|
||||||
if isinstance(item, (NavigableString, CData)):
|
ans = tostring(tag, method='text', encoding=unicode, with_tail=False)
|
||||||
strings.append(item.string)
|
else:
|
||||||
elif isinstance(item, Tag):
|
strings = []
|
||||||
res = self.tag_to_string(item)
|
for item in tag.contents:
|
||||||
if res:
|
if isinstance(item, (NavigableString, CData)):
|
||||||
strings.append(res)
|
strings.append(item.string)
|
||||||
elif use_alt:
|
elif isinstance(item, Tag):
|
||||||
try:
|
res = self.tag_to_string(item)
|
||||||
strings.append(item['alt'])
|
if res:
|
||||||
except KeyError:
|
strings.append(res)
|
||||||
pass
|
elif use_alt:
|
||||||
ans = u''.join(strings)
|
try:
|
||||||
|
strings.append(item['alt'])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
ans = u''.join(strings)
|
||||||
if normalize_whitespace:
|
if normalize_whitespace:
|
||||||
ans = re.sub(r'\s+', ' ', ans)
|
ans = re.sub(r'\s+', ' ', ans)
|
||||||
return ans
|
return ans
|
||||||
|
Loading…
x
Reference in New Issue
Block a user