mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Get rid of cssselect in a couple of recipes
This commit is contained in:
parent
9248a9ffec
commit
f2d44f286b
@ -8,11 +8,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
from calibre.utils.cleantext import clean_xml_chars
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
def CSSSelect(expr):
|
|
||||||
from cssselect import HTMLTranslator
|
|
||||||
from lxml.etree import XPath
|
|
||||||
return XPath(HTMLTranslator().css_to_xpath(expr))
|
|
||||||
|
|
||||||
class AmericanThinker(BasicNewsRecipe):
|
class AmericanThinker(BasicNewsRecipe):
|
||||||
title = u'American Thinker'
|
title = u'American Thinker'
|
||||||
description = "American Thinker is a daily internet publication devoted to the thoughtful exploration of issues of importance to Americans."
|
description = "American Thinker is a daily internet publication devoted to the thoughtful exploration of issues of importance to Americans."
|
||||||
@ -27,7 +22,6 @@ class AmericanThinker(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
#remove_tags_before = dict(name='h1')
|
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'comment' : description
|
'comment' : description
|
||||||
@ -41,13 +35,10 @@ class AmericanThinker(BasicNewsRecipe):
|
|||||||
root = html5lib.parse(
|
root = html5lib.parse(
|
||||||
clean_xml_chars(raw), treebuilder='lxml',
|
clean_xml_chars(raw), treebuilder='lxml',
|
||||||
namespaceHTMLElements=False)
|
namespaceHTMLElements=False)
|
||||||
for x in CSSSelect('.article_body.bottom')(root):
|
for x in root.xpath('''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' article_body ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' bottom '))]'''):
|
||||||
x.getparent().remove(x)
|
x.getparent().remove(x)
|
||||||
return etree.tostring(root, encoding=unicode)
|
return etree.tostring(root, encoding=unicode)
|
||||||
|
|
||||||
feeds = [(u'http://feeds.feedburner.com/americanthinker'),
|
feeds = [(u'http://feeds.feedburner.com/americanthinker'),
|
||||||
(u'http://feeds.feedburner.com/AmericanThinkerBlog')
|
(u'http://feeds.feedburner.com/AmericanThinkerBlog')
|
||||||
]
|
]
|
||||||
|
|
||||||
#def print_version(self, url):
|
|
||||||
#return 'http://www.americanthinker.com/assets/3rd_party/printpage/?url=' + url
|
|
||||||
|
@ -1,11 +1,6 @@
|
|||||||
from calibre.web.feeds.jsnews import JavascriptRecipe
|
from calibre.web.feeds.jsnews import JavascriptRecipe
|
||||||
from cssselect import HTMLTranslator
|
|
||||||
from lxml.etree import XPath
|
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
def CSSSelect(expr):
|
|
||||||
return XPath(HTMLTranslator().css_to_xpath(expr))
|
|
||||||
|
|
||||||
BASE = 'http://www.newsweek.com'
|
BASE = 'http://www.newsweek.com'
|
||||||
def href_to_url(a, add_piano=False):
|
def href_to_url(a, add_piano=False):
|
||||||
return BASE + a.get('href') + ('?piano_t=1' if add_piano else '')
|
return BASE + a.get('href') + ('?piano_t=1' if add_piano else '')
|
||||||
@ -40,16 +35,16 @@ class Newsweek(JavascriptRecipe):
|
|||||||
def get_publication_data(self, browser):
|
def get_publication_data(self, browser):
|
||||||
browser.wait_for_element('nav.main-menu a[href]')
|
browser.wait_for_element('nav.main-menu a[href]')
|
||||||
root = self.index_to_soup(browser.html)
|
root = self.index_to_soup(browser.html)
|
||||||
for a in CSSSelect('nav.main-menu a[href]')(root):
|
for a in root.xpath('''descendant-or-self::nav[@class and contains(concat(' ', normalize-space(@class), ' '), ' main-menu ')]/descendant-or-self::*/a[@href]'''):
|
||||||
if a.text and a.text.strip() == 'This Week\'s Edition':
|
if a.text and a.text.strip() == 'This Week\'s Edition':
|
||||||
return self.get_newsweek_publication_data(browser, href_to_url(a, True))
|
return self.get_newsweek_publication_data(browser, href_to_url(a, True))
|
||||||
|
|
||||||
def get_newsweek_publication_data(self, browser, url):
|
def get_newsweek_publication_data(self, browser, url):
|
||||||
root = self.index_to_soup(url)
|
root = self.index_to_soup(url)
|
||||||
sel = lambda expr: CSSSelect(expr)(root)
|
sel = lambda expr: root.xpath(expr)
|
||||||
ans = {}
|
ans = {}
|
||||||
|
|
||||||
for img in sel('div.cover-story div.info img[src]'):
|
for img in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]/descendant-or-self::*/img[@src]'''):
|
||||||
if '_Cover_' in img.get('title', ''):
|
if '_Cover_' in img.get('title', ''):
|
||||||
ans['cover'] = browser.get_resource(img.get('src'))
|
ans['cover'] = browser.get_resource(img.get('src'))
|
||||||
break
|
break
|
||||||
@ -59,7 +54,7 @@ class Newsweek(JavascriptRecipe):
|
|||||||
self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')
|
self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')
|
||||||
|
|
||||||
sections = []
|
sections = []
|
||||||
for div in sel('div.cover-story div.info'):
|
for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]'''):
|
||||||
url = None
|
url = None
|
||||||
for a in div.xpath('descendant::a[@href]'):
|
for a in div.xpath('descendant::a[@href]'):
|
||||||
url = href_to_url(a)
|
url = href_to_url(a)
|
||||||
@ -68,7 +63,7 @@ class Newsweek(JavascriptRecipe):
|
|||||||
sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
|
sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
|
||||||
break
|
break
|
||||||
features = []
|
features = []
|
||||||
for li in sel('div.features li'):
|
for li in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' features ')]/descendant-or-self::*/li'''):
|
||||||
url = None
|
url = None
|
||||||
for a in li.xpath('descendant::a[@class="article-link"]'):
|
for a in li.xpath('descendant::a[@class="article-link"]'):
|
||||||
url = href_to_url(a)
|
url = href_to_url(a)
|
||||||
@ -77,7 +72,7 @@ class Newsweek(JavascriptRecipe):
|
|||||||
if features:
|
if features:
|
||||||
sections.append(('Features', features))
|
sections.append(('Features', features))
|
||||||
|
|
||||||
for div in sel('div.issue-list-block'):
|
for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' issue-list-block ')]'''):
|
||||||
for d in div.xpath('descendant::div[@class="block-title"]'):
|
for d in div.xpath('descendant::div[@class="block-title"]'):
|
||||||
section_title = self.tag_to_string(d)
|
section_title = self.tag_to_string(d)
|
||||||
articles = []
|
articles = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user