Get rid of cssselect in a couple of recipes

2025-07-09 03:04:10 -04:00 · 2015-02-22 14:25:02 +05:30 · 2015-02-22 14:25:02 +05:30 · f2d44f286b
commit f2d44f286b
parent 9248a9ffec
2 changed files with 7 additions and 21 deletions
--- a/recipes/american_thinker.recipe
+++ b/recipes/american_thinker.recipe
@ -8,11 +8,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_xml_chars
 from lxml import etree
 def CSSSelect(expr):
    from cssselect import HTMLTranslator
    from lxml.etree import XPath
    return XPath(HTMLTranslator().css_to_xpath(expr))
 class AmericanThinker(BasicNewsRecipe):
    title          = u'American Thinker'
    description    = "American Thinker is a daily internet publication devoted to the thoughtful exploration of issues of importance to Americans."
@ -27,7 +22,6 @@ class AmericanThinker(BasicNewsRecipe):
    remove_javascript     = True
    auto_cleanup = True
    #remove_tags_before = dict(name='h1')
    conversion_options = {
                          'comment'   : description
@ -41,13 +35,10 @@ class AmericanThinker(BasicNewsRecipe):
        root = html5lib.parse(
            clean_xml_chars(raw), treebuilder='lxml',
            namespaceHTMLElements=False)
-        for x in CSSSelect('.article_body.bottom')(root):
+        for x in root.xpath('''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' article_body ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' bottom '))]'''):
            x.getparent().remove(x)
        return etree.tostring(root, encoding=unicode)
    feeds = [(u'http://feeds.feedburner.com/americanthinker'),
                     (u'http://feeds.feedburner.com/AmericanThinkerBlog')
            ]
    #def print_version(self, url):
        #return 'http://www.americanthinker.com/assets/3rd_party/printpage/?url=' + url
--- a/recipes/newsweek.recipe
+++ b/recipes/newsweek.recipe
@ -1,11 +1,6 @@
 from calibre.web.feeds.jsnews import JavascriptRecipe
 from cssselect import HTMLTranslator
 from lxml.etree import XPath
 import datetime
 def CSSSelect(expr):
    return XPath(HTMLTranslator().css_to_xpath(expr))
 BASE = 'http://www.newsweek.com'
 def href_to_url(a, add_piano=False):
    return BASE + a.get('href') + ('?piano_t=1' if add_piano else '')
@ -40,16 +35,16 @@ class Newsweek(JavascriptRecipe):
    def get_publication_data(self, browser):
        browser.wait_for_element('nav.main-menu a[href]')
        root = self.index_to_soup(browser.html)
-        for a in CSSSelect('nav.main-menu a[href]')(root):
+        for a in root.xpath('''descendant-or-self::nav[@class and contains(concat(' ', normalize-space(@class), ' '), ' main-menu ')]/descendant-or-self::*/a[@href]'''):
            if a.text and a.text.strip() == 'This Week\'s Edition':
                return self.get_newsweek_publication_data(browser, href_to_url(a, True))
    def get_newsweek_publication_data(self, browser, url):
        root = self.index_to_soup(url)
-        sel = lambda expr: CSSSelect(expr)(root)
+        sel = lambda expr: root.xpath(expr)
        ans = {}
-        for img in sel('div.cover-story div.info img[src]'):
+        for img in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]/descendant-or-self::*/img[@src]'''):
            if '_Cover_' in img.get('title', ''):
                ans['cover'] = browser.get_resource(img.get('src'))
                break
@ -59,7 +54,7 @@ class Newsweek(JavascriptRecipe):
                self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')
        sections = []
-        for div in sel('div.cover-story div.info'):
+        for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]'''):
            url = None
            for a in div.xpath('descendant::a[@href]'):
                url = href_to_url(a)
@ -68,7 +63,7 @@ class Newsweek(JavascriptRecipe):
                sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
                break
        features = []
-        for li in sel('div.features li'):
+        for li in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' features ')]/descendant-or-self::*/li'''):
            url = None
            for a in li.xpath('descendant::a[@class="article-link"]'):
                url = href_to_url(a)
@ -77,7 +72,7 @@ class Newsweek(JavascriptRecipe):
        if features:
            sections.append(('Features', features))
-        for div in sel('div.issue-list-block'):
+        for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' issue-list-block ')]'''):
            for d in div.xpath('descendant::div[@class="block-title"]'):
                section_title = self.tag_to_string(d)
                articles = []