Get rid of cssselect from readability

This commit is contained in:
Kovid Goyal 2015-02-22 14:57:19 +05:30
parent 940693042f
commit 97b8daee41

View File

@ -67,12 +67,19 @@ def shorten_title(doc):
if e.text_content(): if e.text_content():
add_match(candidates, e.text_content(), orig) add_match(candidates, e.text_content(), orig)
from cssselect import HTMLTranslator for item in [
css_to_xpath = HTMLTranslator().css_to_xpath "descendant-or-self::*[@id = 'title']",
for item in ('#title', '#head', '#heading', '.pageTitle', '.news_title', "descendant-or-self::*[@id = 'head']",
'.title', '.head', '.heading', '.contentheading', "descendant-or-self::*[@id = 'heading']",
'.small_header_red'): "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' pageTitle ')]",
for e in doc.xpath(css_to_xpath(item)): "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' news_title ')]",
"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' title ')]",
"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' head ')]",
"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' heading ')]",
"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' contentheading ')]",
"descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' small_header_red ')]"
]:
for e in doc.xpath(item):
if e.text: if e.text:
add_match(candidates, e.text, orig) add_match(candidates, e.text, orig)
if e.text_content(): if e.text_content():