Improved Economist

This commit is contained in:
Kovid Goyal 2010-04-07 18:07:12 +05:30
parent 47cee548dc
commit 5dfc08ed4e
3 changed files with 21 additions and 5 deletions

View File

@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag, NavigableString from calibre.ebooks.BeautifulSoup import Tag, NavigableString
import mechanize, string, urllib, time import mechanize, string, urllib, time, re
class Economist(BasicNewsRecipe): class Economist(BasicNewsRecipe):
@ -27,6 +27,8 @@ class Economist(BasicNewsRecipe):
dict(attrs={'class':['dblClkTrk']})] dict(attrs={'class':['dblClkTrk']})]
remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body') remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body')
needs_subscription = True needs_subscription = True
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
lambda x:'</html>')]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
@ -111,11 +113,15 @@ class Economist(BasicNewsRecipe):
yield x yield x
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
body = soup.find('body')
for name, val in body.attrs:
del body[name]
for table in list(self.eco_find_image_tables(soup)): for table in list(self.eco_find_image_tables(soup)):
caption = table.find('font') caption = table.find('font')
img = table.find('img') img = table.find('img')
div = Tag(soup, 'div') div = Tag(soup, 'div')
div['style'] = 'text-align:center;font-size:70%' div['style'] = 'text-align:left;font-size:70%'
ns = NavigableString(self.tag_to_string(caption)) ns = NavigableString(self.tag_to_string(caption))
div.insert(0, ns) div.insert(0, ns)
div.insert(1, Tag(soup, 'br')) div.insert(1, Tag(soup, 'br'))

View File

@ -1,7 +1,7 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.threadpool import ThreadPool, makeRequests from calibre.utils.threadpool import ThreadPool, makeRequests
from calibre.ebooks.BeautifulSoup import Tag, NavigableString from calibre.ebooks.BeautifulSoup import Tag, NavigableString
import time, string import time, string, re
from datetime import datetime from datetime import datetime
from lxml import html from lxml import html
@ -19,9 +19,13 @@ class Economist(BasicNewsRecipe):
remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk']})] dict(attrs={'class':['dblClkTrk']})]
remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body') remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body')
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
lambda x:'</html>')]
def parse_index(self): def parse_index(self):
from calibre.web.feeds.feedparser import parse from calibre.web.feeds.feedparser import parse
if self.test:
self.oldest_article = 14.0
raw = self.index_to_soup( raw = self.index_to_soup(
'http://feeds.feedburner.com/economist/full_print_edition', 'http://feeds.feedburner.com/economist/full_print_edition',
raw=True) raw=True)
@ -44,6 +48,8 @@ class Economist(BasicNewsRecipe):
author = item.get('author', '') author = item.get('author', '')
requests.append([i, link, title, description, author, published]) requests.append([i, link, title, description, author, published])
if self.test:
requests = requests[:4]
requests = makeRequests(self.process_eco_feed_article, requests, self.eco_article_found, requests = makeRequests(self.process_eco_feed_article, requests, self.eco_article_found,
self.eco_article_failed) self.eco_article_failed)
for r in requests: pool.putRequest(r) for r in requests: pool.putRequest(r)
@ -114,11 +120,14 @@ class Economist(BasicNewsRecipe):
yield x yield x
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
body = soup.find('body')
for name, val in body.attrs:
del body[name]
for table in list(self.eco_find_image_tables(soup)): for table in list(self.eco_find_image_tables(soup)):
caption = table.find('font') caption = table.find('font')
img = table.find('img') img = table.find('img')
div = Tag(soup, 'div') div = Tag(soup, 'div')
div['style'] = 'text-align:center;font-size:70%' div['style'] = 'text-align:left;font-size:70%'
ns = NavigableString(self.tag_to_string(caption)) ns = NavigableString(self.tag_to_string(caption))
div.insert(0, ns) div.insert(0, ns)
div.insert(1, Tag(soup, 'br')) div.insert(1, Tag(soup, 'br'))

View File

@ -65,7 +65,8 @@ def debug(ioreg_to_tmp=False, buf=None):
ioreg += 'Output from osx_get_usb_drives:\n'+drives+'\n\n' ioreg += 'Output from osx_get_usb_drives:\n'+drives+'\n\n'
ioreg += Device.run_ioreg() ioreg += Device.run_ioreg()
connected_devices = [] connected_devices = []
for dev in device_plugins(): for dev in sorted(device_plugins(), cmp=lambda
x,y:cmp(x.__class__.__name__, y.__class__.__name__)):
out('Looking for', dev.__class__.__name__) out('Looking for', dev.__class__.__name__)
connected, det = s.is_device_connected(dev, debug=True) connected, det = s.is_device_connected(dev, debug=True)
if connected: if connected: