New recipe for The New York Times based on mobile.nytimes.com. Fixes #1281 (New York Times Recipe producing large files)

This commit is contained in:
Kovid Goyal 2008-11-21 09:26:47 -08:00
parent 5a4752fd38
commit bd1d6ca3f3
8 changed files with 223 additions and 94 deletions

View File

@ -142,7 +142,7 @@ def get_proxies():
return proxies
def browser(honor_time=True, max_time=2):
def browser(honor_time=True, max_time=2, mobile_browser=False):
'''
Create a mechanize browser for web scraping. The browser handles cookies,
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
@ -153,7 +153,8 @@ def browser(honor_time=True, max_time=2):
opener = mechanize.Browser()
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
http_proxy = get_proxies().get('http', None)
if http_proxy:
opener.set_proxies({'http':http_proxy})

View File

@ -93,18 +93,18 @@ class HTMLProcessor(Processor, Rationalizer):
p = QPixmap()
p.load(path)
if not p.isNull():
p.save(path+'.jpg')
p.save(path+'_calibre_converted.jpg')
os.remove(path)
for key, val in self.resource_map.items():
if val == rpath:
self.resource_map[key] = rpath+'.jpg'
img.set('src', rpath+'.jpg')
self.resource_map[key] = rpath+'_calibre_converted.jpg'
img.set('src', rpath+'_calibre_converted.jpg')
def save(self):
for meta in list(self.root.xpath('//meta')):
meta.getparent().remove(meta)
for img in self.root.xpath('//img[@src]'):
self.convert_image(img)
#for img in self.root.xpath('//img[@src]'):
# self.convert_image(img)
Processor.save(self)

View File

@ -837,7 +837,7 @@ class Main(MainWindow, Ui_MainWindow):
self.job_exception(job)
return
mi = get_metadata(open(pt.name, 'rb'), fmt, use_libprs_metadata=False)
mi.tags = ['news', recipe.title]
mi.tags = [_('News'), recipe.title]
paths, formats, metadata = [pt.name], [fmt], [mi]
self.library_view.model().add_books(paths, formats, metadata, add_duplicates=True)
callback(recipe)

View File

@ -142,8 +142,89 @@ Real life example
A reasonably complex real life example that exposes more of the :term:`API` of ``BasicNewsRecipe`` is the :term:`recipe` for *The New York Times*
.. literalinclude:: ../web/feeds/recipes/nytimes.py
:linenos:
.. code-block:: python
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class NYTimes(BasicNewsRecipe):
title = 'The New York Times'
__author__ = 'Kovid Goyal'
description = 'Daily news from the New York Times'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = True
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style'])]
encoding = 'cp1252'
no_stylesheets = True
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
return br
def parse_index(self):
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
articles = {}
key = None
ans = []
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline']}):
if div['class'] == 'section-headline':
key = string.capwords(feed_title(div))
articles[key] = []
ans.append(key)
elif div['class'] in ['story', 'story headline']:
a = div.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all'
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
articles[feed] = []
if not 'podcasts' in url:
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('http://www.nytimes.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
We see several new features in this :term:`recipe`. First, we have::
@ -164,12 +245,14 @@ The next interesting feature is::
needs_subscription = True
...
def get_growser(self):
def get_browser(self):
...
``needs_subscription = True`` tells |app| that this recipe needs a username and password in order to access the content. This causes, |app| to ask for a username and password whenever you try to use this recipe. The code in :meth:`calibre.web.feeds.news.BasicNewsRecipe.get_browser` actually does the login into the NYT website. Once logged in, |app| will use the same, logged in, browser instance to fetch all content. See `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_ to understand the code in ``get_browser``.
The last new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ to parse the daily paper webpage.
The next new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ to parse the daily paper webpage.
The final new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.preprocess_html` method. It can be used to perform arbitrary transformations on every downloaded HTML page. Here it is used to bypass the ads that the nytimes shows you before each article.
Tips for developing new recipes
---------------------------------

View File

@ -155,7 +155,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
if not os.path.exists(recipe.output_dir):
os.makedirs(recipe.output_dir)
recipe.download()
recipe.download(for_lrf=True)
return recipe

View File

@ -59,6 +59,9 @@ class BasicNewsRecipe(object, LoggingInterface):
#: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
simultaneous_downloads = 5
#: If False the remote server is contacted by only one thread at a time
multithreaded_fetch = False
#: Timeout for fetching files from server in seconds
timeout = 120.0
@ -108,7 +111,7 @@ class BasicNewsRecipe(object, LoggingInterface):
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
#: It will be inserted into `<style>` tags, just before the closing
#: `</head>` tag thereby overrinding all :term:`CSS` except that which is
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
#: declared using the style attribute on individual :term:`HTML` tags.
#: For example::
#:
@ -272,7 +275,15 @@ class BasicNewsRecipe(object, LoggingInterface):
raise NotImplementedError
@classmethod
def get_browser(self):
def image_url_processor(cls, baseurl, url):
'''
Perform some processing on image urls (perhaps removing size restrictions for
dynamically generated images, etc.) and return the precessed URL.
'''
return url
@classmethod
def get_browser(cls, *args, **kwargs):
'''
Return a browser instance used to fetch documents from the web. By default
it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
@ -294,7 +305,7 @@ class BasicNewsRecipe(object, LoggingInterface):
return br
'''
return browser()
return browser(*args, **kwargs)
def get_article_url(self, article):
'''
@ -338,7 +349,7 @@ class BasicNewsRecipe(object, LoggingInterface):
'''
pass
def index_to_soup(self, url_or_raw):
def index_to_soup(self, url_or_raw, raw=False):
'''
Convenience method that takes an URL to the index page and returns
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -354,6 +365,8 @@ class BasicNewsRecipe(object, LoggingInterface):
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else:
raw = url_or_raw
if raw:
return raw
if not isinstance(raw, unicode) and self.encoding:
raw = raw.decode(self.encoding)
massage = list(BeautifulSoup.MARKUP_MASSAGE)
@ -524,7 +537,7 @@ class BasicNewsRecipe(object, LoggingInterface):
return self.postprocess_html(soup, first_fetch)
def download(self):
def download(self, for_lrf=False):
'''
Download and pre-process all articles from the feeds in this recipe.
This method should be called only one on a particular Recipe instance.
@ -622,11 +635,14 @@ class BasicNewsRecipe(object, LoggingInterface):
return logger, out
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
self.web2disk_options.browser = self.browser
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
fetcher.base_dir = dir
fetcher.current_dir = dir
fetcher.show_progress = False
fetcher.image_url_processor = self.image_url_processor
if self.multithreaded_fetch:
fetcher.browser_lock = fetcher.DUMMY_LOCK
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res or not os.path.exists(res):
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))

View File

@ -1,87 +1,105 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
nytimes.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
class NYTimes(BasicNewsRecipe):
'''
mobile.nytimes.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from lxml import html
class NYTimesMobile(BasicNewsRecipe):
title = 'The New York Times'
__author__ = 'Kovid Goyal'
description = 'Daily news from the New York Times'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = True
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style'])]
encoding = 'cp1252'
timefmt = ' [%a, %d %b, %Y]'
multithreaded_fetch = True
max_articles_per_feed = 15
no_stylesheets = True
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
extra_css = '''
.h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left }
.h2 { font-size: large; font-weight: bold }
.credit { font-size: small }
.aut { font-weight: bold }
.bodycontent { font-family: serif }
'''
remove_tags = [
dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}),
dict(name='a', href='/main')
]
remove_tags_after = [
dict(name='a', attrs={'name': 'bottom'})
]
def image_url_processor(self, baseurl, url):
return re.sub(r'(&|&amp;).*', '', url)
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
return br
return BasicNewsRecipe.get_browser(mobile_browser=True)
def download(self, for_lrf=False):
if for_lrf:
self.max_articles_per_feed = 10
return BasicNewsRecipe.download(self, for_lrf=for_lrf)
def process_section(self, href):
raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
articles = []
while True:
root = html.fromstring(raw)
for art in self.find_articles(root):
append = True
for x in articles:
if x['title'] == art['title']:
append = False
break
if append: articles.append(art)
more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]')
if not more:
break
href = more[0].get('href')
raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
return articles
def find_articles(self, root):
for a in root.xpath('//a[@accesskey]'):
href = a.get('href')
yield {
'title': a.text.strip(),
'date' : '',
'url' : 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1',
'description': '',
}
def parse_index(self):
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
articles = {}
key = None
ans = []
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline']}):
raw = self.index_to_soup('http://mobile.nytimes.com', raw=True)
root = html.fromstring(raw)
feeds = [('Latest news', list(self.find_articles(root)))]
if div['class'] == 'section-headline':
key = string.capwords(feed_title(div))
articles[key] = []
ans.append(key)
for a in root.xpath('//a[starts-with(@href, "section")]'):
title = a.text.replace('&raquo;', '').replace(u'\xbb', '').strip()
print 'Processing section:', title
articles = self.process_section(a.get('href'))
feeds.append((title, articles))
elif div['class'] in ['story', 'story headline']:
a = div.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all'
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
articles[feed] = []
if not 'podcasts' in url:
articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
return feeds
def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('http://www.nytimes.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
def postprocess_html(self, soup, first_fetch):
for img in soup.findAll('img', width=True):
try:
width = int(img['width'].replace('px', ''))
if width < 5:
img.extract()
continue
except:
pass
del img['width']
del img['height']
del img.parent['style']
return soup

View File

@ -71,6 +71,11 @@ class response(str):
def __init__(self, *args):
str.__init__(self, *args)
self.newurl = None
class DummyLock(object):
def __enter__(self, *args): return self
def __exit__(self, *args): pass
class RecursiveFetcher(object, LoggingInterface):
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
@ -82,6 +87,7 @@ class RecursiveFetcher(object, LoggingInterface):
# )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
DUMMY_LOCK = DummyLock()
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
LoggingInterface.__init__(self, logger)
@ -103,6 +109,8 @@ class RecursiveFetcher(object, LoggingInterface):
self.imagemap = image_map
self.imagemap_lock = threading.RLock()
self.stylemap = css_map
self.image_url_processor = None
self.browser_lock = _browser_lock
self.stylemap_lock = threading.RLock()
self.downloaded_paths = []
self.current_dir = self.base_dir
@ -166,7 +174,7 @@ class RecursiveFetcher(object, LoggingInterface):
delta = time.time() - self.last_fetch_at
if delta < self.delay:
time.sleep(delta)
with _browser_lock:
with self.browser_lock:
try:
with closing(self.browser.open(url)) as f:
data = response(f.read()+f.read())
@ -271,8 +279,11 @@ class RecursiveFetcher(object, LoggingInterface):
os.mkdir(diskpath)
c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
ext = ext[:5]
iurl = tag['src']
if callable(self.image_url_processor):
iurl = self.image_url_processor(baseurl, iurl)
ext = os.path.splitext(iurl)[1]
ext = ext[:5]
#if not ext:
# self.log_debug('Skipping extensionless image %s', iurl)
# continue