mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for The New York Times based on mobile.nytimes.com. Fixes #1281 (New York Times Recipe producing large files)
This commit is contained in:
parent
5a4752fd38
commit
bd1d6ca3f3
@ -142,7 +142,7 @@ def get_proxies():
|
|||||||
return proxies
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
def browser(honor_time=True, max_time=2):
|
def browser(honor_time=True, max_time=2, mobile_browser=False):
|
||||||
'''
|
'''
|
||||||
Create a mechanize browser for web scraping. The browser handles cookies,
|
Create a mechanize browser for web scraping. The browser handles cookies,
|
||||||
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
|
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
|
||||||
@ -153,7 +153,8 @@ def browser(honor_time=True, max_time=2):
|
|||||||
opener = mechanize.Browser()
|
opener = mechanize.Browser()
|
||||||
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
|
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
|
||||||
opener.set_handle_robots(False)
|
opener.set_handle_robots(False)
|
||||||
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
|
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
|
||||||
|
'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
|
||||||
http_proxy = get_proxies().get('http', None)
|
http_proxy = get_proxies().get('http', None)
|
||||||
if http_proxy:
|
if http_proxy:
|
||||||
opener.set_proxies({'http':http_proxy})
|
opener.set_proxies({'http':http_proxy})
|
||||||
|
@ -93,18 +93,18 @@ class HTMLProcessor(Processor, Rationalizer):
|
|||||||
p = QPixmap()
|
p = QPixmap()
|
||||||
p.load(path)
|
p.load(path)
|
||||||
if not p.isNull():
|
if not p.isNull():
|
||||||
p.save(path+'.jpg')
|
p.save(path+'_calibre_converted.jpg')
|
||||||
os.remove(path)
|
os.remove(path)
|
||||||
for key, val in self.resource_map.items():
|
for key, val in self.resource_map.items():
|
||||||
if val == rpath:
|
if val == rpath:
|
||||||
self.resource_map[key] = rpath+'.jpg'
|
self.resource_map[key] = rpath+'_calibre_converted.jpg'
|
||||||
img.set('src', rpath+'.jpg')
|
img.set('src', rpath+'_calibre_converted.jpg')
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
for meta in list(self.root.xpath('//meta')):
|
for meta in list(self.root.xpath('//meta')):
|
||||||
meta.getparent().remove(meta)
|
meta.getparent().remove(meta)
|
||||||
for img in self.root.xpath('//img[@src]'):
|
#for img in self.root.xpath('//img[@src]'):
|
||||||
self.convert_image(img)
|
# self.convert_image(img)
|
||||||
Processor.save(self)
|
Processor.save(self)
|
||||||
|
|
||||||
|
|
||||||
|
@ -837,7 +837,7 @@ class Main(MainWindow, Ui_MainWindow):
|
|||||||
self.job_exception(job)
|
self.job_exception(job)
|
||||||
return
|
return
|
||||||
mi = get_metadata(open(pt.name, 'rb'), fmt, use_libprs_metadata=False)
|
mi = get_metadata(open(pt.name, 'rb'), fmt, use_libprs_metadata=False)
|
||||||
mi.tags = ['news', recipe.title]
|
mi.tags = [_('News'), recipe.title]
|
||||||
paths, formats, metadata = [pt.name], [fmt], [mi]
|
paths, formats, metadata = [pt.name], [fmt], [mi]
|
||||||
self.library_view.model().add_books(paths, formats, metadata, add_duplicates=True)
|
self.library_view.model().add_books(paths, formats, metadata, add_duplicates=True)
|
||||||
callback(recipe)
|
callback(recipe)
|
||||||
|
@ -142,8 +142,89 @@ Real life example
|
|||||||
|
|
||||||
A reasonably complex real life example that exposes more of the :term:`API` of ``BasicNewsRecipe`` is the :term:`recipe` for *The New York Times*
|
A reasonably complex real life example that exposes more of the :term:`API` of ``BasicNewsRecipe`` is the :term:`recipe` for *The New York Times*
|
||||||
|
|
||||||
.. literalinclude:: ../web/feeds/recipes/nytimes.py
|
.. code-block:: python
|
||||||
:linenos:
|
|
||||||
|
import string, re
|
||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
class NYTimes(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'The New York Times'
|
||||||
|
__author__ = 'Kovid Goyal'
|
||||||
|
description = 'Daily news from the New York Times'
|
||||||
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
needs_subscription = True
|
||||||
|
remove_tags_before = dict(id='article')
|
||||||
|
remove_tags_after = dict(id='article')
|
||||||
|
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
||||||
|
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
||||||
|
dict(name=['script', 'noscript', 'style'])]
|
||||||
|
encoding = 'cp1252'
|
||||||
|
no_stylesheets = True
|
||||||
|
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.nytimes.com/auth/login')
|
||||||
|
br.select_form(name='login')
|
||||||
|
br['USERID'] = self.username
|
||||||
|
br['PASSWORD'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
||||||
|
|
||||||
|
def feed_title(div):
|
||||||
|
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = None
|
||||||
|
ans = []
|
||||||
|
for div in soup.findAll(True,
|
||||||
|
attrs={'class':['section-headline', 'story', 'story headline']}):
|
||||||
|
|
||||||
|
if div['class'] == 'section-headline':
|
||||||
|
key = string.capwords(feed_title(div))
|
||||||
|
articles[key] = []
|
||||||
|
ans.append(key)
|
||||||
|
|
||||||
|
elif div['class'] in ['story', 'story headline']:
|
||||||
|
a = div.find('a', href=True)
|
||||||
|
if not a:
|
||||||
|
continue
|
||||||
|
url = re.sub(r'\?.*', '', a['href'])
|
||||||
|
url += '?pagewanted=all'
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
description = ''
|
||||||
|
pubdate = strftime('%a, %d %b')
|
||||||
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
|
if summary:
|
||||||
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
|
||||||
|
feed = key if key is not None else 'Uncategorized'
|
||||||
|
if not articles.has_key(feed):
|
||||||
|
articles[feed] = []
|
||||||
|
if not 'podcasts' in url:
|
||||||
|
articles[feed].append(
|
||||||
|
dict(title=title, url=url, date=pubdate,
|
||||||
|
description=description,
|
||||||
|
content=''))
|
||||||
|
ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
|
if refresh is None:
|
||||||
|
return soup
|
||||||
|
content = refresh.get('content').partition('=')[2]
|
||||||
|
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||||
|
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
|
|
||||||
|
|
||||||
We see several new features in this :term:`recipe`. First, we have::
|
We see several new features in this :term:`recipe`. First, we have::
|
||||||
|
|
||||||
@ -164,12 +245,14 @@ The next interesting feature is::
|
|||||||
|
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
...
|
...
|
||||||
def get_growser(self):
|
def get_browser(self):
|
||||||
...
|
...
|
||||||
|
|
||||||
``needs_subscription = True`` tells |app| that this recipe needs a username and password in order to access the content. This causes, |app| to ask for a username and password whenever you try to use this recipe. The code in :meth:`calibre.web.feeds.news.BasicNewsRecipe.get_browser` actually does the login into the NYT website. Once logged in, |app| will use the same, logged in, browser instance to fetch all content. See `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_ to understand the code in ``get_browser``.
|
``needs_subscription = True`` tells |app| that this recipe needs a username and password in order to access the content. This causes, |app| to ask for a username and password whenever you try to use this recipe. The code in :meth:`calibre.web.feeds.news.BasicNewsRecipe.get_browser` actually does the login into the NYT website. Once logged in, |app| will use the same, logged in, browser instance to fetch all content. See `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_ to understand the code in ``get_browser``.
|
||||||
|
|
||||||
The last new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ to parse the daily paper webpage.
|
The next new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ to parse the daily paper webpage.
|
||||||
|
|
||||||
|
The final new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.preprocess_html` method. It can be used to perform arbitrary transformations on every downloaded HTML page. Here it is used to bypass the ads that the nytimes shows you before each article.
|
||||||
|
|
||||||
Tips for developing new recipes
|
Tips for developing new recipes
|
||||||
---------------------------------
|
---------------------------------
|
||||||
|
@ -155,7 +155,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
|
|||||||
|
|
||||||
if not os.path.exists(recipe.output_dir):
|
if not os.path.exists(recipe.output_dir):
|
||||||
os.makedirs(recipe.output_dir)
|
os.makedirs(recipe.output_dir)
|
||||||
recipe.download()
|
recipe.download(for_lrf=True)
|
||||||
|
|
||||||
return recipe
|
return recipe
|
||||||
|
|
||||||
|
@ -59,6 +59,9 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
#: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
|
#: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
|
||||||
simultaneous_downloads = 5
|
simultaneous_downloads = 5
|
||||||
|
|
||||||
|
#: If False the remote server is contacted by only one thread at a time
|
||||||
|
multithreaded_fetch = False
|
||||||
|
|
||||||
#: Timeout for fetching files from server in seconds
|
#: Timeout for fetching files from server in seconds
|
||||||
timeout = 120.0
|
timeout = 120.0
|
||||||
|
|
||||||
@ -108,7 +111,7 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
|
|
||||||
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
|
||||||
#: It will be inserted into `<style>` tags, just before the closing
|
#: It will be inserted into `<style>` tags, just before the closing
|
||||||
#: `</head>` tag thereby overrinding all :term:`CSS` except that which is
|
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
|
||||||
#: declared using the style attribute on individual :term:`HTML` tags.
|
#: declared using the style attribute on individual :term:`HTML` tags.
|
||||||
#: For example::
|
#: For example::
|
||||||
#:
|
#:
|
||||||
@ -272,7 +275,15 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_browser(self):
|
def image_url_processor(cls, baseurl, url):
|
||||||
|
'''
|
||||||
|
Perform some processing on image urls (perhaps removing size restrictions for
|
||||||
|
dynamically generated images, etc.) and return the precessed URL.
|
||||||
|
'''
|
||||||
|
return url
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_browser(cls, *args, **kwargs):
|
||||||
'''
|
'''
|
||||||
Return a browser instance used to fetch documents from the web. By default
|
Return a browser instance used to fetch documents from the web. By default
|
||||||
it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
|
it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
|
||||||
@ -294,7 +305,7 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
'''
|
'''
|
||||||
return browser()
|
return browser(*args, **kwargs)
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
'''
|
'''
|
||||||
@ -338,7 +349,7 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
'''
|
'''
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def index_to_soup(self, url_or_raw):
|
def index_to_soup(self, url_or_raw, raw=False):
|
||||||
'''
|
'''
|
||||||
Convenience method that takes an URL to the index page and returns
|
Convenience method that takes an URL to the index page and returns
|
||||||
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
||||||
@ -354,6 +365,8 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||||
else:
|
else:
|
||||||
raw = url_or_raw
|
raw = url_or_raw
|
||||||
|
if raw:
|
||||||
|
return raw
|
||||||
if not isinstance(raw, unicode) and self.encoding:
|
if not isinstance(raw, unicode) and self.encoding:
|
||||||
raw = raw.decode(self.encoding)
|
raw = raw.decode(self.encoding)
|
||||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
@ -524,7 +537,7 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
return self.postprocess_html(soup, first_fetch)
|
return self.postprocess_html(soup, first_fetch)
|
||||||
|
|
||||||
|
|
||||||
def download(self):
|
def download(self, for_lrf=False):
|
||||||
'''
|
'''
|
||||||
Download and pre-process all articles from the feeds in this recipe.
|
Download and pre-process all articles from the feeds in this recipe.
|
||||||
This method should be called only one on a particular Recipe instance.
|
This method should be called only one on a particular Recipe instance.
|
||||||
@ -622,11 +635,14 @@ class BasicNewsRecipe(object, LoggingInterface):
|
|||||||
return logger, out
|
return logger, out
|
||||||
|
|
||||||
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||||
self.web2disk_options.browser = self.browser
|
self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
|
||||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
||||||
fetcher.base_dir = dir
|
fetcher.base_dir = dir
|
||||||
fetcher.current_dir = dir
|
fetcher.current_dir = dir
|
||||||
fetcher.show_progress = False
|
fetcher.show_progress = False
|
||||||
|
fetcher.image_url_processor = self.image_url_processor
|
||||||
|
if self.multithreaded_fetch:
|
||||||
|
fetcher.browser_lock = fetcher.DUMMY_LOCK
|
||||||
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
|
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
|
||||||
if not res or not os.path.exists(res):
|
if not res or not os.path.exists(res):
|
||||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||||
|
@ -1,87 +1,105 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||||
'''
|
__docformat__ = 'restructuredtext en'
|
||||||
nytimes.com
|
|
||||||
'''
|
|
||||||
import string, re
|
|
||||||
from calibre import strftime
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
||||||
|
|
||||||
class NYTimes(BasicNewsRecipe):
|
'''
|
||||||
|
mobile.nytimes.com
|
||||||
|
'''
|
||||||
|
import re
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
class NYTimesMobile(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'The New York Times'
|
title = 'The New York Times'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
description = 'Daily news from the New York Times'
|
description = 'Daily news from the New York Times'
|
||||||
timefmt = ' [%a, %d %b, %Y]'
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
needs_subscription = True
|
multithreaded_fetch = True
|
||||||
remove_tags_before = dict(id='article')
|
max_articles_per_feed = 15
|
||||||
remove_tags_after = dict(id='article')
|
|
||||||
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
|
|
||||||
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
|
|
||||||
dict(name=['script', 'noscript', 'style'])]
|
|
||||||
encoding = 'cp1252'
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
|
extra_css = '''
|
||||||
|
.h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left }
|
||||||
|
.h2 { font-size: large; font-weight: bold }
|
||||||
|
.credit { font-size: small }
|
||||||
|
.aut { font-weight: bold }
|
||||||
|
.bodycontent { font-family: serif }
|
||||||
|
'''
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}),
|
||||||
|
dict(name='a', href='/main')
|
||||||
|
]
|
||||||
|
remove_tags_after = [
|
||||||
|
dict(name='a', attrs={'name': 'bottom'})
|
||||||
|
]
|
||||||
|
|
||||||
|
def image_url_processor(self, baseurl, url):
|
||||||
|
return re.sub(r'(&|&).*', '', url)
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser()
|
return BasicNewsRecipe.get_browser(mobile_browser=True)
|
||||||
if self.username is not None and self.password is not None:
|
|
||||||
br.open('http://www.nytimes.com/auth/login')
|
def download(self, for_lrf=False):
|
||||||
br.select_form(name='login')
|
if for_lrf:
|
||||||
br['USERID'] = self.username
|
self.max_articles_per_feed = 10
|
||||||
br['PASSWORD'] = self.password
|
return BasicNewsRecipe.download(self, for_lrf=for_lrf)
|
||||||
br.submit()
|
|
||||||
return br
|
def process_section(self, href):
|
||||||
|
raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
|
||||||
|
articles = []
|
||||||
|
while True:
|
||||||
|
root = html.fromstring(raw)
|
||||||
|
for art in self.find_articles(root):
|
||||||
|
append = True
|
||||||
|
for x in articles:
|
||||||
|
if x['title'] == art['title']:
|
||||||
|
append = False
|
||||||
|
break
|
||||||
|
if append: articles.append(art)
|
||||||
|
more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]')
|
||||||
|
if not more:
|
||||||
|
break
|
||||||
|
href = more[0].get('href')
|
||||||
|
raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
def find_articles(self, root):
|
||||||
|
for a in root.xpath('//a[@accesskey]'):
|
||||||
|
href = a.get('href')
|
||||||
|
yield {
|
||||||
|
'title': a.text.strip(),
|
||||||
|
'date' : '',
|
||||||
|
'url' : 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1',
|
||||||
|
'description': '',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
|
raw = self.index_to_soup('http://mobile.nytimes.com', raw=True)
|
||||||
|
root = html.fromstring(raw)
|
||||||
|
feeds = [('Latest news', list(self.find_articles(root)))]
|
||||||
|
|
||||||
def feed_title(div):
|
for a in root.xpath('//a[starts-with(@href, "section")]'):
|
||||||
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
title = a.text.replace('»', '').replace(u'\xbb', '').strip()
|
||||||
|
print 'Processing section:', title
|
||||||
|
articles = self.process_section(a.get('href'))
|
||||||
|
feeds.append((title, articles))
|
||||||
|
|
||||||
articles = {}
|
return feeds
|
||||||
key = None
|
|
||||||
ans = []
|
|
||||||
for div in soup.findAll(True,
|
|
||||||
attrs={'class':['section-headline', 'story', 'story headline']}):
|
|
||||||
|
|
||||||
if div['class'] == 'section-headline':
|
def postprocess_html(self, soup, first_fetch):
|
||||||
key = string.capwords(feed_title(div))
|
for img in soup.findAll('img', width=True):
|
||||||
articles[key] = []
|
try:
|
||||||
ans.append(key)
|
width = int(img['width'].replace('px', ''))
|
||||||
|
if width < 5:
|
||||||
elif div['class'] in ['story', 'story headline']:
|
img.extract()
|
||||||
a = div.find('a', href=True)
|
|
||||||
if not a:
|
|
||||||
continue
|
continue
|
||||||
url = re.sub(r'\?.*', '', a['href'])
|
except:
|
||||||
url += '?pagewanted=all'
|
pass
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
del img['width']
|
||||||
description = ''
|
del img['height']
|
||||||
pubdate = strftime('%a, %d %b')
|
del img.parent['style']
|
||||||
summary = div.find(True, attrs={'class':'summary'})
|
|
||||||
if summary:
|
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
|
||||||
|
|
||||||
feed = key if key is not None else 'Uncategorized'
|
|
||||||
if not articles.has_key(feed):
|
|
||||||
articles[feed] = []
|
|
||||||
if not 'podcasts' in url:
|
|
||||||
articles[feed].append(
|
|
||||||
dict(title=title, url=url, date=pubdate,
|
|
||||||
description=description,
|
|
||||||
content=''))
|
|
||||||
ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
|
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
|
||||||
if refresh is None:
|
|
||||||
return soup
|
return soup
|
||||||
content = refresh.get('content').partition('=')[2]
|
|
||||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
|
||||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
|
||||||
|
@ -72,6 +72,11 @@ class response(str):
|
|||||||
str.__init__(self, *args)
|
str.__init__(self, *args)
|
||||||
self.newurl = None
|
self.newurl = None
|
||||||
|
|
||||||
|
class DummyLock(object):
|
||||||
|
|
||||||
|
def __enter__(self, *args): return self
|
||||||
|
def __exit__(self, *args): pass
|
||||||
|
|
||||||
class RecursiveFetcher(object, LoggingInterface):
|
class RecursiveFetcher(object, LoggingInterface):
|
||||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||||
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
||||||
@ -82,6 +87,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
# )
|
# )
|
||||||
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
||||||
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
||||||
|
DUMMY_LOCK = DummyLock()
|
||||||
|
|
||||||
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
||||||
LoggingInterface.__init__(self, logger)
|
LoggingInterface.__init__(self, logger)
|
||||||
@ -103,6 +109,8 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
self.imagemap = image_map
|
self.imagemap = image_map
|
||||||
self.imagemap_lock = threading.RLock()
|
self.imagemap_lock = threading.RLock()
|
||||||
self.stylemap = css_map
|
self.stylemap = css_map
|
||||||
|
self.image_url_processor = None
|
||||||
|
self.browser_lock = _browser_lock
|
||||||
self.stylemap_lock = threading.RLock()
|
self.stylemap_lock = threading.RLock()
|
||||||
self.downloaded_paths = []
|
self.downloaded_paths = []
|
||||||
self.current_dir = self.base_dir
|
self.current_dir = self.base_dir
|
||||||
@ -166,7 +174,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
delta = time.time() - self.last_fetch_at
|
delta = time.time() - self.last_fetch_at
|
||||||
if delta < self.delay:
|
if delta < self.delay:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
with _browser_lock:
|
with self.browser_lock:
|
||||||
try:
|
try:
|
||||||
with closing(self.browser.open(url)) as f:
|
with closing(self.browser.open(url)) as f:
|
||||||
data = response(f.read()+f.read())
|
data = response(f.read()+f.read())
|
||||||
@ -271,7 +279,10 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
os.mkdir(diskpath)
|
os.mkdir(diskpath)
|
||||||
c = 0
|
c = 0
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||||
iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
|
iurl = tag['src']
|
||||||
|
if callable(self.image_url_processor):
|
||||||
|
iurl = self.image_url_processor(baseurl, iurl)
|
||||||
|
ext = os.path.splitext(iurl)[1]
|
||||||
ext = ext[:5]
|
ext = ext[:5]
|
||||||
#if not ext:
|
#if not ext:
|
||||||
# self.log_debug('Skipping extensionless image %s', iurl)
|
# self.log_debug('Skipping extensionless image %s', iurl)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user