diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index 5c6f43b09d..0e379ce9d1 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -142,7 +142,7 @@ def get_proxies():
return proxies
-def browser(honor_time=True, max_time=2):
+def browser(honor_time=True, max_time=2, mobile_browser=False):
'''
Create a mechanize browser for web scraping. The browser handles cookies,
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
@@ -153,7 +153,8 @@ def browser(honor_time=True, max_time=2):
opener = mechanize.Browser()
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False)
- opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
+ opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
+ 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
http_proxy = get_proxies().get('http', None)
if http_proxy:
opener.set_proxies({'http':http_proxy})
diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index b47e8cd06d..a5e5f51004 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -93,18 +93,18 @@ class HTMLProcessor(Processor, Rationalizer):
p = QPixmap()
p.load(path)
if not p.isNull():
- p.save(path+'.jpg')
+ p.save(path+'_calibre_converted.jpg')
os.remove(path)
for key, val in self.resource_map.items():
if val == rpath:
- self.resource_map[key] = rpath+'.jpg'
- img.set('src', rpath+'.jpg')
+ self.resource_map[key] = rpath+'_calibre_converted.jpg'
+ img.set('src', rpath+'_calibre_converted.jpg')
def save(self):
for meta in list(self.root.xpath('//meta')):
meta.getparent().remove(meta)
- for img in self.root.xpath('//img[@src]'):
- self.convert_image(img)
+ #for img in self.root.xpath('//img[@src]'):
+ # self.convert_image(img)
Processor.save(self)
diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py
index 116ec4f957..2dafee08c1 100644
--- a/src/calibre/gui2/main.py
+++ b/src/calibre/gui2/main.py
@@ -837,7 +837,7 @@ class Main(MainWindow, Ui_MainWindow):
self.job_exception(job)
return
mi = get_metadata(open(pt.name, 'rb'), fmt, use_libprs_metadata=False)
- mi.tags = ['news', recipe.title]
+ mi.tags = [_('News'), recipe.title]
paths, formats, metadata = [pt.name], [fmt], [mi]
self.library_view.model().add_books(paths, formats, metadata, add_duplicates=True)
callback(recipe)
diff --git a/src/calibre/manual/news.rst b/src/calibre/manual/news.rst
index 5b25e355b4..871b0beb09 100644
--- a/src/calibre/manual/news.rst
+++ b/src/calibre/manual/news.rst
@@ -142,8 +142,89 @@ Real life example
A reasonably complex real life example that exposes more of the :term:`API` of ``BasicNewsRecipe`` is the :term:`recipe` for *The New York Times*
-.. literalinclude:: ../web/feeds/recipes/nytimes.py
- :linenos:
+.. code-block:: python
+
+ import string, re
+ from calibre import strftime
+ from calibre.web.feeds.recipes import BasicNewsRecipe
+ from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+ class NYTimes(BasicNewsRecipe):
+
+ title = 'The New York Times'
+ __author__ = 'Kovid Goyal'
+ description = 'Daily news from the New York Times'
+ timefmt = ' [%a, %d %b, %Y]'
+ needs_subscription = True
+ remove_tags_before = dict(id='article')
+ remove_tags_after = dict(id='article')
+ remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
+ dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
+ dict(name=['script', 'noscript', 'style'])]
+ encoding = 'cp1252'
+ no_stylesheets = True
+ extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
+
+ def get_browser(self):
+ br = BasicNewsRecipe.get_browser()
+ if self.username is not None and self.password is not None:
+ br.open('http://www.nytimes.com/auth/login')
+ br.select_form(name='login')
+ br['USERID'] = self.username
+ br['PASSWORD'] = self.password
+ br.submit()
+ return br
+
+ def parse_index(self):
+ soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
+
+ def feed_title(div):
+ return ''.join(div.findAll(text=True, recursive=False)).strip()
+
+ articles = {}
+ key = None
+ ans = []
+ for div in soup.findAll(True,
+ attrs={'class':['section-headline', 'story', 'story headline']}):
+
+ if div['class'] == 'section-headline':
+ key = string.capwords(feed_title(div))
+ articles[key] = []
+ ans.append(key)
+
+ elif div['class'] in ['story', 'story headline']:
+ a = div.find('a', href=True)
+ if not a:
+ continue
+ url = re.sub(r'\?.*', '', a['href'])
+ url += '?pagewanted=all'
+ title = self.tag_to_string(a, use_alt=True).strip()
+ description = ''
+ pubdate = strftime('%a, %d %b')
+ summary = div.find(True, attrs={'class':'summary'})
+ if summary:
+ description = self.tag_to_string(summary, use_alt=False)
+
+ feed = key if key is not None else 'Uncategorized'
+ if not articles.has_key(feed):
+ articles[feed] = []
+ if not 'podcasts' in url:
+ articles[feed].append(
+ dict(title=title, url=url, date=pubdate,
+ description=description,
+ content=''))
+ ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
+ ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+ return ans
+
+ def preprocess_html(self, soup):
+ refresh = soup.find('meta', {'http-equiv':'refresh'})
+ if refresh is None:
+ return soup
+ content = refresh.get('content').partition('=')[2]
+ raw = self.browser.open('http://www.nytimes.com'+content).read()
+ return BeautifulSoup(raw.decode('cp1252', 'replace'))
+
We see several new features in this :term:`recipe`. First, we have::
@@ -164,12 +245,14 @@ The next interesting feature is::
needs_subscription = True
...
- def get_growser(self):
+ def get_browser(self):
...
``needs_subscription = True`` tells |app| that this recipe needs a username and password in order to access the content. This causes, |app| to ask for a username and password whenever you try to use this recipe. The code in :meth:`calibre.web.feeds.news.BasicNewsRecipe.get_browser` actually does the login into the NYT website. Once logged in, |app| will use the same, logged in, browser instance to fetch all content. See `mechanize `_ to understand the code in ``get_browser``.
-The last new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup `_ to parse the daily paper webpage.
+The next new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup `_ to parse the daily paper webpage.
+
+The final new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.preprocess_html` method. It can be used to perform arbitrary transformations on every downloaded HTML page. Here it is used to bypass the ads that the nytimes shows you before each article.
Tips for developing new recipes
---------------------------------
diff --git a/src/calibre/web/feeds/main.py b/src/calibre/web/feeds/main.py
index d56256de08..4ef7d89dd4 100644
--- a/src/calibre/web/feeds/main.py
+++ b/src/calibre/web/feeds/main.py
@@ -155,7 +155,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
if not os.path.exists(recipe.output_dir):
os.makedirs(recipe.output_dir)
- recipe.download()
+ recipe.download(for_lrf=True)
return recipe
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 4fd6438b89..fe621f9bfa 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -59,6 +59,9 @@ class BasicNewsRecipe(object, LoggingInterface):
#: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
simultaneous_downloads = 5
+ #: If False the remote server is contacted by only one thread at a time
+ multithreaded_fetch = False
+
#: Timeout for fetching files from server in seconds
timeout = 120.0
@@ -108,7 +111,7 @@ class BasicNewsRecipe(object, LoggingInterface):
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
#: It will be inserted into `