New recipe for The New York Times based on mobile.nytimes.com. Fixes #1281 (New York Times Recipe producing large files)

2025-07-09 03:04:10 -04:00 · 2008-11-21 09:26:47 -08:00 · 2008-11-21 09:26:47 -08:00 · bd1d6ca3f3
commit bd1d6ca3f3
parent 5a4752fd38
8 changed files with 223 additions and 94 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -142,7 +142,7 @@ def get_proxies():
    return proxies


-def browser(honor_time=True, max_time=2):
+def browser(honor_time=True, max_time=2, mobile_browser=False):
    '''
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if avaialable.  
@ -153,7 +153,8 @@ def browser(honor_time=True, max_time=2):
    opener = mechanize.Browser()
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
-    opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
+    opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
+                          'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
    http_proxy = get_proxies().get('http', None)
    if http_proxy:
        opener.set_proxies({'http':http_proxy})
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -93,18 +93,18 @@ class HTMLProcessor(Processor, Rationalizer):
            p = QPixmap()
            p.load(path)
            if not p.isNull():
-                p.save(path+'.jpg')
+                p.save(path+'_calibre_converted.jpg')
                os.remove(path)
                for key, val in self.resource_map.items():
                    if val == rpath:
-                        self.resource_map[key] = rpath+'.jpg'
-        img.set('src', rpath+'.jpg')
+                        self.resource_map[key] = rpath+'_calibre_converted.jpg'
+        img.set('src', rpath+'_calibre_converted.jpg')
    
    def save(self):
        for meta in list(self.root.xpath('//meta')):
            meta.getparent().remove(meta)
-        for img in self.root.xpath('//img[@src]'):
-            self.convert_image(img)
+        #for img in self.root.xpath('//img[@src]'):
+        #    self.convert_image(img)
        Processor.save(self)
        
    
--- a/src/calibre/gui2/main.py
+++ b/src/calibre/gui2/main.py
@ -837,7 +837,7 @@ class Main(MainWindow, Ui_MainWindow):
            self.job_exception(job)
            return
        mi = get_metadata(open(pt.name, 'rb'), fmt, use_libprs_metadata=False)
-        mi.tags = ['news', recipe.title]
+        mi.tags = [_('News'), recipe.title]
        paths, formats, metadata = [pt.name], [fmt], [mi]
        self.library_view.model().add_books(paths, formats, metadata, add_duplicates=True)
        callback(recipe)
--- a/src/calibre/manual/news.rst
+++ b/src/calibre/manual/news.rst
@ -142,8 +142,89 @@ Real life example

 A reasonably complex real life example that exposes more of the :term:`API` of ``BasicNewsRecipe`` is the :term:`recipe` for *The New York Times*

-.. literalinclude:: ../web/feeds/recipes/nytimes.py
-    :linenos:
+.. code-block:: python
+
+   import string, re
+   from calibre import strftime
+   from calibre.web.feeds.recipes import BasicNewsRecipe
+   from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+   class NYTimes(BasicNewsRecipe):
+    
+       title       = 'The New York Times'
+       __author__  = 'Kovid Goyal'
+       description = 'Daily news from the New York Times'
+       timefmt = ' [%a, %d %b, %Y]'
+       needs_subscription = True
+       remove_tags_before = dict(id='article')
+       remove_tags_after  = dict(id='article')
+       remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), 
+                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), 
+                   dict(name=['script', 'noscript', 'style'])]
+       encoding = 'cp1252'
+       no_stylesheets = True
+       extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
+    
+       def get_browser(self):
+           br = BasicNewsRecipe.get_browser()
+           if self.username is not None and self.password is not None:
+               br.open('http://www.nytimes.com/auth/login')
+               br.select_form(name='login')
+               br['USERID']   = self.username
+               br['PASSWORD'] = self.password
+               br.submit()
+           return br
+    
+       def parse_index(self):
+           soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
+        
+           def feed_title(div):
+               return ''.join(div.findAll(text=True, recursive=False)).strip()
+        
+           articles = {}
+           key = None
+           ans = []
+           for div in soup.findAll(True, 
+                attrs={'class':['section-headline', 'story', 'story headline']}):
+            
+                if div['class'] == 'section-headline':
+                    key = string.capwords(feed_title(div))
+                    articles[key] = []
+                    ans.append(key)
+            
+                elif div['class'] in ['story', 'story headline']:
+                    a = div.find('a', href=True)
+                    if not a:
+                        continue
+                    url = re.sub(r'\?.*', '', a['href'])
+                    url += '?pagewanted=all'
+                    title = self.tag_to_string(a, use_alt=True).strip()
+                    description = ''
+                    pubdate = strftime('%a, %d %b')
+                    summary = div.find(True, attrs={'class':'summary'})
+                    if summary:
+                        description = self.tag_to_string(summary, use_alt=False)
+                
+                    feed = key if key is not None else 'Uncategorized'
+                    if not articles.has_key(feed):
+                        articles[feed] = []
+                    if not 'podcasts' in url:
+                        articles[feed].append(
+                                  dict(title=title, url=url, date=pubdate, 
+                                       description=description,
+                                       content=''))
+           ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
+           ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+           return ans
+    
+       def preprocess_html(self, soup):
+           refresh = soup.find('meta', {'http-equiv':'refresh'})
+           if refresh is None:
+               return soup
+           content = refresh.get('content').partition('=')[2]
+           raw = self.browser.open('http://www.nytimes.com'+content).read()
+           return BeautifulSoup(raw.decode('cp1252', 'replace'))
+ 

 We see several new features in this :term:`recipe`. First, we have::

@ -164,12 +245,14 @@ The next interesting feature is::

    needs_subscription = True
    ...
-    def get_growser(self):
+    def get_browser(self):
        ...

 ``needs_subscription = True`` tells |app| that this recipe needs a username and password in order to access the content. This causes, |app| to ask for a username and password whenever you try to use this recipe. The code in :meth:`calibre.web.feeds.news.BasicNewsRecipe.get_browser` actually does the login into the NYT website. Once logged in, |app| will use the same, logged in, browser instance to fetch all content. See `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_ to understand the code in ``get_browser``.

-The last new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ to parse the daily paper webpage.
+The next new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ to parse the daily paper webpage.
+
+The final new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.preprocess_html` method. It can be used to perform arbitrary transformations on every downloaded HTML page. Here it is used to bypass the ads that the nytimes shows you before each article.

 Tips for developing new recipes
 ---------------------------------
--- a/src/calibre/web/feeds/main.py
+++ b/src/calibre/web/feeds/main.py
@ -155,7 +155,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
    
    if not os.path.exists(recipe.output_dir):
        os.makedirs(recipe.output_dir)
-    recipe.download()
+    recipe.download(for_lrf=True)
    
    return recipe

--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -59,6 +59,9 @@ class BasicNewsRecipe(object, LoggingInterface):
    #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
    simultaneous_downloads = 5
    
+    #: If False the remote server is contacted by only one thread at a time
+    multithreaded_fetch = False
+    
    #: Timeout for fetching files from server in seconds
    timeout                = 120.0
    
@ -108,7 +111,7 @@ class BasicNewsRecipe(object, LoggingInterface):
    
    #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
    #: It will be inserted into `<style>` tags, just before the closing
-    #: `</head>` tag thereby overrinding all :term:`CSS` except that which is
+    #: `</head>` tag thereby overriding all :term:`CSS` except that which is
    #: declared using the style attribute on individual :term:`HTML` tags. 
    #: For example::
    #: 
@ -272,7 +275,15 @@ class BasicNewsRecipe(object, LoggingInterface):
        raise NotImplementedError
    
    @classmethod
-    def get_browser(self):
+    def image_url_processor(cls, baseurl, url):
+        '''
+        Perform some processing on image urls (perhaps removing size restrictions for 
+        dynamically generated images, etc.) and return the precessed URL.
+        '''
+        return url
+    
+    @classmethod
+    def get_browser(cls, *args, **kwargs):
        '''
        Return a browser instance used to fetch documents from the web. By default
        it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
@ -294,7 +305,7 @@ class BasicNewsRecipe(object, LoggingInterface):
                return br
        
        '''
-        return browser()
+        return browser(*args, **kwargs)
    
    def get_article_url(self, article):
        '''
@ -338,7 +349,7 @@ class BasicNewsRecipe(object, LoggingInterface):
        '''
        pass
    
-    def index_to_soup(self, url_or_raw):
+    def index_to_soup(self, url_or_raw, raw=False):
        '''
        Convenience method that takes an URL to the index page and returns
        a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -354,6 +365,8 @@ class BasicNewsRecipe(object, LoggingInterface):
                raise RuntimeError('Could not fetch index from %s'%url_or_raw)
        else:
            raw = url_or_raw
+        if raw:
+            return raw
        if not isinstance(raw, unicode) and self.encoding:
            raw = raw.decode(self.encoding)
        massage = list(BeautifulSoup.MARKUP_MASSAGE)
@ -524,7 +537,7 @@ class BasicNewsRecipe(object, LoggingInterface):
        return self.postprocess_html(soup, first_fetch)
        
    
-    def download(self):
+    def download(self, for_lrf=False):
        '''
        Download and pre-process all articles from the feeds in this recipe. 
        This method should be called only one on a particular Recipe instance.
@ -622,11 +635,14 @@ class BasicNewsRecipe(object, LoggingInterface):
        return logger, out
    
    def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
-        self.web2disk_options.browser = self.browser
+        self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
        fetcher.base_dir = dir
        fetcher.current_dir = dir
        fetcher.show_progress = False
+        fetcher.image_url_processor = self.image_url_processor
+        if self.multithreaded_fetch:
+            fetcher.browser_lock = fetcher.DUMMY_LOCK
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
        if not res or not os.path.exists(res):
            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
--- a/src/calibre/web/feeds/recipes/nytimes.py
+++ b/src/calibre/web/feeds/recipes/nytimes.py
@ -1,87 +1,105 @@
 #!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-'''
-nytimes.com
-'''
-import string, re
-from calibre import strftime
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'

-class NYTimes(BasicNewsRecipe):
+'''
+mobile.nytimes.com
+'''
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+from lxml import html
+
+class NYTimesMobile(BasicNewsRecipe):
    
    title       = 'The New York Times'
    __author__  = 'Kovid Goyal'
    description = 'Daily news from the New York Times'
-    timefmt = ' [%a, %d %b, %Y]'
-    needs_subscription = True
-    remove_tags_before = dict(id='article')
-    remove_tags_after  = dict(id='article')
-    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), 
-                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), 
-                   dict(name=['script', 'noscript', 'style'])]
-    encoding = 'cp1252'
+    timefmt     = ' [%a, %d %b, %Y]'
+    multithreaded_fetch = True
+    max_articles_per_feed  = 15
    no_stylesheets = True
-    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
+    extra_css = '''
+    .h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left }
+    .h2 { font-size: large; font-weight: bold }
+    .credit { font-size: small }
+    .aut { font-weight: bold }
+    .bodycontent { font-family: serif }
+    ''' 
+    
+    remove_tags = [
+                   dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}), 
+                   dict(name='a', href='/main')
+                   ]
+    remove_tags_after = [
+                         dict(name='a', attrs={'name': 'bottom'})
+                         ]
+    
+    def image_url_processor(self, baseurl, url):
+        return re.sub(r'(&|&amp;).*', '', url)
    
    def get_browser(self):
-        br = BasicNewsRecipe.get_browser()
-        if self.username is not None and self.password is not None:
-            br.open('http://www.nytimes.com/auth/login')
-            br.select_form(name='login')
-            br['USERID']   = self.username
-            br['PASSWORD'] = self.password
-            br.submit()
-        return br
+        return BasicNewsRecipe.get_browser(mobile_browser=True)
+    
+    def download(self, for_lrf=False):
+        if for_lrf:
+            self.max_articles_per_feed = 10
+        return BasicNewsRecipe.download(self, for_lrf=for_lrf)
+    
+    def process_section(self, href):
+        raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
+        articles = []
+        while True:
+            root = html.fromstring(raw)
+            for art in self.find_articles(root):
+                append = True
+                for x in articles:
+                    if x['title'] == art['title']:
+                        append = False
+                        break
+                if append: articles.append(art)
+            more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]')
+            if not more:
+                break
+            href = more[0].get('href')
+            raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
+        return articles
+        
+    
+    def find_articles(self, root):
+        for a in root.xpath('//a[@accesskey]'):
+            href = a.get('href')
+            yield {
+                   'title': a.text.strip(),
+                   'date' : '',
+                   'url'  : 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1',
+                   'description': '',
+                   }
+        
    
    def parse_index(self):
-        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
-        
-        def feed_title(div):
-            return ''.join(div.findAll(text=True, recursive=False)).strip()
-        
-        articles = {}
-        key = None
-        ans = []
-        for div in soup.findAll(True, 
-            attrs={'class':['section-headline', 'story', 'story headline']}):
+        raw = self.index_to_soup('http://mobile.nytimes.com', raw=True)
+        root = html.fromstring(raw)
+        feeds = [('Latest news', list(self.find_articles(root)))]
            
-            if div['class'] == 'section-headline':
-                key = string.capwords(feed_title(div))
-                articles[key] = []
-                ans.append(key)
+        for a in root.xpath('//a[starts-with(@href, "section")]'):
+            title = a.text.replace('&raquo;', '').replace(u'\xbb', '').strip()
+            print 'Processing section:', title
+            articles = self.process_section(a.get('href'))
+            feeds.append((title, articles))
            
-            elif div['class'] in ['story', 'story headline']:
-                a = div.find('a', href=True)
-                if not a:
-                    continue
-                url = re.sub(r'\?.*', '', a['href'])
-                url += '?pagewanted=all'
-                title = self.tag_to_string(a, use_alt=True).strip()
-                description = ''
-                pubdate = strftime('%a, %d %b')
-                summary = div.find(True, attrs={'class':'summary'})
-                if summary:
-                    description = self.tag_to_string(summary, use_alt=False)
-                
-                feed = key if key is not None else 'Uncategorized'
-                if not articles.has_key(feed):
-                    articles[feed] = []
-                if not 'podcasts' in url:
-                    articles[feed].append(
-                                  dict(title=title, url=url, date=pubdate, 
-                                       description=description,
-                                       content=''))
-        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        return ans
+        return feeds
    
-    def preprocess_html(self, soup):
-        refresh = soup.find('meta', {'http-equiv':'refresh'})
-        if refresh is None:
-            return soup
-        content = refresh.get('content').partition('=')[2]
-        raw = self.browser.open('http://www.nytimes.com'+content).read()
-        return BeautifulSoup(raw.decode('cp1252', 'replace'))
+    def postprocess_html(self, soup, first_fetch):
+        for img in soup.findAll('img', width=True):
+            try:
+                width = int(img['width'].replace('px', ''))
+                if width < 5:
+                    img.extract()
+                    continue
+            except:
+                pass
+            del img['width']
+            del img['height']
+            del img.parent['style']
+        return soup
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -71,6 +71,11 @@ class response(str):
    def __init__(self, *args):
        str.__init__(self, *args)
        self.newurl = None
+        
+class DummyLock(object):
+    
+    def __enter__(self, *args): return self
+    def __exit__(self, *args): pass

 class RecursiveFetcher(object, LoggingInterface):
    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
@ -82,6 +87,7 @@ class RecursiveFetcher(object, LoggingInterface):
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
    default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
+    DUMMY_LOCK = DummyLock()
    
    def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
        LoggingInterface.__init__(self, logger)
@ -103,6 +109,8 @@ class RecursiveFetcher(object, LoggingInterface):
        self.imagemap = image_map
        self.imagemap_lock = threading.RLock()
        self.stylemap = css_map
+        self.image_url_processor = None
+        self.browser_lock = _browser_lock
        self.stylemap_lock = threading.RLock()
        self.downloaded_paths = []
        self.current_dir = self.base_dir
@ -166,7 +174,7 @@ class RecursiveFetcher(object, LoggingInterface):
        delta = time.time() - self.last_fetch_at 
        if  delta < self.delay:
            time.sleep(delta)
-        with _browser_lock:
+        with self.browser_lock:
            try:
                with closing(self.browser.open(url)) as f:
                    data = response(f.read()+f.read())
@ -271,8 +279,11 @@ class RecursiveFetcher(object, LoggingInterface):
            os.mkdir(diskpath)
        c = 0
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
-            iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
-            ext = ext[:5]
+            iurl = tag['src']
+            if callable(self.image_url_processor):
+                iurl = self.image_url_processor(baseurl, iurl)
+            ext  = os.path.splitext(iurl)[1]
+            ext  = ext[:5]
            #if not ext:
            #    self.log_debug('Skipping extensionless image %s', iurl)
            #    continue