New recipe for The New York Times based on mobile.nytimes.com. Fixes #1281 (New York Times Recipe producing large files)

2025-07-09 03:04:10 -04:00 · 2008-11-21 09:26:47 -08:00 · 2008-11-21 09:26:47 -08:00 · bd1d6ca3f3
commit bd1d6ca3f3
parent 5a4752fd38
8 changed files with 223 additions and 94 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -142,7 +142,7 @@ def get_proxies():
    return proxies
-def browser(honor_time=True, max_time=2):
+def browser(honor_time=True, max_time=2, mobile_browser=False):
    '''
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if avaialable.  
@ -153,7 +153,8 @@ def browser(honor_time=True, max_time=2):
    opener = mechanize.Browser()
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
-    opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
+    opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
                          'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
    http_proxy = get_proxies().get('http', None)
    if http_proxy:
        opener.set_proxies({'http':http_proxy})
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -93,18 +93,18 @@ class HTMLProcessor(Processor, Rationalizer):
            p = QPixmap()
            p.load(path)
            if not p.isNull():
-                p.save(path+'.jpg')
+                p.save(path+'_calibre_converted.jpg')
                os.remove(path)
                for key, val in self.resource_map.items():
                    if val == rpath:
-                        self.resource_map[key] = rpath+'.jpg'
+                        self.resource_map[key] = rpath+'_calibre_converted.jpg'
-        img.set('src', rpath+'.jpg')
+        img.set('src', rpath+'_calibre_converted.jpg')
    def save(self):
        for meta in list(self.root.xpath('//meta')):
            meta.getparent().remove(meta)
-        for img in self.root.xpath('//img[@src]'):
+        #for img in self.root.xpath('//img[@src]'):
-            self.convert_image(img)
+        #    self.convert_image(img)
        Processor.save(self)
--- a/src/calibre/gui2/main.py
+++ b/src/calibre/gui2/main.py
@ -837,7 +837,7 @@ class Main(MainWindow, Ui_MainWindow):
            self.job_exception(job)
            return
        mi = get_metadata(open(pt.name, 'rb'), fmt, use_libprs_metadata=False)
-        mi.tags = ['news', recipe.title]
+        mi.tags = [_('News'), recipe.title]
        paths, formats, metadata = [pt.name], [fmt], [mi]
        self.library_view.model().add_books(paths, formats, metadata, add_duplicates=True)
        callback(recipe)
--- a/src/calibre/manual/news.rst
+++ b/src/calibre/manual/news.rst
@ -142,8 +142,89 @@ Real life example
 A reasonably complex real life example that exposes more of the :term:`API` of ``BasicNewsRecipe`` is the :term:`recipe` for *The New York Times*
-.. literalinclude:: ../web/feeds/recipes/nytimes.py
+.. code-block:: python
-    :linenos:
+
   import string, re
   from calibre import strftime
   from calibre.web.feeds.recipes import BasicNewsRecipe
   from calibre.ebooks.BeautifulSoup import BeautifulSoup
   class NYTimes(BasicNewsRecipe):
       title       = 'The New York Times'
       __author__  = 'Kovid Goyal'
       description = 'Daily news from the New York Times'
       timefmt = ' [%a, %d %b, %Y]'
       needs_subscription = True
       remove_tags_before = dict(id='article')
       remove_tags_after  = dict(id='article')
       remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), 
                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), 
                   dict(name=['script', 'noscript', 'style'])]
       encoding = 'cp1252'
       no_stylesheets = True
       extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
       def get_browser(self):
           br = BasicNewsRecipe.get_browser()
           if self.username is not None and self.password is not None:
               br.open('http://www.nytimes.com/auth/login')
               br.select_form(name='login')
               br['USERID']   = self.username
               br['PASSWORD'] = self.password
               br.submit()
           return br
       def parse_index(self):
           soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
           def feed_title(div):
               return ''.join(div.findAll(text=True, recursive=False)).strip()
           articles = {}
           key = None
           ans = []
           for div in soup.findAll(True, 
                attrs={'class':['section-headline', 'story', 'story headline']}):
                if div['class'] == 'section-headline':
                    key = string.capwords(feed_title(div))
                    articles[key] = []
                    ans.append(key)
                elif div['class'] in ['story', 'story headline']:
                    a = div.find('a', href=True)
                    if not a:
                        continue
                    url = re.sub(r'\?.*', '', a['href'])
                    url += '?pagewanted=all'
                    title = self.tag_to_string(a, use_alt=True).strip()
                    description = ''
                    pubdate = strftime('%a, %d %b')
                    summary = div.find(True, attrs={'class':'summary'})
                    if summary:
                        description = self.tag_to_string(summary, use_alt=False)
                    feed = key if key is not None else 'Uncategorized'
                    if not articles.has_key(feed):
                        articles[feed] = []
                    if not 'podcasts' in url:
                        articles[feed].append(
                                  dict(title=title, url=url, date=pubdate, 
                                       description=description,
                                       content=''))
           ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
           ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
           return ans
       def preprocess_html(self, soup):
           refresh = soup.find('meta', {'http-equiv':'refresh'})
           if refresh is None:
               return soup
           content = refresh.get('content').partition('=')[2]
           raw = self.browser.open('http://www.nytimes.com'+content).read()
           return BeautifulSoup(raw.decode('cp1252', 'replace'))
 We see several new features in this :term:`recipe`. First, we have::
@ -164,12 +245,14 @@ The next interesting feature is::
    needs_subscription = True
    ...
-    def get_growser(self):
+    def get_browser(self):
        ...
 ``needs_subscription = True`` tells |app| that this recipe needs a username and password in order to access the content. This causes, |app| to ask for a username and password whenever you try to use this recipe. The code in :meth:`calibre.web.feeds.news.BasicNewsRecipe.get_browser` actually does the login into the NYT website. Once logged in, |app| will use the same, logged in, browser instance to fetch all content. See `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_ to understand the code in ``get_browser``.
-The last new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ to parse the daily paper webpage.
+The next new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.parse_index` method. Its job is to go to http://www.nytimes.com/pages/todayspaper/index.html and fetch the list of articles that appear in *todays* paper. While more complex than simply using :term:`RSS`, the recipe creates an e-book that corresponds very closely to the days paper. ``parse_index`` makes heavy use of `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ to parse the daily paper webpage.
 The final new feature is the :meth:`calibre.web.feeds.news.BasicNewsRecipe.preprocess_html` method. It can be used to perform arbitrary transformations on every downloaded HTML page. Here it is used to bypass the ads that the nytimes shows you before each article.
 Tips for developing new recipes
 ---------------------------------
--- a/src/calibre/web/feeds/main.py
+++ b/src/calibre/web/feeds/main.py
@ -155,7 +155,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
    if not os.path.exists(recipe.output_dir):
        os.makedirs(recipe.output_dir)
-    recipe.download()
+    recipe.download(for_lrf=True)
    return recipe
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -59,6 +59,9 @@ class BasicNewsRecipe(object, LoggingInterface):
    #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
    simultaneous_downloads = 5
    #: If False the remote server is contacted by only one thread at a time
    multithreaded_fetch = False
    #: Timeout for fetching files from server in seconds
    timeout                = 120.0
@ -108,7 +111,7 @@ class BasicNewsRecipe(object, LoggingInterface):
    #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
    #: It will be inserted into `<style>` tags, just before the closing
-    #: `</head>` tag thereby overrinding all :term:`CSS` except that which is
+    #: `</head>` tag thereby overriding all :term:`CSS` except that which is
    #: declared using the style attribute on individual :term:`HTML` tags. 
    #: For example::
    #: 
@ -272,7 +275,15 @@ class BasicNewsRecipe(object, LoggingInterface):
        raise NotImplementedError
    @classmethod
-    def get_browser(self):
+    def image_url_processor(cls, baseurl, url):
        '''
        Perform some processing on image urls (perhaps removing size restrictions for 
        dynamically generated images, etc.) and return the precessed URL.
        '''
        return url
    @classmethod
    def get_browser(cls, *args, **kwargs):
        '''
        Return a browser instance used to fetch documents from the web. By default
        it returns a `mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_
@ -294,7 +305,7 @@ class BasicNewsRecipe(object, LoggingInterface):
                return br
        '''
-        return browser()
+        return browser(*args, **kwargs)
    def get_article_url(self, article):
        '''
@ -338,7 +349,7 @@ class BasicNewsRecipe(object, LoggingInterface):
        '''
        pass
-    def index_to_soup(self, url_or_raw):
+    def index_to_soup(self, url_or_raw, raw=False):
        '''
        Convenience method that takes an URL to the index page and returns
        a `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -354,6 +365,8 @@ class BasicNewsRecipe(object, LoggingInterface):
                raise RuntimeError('Could not fetch index from %s'%url_or_raw)
        else:
            raw = url_or_raw
        if raw:
            return raw
        if not isinstance(raw, unicode) and self.encoding:
            raw = raw.decode(self.encoding)
        massage = list(BeautifulSoup.MARKUP_MASSAGE)
@ -524,7 +537,7 @@ class BasicNewsRecipe(object, LoggingInterface):
        return self.postprocess_html(soup, first_fetch)
-    def download(self):
+    def download(self, for_lrf=False):
        '''
        Download and pre-process all articles from the feeds in this recipe. 
        This method should be called only one on a particular Recipe instance.
@ -622,11 +635,14 @@ class BasicNewsRecipe(object, LoggingInterface):
        return logger, out
    def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
-        self.web2disk_options.browser = self.browser
+        self.web2disk_options.browser = self.get_browser() if self.multithreaded_fetch else self.browser
        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
        fetcher.base_dir = dir
        fetcher.current_dir = dir
        fetcher.show_progress = False
        fetcher.image_url_processor = self.image_url_processor
        if self.multithreaded_fetch:
            fetcher.browser_lock = fetcher.DUMMY_LOCK
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
        if not res or not os.path.exists(res):
            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
--- a/src/calibre/web/feeds/recipes/nytimes.py
+++ b/src/calibre/web/feeds/recipes/nytimes.py
@ -1,87 +1,105 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
-'''
+__docformat__ = 'restructuredtext en'
 nytimes.com
 '''
 import string, re
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
-class NYTimes(BasicNewsRecipe):
+'''
 mobile.nytimes.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from lxml import html
 class NYTimesMobile(BasicNewsRecipe):
    title       = 'The New York Times'
    __author__  = 'Kovid Goyal'
    description = 'Daily news from the New York Times'
    timefmt     = ' [%a, %d %b, %Y]'
-    needs_subscription = True
+    multithreaded_fetch = True
-    remove_tags_before = dict(id='article')
+    max_articles_per_feed  = 15
    remove_tags_after  = dict(id='article')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), 
                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), 
                   dict(name=['script', 'noscript', 'style'])]
    encoding = 'cp1252'
    no_stylesheets = True
-    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
+    extra_css = '''
    .h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left }
    .h2 { font-size: large; font-weight: bold }
    .credit { font-size: small }
    .aut { font-weight: bold }
    .bodycontent { font-family: serif }
    ''' 
    remove_tags = [
                   dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}), 
                   dict(name='a', href='/main')
                   ]
    remove_tags_after = [
                         dict(name='a', attrs={'name': 'bottom'})
                         ]
    def image_url_processor(self, baseurl, url):
        return re.sub(r'(&|&amp;).*', '', url)
    def get_browser(self):
-        br = BasicNewsRecipe.get_browser()
+        return BasicNewsRecipe.get_browser(mobile_browser=True)
-        if self.username is not None and self.password is not None:
+    
-            br.open('http://www.nytimes.com/auth/login')
+    def download(self, for_lrf=False):
-            br.select_form(name='login')
+        if for_lrf:
-            br['USERID']   = self.username
+            self.max_articles_per_feed = 10
-            br['PASSWORD'] = self.password
+        return BasicNewsRecipe.download(self, for_lrf=for_lrf)
-            br.submit()
+    
-        return br
+    def process_section(self, href):
        raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
        articles = []
        while True:
            root = html.fromstring(raw)
            for art in self.find_articles(root):
                append = True
                for x in articles:
                    if x['title'] == art['title']:
                        append = False
                        break
                if append: articles.append(art)
            more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]')
            if not more:
                break
            href = more[0].get('href')
            raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True)
        return articles
    def find_articles(self, root):
        for a in root.xpath('//a[@accesskey]'):
            href = a.get('href')
            yield {
                   'title': a.text.strip(),
                   'date' : '',
                   'url'  : 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1',
                   'description': '',
                   }
    def parse_index(self):
-        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
+        raw = self.index_to_soup('http://mobile.nytimes.com', raw=True)
        root = html.fromstring(raw)
        feeds = [('Latest news', list(self.find_articles(root)))]
-        def feed_title(div):
+        for a in root.xpath('//a[starts-with(@href, "section")]'):
-            return ''.join(div.findAll(text=True, recursive=False)).strip()
+            title = a.text.replace('&raquo;', '').replace(u'\xbb', '').strip()
            print 'Processing section:', title
            articles = self.process_section(a.get('href'))
            feeds.append((title, articles))
-        articles = {}
+        return feeds
        key = None
        ans = []
        for div in soup.findAll(True, 
            attrs={'class':['section-headline', 'story', 'story headline']}):
-            if div['class'] == 'section-headline':
+    def postprocess_html(self, soup, first_fetch):
-                key = string.capwords(feed_title(div))
+        for img in soup.findAll('img', width=True):
-                articles[key] = []
+            try:
-                ans.append(key)
+                width = int(img['width'].replace('px', ''))
-            
+                if width < 5:
-            elif div['class'] in ['story', 'story headline']:
+                    img.extract()
                a = div.find('a', href=True)
                if not a:
                    continue
-                url = re.sub(r'\?.*', '', a['href'])
+            except:
-                url += '?pagewanted=all'
+                pass
-                title = self.tag_to_string(a, use_alt=True).strip()
+            del img['width']
-                description = ''
+            del img['height']
-                pubdate = strftime('%a, %d %b')
+            del img.parent['style']
                summary = div.find(True, attrs={'class':'summary'})
                if summary:
                    description = self.tag_to_string(summary, use_alt=False)
                feed = key if key is not None else 'Uncategorized'
                if not articles.has_key(feed):
                    articles[feed] = []
                if not 'podcasts' in url:
                    articles[feed].append(
                                  dict(title=title, url=url, date=pubdate, 
                                       description=description,
                                       content=''))
        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
    def preprocess_html(self, soup):
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
        return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open('http://www.nytimes.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -72,6 +72,11 @@ class response(str):
        str.__init__(self, *args)
        self.newurl = None
 class DummyLock(object):
    def __enter__(self, *args): return self
    def __exit__(self, *args): pass
 class RecursiveFetcher(object, LoggingInterface):
    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
                ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
@ -82,6 +87,7 @@ class RecursiveFetcher(object, LoggingInterface):
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
    default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
    DUMMY_LOCK = DummyLock()
    def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
        LoggingInterface.__init__(self, logger)
@ -103,6 +109,8 @@ class RecursiveFetcher(object, LoggingInterface):
        self.imagemap = image_map
        self.imagemap_lock = threading.RLock()
        self.stylemap = css_map
        self.image_url_processor = None
        self.browser_lock = _browser_lock
        self.stylemap_lock = threading.RLock()
        self.downloaded_paths = []
        self.current_dir = self.base_dir
@ -166,7 +174,7 @@ class RecursiveFetcher(object, LoggingInterface):
        delta = time.time() - self.last_fetch_at 
        if  delta < self.delay:
            time.sleep(delta)
-        with _browser_lock:
+        with self.browser_lock:
            try:
                with closing(self.browser.open(url)) as f:
                    data = response(f.read()+f.read())
@ -271,7 +279,10 @@ class RecursiveFetcher(object, LoggingInterface):
            os.mkdir(diskpath)
        c = 0
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
-            iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
+            iurl = tag['src']
            if callable(self.image_url_processor):
                iurl = self.image_url_processor(baseurl, iurl)
            ext  = os.path.splitext(iurl)[1]
            ext  = ext[:5]
            #if not ext:
            #    self.log_debug('Skipping extensionless image %s', iurl)