From 352ee20933bb0f3c5ea37187247ff466bd203755 Mon Sep 17 00:00:00 2001 From: Charles Haley Date: Sat, 11 Jan 2014 08:30:52 +0100 Subject: [PATCH 1/7] Allow searching for tags in the device view. Works for books on the device that came from calibre or have matched a calibre book in the past. --- src/calibre/gui2/library/models.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py index 45fcd0ab36..16c7060d5b 100644 --- a/src/calibre/gui2/library/models.py +++ b/src/calibre/gui2/library/models.py @@ -1104,7 +1104,8 @@ class OnDeviceSearch(SearchQueryParser): # {{{ 'format', 'formats', 'title', - 'inlibrary' + 'inlibrary', + 'tags' ] def __init__(self, model): @@ -1142,7 +1143,8 @@ class OnDeviceSearch(SearchQueryParser): # {{{ 'author': lambda x: ' & '.join(getattr(x, 'authors')).lower(), 'collections':lambda x: ','.join(getattr(x, 'device_collections')).lower(), 'format':lambda x: os.path.splitext(x.path)[1].lower(), - 'inlibrary':lambda x : getattr(x, 'in_library') + 'inlibrary':lambda x : getattr(x, 'in_library'), + 'tags':lambda x : getattr(x, 'tags') } for x in ('author', 'format'): q[x+'s'] = q[x] @@ -1169,10 +1171,12 @@ class OnDeviceSearch(SearchQueryParser): # {{{ else: m = matchkind - if locvalue == 'collections': - vals = accessor(row).split(',') - else: - vals = [accessor(row)] + vals = accessor(row) + if not isinstance(vals, list): + if locvalue == 'collections': + vals = accessor(row).split(',') + else: + vals = [accessor(row)] if _match(query, vals, m, use_primary_find_in_search=upf): matches.add(index) break From 6001342b32c4f08ca940f27e2de938d89a151b97 Mon Sep 17 00:00:00 2001 From: Charles Haley Date: Sat, 11 Jan 2014 09:23:25 +0100 Subject: [PATCH 2/7] Don't search tags when the expression is unprefixed. Some performance improvements and defensive code. --- src/calibre/gui2/library/models.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py index 16c7060d5b..ad3fc5d0b2 100644 --- a/src/calibre/gui2/library/models.py +++ b/src/calibre/gui2/library/models.py @@ -1136,7 +1136,7 @@ class OnDeviceSearch(SearchQueryParser): # {{{ if location not in self.USABLE_LOCATIONS: return set([]) matches = set([]) - all_locs = set(self.USABLE_LOCATIONS) - set(['all']) + all_locs = set(self.USABLE_LOCATIONS) - set(['all', 'tags']) locations = all_locs if location == 'all' else [location] q = { 'title' : lambda x : getattr(x, 'title').lower(), @@ -1144,7 +1144,7 @@ class OnDeviceSearch(SearchQueryParser): # {{{ 'collections':lambda x: ','.join(getattr(x, 'device_collections')).lower(), 'format':lambda x: os.path.splitext(x.path)[1].lower(), 'inlibrary':lambda x : getattr(x, 'in_library'), - 'tags':lambda x : getattr(x, 'tags') + 'tags':lambda x : getattr(x, 'tags', []) } for x in ('author', 'format'): q[x+'s'] = q[x] @@ -1172,11 +1172,10 @@ class OnDeviceSearch(SearchQueryParser): # {{{ m = matchkind vals = accessor(row) - if not isinstance(vals, list): - if locvalue == 'collections': - vals = accessor(row).split(',') - else: - vals = [accessor(row)] + if vals is None: + vals = '' + if isinstance(vals, basestring): + vals = vals.split(',') if locvalue == 'collections' else [vals] if _match(query, vals, m, use_primary_find_in_search=upf): matches.add(index) break From 901f54e124ab4cd1c554ffc741f0a4fc79e43d43 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 12 Jan 2014 09:25:07 +0530 Subject: [PATCH 3/7] EPUB Output: Fix splitting of large HTML files removing all child tags from inside
 tags. Fixes #1267327 [Private
 bug](https://bugs.launchpad.net/calibre/+bug/1267327)

---
 src/calibre/ebooks/oeb/transforms/split.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py
index 01e4348b34..02215c5121 100644
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@@ -317,13 +317,11 @@ class FlowSplitter(object):
     def split_to_size(self, tree):
         self.log.debug('\t\tSplitting...')
         root = tree.getroot()
-        # Split large 
 tags
-        for pre in list(XPath('//h:pre')(root)):
-            text = u''.join(pre.xpath('descendant::text()'))
-            pre.text = text
-            for child in list(pre.iterchildren()):
-                pre.remove(child)
-            if len(pre.text) > self.max_flow_size*0.5:
+        # Split large 
 tags if they contain only text
+        for pre in XPath('//h:pre')(root):
+            if len(tuple(pre.iterchildren(etree.Element))) > 0:
+                continue
+            if pre.text and len(pre.text) > self.max_flow_size*0.5:
                 self.log.debug('\t\tSplitting large 
 tag')
                 frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
                 new_pres = []

From 25f96bd1986d376baeee835d50bbd0f2d28ab576 Mon Sep 17 00:00:00 2001
From: Kovid Goyal 
Date: Sun, 12 Jan 2014 11:11:00 +0530
Subject: [PATCH 4/7] ...

---
 recipes/nytimes.recipe     | 5 ++++-
 recipes/nytimes_sub.recipe | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index 051d40b7b4..cfdfc35236 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -222,6 +222,7 @@ class NYTimes(BasicNewsRecipe):
                                         re.compile('commentCount'),
                                         'credit'
                                         ]}),
+                    dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
                     dict(name='div', attrs={'class':re.compile('toolsList')}),  # bits
                     dict(name='div', attrs={'class':re.compile('postNavigation')}),  # bits
                     dict(name='div', attrs={'class':'tweet'}),
@@ -235,6 +236,8 @@ class NYTimes(BasicNewsRecipe):
                     dict(id=[
                             'adxLeaderboard',
                             'adxSponLink',
+                            'anchoredAd_module',
+                            'anchoredAd_spot',
                             'archive',
                             'articleExtras',
                             'articleInline',
@@ -268,7 +271,7 @@ class NYTimes(BasicNewsRecipe):
                             'related-content', #added for DealBook
                             'whats-next',
                             ]),
-                    dict(name=['script', 'noscript', 'style','form','hr', 'button'])]
+                    dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta'])]
     no_stylesheets = True
     extra_css = '''
                 .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 4527fb544e..94c03bd98b 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -222,6 +222,7 @@ class NYTimes(BasicNewsRecipe):
                                         re.compile('commentCount'),
                                         'credit'
                                         ]}),
+                    dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
                     dict(name='div', attrs={'class':re.compile('toolsList')}),  # bits
                     dict(name='div', attrs={'class':re.compile('postNavigation')}),  # bits
                     dict(name='div', attrs={'class':'tweet'}),
@@ -235,6 +236,8 @@ class NYTimes(BasicNewsRecipe):
                     dict(id=[
                             'adxLeaderboard',
                             'adxSponLink',
+                            'anchoredAd_module',
+                            'anchoredAd_spot',
                             'archive',
                             'articleExtras',
                             'articleInline',
@@ -267,7 +270,7 @@ class NYTimes(BasicNewsRecipe):
                             'related-content', #added for DealBook
                             'whats-next',
                             ]),
-                    dict(name=['script', 'noscript', 'style','form','hr', 'button'])]
+                    dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta'])]
     no_stylesheets = True
     extra_css = '''
                 .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }

From a60a80d12526da5d43af4d94fc0af3f7a825c7f7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal 
Date: Sun, 12 Jan 2014 16:15:21 +0530
Subject: [PATCH 5/7] HTML Input: Fix UTF-16/32 encoded files that are linked
 to from the parent file not being properly processed. Fixes #1268262 [convert
 html file encoded as utf-16 fail to include image files and have wrong
 order](https://bugs.launchpad.net/calibre/+bug/1268262)

---
 src/calibre/ebooks/html/input.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 9683837ad6..df6793b107 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -20,6 +20,7 @@ from calibre.constants import iswindows
 from calibre import unicode_path, as_unicode, replace_entities
 
 class Link(object):
+
     '''
     Represents a link in a HTML file.
     '''
@@ -73,6 +74,7 @@ class IgnoreFile(Exception):
         self.errno = errno
 
 class HTMLFile(object):
+
     '''
     Contains basic information about an HTML file. This
     includes a list of links to other files as well as
@@ -103,8 +105,14 @@ class HTMLFile(object):
 
         try:
             with open(self.path, 'rb') as f:
-                src = f.read(4096)
-                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src))
+                src = header = f.read(4096)
+                encoding = detect_xml_encoding(src)[1]
+                if encoding:
+                    try:
+                        header = header.decode(encoding)
+                    except ValueError:
+                        pass
+                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
                 if not self.is_binary:
                     src += f.read()
         except IOError as err:
@@ -139,7 +147,6 @@ class HTMLFile(object):
     def __repr__(self):
         return str(self)
 
-
     def find_links(self, src):
         for match in self.LINK_PAT.finditer(src):
             url = None
@@ -167,7 +174,7 @@ def depth_first(root, flat, visited=set([])):
         if link.path is not None and link not in visited:
             try:
                 index = flat.index(link)
-            except ValueError: # Can happen if max_levels is used
+            except ValueError:  # Can happen if max_levels is used
                 continue
             hf = flat[index]
             if hf not in visited:
@@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log):
     log.info('Building file list...')
     filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
                         verbose=opts.verbose,
-                        encoding=opts.input_encoding)\
-                [0 if opts.breadth_first else 1]
+                        encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
     if opts.verbose:
         log.debug('\tFound files...')
         for f in filelist:

From 2d8dfc3a284bcce867b156eee9d643ad62292de2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal 
Date: Sun, 12 Jan 2014 19:29:09 +0530
Subject: [PATCH 6/7] Update NY Times some more

---
 recipes/nytimes.recipe     | 185 ++++++++++++++++++------------------
 recipes/nytimes_sub.recipe | 186 ++++++++++++++++++-------------------
 2 files changed, 181 insertions(+), 190 deletions(-)

diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index cfdfc35236..cca16e891a 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -14,8 +14,8 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
 
 class NYTimes(BasicNewsRecipe):
 
-    recursions=1 # set this to zero to omit Related articles lists
-    match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
+    recursions=1  # set this to zero to omit Related articles lists
+    match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/']  # speeds up processing by preventing index page links from being followed
 
     # set getTechBlogs to True to include the technology blogs
     # set tech_oldest_article to control article age
@@ -28,12 +28,11 @@ class NYTimes(BasicNewsRecipe):
     # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
     # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
     getPopularArticles = True
-    popularPeriod = '1' # set this to the number of days to include in the measurement
+    popularPeriod = '1'  # set this to the number of days to include in the measurement
                         # e.g. 7 will get the most popular measured over the last 7 days
                         # and 30 will get the most popular measured over 30 days.
                         # you still only get up to 20 articles in each category
 
-
     # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
     headlinesOnly = True
 
@@ -82,6 +81,7 @@ class NYTimes(BasicNewsRecipe):
 
     # The maximum number of articles that will be downloaded
     max_articles_per_feed = 100
+    use_embedded_content = False
 
     # Whether to omit duplicates of articles (typically arsing when articles are indexed in
     # more than one section). If True, only the first occurance will be downloaded.
@@ -122,7 +122,6 @@ class NYTimes(BasicNewsRecipe):
                (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
                    ]
 
-
     if headlinesOnly:
         title='New York Times Headlines'
         description = 'Headlines from the New York Times'
@@ -155,7 +154,7 @@ class NYTimes(BasicNewsRecipe):
         earliest_date = date.today()
     else:
         earliest_date = date.today() - timedelta(days=oldest_web_article)
-    oldest_article = 365 # by default, a long time ago
+    oldest_article = 365  # by default, a long time ago
 
     __author__  = 'GRiker/Kovid Goyal/Nick Redding'
     language = 'en'
@@ -164,12 +163,11 @@ class NYTimes(BasicNewsRecipe):
 
     timefmt = ''
 
-    #simultaneous_downloads = 1 # no longer required to deal with ads
+    # simultaneous_downloads = 1 # no longer required to deal with ads
 
     cover_margins = (18,18,'grey99')
 
-    remove_tags_before = dict(id='article')
-    remove_tags_after  = dict(id='article')
+    keep_only_tags = dict(id=['article', 'story', 'content'])
     remove_tags = [
                     dict(attrs={'class':[
                                         'articleFooter',
@@ -184,6 +182,7 @@ class NYTimes(BasicNewsRecipe):
                                         'entry-response module',
                                         'leftNavTabs',
                                         'metaFootnote',
+                                        'inside-story',
                                         'module box nav',
                                         'nextArticleLink',
                                         'nextArticleLink clearfix',
@@ -192,28 +191,28 @@ class NYTimes(BasicNewsRecipe):
                                         'side_tool',
                                         'singleAd',
                                         'postCategory column',
-                                        'refer tagRefer', # added for bits blog post
-                                        'entry entry-utility', #added for DealBook
-                                        'entry-tags', #added for DealBook
-                                        'footer promos clearfix', #added for DealBook
-                                        'footer links clearfix', #added for DealBook
-                                        'tabsContainer', #added for other blog downloads
-                                        'column lastColumn', #added for other blog downloads
-                                        'pageHeaderWithLabel', #added for other gadgetwise downloads
-                                        'column two', #added for other blog downloads
-                                        'column two last', #added for other blog downloads
-                                        'column three', #added for other blog downloads
-                                        'column three last', #added for other blog downloads
-                                        'column four',#added for other blog downloads
-                                        'column four last',#added for other blog downloads
-                                        'column last', #added for other blog downloads
+                                        'refer tagRefer',  # added for bits blog post
+                                        'entry entry-utility',  # added for DealBook
+                                        'entry-tags',  # added for DealBook
+                                        'footer promos clearfix',  # added for DealBook
+                                        'footer links clearfix',  # added for DealBook
+                                        'tabsContainer',  # added for other blog downloads
+                                        'column lastColumn',  # added for other blog downloads
+                                        'pageHeaderWithLabel',  # added for other gadgetwise downloads
+                                        'column two',  # added for other blog downloads
+                                        'column two last',  # added for other blog downloads
+                                        'column three',  # added for other blog downloads
+                                        'column three last',  # added for other blog downloads
+                                        'column four',  # added for other blog downloads
+                                        'column four last',  # added for other blog downloads
+                                        'column last',  # added for other blog downloads
                                         'entry entry-related',
-                                        'subNavigation tabContent active', #caucus blog navigation
+                                        'subNavigation tabContent active',  # caucus blog navigation
                                         'mediaOverlay slideshow',
                                         'wideThumb',
-                                        'video', #added 02-11-2011
-                                        'videoHeader',#added 02-11-2011
-                                        'articleInlineVideoHolder', #added 02-11-2011
+                                        'video',  # added 02-11-2011
+                                        'videoHeader',  # added 02-11-2011
+                                        'articleInlineVideoHolder',  # added 02-11-2011
                                         'assetCompanionAd',
                                         'nytint-sectionHeader',
                                         re.compile('^subNavigation'),
@@ -223,6 +222,7 @@ class NYTimes(BasicNewsRecipe):
                                         'credit'
                                         ]}),
                     dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
+                    dict(attrs={'class':lambda x: x and 'interactive' in x.split()}),
                     dict(name='div', attrs={'class':re.compile('toolsList')}),  # bits
                     dict(name='div', attrs={'class':re.compile('postNavigation')}),  # bits
                     dict(name='div', attrs={'class':'tweet'}),
@@ -231,8 +231,8 @@ class NYTimes(BasicNewsRecipe):
                     dict(name='div', attrs={'id':re.compile('commentsContainer')}),  # bits, pogue, gadgetwise, open
                     dict(name='ul', attrs={'class':re.compile('entry-tools')}),  # pogue, gadgetwise
                     dict(name='div', attrs={'class':re.compile('nocontent')}),  # pogue, gadgetwise
-                    dict(name='div', attrs={'id':re.compile('respond')}), # open
-                    dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
+                    dict(name='div', attrs={'id':re.compile('respond')}),  # open
+                    dict(name='div', attrs={'class':re.compile('entry-tags')}),  # pogue
                     dict(id=[
                             'adxLeaderboard',
                             'adxSponLink',
@@ -266,16 +266,18 @@ class NYTimes(BasicNewsRecipe):
                             'side_index',
                             'side_tool',
                             'toolsRight',
-                            'skybox', #added for DealBook
-                            'TopAd', #added for DealBook
-                            'related-content', #added for DealBook
+                            'skybox',  # added for DealBook
+                            'TopAd',  # added for DealBook
+                            'related-content',  # added for DealBook
                             'whats-next',
                             ]),
-                    dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta'])]
+                    dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])]
     no_stylesheets = True
     extra_css = '''
                 .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
-                .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                .credit { font-weight: normal; text-align: right; font-size:
+                    50%; line-height:1em; margin-top:5px; margin-left:0;
+                    margin-right:0; margin-bottom: 0; }
                 .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
                 .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                 .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
@@ -291,7 +293,6 @@ class NYTimes(BasicNewsRecipe):
                 .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
                 .source {text-align: left; font-size: x-small; }'''
 
-
     articles = {}
     key = None
     ans = []
@@ -313,22 +314,22 @@ class NYTimes(BasicNewsRecipe):
                 del ans[idx]
                 idx_max = idx_max-1
                 continue
-            if True: #self.verbose
-                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
+            if True:  # self.verbose
+                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])))
             for article in ans[idx][1]:
                 total_article_count += 1
-                if True: #self.verbose
+                if True:  # self.verbose
                     self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
                               article['url'].encode('cp1252','replace')))
             idx = idx+1
 
-        self.log( "Queued %d articles" % total_article_count )
+        self.log("Queued %d articles" % total_article_count)
         return ans
 
     def exclude_url(self,url):
         if not url.startswith("http"):
             return True
-        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url:  # added for DealBook
             return True
         if 'nytimes.com' not in url:
             return True
@@ -412,7 +413,6 @@ class NYTimes(BasicNewsRecipe):
     def short_title(self):
         return self.title
 
-
     def article_to_soup(self, url_or_raw, raw=False):
         from contextlib import closing
         import copy
@@ -446,7 +446,6 @@ class NYTimes(BasicNewsRecipe):
         usrc = self.preprocess_raw_html(usrc, url_or_raw)
         return BeautifulSoup(usrc, markupMassage=nmassage)
 
-
     def massageNCXText(self, description):
         # Kindle TOC descriptions won't render certain characters
         if description:
@@ -478,7 +477,7 @@ class NYTimes(BasicNewsRecipe):
         if self.webEdition:
             date_tag = self.decode_url_date(url)
             if date_tag is not None:
-                if  self.oldest_web_article is not None:
+                if self.oldest_web_article is not None:
                     if date_tag < self.earliest_date:
                         self.log("Skipping article %s" % url)
                         return
@@ -501,7 +500,7 @@ class NYTimes(BasicNewsRecipe):
             if authorAttribution:
                 author = self.tag_to_string(authorAttribution, use_alt=False)
         feed = self.key if self.key is not None else 'Uncategorized'
-        if not self.articles.has_key(feed):
+        if feed not in self.articles:
             self.ans.append(feed)
             self.articles[feed] = []
         self.articles[feed].append(
@@ -536,7 +535,6 @@ class NYTimes(BasicNewsRecipe):
                     desc = ''
                 return(title,url,author,desc)
 
-
             have_emailed = False
             emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
             for h3tag in emailed_soup.findAll('h3'):
@@ -565,7 +563,7 @@ class NYTimes(BasicNewsRecipe):
                     dict(title=title, url=url, date=strftime('%a, %d %b'),
                         description=desc, author=author,
                         content=''))
-            viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
+            viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles]
             for x in viewed_ans:
                 ans.append(x)
         return ans
@@ -588,10 +586,10 @@ class NYTimes(BasicNewsRecipe):
                 tech_articles[f.title] = []
                 for a in f.articles:
                     tech_articles[f.title].append(
-                        dict(title=a.title, url=a.url, date=a.date,
+                        dict(title=a.title, url=a.url.partition('?')[0], date=a.date,
                             description=a.summary, author=a.author,
                             content=a.content))
-            tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
+            tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles]
             for x in tech_ans:
                 ans.append(x)
         return ans
@@ -630,10 +628,9 @@ class NYTimes(BasicNewsRecipe):
                     for lidiv in div.findAll('li'):
                         self.handle_article(lidiv)
 
-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
         return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
 
-
     def parse_todays_index(self):
 
         soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
@@ -663,7 +660,7 @@ class NYTimes(BasicNewsRecipe):
                     if not skipping:
                         self.handle_article(lidiv)
 
-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
         return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
 
     def parse_headline_index(self):
@@ -709,13 +706,13 @@ class NYTimes(BasicNewsRecipe):
                     description = self.tag_to_string(desc,use_alt=False)
                 else:
                     description = ''
-                if not self.articles.has_key(section_name):
+                if section_name not in self.articles:
                     self.ans.append(section_name)
                     self.articles[section_name] = []
                 print('Title '+title+' author '+author)
                 self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
 
-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
         return self.filter_ans(self.ans)
 
     def parse_index(self):
@@ -735,7 +732,7 @@ class NYTimes(BasicNewsRecipe):
                     if kill_all or (self.recursions==0):
                         a.replaceWith(self.tag_to_string(a,False))
                     else:
-                        if a.has_key('href'):
+                        if 'href' in a:
                             if a['href'].startswith('http://www.nytimes'):
                                 if not a['href'].endswith('pagewanted=all'):
                                     url = re.sub(r'\?.*', '', a['href'])
@@ -743,13 +740,13 @@ class NYTimes(BasicNewsRecipe):
                                         a.replaceWith(self.tag_to_string(a,False))
                                     else:
                                         a['href'] = url+'?pagewanted=all'
-                            elif not (a['href'].startswith('http://pogue') or \
-                                      a['href'].startswith('http://bits') or \
-                                      a['href'].startswith('http://travel') or \
-                                      a['href'].startswith('http://business') or \
-                                      a['href'].startswith('http://tech') or \
-                                      a['href'].startswith('http://health') or \
-                                      a['href'].startswith('http://dealbook') or \
+                            elif not (a['href'].startswith('http://pogue') or
+                                      a['href'].startswith('http://bits') or
+                                      a['href'].startswith('http://travel') or
+                                      a['href'].startswith('http://business') or
+                                      a['href'].startswith('http://tech') or
+                                      a['href'].startswith('http://health') or
+                                      a['href'].startswith('http://dealbook') or
                                       a['href'].startswith('http://open')):
                                 a.replaceWith(self.tag_to_string(a,False))
         return soup
@@ -764,7 +761,7 @@ class NYTimes(BasicNewsRecipe):
             return None
 
 ##        print("HANDLING AD FORWARD:")
-##        print(soup)
+# print(soup)
         if self.keep_only_tags:
             body = Tag(soup, 'body')
             try:
@@ -774,7 +771,7 @@ class NYTimes(BasicNewsRecipe):
                     for tag in soup.find('body').findAll(**spec):
                         body.insert(len(body.contents), tag)
                 soup.find('body').replaceWith(body)
-            except AttributeError: # soup has no body element
+            except AttributeError:  # soup has no body element
                 pass
 
         def remove_beyond(tag, next):
@@ -802,7 +799,6 @@ class NYTimes(BasicNewsRecipe):
 
         return soup
 
-
     def preprocess_html(self, soup):
         #print(strftime("%H:%M:%S")+" --  PREPROCESS TITLE="+self.tag_to_string(soup.title))
         skip_tag = soup.find(True, {'name':'skip'})
@@ -821,7 +817,7 @@ class NYTimes(BasicNewsRecipe):
             old_body = soup.find('body')
             new_body=Tag(soup,'body')
             new_body.append(soup.find('div',attrs={'id':'content'}))
-            new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
+            new_body.find('div',attrs={'id':'content'})['id']='blogcontent'  # identify for postprocess_html
             old_body.replaceWith(new_body)
             for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
                 if divr.find(text=re.compile('Sign up')):
@@ -864,9 +860,9 @@ class NYTimes(BasicNewsRecipe):
                 img = atag.find('img')
                 if img is not None:
                     atag.replaceWith(img)
-                elif not atag.has_key('href'):
+                elif 'href' not in atag:
                     atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
-                elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
+                elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or
                               atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
                     atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
             hdr = soup.find('address')
@@ -879,11 +875,11 @@ class NYTimes(BasicNewsRecipe):
                 sp.append(span_credit)
                 sp.append(Tag(soup,'br'))
 
-        else: # nytimes article
+        else:  # nytimes article
 
-            related = [] # these will be the related articles
-            first_outer = None # first related outer tag
-            first_related = None # first related tag
+            related = []  # these will be the related articles
+            first_outer = None  # first related outer tag
+            first_related = None  # first related tag
             for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
                 for rdiv in soup.findAll('div','columnGroup doubleRule'):
                     if rdiv.find('h3') is not None:
@@ -916,19 +912,19 @@ class NYTimes(BasicNewsRecipe):
                                         h6tag.extract()
             if related != []:
                 for r in related:
-                    if r.h6: # don't want the anchor inside a h6 tag
+                    if r.h6:  # don't want the anchor inside a h6 tag
                         r.h6.replaceWith(r.h6.a)
                     first_related.ul.append(r)
                 first_related.insert(0,Tag(soup,'hr'))
                 first_related.append(Tag(soup,'hr'))
                 first_related['class'] = 'aside'
-                first_outer.replaceWith(first_related) # replace the outer tag with the related tag
+                first_outer.replaceWith(first_related)  # replace the outer tag with the related tag
 
             for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
                 rdiv.extract()
 
             kicker_tag = soup.find(attrs={'class':'kicker'})
-            if kicker_tag: # remove Op_Ed author head shots
+            if kicker_tag:  # remove Op_Ed author head shots
                 tagline = self.tag_to_string(kicker_tag)
                 if tagline=='Op-Ed Columnist':
                     img_div = soup.find('div','inlineImage module')
@@ -937,7 +933,7 @@ class NYTimes(BasicNewsRecipe):
 
             if self.useHighResImages:
                 try:
-                    #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
+                    # open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
                     enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                     if enlargeThisList:
                         for popupref in enlargeThisList:
@@ -956,8 +952,10 @@ class NYTimes(BasicNewsRecipe):
                                     year = str(st.tm_year)
                                     month = "%.2d" % st.tm_mon
                                     day = "%.2d" % st.tm_mday
-                                    imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
-                                    highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
+                                    imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + \
+                                                                 len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
+                                    highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \
+                                        month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
                                     popupSoup = BeautifulSoup(popuphtml)
                                     highResTag = popupSoup.find('img', {'src':highResImageLink})
                                     if highResTag:
@@ -979,7 +977,7 @@ class NYTimes(BasicNewsRecipe):
                     self.log("Error pulling high resolution images")
 
                 try:
-                    #in case pulling images failed, delete the enlarge this text
+                    # in case pulling images failed, delete the enlarge this text
                     enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                     if enlargeThisList:
                         for popupref in enlargeThisList:
@@ -987,11 +985,10 @@ class NYTimes(BasicNewsRecipe):
                 except:
                     self.log("Error removing Enlarge this text")
 
-
         return self.strip_anchors(soup,False)
 
     def postprocess_html(self,soup,first_fetch):
-        if not first_fetch: # remove Related links
+        if not first_fetch:  # remove Related links
             for aside in soup.findAll('div','aside'):
                 aside.extract()
             soup = self.strip_anchors(soup,True)
@@ -1000,7 +997,7 @@ class NYTimes(BasicNewsRecipe):
         if soup.find('div',attrs={'id':'blogcontent'}) is None:
             if first_fetch:
                 aside = soup.find('div','aside')
-                if aside is not None: # move the related list to the end of the article
+                if aside is not None:  # move the related list to the end of the article
                     art = soup.find('div',attrs={'id':'article'})
                     if art is None:
                         art = soup.find('div',attrs={'class':'article'})
@@ -1061,7 +1058,7 @@ class NYTimes(BasicNewsRecipe):
             try:
                 # Change  to 

h1 = soup.find('h1') - blogheadline = str(h1) #added for dealbook + blogheadline = str(h1) # added for dealbook if h1: headline = h1.find("nyt_headline") if headline: @@ -1069,11 +1066,11 @@ class NYTimes(BasicNewsRecipe): tag['class'] = "headline" tag.insert(0, self.fixChars(headline.contents[0])) h1.replaceWith(tag) - elif blogheadline.find('entry-title'):#added for dealbook - tag = Tag(soup, "h2")#added for dealbook - tag['class'] = "headline"#added for dealbook - tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook - h1.replaceWith(tag)#added for dealbook + elif blogheadline.find('entry-title'): # added for dealbook + tag = Tag(soup, "h2") # added for dealbook + tag['class'] = "headline" # added for dealbook + tag.insert(0, self.fixChars(h1.contents[0])) # added for dealbook + h1.replaceWith(tag) # added for dealbook else: # Blog entry - replace headline, remove
tags - BCC I think this is no longer functional 1-18-2011 @@ -1090,7 +1087,7 @@ class NYTimes(BasicNewsRecipe): self.log("ERROR: Problem in Change to

") try: - #if this is from a blog (dealbook, fix the byline format + # if this is from a blog (dealbook, fix the byline format bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) if bylineauthor: tag = Tag(soup, "h6") @@ -1101,7 +1098,7 @@ class NYTimes(BasicNewsRecipe): self.log("ERROR: fixing byline author format") try: - #if this is a blog (dealbook) fix the credit style for the pictures + # if this is a blog (dealbook) fix the credit style for the pictures blogcredit = soup.find('div',attrs={'class':'credit'}) if blogcredit: tag = Tag(soup, "h6") @@ -1111,7 +1108,6 @@ class NYTimes(BasicNewsRecipe): except: self.log("ERROR: fixing credit format") - try: # Change

to

- used in editorial blogs masthead = soup.find("h1") @@ -1135,7 +1131,7 @@ class NYTimes(BasicNewsRecipe): except: self.log("ERROR: Problem in Change

to

- used in editorial blogs") try: - #remove the update tag + # remove the update tag blogupdated = soup.find('span', {'class':'update'}) if blogupdated: blogupdated.replaceWith("") @@ -1184,9 +1180,9 @@ class NYTimes(BasicNewsRecipe): paras = articlebody.findAll('p') for p in paras: refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() - #account for blank paragraphs and short paragraphs by appending them to longer ones + # account for blank paragraphs and short paragraphs by appending them to longer ones if len(refparagraph) > 0: - if len(refparagraph) > 70: #approximately one line of text + if len(refparagraph) > 70: # approximately one line of text newpara = shortparagraph + refparagraph newparaDateline,newparaEm,newparaDesc = newpara.partition('—') if newparaEm == '': @@ -1205,4 +1201,3 @@ class NYTimes(BasicNewsRecipe): self.log("Error creating article descriptions") return - diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 94c03bd98b..e66ccef315 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -14,8 +14,8 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup class NYTimes(BasicNewsRecipe): - recursions=1 # set this to zero to omit Related articles lists - match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed + recursions=1 # set this to zero to omit Related articles lists + match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed # set getTechBlogs to True to include the technology blogs # set tech_oldest_article to control article age @@ -28,12 +28,11 @@ class NYTimes(BasicNewsRecipe): # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category) getPopularArticles = True - popularPeriod = '1' # set this to the number of days to include in the measurement + popularPeriod = '1' # set this to the number of days to include in the measurement # e.g. 7 will get the most popular measured over the last 7 days # and 30 will get the most popular measured over 30 days. # you still only get up to 20 articles in each category - # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored. headlinesOnly = False @@ -82,6 +81,7 @@ class NYTimes(BasicNewsRecipe): # The maximum number of articles that will be downloaded max_articles_per_feed = 100 + use_embedded_content = False # Whether to omit duplicates of articles (typically arsing when articles are indexed in # more than one section). If True, only the first occurance will be downloaded. @@ -122,7 +122,6 @@ class NYTimes(BasicNewsRecipe): (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/') ] - if headlinesOnly: title='New York Times Headlines' description = 'Headlines from the New York Times' @@ -155,7 +154,7 @@ class NYTimes(BasicNewsRecipe): earliest_date = date.today() else: earliest_date = date.today() - timedelta(days=oldest_web_article) - oldest_article = 365 # by default, a long time ago + oldest_article = 365 # by default, a long time ago __author__ = 'GRiker/Kovid Goyal/Nick Redding' language = 'en' @@ -164,12 +163,11 @@ class NYTimes(BasicNewsRecipe): timefmt = '' - #simultaneous_downloads = 1 # no longer required to deal with ads + # simultaneous_downloads = 1 # no longer required to deal with ads cover_margins = (18,18,'grey99') - remove_tags_before = dict(id='article') - remove_tags_after = dict(id='article') + keep_only_tags = dict(id=['article', 'story', 'content']) remove_tags = [ dict(attrs={'class':[ 'articleFooter', @@ -184,6 +182,7 @@ class NYTimes(BasicNewsRecipe): 'entry-response module', 'leftNavTabs', 'metaFootnote', + 'inside-story', 'module box nav', 'nextArticleLink', 'nextArticleLink clearfix', @@ -192,28 +191,28 @@ class NYTimes(BasicNewsRecipe): 'side_tool', 'singleAd', 'postCategory column', - 'refer tagRefer', # added for bits blog post - 'entry entry-utility', #added for DealBook - 'entry-tags', #added for DealBook - 'footer promos clearfix', #added for DealBook - 'footer links clearfix', #added for DealBook - 'tabsContainer', #added for other blog downloads - 'column lastColumn', #added for other blog downloads - 'pageHeaderWithLabel', #added for other gadgetwise downloads - 'column two', #added for other blog downloads - 'column two last', #added for other blog downloads - 'column three', #added for other blog downloads - 'column three last', #added for other blog downloads - 'column four',#added for other blog downloads - 'column four last',#added for other blog downloads - 'column last', #added for other blog downloads + 'refer tagRefer', # added for bits blog post + 'entry entry-utility', # added for DealBook + 'entry-tags', # added for DealBook + 'footer promos clearfix', # added for DealBook + 'footer links clearfix', # added for DealBook + 'tabsContainer', # added for other blog downloads + 'column lastColumn', # added for other blog downloads + 'pageHeaderWithLabel', # added for other gadgetwise downloads + 'column two', # added for other blog downloads + 'column two last', # added for other blog downloads + 'column three', # added for other blog downloads + 'column three last', # added for other blog downloads + 'column four', # added for other blog downloads + 'column four last', # added for other blog downloads + 'column last', # added for other blog downloads 'entry entry-related', - 'subNavigation tabContent active', #caucus blog navigation + 'subNavigation tabContent active', # caucus blog navigation 'mediaOverlay slideshow', 'wideThumb', - 'video', #added 02-11-2011 - 'videoHeader',#added 02-11-2011 - 'articleInlineVideoHolder', #added 02-11-2011 + 'video', # added 02-11-2011 + 'videoHeader', # added 02-11-2011 + 'articleInlineVideoHolder', # added 02-11-2011 'assetCompanionAd', 'nytint-sectionHeader', re.compile('^subNavigation'), @@ -223,6 +222,7 @@ class NYTimes(BasicNewsRecipe): 'credit' ]}), dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}), + dict(attrs={'class':lambda x: x and 'interactive' in x.split()}), dict(name='div', attrs={'class':re.compile('toolsList')}), # bits dict(name='div', attrs={'class':re.compile('postNavigation')}), # bits dict(name='div', attrs={'class':'tweet'}), @@ -231,8 +231,8 @@ class NYTimes(BasicNewsRecipe): dict(name='div', attrs={'id':re.compile('commentsContainer')}), # bits, pogue, gadgetwise, open dict(name='ul', attrs={'class':re.compile('entry-tools')}), # pogue, gadgetwise dict(name='div', attrs={'class':re.compile('nocontent')}), # pogue, gadgetwise - dict(name='div', attrs={'id':re.compile('respond')}), # open - dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue + dict(name='div', attrs={'id':re.compile('respond')}), # open + dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue dict(id=[ 'adxLeaderboard', 'adxSponLink', @@ -254,6 +254,7 @@ class NYTimes(BasicNewsRecipe): 'masthead-nav', 'memberTools', 'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge', + 'page-footer', 'portfolioInline', 'readerReviews', 'readerReviewsCount', @@ -265,16 +266,18 @@ class NYTimes(BasicNewsRecipe): 'side_index', 'side_tool', 'toolsRight', - 'skybox', #added for DealBook - 'TopAd', #added for DealBook - 'related-content', #added for DealBook + 'skybox', # added for DealBook + 'TopAd', # added for DealBook + 'related-content', # added for DealBook 'whats-next', ]), - dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta'])] + dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])] no_stylesheets = True extra_css = ''' .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } - .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .credit { font-weight: normal; text-align: right; font-size: + 50%; line-height:1em; margin-top:5px; margin-left:0; + margin-right:0; margin-bottom: 0; } .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } @@ -290,7 +293,6 @@ class NYTimes(BasicNewsRecipe): .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;} .source {text-align: left; font-size: x-small; }''' - articles = {} key = None ans = [] @@ -312,22 +314,22 @@ class NYTimes(BasicNewsRecipe): del ans[idx] idx_max = idx_max-1 continue - if True: #self.verbose - self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) + if True: # self.verbose + self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1]))) for article in ans[idx][1]: total_article_count += 1 - if True: #self.verbose + if True: # self.verbose self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), article['url'].encode('cp1252','replace'))) idx = idx+1 - self.log( "Queued %d articles" % total_article_count ) + self.log("Queued %d articles" % total_article_count) return ans def exclude_url(self,url): if not url.startswith("http"): return True - if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook + if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: # added for DealBook return True if 'nytimes.com' not in url: return True @@ -419,7 +421,6 @@ class NYTimes(BasicNewsRecipe): def short_title(self): return self.title - def article_to_soup(self, url_or_raw, raw=False): from contextlib import closing import copy @@ -453,7 +454,6 @@ class NYTimes(BasicNewsRecipe): usrc = self.preprocess_raw_html(usrc, url_or_raw) return BeautifulSoup(usrc, markupMassage=nmassage) - def massageNCXText(self, description): # Kindle TOC descriptions won't render certain characters if description: @@ -485,7 +485,7 @@ class NYTimes(BasicNewsRecipe): if self.webEdition: date_tag = self.decode_url_date(url) if date_tag is not None: - if self.oldest_web_article is not None: + if self.oldest_web_article is not None: if date_tag < self.earliest_date: self.log("Skipping article %s" % url) return @@ -508,7 +508,7 @@ class NYTimes(BasicNewsRecipe): if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) feed = self.key if self.key is not None else 'Uncategorized' - if not self.articles.has_key(feed): + if feed not in self.articles: self.ans.append(feed) self.articles[feed] = [] self.articles[feed].append( @@ -543,7 +543,6 @@ class NYTimes(BasicNewsRecipe): desc = '' return(title,url,author,desc) - have_emailed = False emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod) for h3tag in emailed_soup.findAll('h3'): @@ -572,7 +571,7 @@ class NYTimes(BasicNewsRecipe): dict(title=title, url=url, date=strftime('%a, %d %b'), description=desc, author=author, content='')) - viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)] + viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles] for x in viewed_ans: ans.append(x) return ans @@ -595,10 +594,10 @@ class NYTimes(BasicNewsRecipe): tech_articles[f.title] = [] for a in f.articles: tech_articles[f.title].append( - dict(title=a.title, url=a.url, date=a.date, + dict(title=a.title, url=a.url.partition('?')[0], date=a.date, description=a.summary, author=a.author, content=a.content)) - tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)] + tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles] for x in tech_ans: ans.append(x) return ans @@ -637,10 +636,9 @@ class NYTimes(BasicNewsRecipe): for lidiv in div.findAll('li'): self.handle_article(lidiv) - self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles] return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) - def parse_todays_index(self): soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') @@ -670,7 +668,7 @@ class NYTimes(BasicNewsRecipe): if not skipping: self.handle_article(lidiv) - self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles] return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_headline_index(self): @@ -716,13 +714,13 @@ class NYTimes(BasicNewsRecipe): description = self.tag_to_string(desc,use_alt=False) else: description = '' - if not self.articles.has_key(section_name): + if section_name not in self.articles: self.ans.append(section_name) self.articles[section_name] = [] print('Title '+title+' author '+author) self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) - self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] + self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles] return self.filter_ans(self.ans) def parse_index(self): @@ -742,7 +740,7 @@ class NYTimes(BasicNewsRecipe): if kill_all or (self.recursions==0): a.replaceWith(self.tag_to_string(a,False)) else: - if a.has_key('href'): + if 'href' in a: if a['href'].startswith('http://www.nytimes'): if not a['href'].endswith('pagewanted=all'): url = re.sub(r'\?.*', '', a['href']) @@ -750,13 +748,13 @@ class NYTimes(BasicNewsRecipe): a.replaceWith(self.tag_to_string(a,False)) else: a['href'] = url+'?pagewanted=all' - elif not (a['href'].startswith('http://pogue') or \ - a['href'].startswith('http://bits') or \ - a['href'].startswith('http://travel') or \ - a['href'].startswith('http://business') or \ - a['href'].startswith('http://tech') or \ - a['href'].startswith('http://health') or \ - a['href'].startswith('http://dealbook') or \ + elif not (a['href'].startswith('http://pogue') or + a['href'].startswith('http://bits') or + a['href'].startswith('http://travel') or + a['href'].startswith('http://business') or + a['href'].startswith('http://tech') or + a['href'].startswith('http://health') or + a['href'].startswith('http://dealbook') or a['href'].startswith('http://open')): a.replaceWith(self.tag_to_string(a,False)) return soup @@ -771,7 +769,7 @@ class NYTimes(BasicNewsRecipe): return None ## print("HANDLING AD FORWARD:") -## print(soup) +# print(soup) if self.keep_only_tags: body = Tag(soup, 'body') try: @@ -781,7 +779,7 @@ class NYTimes(BasicNewsRecipe): for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) - except AttributeError: # soup has no body element + except AttributeError: # soup has no body element pass def remove_beyond(tag, next): @@ -809,7 +807,6 @@ class NYTimes(BasicNewsRecipe): return soup - def preprocess_html(self, soup): #print(strftime("%H:%M:%S")+" -- PREPROCESS TITLE="+self.tag_to_string(soup.title)) skip_tag = soup.find(True, {'name':'skip'}) @@ -828,7 +825,7 @@ class NYTimes(BasicNewsRecipe): old_body = soup.find('body') new_body=Tag(soup,'body') new_body.append(soup.find('div',attrs={'id':'content'})) - new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html + new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html old_body.replaceWith(new_body) for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}): if divr.find(text=re.compile('Sign up')): @@ -871,9 +868,9 @@ class NYTimes(BasicNewsRecipe): img = atag.find('img') if img is not None: atag.replaceWith(img) - elif not atag.has_key('href'): + elif 'href' not in atag: atag.replaceWith(atag.renderContents().decode('cp1252','replace')) - elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \ + elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')): atag.replaceWith(atag.renderContents().decode('cp1252','replace')) hdr = soup.find('address') @@ -886,11 +883,11 @@ class NYTimes(BasicNewsRecipe): sp.append(span_credit) sp.append(Tag(soup,'br')) - else: # nytimes article + else: # nytimes article - related = [] # these will be the related articles - first_outer = None # first related outer tag - first_related = None # first related tag + related = [] # these will be the related articles + first_outer = None # first related outer tag + first_related = None # first related tag for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): for rdiv in soup.findAll('div','columnGroup doubleRule'): if rdiv.find('h3') is not None: @@ -923,19 +920,19 @@ class NYTimes(BasicNewsRecipe): h6tag.extract() if related != []: for r in related: - if r.h6: # don't want the anchor inside a h6 tag + if r.h6: # don't want the anchor inside a h6 tag r.h6.replaceWith(r.h6.a) first_related.ul.append(r) first_related.insert(0,Tag(soup,'hr')) first_related.append(Tag(soup,'hr')) first_related['class'] = 'aside' - first_outer.replaceWith(first_related) # replace the outer tag with the related tag + first_outer.replaceWith(first_related) # replace the outer tag with the related tag for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): rdiv.extract() kicker_tag = soup.find(attrs={'class':'kicker'}) - if kicker_tag: # remove Op_Ed author head shots + if kicker_tag: # remove Op_Ed author head shots tagline = self.tag_to_string(kicker_tag) if tagline=='Op-Ed Columnist': img_div = soup.find('div','inlineImage module') @@ -944,7 +941,7 @@ class NYTimes(BasicNewsRecipe): if self.useHighResImages: try: - #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs + # open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) if enlargeThisList: for popupref in enlargeThisList: @@ -963,8 +960,10 @@ class NYTimes(BasicNewsRecipe): year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday - imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') - highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] + imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + \ + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \ + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] popupSoup = BeautifulSoup(popuphtml) highResTag = popupSoup.find('img', {'src':highResImageLink}) if highResTag: @@ -986,7 +985,7 @@ class NYTimes(BasicNewsRecipe): self.log("Error pulling high resolution images") try: - #in case pulling images failed, delete the enlarge this text + # in case pulling images failed, delete the enlarge this text enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) if enlargeThisList: for popupref in enlargeThisList: @@ -994,11 +993,10 @@ class NYTimes(BasicNewsRecipe): except: self.log("Error removing Enlarge this text") - return self.strip_anchors(soup,False) def postprocess_html(self,soup,first_fetch): - if not first_fetch: # remove Related links + if not first_fetch: # remove Related links for aside in soup.findAll('div','aside'): aside.extract() soup = self.strip_anchors(soup,True) @@ -1007,7 +1005,7 @@ class NYTimes(BasicNewsRecipe): if soup.find('div',attrs={'id':'blogcontent'}) is None: if first_fetch: aside = soup.find('div','aside') - if aside is not None: # move the related list to the end of the article + if aside is not None: # move the related list to the end of the article art = soup.find('div',attrs={'id':'article'}) if art is None: art = soup.find('div',attrs={'class':'article'}) @@ -1068,7 +1066,7 @@ class NYTimes(BasicNewsRecipe): try: # Change to

h1 = soup.find('h1') - blogheadline = str(h1) #added for dealbook + blogheadline = str(h1) # added for dealbook if h1: headline = h1.find("nyt_headline") if headline: @@ -1076,11 +1074,11 @@ class NYTimes(BasicNewsRecipe): tag['class'] = "headline" tag.insert(0, self.fixChars(headline.contents[0])) h1.replaceWith(tag) - elif blogheadline.find('entry-title'):#added for dealbook - tag = Tag(soup, "h2")#added for dealbook - tag['class'] = "headline"#added for dealbook - tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook - h1.replaceWith(tag)#added for dealbook + elif blogheadline.find('entry-title'): # added for dealbook + tag = Tag(soup, "h2") # added for dealbook + tag['class'] = "headline" # added for dealbook + tag.insert(0, self.fixChars(h1.contents[0])) # added for dealbook + h1.replaceWith(tag) # added for dealbook else: # Blog entry - replace headline, remove
tags - BCC I think this is no longer functional 1-18-2011 @@ -1097,7 +1095,7 @@ class NYTimes(BasicNewsRecipe): self.log("ERROR: Problem in Change to

") try: - #if this is from a blog (dealbook, fix the byline format + # if this is from a blog (dealbook, fix the byline format bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) if bylineauthor: tag = Tag(soup, "h6") @@ -1108,7 +1106,7 @@ class NYTimes(BasicNewsRecipe): self.log("ERROR: fixing byline author format") try: - #if this is a blog (dealbook) fix the credit style for the pictures + # if this is a blog (dealbook) fix the credit style for the pictures blogcredit = soup.find('div',attrs={'class':'credit'}) if blogcredit: tag = Tag(soup, "h6") @@ -1118,7 +1116,6 @@ class NYTimes(BasicNewsRecipe): except: self.log("ERROR: fixing credit format") - try: # Change

to

- used in editorial blogs masthead = soup.find("h1") @@ -1142,7 +1139,7 @@ class NYTimes(BasicNewsRecipe): except: self.log("ERROR: Problem in Change

to

- used in editorial blogs") try: - #remove the update tag + # remove the update tag blogupdated = soup.find('span', {'class':'update'}) if blogupdated: blogupdated.replaceWith("") @@ -1191,9 +1188,9 @@ class NYTimes(BasicNewsRecipe): paras = articlebody.findAll('p') for p in paras: refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() - #account for blank paragraphs and short paragraphs by appending them to longer ones + # account for blank paragraphs and short paragraphs by appending them to longer ones if len(refparagraph) > 0: - if len(refparagraph) > 70: #approximately one line of text + if len(refparagraph) > 70: # approximately one line of text newpara = shortparagraph + refparagraph newparaDateline,newparaEm,newparaDesc = newpara.partition('—') if newparaEm == '': @@ -1212,4 +1209,3 @@ class NYTimes(BasicNewsRecipe): self.log("Error creating article descriptions") return - From 2c8ac926b1968d835d1992787ef408d08d1005a9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 13 Jan 2014 11:24:41 +0530 Subject: [PATCH 7/7] Refactor dialog base class into its own module for easier re-use --- src/calibre/gui2/tweak_book/char_select.py | 2 +- .../gui2/tweak_book/editor/insert_resource.py | 44 +++-------------- src/calibre/gui2/tweak_book/widgets.py | 48 +++++++++++++++++++ 3 files changed, 55 insertions(+), 39 deletions(-) create mode 100644 src/calibre/gui2/tweak_book/widgets.py diff --git a/src/calibre/gui2/tweak_book/char_select.py b/src/calibre/gui2/tweak_book/char_select.py index 4ce9890c02..5a973efa3e 100644 --- a/src/calibre/gui2/tweak_book/char_select.py +++ b/src/calibre/gui2/tweak_book/char_select.py @@ -21,7 +21,7 @@ from calibre.constants import ispy3, plugins, cache_dir from calibre.gui2 import NONE from calibre.gui2.widgets2 import HistoryLineEdit2 from calibre.gui2.tweak_book import tprefs -from calibre.gui2.tweak_book.editor.insert_resource import Dialog +from calibre.gui2.tweak_book.widgets import Dialog if not ispy3: if sys.maxunicode >= 0x10FFFF: diff --git a/src/calibre/gui2/tweak_book/editor/insert_resource.py b/src/calibre/gui2/tweak_book/editor/insert_resource.py index 855e5ce184..4157183793 100644 --- a/src/calibre/gui2/tweak_book/editor/insert_resource.py +++ b/src/calibre/gui2/tweak_book/editor/insert_resource.py @@ -10,11 +10,11 @@ import sys, os from functools import partial from PyQt4.Qt import ( - QDialog, QGridLayout, QDialogButtonBox, QSize, QListView, QStyledItemDelegate, - QLabel, QPixmap, QApplication, QSizePolicy, QAbstractListModel, QVariant, - Qt, QRect, QPainter, QModelIndex, QSortFilterProxyModel, QLineEdit, - QToolButton, QIcon, QFormLayout, pyqtSignal, QTreeWidget, QTreeWidgetItem, - QVBoxLayout, QMenu, QInputDialog) + QGridLayout, QSize, QListView, QStyledItemDelegate, QLabel, QPixmap, + QApplication, QSizePolicy, QAbstractListModel, QVariant, Qt, QRect, + QPainter, QModelIndex, QSortFilterProxyModel, QLineEdit, QToolButton, + QIcon, QFormLayout, pyqtSignal, QTreeWidget, QTreeWidgetItem, QVBoxLayout, + QMenu, QInputDialog) from calibre import fit_image from calibre.constants import plugins @@ -23,43 +23,11 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.gui2 import NONE, choose_files, error_dialog from calibre.gui2.languages import LanguagesEdit from calibre.gui2.tweak_book import current_container, tprefs +from calibre.gui2.tweak_book.widgets import Dialog from calibre.gui2.tweak_book.file_list import name_is_ok from calibre.utils.localization import get_lang, canonicalize_lang from calibre.utils.icu import sort_key -class Dialog(QDialog): - - def __init__(self, title, name, parent=None): - QDialog.__init__(self, parent) - self.setWindowTitle(title) - self.name = name - self.bb = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel) - self.bb.accepted.connect(self.accept) - self.bb.rejected.connect(self.reject) - - self.setup_ui() - - self.resize(self.sizeHint()) - geom = tprefs.get(name + '-geometry', None) - if geom is not None: - self.restoreGeometry(geom) - if hasattr(self, 'splitter'): - state = tprefs.get(name + '-splitter-state', None) - if state is not None: - self.splitter.restoreState(state) - - def accept(self): - tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry())) - if hasattr(self, 'splitter'): - tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState())) - QDialog.accept(self) - - def reject(self): - tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry())) - if hasattr(self, 'splitter'): - tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState())) - QDialog.reject(self) - class ChooseName(Dialog): # {{{ ''' Chooses the filename for a newly imported file, with error checking ''' diff --git a/src/calibre/gui2/tweak_book/widgets.py b/src/calibre/gui2/tweak_book/widgets.py new file mode 100644 index 0000000000..606e699bd7 --- /dev/null +++ b/src/calibre/gui2/tweak_book/widgets.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +from PyQt4.Qt import (QDialog, QDialogButtonBox) + +from calibre.gui2.tweak_book import tprefs + +class Dialog(QDialog): + + def __init__(self, title, name, parent=None): + QDialog.__init__(self, parent) + self.setWindowTitle(title) + self.name = name + self.bb = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel) + self.bb.accepted.connect(self.accept) + self.bb.rejected.connect(self.reject) + + self.setup_ui() + + self.resize(self.sizeHint()) + geom = tprefs.get(name + '-geometry', None) + if geom is not None: + self.restoreGeometry(geom) + if hasattr(self, 'splitter'): + state = tprefs.get(name + '-splitter-state', None) + if state is not None: + self.splitter.restoreState(state) + + def accept(self): + tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry())) + if hasattr(self, 'splitter'): + tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState())) + QDialog.accept(self) + + def reject(self): + tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry())) + if hasattr(self, 'splitter'): + tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState())) + QDialog.reject(self) + + def setup_ui(self): + raise NotImplementedError('You must implement this method in Dialog subclasses') +