Merge branch 'master' of https://github.com/kovidgoyal/calibre

2025-08-30 23:00:21 -04:00 · 2014-01-13 05:14:55 -07:00 · 2014-01-13 05:14:55 -07:00 · 3e81bee473
commit 3e81bee473
parent d55d3c7283 2c8ac926b1
8 changed files with 269 additions and 249 deletions
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -14,8 +14,8 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup

 class NYTimes(BasicNewsRecipe):

-    recursions=1 # set this to zero to omit Related articles lists
-    match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
+    recursions=1  # set this to zero to omit Related articles lists
+    match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/']  # speeds up processing by preventing index page links from being followed

    # set getTechBlogs to True to include the technology blogs
    # set tech_oldest_article to control article age
@ -28,12 +28,11 @@ class NYTimes(BasicNewsRecipe):
    # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
    # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
    getPopularArticles = True
-    popularPeriod = '1' # set this to the number of days to include in the measurement
+    popularPeriod = '1'  # set this to the number of days to include in the measurement
                        # e.g. 7 will get the most popular measured over the last 7 days
                        # and 30 will get the most popular measured over 30 days.
                        # you still only get up to 20 articles in each category

-
    # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
    headlinesOnly = True

@ -82,6 +81,7 @@ class NYTimes(BasicNewsRecipe):

    # The maximum number of articles that will be downloaded
    max_articles_per_feed = 100
+    use_embedded_content = False

    # Whether to omit duplicates of articles (typically arsing when articles are indexed in
    # more than one section). If True, only the first occurance will be downloaded.
@ -122,7 +122,6 @@ class NYTimes(BasicNewsRecipe):
               (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
                   ]

-
    if headlinesOnly:
        title='New York Times Headlines'
        description = 'Headlines from the New York Times'
@ -155,7 +154,7 @@ class NYTimes(BasicNewsRecipe):
        earliest_date = date.today()
    else:
        earliest_date = date.today() - timedelta(days=oldest_web_article)
-    oldest_article = 365 # by default, a long time ago
+    oldest_article = 365  # by default, a long time ago

    __author__  = 'GRiker/Kovid Goyal/Nick Redding'
    language = 'en'
@ -164,12 +163,11 @@ class NYTimes(BasicNewsRecipe):

    timefmt = ''

-    #simultaneous_downloads = 1 # no longer required to deal with ads
+    # simultaneous_downloads = 1 # no longer required to deal with ads

    cover_margins = (18,18,'grey99')

-    remove_tags_before = dict(id='article')
-    remove_tags_after  = dict(id='article')
+    keep_only_tags = dict(id=['article', 'story', 'content'])
    remove_tags = [
                    dict(attrs={'class':[
                                        'articleFooter',
@ -184,6 +182,7 @@ class NYTimes(BasicNewsRecipe):
                                        'entry-response module',
                                        'leftNavTabs',
                                        'metaFootnote',
+                                        'inside-story',
                                        'module box nav',
                                        'nextArticleLink',
                                        'nextArticleLink clearfix',
@ -192,28 +191,28 @@ class NYTimes(BasicNewsRecipe):
                                        'side_tool',
                                        'singleAd',
                                        'postCategory column',
-                                        'refer tagRefer', # added for bits blog post
-                                        'entry entry-utility', #added for DealBook
-                                        'entry-tags', #added for DealBook
-                                        'footer promos clearfix', #added for DealBook
-                                        'footer links clearfix', #added for DealBook
-                                        'tabsContainer', #added for other blog downloads
-                                        'column lastColumn', #added for other blog downloads
-                                        'pageHeaderWithLabel', #added for other gadgetwise downloads
-                                        'column two', #added for other blog downloads
-                                        'column two last', #added for other blog downloads
-                                        'column three', #added for other blog downloads
-                                        'column three last', #added for other blog downloads
-                                        'column four',#added for other blog downloads
-                                        'column four last',#added for other blog downloads
-                                        'column last', #added for other blog downloads
+                                        'refer tagRefer',  # added for bits blog post
+                                        'entry entry-utility',  # added for DealBook
+                                        'entry-tags',  # added for DealBook
+                                        'footer promos clearfix',  # added for DealBook
+                                        'footer links clearfix',  # added for DealBook
+                                        'tabsContainer',  # added for other blog downloads
+                                        'column lastColumn',  # added for other blog downloads
+                                        'pageHeaderWithLabel',  # added for other gadgetwise downloads
+                                        'column two',  # added for other blog downloads
+                                        'column two last',  # added for other blog downloads
+                                        'column three',  # added for other blog downloads
+                                        'column three last',  # added for other blog downloads
+                                        'column four',  # added for other blog downloads
+                                        'column four last',  # added for other blog downloads
+                                        'column last',  # added for other blog downloads
                                        'entry entry-related',
-                                        'subNavigation tabContent active', #caucus blog navigation
+                                        'subNavigation tabContent active',  # caucus blog navigation
                                        'mediaOverlay slideshow',
                                        'wideThumb',
-                                        'video', #added 02-11-2011
-                                        'videoHeader',#added 02-11-2011
-                                        'articleInlineVideoHolder', #added 02-11-2011
+                                        'video',  # added 02-11-2011
+                                        'videoHeader',  # added 02-11-2011
+                                        'articleInlineVideoHolder',  # added 02-11-2011
                                        'assetCompanionAd',
                                        'nytint-sectionHeader',
                                        re.compile('^subNavigation'),
@ -222,6 +221,8 @@ class NYTimes(BasicNewsRecipe):
                                        re.compile('commentCount'),
                                        'credit'
                                        ]}),
+                    dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
+                    dict(attrs={'class':lambda x: x and 'interactive' in x.split()}),
                    dict(name='div', attrs={'class':re.compile('toolsList')}),  # bits
                    dict(name='div', attrs={'class':re.compile('postNavigation')}),  # bits
                    dict(name='div', attrs={'class':'tweet'}),
@ -230,11 +231,13 @@ class NYTimes(BasicNewsRecipe):
                    dict(name='div', attrs={'id':re.compile('commentsContainer')}),  # bits, pogue, gadgetwise, open
                    dict(name='ul', attrs={'class':re.compile('entry-tools')}),  # pogue, gadgetwise
                    dict(name='div', attrs={'class':re.compile('nocontent')}),  # pogue, gadgetwise
-                    dict(name='div', attrs={'id':re.compile('respond')}), # open
-                    dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
+                    dict(name='div', attrs={'id':re.compile('respond')}),  # open
+                    dict(name='div', attrs={'class':re.compile('entry-tags')}),  # pogue
                    dict(id=[
                            'adxLeaderboard',
                            'adxSponLink',
+                            'anchoredAd_module',
+                            'anchoredAd_spot',
                            'archive',
                            'articleExtras',
                            'articleInline',
@ -263,16 +266,18 @@ class NYTimes(BasicNewsRecipe):
                            'side_index',
                            'side_tool',
                            'toolsRight',
-                            'skybox', #added for DealBook
-                            'TopAd', #added for DealBook
-                            'related-content', #added for DealBook
+                            'skybox',  # added for DealBook
+                            'TopAd',  # added for DealBook
+                            'related-content',  # added for DealBook
                            'whats-next',
                            ]),
-                    dict(name=['script', 'noscript', 'style','form','hr', 'button'])]
+                    dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])]
    no_stylesheets = True
    extra_css = '''
                .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
-                .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                .credit { font-weight: normal; text-align: right; font-size:
+                    50%; line-height:1em; margin-top:5px; margin-left:0;
+                    margin-right:0; margin-bottom: 0; }
                .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
@ -288,7 +293,6 @@ class NYTimes(BasicNewsRecipe):
                .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
                .source {text-align: left; font-size: x-small; }'''

-
    articles = {}
    key = None
    ans = []
@ -310,22 +314,22 @@ class NYTimes(BasicNewsRecipe):
                del ans[idx]
                idx_max = idx_max-1
                continue
-            if True: #self.verbose
-                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
+            if True:  # self.verbose
+                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])))
            for article in ans[idx][1]:
                total_article_count += 1
-                if True: #self.verbose
+                if True:  # self.verbose
                    self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
                              article['url'].encode('cp1252','replace')))
            idx = idx+1

-        self.log( "Queued %d articles" % total_article_count )
+        self.log("Queued %d articles" % total_article_count)
        return ans

    def exclude_url(self,url):
        if not url.startswith("http"):
            return True
-        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url:  # added for DealBook
            return True
        if 'nytimes.com' not in url:
            return True
@ -409,7 +413,6 @@ class NYTimes(BasicNewsRecipe):
    def short_title(self):
        return self.title

-
    def article_to_soup(self, url_or_raw, raw=False):
        from contextlib import closing
        import copy
@ -443,7 +446,6 @@ class NYTimes(BasicNewsRecipe):
        usrc = self.preprocess_raw_html(usrc, url_or_raw)
        return BeautifulSoup(usrc, markupMassage=nmassage)

-
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
@ -475,7 +477,7 @@ class NYTimes(BasicNewsRecipe):
        if self.webEdition:
            date_tag = self.decode_url_date(url)
            if date_tag is not None:
-                if  self.oldest_web_article is not None:
+                if self.oldest_web_article is not None:
                    if date_tag < self.earliest_date:
                        self.log("Skipping article %s" % url)
                        return
@ -498,7 +500,7 @@ class NYTimes(BasicNewsRecipe):
            if authorAttribution:
                author = self.tag_to_string(authorAttribution, use_alt=False)
        feed = self.key if self.key is not None else 'Uncategorized'
-        if not self.articles.has_key(feed):
+        if feed not in self.articles:
            self.ans.append(feed)
            self.articles[feed] = []
        self.articles[feed].append(
@ -533,7 +535,6 @@ class NYTimes(BasicNewsRecipe):
                    desc = ''
                return(title,url,author,desc)

-
            have_emailed = False
            emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
            for h3tag in emailed_soup.findAll('h3'):
@ -562,7 +563,7 @@ class NYTimes(BasicNewsRecipe):
                    dict(title=title, url=url, date=strftime('%a, %d %b'),
                        description=desc, author=author,
                        content=''))
-            viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
+            viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles]
            for x in viewed_ans:
                ans.append(x)
        return ans
@ -585,10 +586,10 @@ class NYTimes(BasicNewsRecipe):
                tech_articles[f.title] = []
                for a in f.articles:
                    tech_articles[f.title].append(
-                        dict(title=a.title, url=a.url, date=a.date,
+                        dict(title=a.title, url=a.url.partition('?')[0], date=a.date,
                            description=a.summary, author=a.author,
                            content=a.content))
-            tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
+            tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles]
            for x in tech_ans:
                ans.append(x)
        return ans
@ -627,10 +628,9 @@ class NYTimes(BasicNewsRecipe):
                    for lidiv in div.findAll('li'):
                        self.handle_article(lidiv)

-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))

-
    def parse_todays_index(self):

        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
@ -660,7 +660,7 @@ class NYTimes(BasicNewsRecipe):
                    if not skipping:
                        self.handle_article(lidiv)

-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))

    def parse_headline_index(self):
@ -706,13 +706,13 @@ class NYTimes(BasicNewsRecipe):
                    description = self.tag_to_string(desc,use_alt=False)
                else:
                    description = ''
-                if not self.articles.has_key(section_name):
+                if section_name not in self.articles:
                    self.ans.append(section_name)
                    self.articles[section_name] = []
                print('Title '+title+' author '+author)
                self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))

-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
        return self.filter_ans(self.ans)

    def parse_index(self):
@ -732,7 +732,7 @@ class NYTimes(BasicNewsRecipe):
                    if kill_all or (self.recursions==0):
                        a.replaceWith(self.tag_to_string(a,False))
                    else:
-                        if a.has_key('href'):
+                        if 'href' in a:
                            if a['href'].startswith('http://www.nytimes'):
                                if not a['href'].endswith('pagewanted=all'):
                                    url = re.sub(r'\?.*', '', a['href'])
@ -740,13 +740,13 @@ class NYTimes(BasicNewsRecipe):
                                        a.replaceWith(self.tag_to_string(a,False))
                                    else:
                                        a['href'] = url+'?pagewanted=all'
-                            elif not (a['href'].startswith('http://pogue') or \
-                                      a['href'].startswith('http://bits') or \
-                                      a['href'].startswith('http://travel') or \
-                                      a['href'].startswith('http://business') or \
-                                      a['href'].startswith('http://tech') or \
-                                      a['href'].startswith('http://health') or \
-                                      a['href'].startswith('http://dealbook') or \
+                            elif not (a['href'].startswith('http://pogue') or
+                                      a['href'].startswith('http://bits') or
+                                      a['href'].startswith('http://travel') or
+                                      a['href'].startswith('http://business') or
+                                      a['href'].startswith('http://tech') or
+                                      a['href'].startswith('http://health') or
+                                      a['href'].startswith('http://dealbook') or
                                      a['href'].startswith('http://open')):
                                a.replaceWith(self.tag_to_string(a,False))
        return soup
@ -761,7 +761,7 @@ class NYTimes(BasicNewsRecipe):
            return None

 ##        print("HANDLING AD FORWARD:")
-##        print(soup)
+# print(soup)
        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
@ -771,7 +771,7 @@ class NYTimes(BasicNewsRecipe):
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
-            except AttributeError: # soup has no body element
+            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
@ -799,7 +799,6 @@ class NYTimes(BasicNewsRecipe):

        return soup

-
    def preprocess_html(self, soup):
        #print(strftime("%H:%M:%S")+" --  PREPROCESS TITLE="+self.tag_to_string(soup.title))
        skip_tag = soup.find(True, {'name':'skip'})
@ -818,7 +817,7 @@ class NYTimes(BasicNewsRecipe):
            old_body = soup.find('body')
            new_body=Tag(soup,'body')
            new_body.append(soup.find('div',attrs={'id':'content'}))
-            new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
+            new_body.find('div',attrs={'id':'content'})['id']='blogcontent'  # identify for postprocess_html
            old_body.replaceWith(new_body)
            for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
                if divr.find(text=re.compile('Sign up')):
@ -861,9 +860,9 @@ class NYTimes(BasicNewsRecipe):
                img = atag.find('img')
                if img is not None:
                    atag.replaceWith(img)
-                elif not atag.has_key('href'):
+                elif 'href' not in atag:
                    atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
-                elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
+                elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or
                              atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
                    atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
            hdr = soup.find('address')
@ -876,11 +875,11 @@ class NYTimes(BasicNewsRecipe):
                sp.append(span_credit)
                sp.append(Tag(soup,'br'))

-        else: # nytimes article
+        else:  # nytimes article

-            related = [] # these will be the related articles
-            first_outer = None # first related outer tag
-            first_related = None # first related tag
+            related = []  # these will be the related articles
+            first_outer = None  # first related outer tag
+            first_related = None  # first related tag
            for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
                for rdiv in soup.findAll('div','columnGroup doubleRule'):
                    if rdiv.find('h3') is not None:
@ -913,19 +912,19 @@ class NYTimes(BasicNewsRecipe):
                                        h6tag.extract()
            if related != []:
                for r in related:
-                    if r.h6: # don't want the anchor inside a h6 tag
+                    if r.h6:  # don't want the anchor inside a h6 tag
                        r.h6.replaceWith(r.h6.a)
                    first_related.ul.append(r)
                first_related.insert(0,Tag(soup,'hr'))
                first_related.append(Tag(soup,'hr'))
                first_related['class'] = 'aside'
-                first_outer.replaceWith(first_related) # replace the outer tag with the related tag
+                first_outer.replaceWith(first_related)  # replace the outer tag with the related tag

            for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
                rdiv.extract()

            kicker_tag = soup.find(attrs={'class':'kicker'})
-            if kicker_tag: # remove Op_Ed author head shots
+            if kicker_tag:  # remove Op_Ed author head shots
                tagline = self.tag_to_string(kicker_tag)
                if tagline=='Op-Ed Columnist':
                    img_div = soup.find('div','inlineImage module')
@ -934,7 +933,7 @@ class NYTimes(BasicNewsRecipe):

            if self.useHighResImages:
                try:
-                    #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
+                    # open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
                    enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                    if enlargeThisList:
                        for popupref in enlargeThisList:
@ -953,8 +952,10 @@ class NYTimes(BasicNewsRecipe):
                                    year = str(st.tm_year)
                                    month = "%.2d" % st.tm_mon
                                    day = "%.2d" % st.tm_mday
-                                    imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
-                                    highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
+                                    imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + \
+                                                                 len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
+                                    highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \
+                                        month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
                                    popupSoup = BeautifulSoup(popuphtml)
                                    highResTag = popupSoup.find('img', {'src':highResImageLink})
                                    if highResTag:
@ -976,7 +977,7 @@ class NYTimes(BasicNewsRecipe):
                    self.log("Error pulling high resolution images")

                try:
-                    #in case pulling images failed, delete the enlarge this text
+                    # in case pulling images failed, delete the enlarge this text
                    enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                    if enlargeThisList:
                        for popupref in enlargeThisList:
@ -984,11 +985,10 @@ class NYTimes(BasicNewsRecipe):
                except:
                    self.log("Error removing Enlarge this text")

-
        return self.strip_anchors(soup,False)

    def postprocess_html(self,soup,first_fetch):
-        if not first_fetch: # remove Related links
+        if not first_fetch:  # remove Related links
            for aside in soup.findAll('div','aside'):
                aside.extract()
            soup = self.strip_anchors(soup,True)
@ -997,7 +997,7 @@ class NYTimes(BasicNewsRecipe):
        if soup.find('div',attrs={'id':'blogcontent'}) is None:
            if first_fetch:
                aside = soup.find('div','aside')
-                if aside is not None: # move the related list to the end of the article
+                if aside is not None:  # move the related list to the end of the article
                    art = soup.find('div',attrs={'id':'article'})
                    if art is None:
                        art = soup.find('div',attrs={'class':'article'})
@ -1058,7 +1058,7 @@ class NYTimes(BasicNewsRecipe):
            try:
                # Change <nyt_headline> to <h2>
                h1 = soup.find('h1')
-                blogheadline = str(h1) #added for dealbook
+                blogheadline = str(h1)  # added for dealbook
                if h1:
                        headline = h1.find("nyt_headline")
                        if headline:
@ -1066,11 +1066,11 @@ class NYTimes(BasicNewsRecipe):
                                tag['class'] = "headline"
                                tag.insert(0, self.fixChars(headline.contents[0]))
                                h1.replaceWith(tag)
-                        elif blogheadline.find('entry-title'):#added for dealbook
-                                tag = Tag(soup, "h2")#added for dealbook
-                                tag['class'] = "headline"#added for dealbook
-                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
-                                h1.replaceWith(tag)#added for dealbook
+                        elif blogheadline.find('entry-title'):  # added for dealbook
+                                tag = Tag(soup, "h2")  # added for dealbook
+                                tag['class'] = "headline"  # added for dealbook
+                                tag.insert(0, self.fixChars(h1.contents[0]))  # added for dealbook
+                                h1.replaceWith(tag)  # added for dealbook

                else:
                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
@ -1087,7 +1087,7 @@ class NYTimes(BasicNewsRecipe):
                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")

            try:
-                #if this is from a blog (dealbook, fix the byline format
+                # if this is from a blog (dealbook, fix the byline format
                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
                if bylineauthor:
                    tag = Tag(soup, "h6")
@ -1098,7 +1098,7 @@ class NYTimes(BasicNewsRecipe):
                self.log("ERROR:  fixing byline author format")

            try:
-                #if this is a blog (dealbook) fix the credit style for the pictures
+                # if this is a blog (dealbook) fix the credit style for the pictures
                blogcredit = soup.find('div',attrs={'class':'credit'})
                if blogcredit:
                    tag = Tag(soup, "h6")
@ -1108,7 +1108,6 @@ class NYTimes(BasicNewsRecipe):
            except:
                self.log("ERROR:  fixing credit format")

-
            try:
                # Change <h1> to <h3> - used in editorial blogs
                masthead = soup.find("h1")
@ -1132,7 +1131,7 @@ class NYTimes(BasicNewsRecipe):
            except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
            try:
-                #remove the <strong> update tag
+                # remove the <strong> update tag
                blogupdated = soup.find('span', {'class':'update'})
                if blogupdated:
                    blogupdated.replaceWith("")
@ -1181,9 +1180,9 @@ class NYTimes(BasicNewsRecipe):
                            paras = articlebody.findAll('p')
                            for p in paras:
                                refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
-                                #account for blank paragraphs and short paragraphs by appending them to longer ones
+                                # account for blank paragraphs and short paragraphs by appending them to longer ones
                                if len(refparagraph) > 0:
-                                    if len(refparagraph) > 70: #approximately one line of text
+                                    if len(refparagraph) > 70:  # approximately one line of text
                                        newpara = shortparagraph + refparagraph
                                        newparaDateline,newparaEm,newparaDesc = newpara.partition('&mdash;')
                                        if newparaEm == '':
@ -1202,4 +1201,3 @@ class NYTimes(BasicNewsRecipe):
            self.log("Error creating article descriptions")
            return

-
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -14,8 +14,8 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup

 class NYTimes(BasicNewsRecipe):

-    recursions=1 # set this to zero to omit Related articles lists
-    match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
+    recursions=1  # set this to zero to omit Related articles lists
+    match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/']  # speeds up processing by preventing index page links from being followed

    # set getTechBlogs to True to include the technology blogs
    # set tech_oldest_article to control article age
@ -28,12 +28,11 @@ class NYTimes(BasicNewsRecipe):
    # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
    # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
    getPopularArticles = True
-    popularPeriod = '1' # set this to the number of days to include in the measurement
+    popularPeriod = '1'  # set this to the number of days to include in the measurement
                        # e.g. 7 will get the most popular measured over the last 7 days
                        # and 30 will get the most popular measured over 30 days.
                        # you still only get up to 20 articles in each category

-
    # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
    headlinesOnly = False

@ -82,6 +81,7 @@ class NYTimes(BasicNewsRecipe):

    # The maximum number of articles that will be downloaded
    max_articles_per_feed = 100
+    use_embedded_content = False

    # Whether to omit duplicates of articles (typically arsing when articles are indexed in
    # more than one section). If True, only the first occurance will be downloaded.
@ -122,7 +122,6 @@ class NYTimes(BasicNewsRecipe):
               (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
                   ]

-
    if headlinesOnly:
        title='New York Times Headlines'
        description = 'Headlines from the New York Times'
@ -155,7 +154,7 @@ class NYTimes(BasicNewsRecipe):
        earliest_date = date.today()
    else:
        earliest_date = date.today() - timedelta(days=oldest_web_article)
-    oldest_article = 365 # by default, a long time ago
+    oldest_article = 365  # by default, a long time ago

    __author__  = 'GRiker/Kovid Goyal/Nick Redding'
    language = 'en'
@ -164,12 +163,11 @@ class NYTimes(BasicNewsRecipe):

    timefmt = ''

-    #simultaneous_downloads = 1 # no longer required to deal with ads
+    # simultaneous_downloads = 1 # no longer required to deal with ads

    cover_margins = (18,18,'grey99')

-    remove_tags_before = dict(id='article')
-    remove_tags_after  = dict(id='article')
+    keep_only_tags = dict(id=['article', 'story', 'content'])
    remove_tags = [
                    dict(attrs={'class':[
                                        'articleFooter',
@ -184,6 +182,7 @@ class NYTimes(BasicNewsRecipe):
                                        'entry-response module',
                                        'leftNavTabs',
                                        'metaFootnote',
+                                        'inside-story',
                                        'module box nav',
                                        'nextArticleLink',
                                        'nextArticleLink clearfix',
@ -192,28 +191,28 @@ class NYTimes(BasicNewsRecipe):
                                        'side_tool',
                                        'singleAd',
                                        'postCategory column',
-                                        'refer tagRefer', # added for bits blog post
-                                        'entry entry-utility', #added for DealBook
-                                        'entry-tags', #added for DealBook
-                                        'footer promos clearfix', #added for DealBook
-                                        'footer links clearfix', #added for DealBook
-                                        'tabsContainer', #added for other blog downloads
-                                        'column lastColumn', #added for other blog downloads
-                                        'pageHeaderWithLabel', #added for other gadgetwise downloads
-                                        'column two', #added for other blog downloads
-                                        'column two last', #added for other blog downloads
-                                        'column three', #added for other blog downloads
-                                        'column three last', #added for other blog downloads
-                                        'column four',#added for other blog downloads
-                                        'column four last',#added for other blog downloads
-                                        'column last', #added for other blog downloads
+                                        'refer tagRefer',  # added for bits blog post
+                                        'entry entry-utility',  # added for DealBook
+                                        'entry-tags',  # added for DealBook
+                                        'footer promos clearfix',  # added for DealBook
+                                        'footer links clearfix',  # added for DealBook
+                                        'tabsContainer',  # added for other blog downloads
+                                        'column lastColumn',  # added for other blog downloads
+                                        'pageHeaderWithLabel',  # added for other gadgetwise downloads
+                                        'column two',  # added for other blog downloads
+                                        'column two last',  # added for other blog downloads
+                                        'column three',  # added for other blog downloads
+                                        'column three last',  # added for other blog downloads
+                                        'column four',  # added for other blog downloads
+                                        'column four last',  # added for other blog downloads
+                                        'column last',  # added for other blog downloads
                                        'entry entry-related',
-                                        'subNavigation tabContent active', #caucus blog navigation
+                                        'subNavigation tabContent active',  # caucus blog navigation
                                        'mediaOverlay slideshow',
                                        'wideThumb',
-                                        'video', #added 02-11-2011
-                                        'videoHeader',#added 02-11-2011
-                                        'articleInlineVideoHolder', #added 02-11-2011
+                                        'video',  # added 02-11-2011
+                                        'videoHeader',  # added 02-11-2011
+                                        'articleInlineVideoHolder',  # added 02-11-2011
                                        'assetCompanionAd',
                                        'nytint-sectionHeader',
                                        re.compile('^subNavigation'),
@ -222,6 +221,8 @@ class NYTimes(BasicNewsRecipe):
                                        re.compile('commentCount'),
                                        'credit'
                                        ]}),
+                    dict(attrs={'class':lambda x: x and 'related-coverage-marginalia' in x.split()}),
+                    dict(attrs={'class':lambda x: x and 'interactive' in x.split()}),
                    dict(name='div', attrs={'class':re.compile('toolsList')}),  # bits
                    dict(name='div', attrs={'class':re.compile('postNavigation')}),  # bits
                    dict(name='div', attrs={'class':'tweet'}),
@ -230,11 +231,13 @@ class NYTimes(BasicNewsRecipe):
                    dict(name='div', attrs={'id':re.compile('commentsContainer')}),  # bits, pogue, gadgetwise, open
                    dict(name='ul', attrs={'class':re.compile('entry-tools')}),  # pogue, gadgetwise
                    dict(name='div', attrs={'class':re.compile('nocontent')}),  # pogue, gadgetwise
-                    dict(name='div', attrs={'id':re.compile('respond')}), # open
-                    dict(name='div', attrs={'class':re.compile('entry-tags')}), # pogue
+                    dict(name='div', attrs={'id':re.compile('respond')}),  # open
+                    dict(name='div', attrs={'class':re.compile('entry-tags')}),  # pogue
                    dict(id=[
                            'adxLeaderboard',
                            'adxSponLink',
+                            'anchoredAd_module',
+                            'anchoredAd_spot',
                            'archive',
                            'articleExtras',
                            'articleInline',
@ -251,6 +254,7 @@ class NYTimes(BasicNewsRecipe):
                            'masthead-nav',
                            'memberTools',
                            'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge',
+                            'page-footer',
                            'portfolioInline',
                            'readerReviews',
                            'readerReviewsCount',
@ -262,16 +266,18 @@ class NYTimes(BasicNewsRecipe):
                            'side_index',
                            'side_tool',
                            'toolsRight',
-                            'skybox', #added for DealBook
-                            'TopAd', #added for DealBook
-                            'related-content', #added for DealBook
+                            'skybox',  # added for DealBook
+                            'TopAd',  # added for DealBook
+                            'related-content',  # added for DealBook
                            'whats-next',
                            ]),
-                    dict(name=['script', 'noscript', 'style','form','hr', 'button'])]
+                    dict(name=['script', 'noscript', 'style','form','hr', 'button', 'meta', 'footer'])]
    no_stylesheets = True
    extra_css = '''
                .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
-                .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+                .credit { font-weight: normal; text-align: right; font-size:
+                    50%; line-height:1em; margin-top:5px; margin-left:0;
+                    margin-right:0; margin-bottom: 0; }
                .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
@ -287,7 +293,6 @@ class NYTimes(BasicNewsRecipe):
                .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
                .source {text-align: left; font-size: x-small; }'''

-
    articles = {}
    key = None
    ans = []
@ -309,22 +314,22 @@ class NYTimes(BasicNewsRecipe):
                del ans[idx]
                idx_max = idx_max-1
                continue
-            if True: #self.verbose
-                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
+            if True:  # self.verbose
+                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])))
            for article in ans[idx][1]:
                total_article_count += 1
-                if True: #self.verbose
+                if True:  # self.verbose
                    self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
                              article['url'].encode('cp1252','replace')))
            idx = idx+1

-        self.log( "Queued %d articles" % total_article_count )
+        self.log("Queued %d articles" % total_article_count)
        return ans

    def exclude_url(self,url):
        if not url.startswith("http"):
            return True
-        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url:  # added for DealBook
            return True
        if 'nytimes.com' not in url:
            return True
@ -416,7 +421,6 @@ class NYTimes(BasicNewsRecipe):
    def short_title(self):
        return self.title

-
    def article_to_soup(self, url_or_raw, raw=False):
        from contextlib import closing
        import copy
@ -450,7 +454,6 @@ class NYTimes(BasicNewsRecipe):
        usrc = self.preprocess_raw_html(usrc, url_or_raw)
        return BeautifulSoup(usrc, markupMassage=nmassage)

-
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
@ -482,7 +485,7 @@ class NYTimes(BasicNewsRecipe):
        if self.webEdition:
            date_tag = self.decode_url_date(url)
            if date_tag is not None:
-                if  self.oldest_web_article is not None:
+                if self.oldest_web_article is not None:
                    if date_tag < self.earliest_date:
                        self.log("Skipping article %s" % url)
                        return
@ -505,7 +508,7 @@ class NYTimes(BasicNewsRecipe):
            if authorAttribution:
                author = self.tag_to_string(authorAttribution, use_alt=False)
        feed = self.key if self.key is not None else 'Uncategorized'
-        if not self.articles.has_key(feed):
+        if feed not in self.articles:
            self.ans.append(feed)
            self.articles[feed] = []
        self.articles[feed].append(
@ -540,7 +543,6 @@ class NYTimes(BasicNewsRecipe):
                    desc = ''
                return(title,url,author,desc)

-
            have_emailed = False
            emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
            for h3tag in emailed_soup.findAll('h3'):
@ -569,7 +571,7 @@ class NYTimes(BasicNewsRecipe):
                    dict(title=title, url=url, date=strftime('%a, %d %b'),
                        description=desc, author=author,
                        content=''))
-            viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
+            viewed_ans = [(k, popular_articles[k]) for k in key_list if k in popular_articles]
            for x in viewed_ans:
                ans.append(x)
        return ans
@ -592,10 +594,10 @@ class NYTimes(BasicNewsRecipe):
                tech_articles[f.title] = []
                for a in f.articles:
                    tech_articles[f.title].append(
-                        dict(title=a.title, url=a.url, date=a.date,
+                        dict(title=a.title, url=a.url.partition('?')[0], date=a.date,
                            description=a.summary, author=a.author,
                            content=a.content))
-            tech_ans = [(k, tech_articles[k]) for k in key_list if tech_articles.has_key(k)]
+            tech_ans = [(k, tech_articles[k]) for k in key_list if k in tech_articles]
            for x in tech_ans:
                ans.append(x)
        return ans
@ -634,10 +636,9 @@ class NYTimes(BasicNewsRecipe):
                    for lidiv in div.findAll('li'):
                        self.handle_article(lidiv)

-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))

-
    def parse_todays_index(self):

        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
@ -667,7 +668,7 @@ class NYTimes(BasicNewsRecipe):
                    if not skipping:
                        self.handle_article(lidiv)

-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))

    def parse_headline_index(self):
@ -713,13 +714,13 @@ class NYTimes(BasicNewsRecipe):
                    description = self.tag_to_string(desc,use_alt=False)
                else:
                    description = ''
-                if not self.articles.has_key(section_name):
+                if section_name not in self.articles:
                    self.ans.append(section_name)
                    self.articles[section_name] = []
                print('Title '+title+' author '+author)
                self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))

-        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        self.ans = [(k, self.articles[k]) for k in self.ans if k in self.articles]
        return self.filter_ans(self.ans)

    def parse_index(self):
@ -739,7 +740,7 @@ class NYTimes(BasicNewsRecipe):
                    if kill_all or (self.recursions==0):
                        a.replaceWith(self.tag_to_string(a,False))
                    else:
-                        if a.has_key('href'):
+                        if 'href' in a:
                            if a['href'].startswith('http://www.nytimes'):
                                if not a['href'].endswith('pagewanted=all'):
                                    url = re.sub(r'\?.*', '', a['href'])
@ -747,13 +748,13 @@ class NYTimes(BasicNewsRecipe):
                                        a.replaceWith(self.tag_to_string(a,False))
                                    else:
                                        a['href'] = url+'?pagewanted=all'
-                            elif not (a['href'].startswith('http://pogue') or \
-                                      a['href'].startswith('http://bits') or \
-                                      a['href'].startswith('http://travel') or \
-                                      a['href'].startswith('http://business') or \
-                                      a['href'].startswith('http://tech') or \
-                                      a['href'].startswith('http://health') or \
-                                      a['href'].startswith('http://dealbook') or \
+                            elif not (a['href'].startswith('http://pogue') or
+                                      a['href'].startswith('http://bits') or
+                                      a['href'].startswith('http://travel') or
+                                      a['href'].startswith('http://business') or
+                                      a['href'].startswith('http://tech') or
+                                      a['href'].startswith('http://health') or
+                                      a['href'].startswith('http://dealbook') or
                                      a['href'].startswith('http://open')):
                                a.replaceWith(self.tag_to_string(a,False))
        return soup
@ -768,7 +769,7 @@ class NYTimes(BasicNewsRecipe):
            return None

 ##        print("HANDLING AD FORWARD:")
-##        print(soup)
+# print(soup)
        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
@ -778,7 +779,7 @@ class NYTimes(BasicNewsRecipe):
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
-            except AttributeError: # soup has no body element
+            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
@ -806,7 +807,6 @@ class NYTimes(BasicNewsRecipe):

        return soup

-
    def preprocess_html(self, soup):
        #print(strftime("%H:%M:%S")+" --  PREPROCESS TITLE="+self.tag_to_string(soup.title))
        skip_tag = soup.find(True, {'name':'skip'})
@ -825,7 +825,7 @@ class NYTimes(BasicNewsRecipe):
            old_body = soup.find('body')
            new_body=Tag(soup,'body')
            new_body.append(soup.find('div',attrs={'id':'content'}))
-            new_body.find('div',attrs={'id':'content'})['id']='blogcontent' # identify for postprocess_html
+            new_body.find('div',attrs={'id':'content'})['id']='blogcontent'  # identify for postprocess_html
            old_body.replaceWith(new_body)
            for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
                if divr.find(text=re.compile('Sign up')):
@ -868,9 +868,9 @@ class NYTimes(BasicNewsRecipe):
                img = atag.find('img')
                if img is not None:
                    atag.replaceWith(img)
-                elif not atag.has_key('href'):
+                elif 'href' not in atag:
                    atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
-                elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or \
+                elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or
                              atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')):
                    atag.replaceWith(atag.renderContents().decode('cp1252','replace'))
            hdr = soup.find('address')
@ -883,11 +883,11 @@ class NYTimes(BasicNewsRecipe):
                sp.append(span_credit)
                sp.append(Tag(soup,'br'))

-        else: # nytimes article
+        else:  # nytimes article

-            related = [] # these will be the related articles
-            first_outer = None # first related outer tag
-            first_related = None # first related tag
+            related = []  # these will be the related articles
+            first_outer = None  # first related outer tag
+            first_related = None  # first related tag
            for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
                for rdiv in soup.findAll('div','columnGroup doubleRule'):
                    if rdiv.find('h3') is not None:
@ -920,19 +920,19 @@ class NYTimes(BasicNewsRecipe):
                                        h6tag.extract()
            if related != []:
                for r in related:
-                    if r.h6: # don't want the anchor inside a h6 tag
+                    if r.h6:  # don't want the anchor inside a h6 tag
                        r.h6.replaceWith(r.h6.a)
                    first_related.ul.append(r)
                first_related.insert(0,Tag(soup,'hr'))
                first_related.append(Tag(soup,'hr'))
                first_related['class'] = 'aside'
-                first_outer.replaceWith(first_related) # replace the outer tag with the related tag
+                first_outer.replaceWith(first_related)  # replace the outer tag with the related tag

            for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}):
                rdiv.extract()

            kicker_tag = soup.find(attrs={'class':'kicker'})
-            if kicker_tag: # remove Op_Ed author head shots
+            if kicker_tag:  # remove Op_Ed author head shots
                tagline = self.tag_to_string(kicker_tag)
                if tagline=='Op-Ed Columnist':
                    img_div = soup.find('div','inlineImage module')
@ -941,7 +941,7 @@ class NYTimes(BasicNewsRecipe):

            if self.useHighResImages:
                try:
-                    #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
+                    # open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
                    enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                    if enlargeThisList:
                        for popupref in enlargeThisList:
@ -960,8 +960,10 @@ class NYTimes(BasicNewsRecipe):
                                    year = str(st.tm_year)
                                    month = "%.2d" % st.tm_mon
                                    day = "%.2d" % st.tm_mday
-                                    imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
-                                    highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
+                                    imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + \
+                                                                 len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
+                                    highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \
+                                        month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
                                    popupSoup = BeautifulSoup(popuphtml)
                                    highResTag = popupSoup.find('img', {'src':highResImageLink})
                                    if highResTag:
@ -983,7 +985,7 @@ class NYTimes(BasicNewsRecipe):
                    self.log("Error pulling high resolution images")

                try:
-                    #in case pulling images failed, delete the enlarge this text
+                    # in case pulling images failed, delete the enlarge this text
                    enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                    if enlargeThisList:
                        for popupref in enlargeThisList:
@ -991,11 +993,10 @@ class NYTimes(BasicNewsRecipe):
                except:
                    self.log("Error removing Enlarge this text")

-
        return self.strip_anchors(soup,False)

    def postprocess_html(self,soup,first_fetch):
-        if not first_fetch: # remove Related links
+        if not first_fetch:  # remove Related links
            for aside in soup.findAll('div','aside'):
                aside.extract()
            soup = self.strip_anchors(soup,True)
@ -1004,7 +1005,7 @@ class NYTimes(BasicNewsRecipe):
        if soup.find('div',attrs={'id':'blogcontent'}) is None:
            if first_fetch:
                aside = soup.find('div','aside')
-                if aside is not None: # move the related list to the end of the article
+                if aside is not None:  # move the related list to the end of the article
                    art = soup.find('div',attrs={'id':'article'})
                    if art is None:
                        art = soup.find('div',attrs={'class':'article'})
@ -1065,7 +1066,7 @@ class NYTimes(BasicNewsRecipe):
            try:
                # Change <nyt_headline> to <h2>
                h1 = soup.find('h1')
-                blogheadline = str(h1) #added for dealbook
+                blogheadline = str(h1)  # added for dealbook
                if h1:
                        headline = h1.find("nyt_headline")
                        if headline:
@ -1073,11 +1074,11 @@ class NYTimes(BasicNewsRecipe):
                                tag['class'] = "headline"
                                tag.insert(0, self.fixChars(headline.contents[0]))
                                h1.replaceWith(tag)
-                        elif blogheadline.find('entry-title'):#added for dealbook
-                                tag = Tag(soup, "h2")#added for dealbook
-                                tag['class'] = "headline"#added for dealbook
-                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
-                                h1.replaceWith(tag)#added for dealbook
+                        elif blogheadline.find('entry-title'):  # added for dealbook
+                                tag = Tag(soup, "h2")  # added for dealbook
+                                tag['class'] = "headline"  # added for dealbook
+                                tag.insert(0, self.fixChars(h1.contents[0]))  # added for dealbook
+                                h1.replaceWith(tag)  # added for dealbook

                else:
                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
@ -1094,7 +1095,7 @@ class NYTimes(BasicNewsRecipe):
                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")

            try:
-                #if this is from a blog (dealbook, fix the byline format
+                # if this is from a blog (dealbook, fix the byline format
                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
                if bylineauthor:
                    tag = Tag(soup, "h6")
@ -1105,7 +1106,7 @@ class NYTimes(BasicNewsRecipe):
                self.log("ERROR:  fixing byline author format")

            try:
-                #if this is a blog (dealbook) fix the credit style for the pictures
+                # if this is a blog (dealbook) fix the credit style for the pictures
                blogcredit = soup.find('div',attrs={'class':'credit'})
                if blogcredit:
                    tag = Tag(soup, "h6")
@ -1115,7 +1116,6 @@ class NYTimes(BasicNewsRecipe):
            except:
                self.log("ERROR:  fixing credit format")

-
            try:
                # Change <h1> to <h3> - used in editorial blogs
                masthead = soup.find("h1")
@ -1139,7 +1139,7 @@ class NYTimes(BasicNewsRecipe):
            except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
            try:
-                #remove the <strong> update tag
+                # remove the <strong> update tag
                blogupdated = soup.find('span', {'class':'update'})
                if blogupdated:
                    blogupdated.replaceWith("")
@ -1188,9 +1188,9 @@ class NYTimes(BasicNewsRecipe):
                            paras = articlebody.findAll('p')
                            for p in paras:
                                refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
-                                #account for blank paragraphs and short paragraphs by appending them to longer ones
+                                # account for blank paragraphs and short paragraphs by appending them to longer ones
                                if len(refparagraph) > 0:
-                                    if len(refparagraph) > 70: #approximately one line of text
+                                    if len(refparagraph) > 70:  # approximately one line of text
                                        newpara = shortparagraph + refparagraph
                                        newparaDateline,newparaEm,newparaDesc = newpara.partition('&mdash;')
                                        if newparaEm == '':
@ -1209,4 +1209,3 @@ class NYTimes(BasicNewsRecipe):
            self.log("Error creating article descriptions")
            return

-
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -20,6 +20,7 @@ from calibre.constants import iswindows
 from calibre import unicode_path, as_unicode, replace_entities

 class Link(object):
+
    '''
    Represents a link in a HTML file.
    '''
@ -73,6 +74,7 @@ class IgnoreFile(Exception):
        self.errno = errno

 class HTMLFile(object):
+
    '''
    Contains basic information about an HTML file. This
    includes a list of links to other files as well as
@ -103,8 +105,14 @@ class HTMLFile(object):

        try:
            with open(self.path, 'rb') as f:
-                src = f.read(4096)
-                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src))
+                src = header = f.read(4096)
+                encoding = detect_xml_encoding(src)[1]
+                if encoding:
+                    try:
+                        header = header.decode(encoding)
+                    except ValueError:
+                        pass
+                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
                if not self.is_binary:
                    src += f.read()
        except IOError as err:
@ -139,7 +147,6 @@ class HTMLFile(object):
    def __repr__(self):
        return str(self)

-
    def find_links(self, src):
        for match in self.LINK_PAT.finditer(src):
            url = None
@ -167,7 +174,7 @@ def depth_first(root, flat, visited=set([])):
        if link.path is not None and link not in visited:
            try:
                index = flat.index(link)
-            except ValueError: # Can happen if max_levels is used
+            except ValueError:  # Can happen if max_levels is used
                continue
            hf = flat[index]
            if hf not in visited:
@ -232,8 +239,7 @@ def get_filelist(htmlfile, dir, opts, log):
    log.info('Building file list...')
    filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
                        verbose=opts.verbose,
-                        encoding=opts.input_encoding)\
-                [0 if opts.breadth_first else 1]
+                        encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
    if opts.verbose:
        log.debug('\tFound files...')
        for f in filelist:
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -317,13 +317,11 @@ class FlowSplitter(object):
    def split_to_size(self, tree):
        self.log.debug('\t\tSplitting...')
        root = tree.getroot()
-        # Split large <pre> tags
-        for pre in list(XPath('//h:pre')(root)):
-            text = u''.join(pre.xpath('descendant::text()'))
-            pre.text = text
-            for child in list(pre.iterchildren()):
-                pre.remove(child)
-            if len(pre.text) > self.max_flow_size*0.5:
+        # Split large <pre> tags if they contain only text
+        for pre in XPath('//h:pre')(root):
+            if len(tuple(pre.iterchildren(etree.Element))) > 0:
+                continue
+            if pre.text and len(pre.text) > self.max_flow_size*0.5:
                self.log.debug('\t\tSplitting large <pre> tag')
                frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
                new_pres = []
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@ -1104,7 +1104,8 @@ class OnDeviceSearch(SearchQueryParser):  # {{{
        'format',
        'formats',
        'title',
-        'inlibrary'
+        'inlibrary',
+        'tags'
    ]

    def __init__(self, model):
@ -1135,14 +1136,15 @@ class OnDeviceSearch(SearchQueryParser):  # {{{
        if location not in self.USABLE_LOCATIONS:
            return set([])
        matches = set([])
-        all_locs = set(self.USABLE_LOCATIONS) - set(['all'])
+        all_locs = set(self.USABLE_LOCATIONS) - set(['all', 'tags'])
        locations = all_locs if location == 'all' else [location]
        q = {
             'title' : lambda x : getattr(x, 'title').lower(),
             'author': lambda x: ' & '.join(getattr(x, 'authors')).lower(),
             'collections':lambda x: ','.join(getattr(x, 'device_collections')).lower(),
             'format':lambda x: os.path.splitext(x.path)[1].lower(),
-             'inlibrary':lambda x : getattr(x, 'in_library')
+             'inlibrary':lambda x : getattr(x, 'in_library'),
+             'tags':lambda x : getattr(x, 'tags', [])
             }
        for x in ('author', 'format'):
            q[x+'s'] = q[x]
@ -1169,10 +1171,11 @@ class OnDeviceSearch(SearchQueryParser):  # {{{
                    else:
                        m = matchkind

-                    if locvalue == 'collections':
-                        vals = accessor(row).split(',')
-                    else:
-                        vals = [accessor(row)]
+                    vals = accessor(row)
+                    if vals is None:
+                        vals = ''
+                    if isinstance(vals, basestring):
+                        vals = vals.split(',') if locvalue == 'collections' else [vals]
                    if _match(query, vals, m, use_primary_find_in_search=upf):
                        matches.add(index)
                        break
--- a/src/calibre/gui2/tweak_book/char_select.py
+++ b/src/calibre/gui2/tweak_book/char_select.py
@ -21,7 +21,7 @@ from calibre.constants import ispy3, plugins, cache_dir
 from calibre.gui2 import NONE
 from calibre.gui2.widgets2 import HistoryLineEdit2
 from calibre.gui2.tweak_book import tprefs
-from calibre.gui2.tweak_book.editor.insert_resource import Dialog
+from calibre.gui2.tweak_book.widgets import Dialog

 if not ispy3:
    if sys.maxunicode >= 0x10FFFF:
--- a/src/calibre/gui2/tweak_book/editor/insert_resource.py
+++ b/src/calibre/gui2/tweak_book/editor/insert_resource.py
@ -10,11 +10,11 @@ import sys, os
 from functools import partial

 from PyQt4.Qt import (
-    QDialog, QGridLayout, QDialogButtonBox, QSize, QListView, QStyledItemDelegate,
-    QLabel, QPixmap, QApplication, QSizePolicy, QAbstractListModel, QVariant,
-    Qt, QRect, QPainter, QModelIndex, QSortFilterProxyModel, QLineEdit,
-    QToolButton, QIcon, QFormLayout, pyqtSignal, QTreeWidget, QTreeWidgetItem,
-    QVBoxLayout, QMenu, QInputDialog)
+    QGridLayout, QSize, QListView, QStyledItemDelegate, QLabel, QPixmap,
+    QApplication, QSizePolicy, QAbstractListModel, QVariant, Qt, QRect,
+    QPainter, QModelIndex, QSortFilterProxyModel, QLineEdit, QToolButton,
+    QIcon, QFormLayout, pyqtSignal, QTreeWidget, QTreeWidgetItem, QVBoxLayout,
+    QMenu, QInputDialog)

 from calibre import fit_image
 from calibre.constants import plugins
@ -23,43 +23,11 @@ from calibre.ebooks.metadata.book.base import Metadata
 from calibre.gui2 import NONE, choose_files, error_dialog
 from calibre.gui2.languages import LanguagesEdit
 from calibre.gui2.tweak_book import current_container, tprefs
+from calibre.gui2.tweak_book.widgets import Dialog
 from calibre.gui2.tweak_book.file_list import name_is_ok
 from calibre.utils.localization import get_lang, canonicalize_lang
 from calibre.utils.icu import sort_key

-class Dialog(QDialog):
-
-    def __init__(self, title, name, parent=None):
-        QDialog.__init__(self, parent)
-        self.setWindowTitle(title)
-        self.name = name
-        self.bb = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
-        self.bb.accepted.connect(self.accept)
-        self.bb.rejected.connect(self.reject)
-
-        self.setup_ui()
-
-        self.resize(self.sizeHint())
-        geom = tprefs.get(name + '-geometry', None)
-        if geom is not None:
-            self.restoreGeometry(geom)
-        if hasattr(self, 'splitter'):
-            state = tprefs.get(name + '-splitter-state', None)
-            if state is not None:
-                self.splitter.restoreState(state)
-
-    def accept(self):
-        tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry()))
-        if hasattr(self, 'splitter'):
-            tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState()))
-        QDialog.accept(self)
-
-    def reject(self):
-        tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry()))
-        if hasattr(self, 'splitter'):
-            tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState()))
-        QDialog.reject(self)
-
 class ChooseName(Dialog):  # {{{

    ''' Chooses the filename for a newly imported file, with error checking '''
--- a/src/calibre/gui2/tweak_book/widgets.py
+++ b/src/calibre/gui2/tweak_book/widgets.py
@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from PyQt4.Qt import (QDialog, QDialogButtonBox)
+
+from calibre.gui2.tweak_book import tprefs
+
+class Dialog(QDialog):
+
+    def __init__(self, title, name, parent=None):
+        QDialog.__init__(self, parent)
+        self.setWindowTitle(title)
+        self.name = name
+        self.bb = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
+        self.bb.accepted.connect(self.accept)
+        self.bb.rejected.connect(self.reject)
+
+        self.setup_ui()
+
+        self.resize(self.sizeHint())
+        geom = tprefs.get(name + '-geometry', None)
+        if geom is not None:
+            self.restoreGeometry(geom)
+        if hasattr(self, 'splitter'):
+            state = tprefs.get(name + '-splitter-state', None)
+            if state is not None:
+                self.splitter.restoreState(state)
+
+    def accept(self):
+        tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry()))
+        if hasattr(self, 'splitter'):
+            tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState()))
+        QDialog.accept(self)
+
+    def reject(self):
+        tprefs.set(self.name + '-geometry', bytearray(self.saveGeometry()))
+        if hasattr(self, 'splitter'):
+            tprefs.set(self.name + '-splitter-state', bytearray(self.splitter.saveState()))
+        QDialog.reject(self)
+
+    def setup_ui(self):
+        raise NotImplementedError('You must implement this method in Dialog subclasses')
+