diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index 32e5a4825e..3b9d2858e6 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -5,7 +5,8 @@ __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com ''' -import re, time +import re +import time from calibre import entity_to_unicode from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment @@ -14,7 +15,7 @@ class NYTimes(BasicNewsRecipe): title = 'New York Times Top Stories' __author__ = 'GRiker' - language = 'en' + language = _('English') description = 'Top Stories from the New York Times' # List of sections typically included in Top Stories. Use a keyword from the @@ -39,9 +40,6 @@ class NYTimes(BasicNewsRecipe): 'world' : 'World' } - # By default, no sections are skipped. - excludeSectionKeywords = [] - # Add section keywords from the right column above to skip that section # For example, to skip sections containing the word 'Sports' or 'Dining', use: # excludeSectionKeywords = ['Sports', 'Dining'] @@ -49,36 +47,138 @@ class NYTimes(BasicNewsRecipe): # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] # Fetch only Top Stories # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] + # By default, no sections are skipped. + excludeSectionKeywords = [] + + # one_picture_per_article specifies that calibre should only use the first image + # from an article (if one exists). If one_picture_per_article = True, the image + # will be moved to a location between the headline and the byline. + # If one_picture_per_article = False, all images from the article will be included + # and shown in their original location. + one_picture_per_article = True # The maximum number of articles that will be downloaded max_articles_per_feed = 40 timefmt = '' needs_subscription = True - keep_only_tags = [ dict(attrs={ 'id':['article']}), - dict(attrs={'class':['blog wrap']}) ] + masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - remove_tags = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix', - 'inlineVideo left brightcove', 'entry-meta']}), - dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles', - 'portfolioInline','articleInline','readerscomment', - 'nytRating']}) ] + remove_tags_before = dict(id='article') + remove_tags_after = dict(id='article') + remove_tags = [dict(attrs={'class':[ + 'articleFooter', + 'articleTools', + 'columnGroup doubleRule', + 'columnGroup singleRule', + 'columnGroup last', + 'columnGroup last', + 'doubleRule', + 'dottedLine', + 'entry-meta', + 'icon enlargeThis', + 'leftNavTabs', + 'module box nav', + 'nextArticleLink', + 'nextArticleLink clearfix', + 'post-tools', + 'relatedSearchesModule', + 'side_tool', + 'singleAd', + 'subNavigation tabContent active clearfix', + ]}), + dict(id=[ + 'adxLeaderboard', + 'archive', + 'articleExtras', + 'articleInline', + 'blog_sidebar', + 'cCol', + 'entertainmentSearchBar', + 'footer', + 'header', + 'header_search', + 'login', + 'masthead', + 'memberTools', + 'navigation', + 'portfolioInline', + 'relatedArticles', + 'side_search', + 'side_index', + 'side_tool', + 'toolsRight', + ]), + dict(name=['script', 'noscript', 'style'])] - encoding = 'cp1252' no_stylesheets = True extra_css = '.headline {text-align: left;}\n \ .byline {font-family: monospace; \ text-align: left; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ + .dateline {font-size: small; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ + .timestamp {font-size: small; \ + margin-top: 0px; \ margin-bottom: 0px;}\n \ - .timestamp {font-size: smaller;}\n \ .source {text-align: left;}\n \ .image {text-align: center;}\n \ .credit {text-align: right; \ - font-size: smaller;}\n \ + font-size: small; \ + margin-top: 0px; \ + margin-bottom: 0px;}\n \ .articleBody {text-align: left;}\n \ .authorId {text-align: left; \ font-style: italic;}\n ' + def dump_ans(self, ans) : + total_article_count = 0 + for section in ans : + if self.verbose: + self.log("section %s: %d articles" % (section[0], len(section[1])) ) + for article in section[1]: + total_article_count += 1 + if self.verbose: + self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), + article['url'].encode('cp1252','replace'))) + self.log( "Queued %d articles" % total_article_count ) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + + return fixed + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + try: + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + br.submit() + except: + self.log("\nFailed to login") + return br + def get_cover_url(self): cover = None st = time.localtime() @@ -94,26 +194,6 @@ class NYTimes(BasicNewsRecipe): cover = None return cover - def get_masthead_url(self): - masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - br = BasicNewsRecipe.get_browser() - try: - br.open(masthead) - except: - self.log("\nCover unavailable") - masthead = None - return masthead - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - br.submit() - return br - def index_to_soup(self, url_or_raw, raw=False): ''' OVERRIDE of class method @@ -138,6 +218,7 @@ class NYTimes(BasicNewsRecipe): return BeautifulSoup(_raw, markupMassage=massage) # Entry point + print "index_to_soup()" soup = get_the_soup( self.encoding, url_or_raw ) contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] @@ -151,6 +232,16 @@ class NYTimes(BasicNewsRecipe): return soup + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + def parse_index(self): articles = {} ans = [] @@ -158,12 +249,14 @@ class NYTimes(BasicNewsRecipe): feed = key = 'All Top Stories' articles[key] = [] ans.append(key) + self.log("Scanning 1 section ...") soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') # Fetch the outer table table = soup.find('table') previousTable = table + contentTable = None # Find the deepest table containing the stories while True : @@ -191,8 +284,9 @@ class NYTimes(BasicNewsRecipe): continue skipThisSection = False - + todays_article_count = 0 # Within this table are entries + self.log("Fetching feed Top Stories") for tr in storyblock.findAllNext('tr'): if tr.find('span') is not None : @@ -244,6 +338,7 @@ class NYTimes(BasicNewsRecipe): # Fetch the article titles and URLs articleCount = len(sectionblock.findAll('span')) + todays_article_count += articleCount for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) : a = span.find('a', href=True) url = re.sub(r'\?.*', '', a['href']) @@ -277,6 +372,7 @@ class NYTimes(BasicNewsRecipe): if duplicateFound: # Continue fetching, don't add this article + todays_article_count -= 1 continue if not articles.has_key(feed): @@ -284,11 +380,138 @@ class NYTimes(BasicNewsRecipe): articles[feed].append( dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) +# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories")) ans = self.sort_index_by(ans, {'Top Stories':-1}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + self.dump_ans(ans) return ans + def preprocess_html(self, soup): + return self.strip_anchors(soup) + + def postprocess_html(self,soup, True): + + if self.one_picture_per_article: + # Remove all images after first + largeImg = soup.find(True, {'class':'articleSpanImage'}) + inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) + if largeImg: + for inlineImg in inlineImgs: + inlineImg.extract() + else: + if inlineImgs: + firstImg = inlineImgs[0] + for inlineImg in inlineImgs[1:]: + inlineImg.extract() + # Move firstImg after headline + cgFirst = soup.find(True, {'class':'columnGroup first'}) + if cgFirst: + # Strip all sibling NavigableStrings: noise + navstrings = cgFirst.findAll(text=True, recursive=False) + [ns.extract() for ns in navstrings] + headline_found = False + tag = cgFirst.find(True) + insertLoc = 0 + while True: + insertLoc += 1 + if hasattr(tag,'class') and tag['class'] == 'articleHeadline': + headline_found = True + break + tag = tag.nextSibling + if not tag: + headline_found = False + break + if headline_found: + cgFirst.insert(insertLoc,firstImg) + else: + self.log(">>> No class:'columnGroup first' found <<<") + # Change class="kicker" to

+ kicker = soup.find(True, {'class':'kicker'}) + if kicker and kicker.contents[0]: + h3Tag = Tag(soup, "h3") + h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, + use_alt=False))) + kicker.replaceWith(h3Tag) + + # Change captions to italic -1 + for caption in soup.findAll(True, {'class':'caption'}) : + if caption and caption.contents[0]: + emTag = Tag(soup, "em") + c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() + mp_off = c.find("More Photos") + if mp_off >= 0: + c = c[:mp_off] + emTag.insert(0, c) + hrTag = Tag(soup, 'hr') + #hrTag['style'] = "margin-top:0em;margin-bottom:0em" + emTag.insert(1, hrTag) + caption.replaceWith(emTag) + + # Change to

+ h1 = soup.find('h1') + if h1: + headline = h1.find("nyt_headline") + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + h1.replaceWith(tag) + else: + # Blog entry - replace headline, remove
tags + headline = soup.find('title') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + soup.insert(0, tag) + hrs = soup.findAll('hr') + for hr in hrs: + hr.extract() + + # Change

to

- used in editorial blogs + masthead = soup.find("h1") + if masthead: + # Nuke the href + if masthead.a: + del(masthead.a['href']) + tag = Tag(soup, "h3") + tag.insert(0, self.fixChars(masthead.contents[0])) + masthead.replaceWith(tag) + + # Change to + for subhead in soup.findAll(True, {'class':'bold'}) : + if subhead.contents: + bTag = Tag(soup, "b") + bTag.insert(0, subhead.contents[0]) + subhead.replaceWith(bTag) + + # Synthesize a section header + dsk = soup.find('meta', attrs={'name':'dsk'}) + if dsk and dsk.has_key('content'): + hTag = Tag(soup,'h3') + hTag['class'] = 'section' + hTag.insert(0,NavigableString(dsk['content'])) + articleTag = soup.find(True, attrs={'id':'article'}) + if articleTag: + articleTag.insert(0,hTag) + + # Add class="articleBody" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'articleBody'}) + if divTag: + divTag['class'] = divTag['id'] + + # Add class="authorId" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'authorId'}) + if divTag and divTag.contents[0]: + tag = Tag(soup, "p") + tag['class'] = "authorId" + tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], + use_alt=False))) + divTag.replaceWith(tag) + + return soup + def strip_anchors(self,soup): paras = soup.findAll(True) for para in paras: @@ -297,94 +520,3 @@ class NYTimes(BasicNewsRecipe): if a.img is None: a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - - def preprocess_html(self, soup): -# refresh = soup.find('meta', {'http-equiv':'refresh'}) -# if refresh is None: -# return self.strip_anchors(soup) -# -# content = refresh.get('content').partition('=')[2] -# raw = self.browser.open('http://www.nytimes.com'+content).read() -# soup = BeautifulSoup(raw.decode('cp1252', 'replace')) - return self.strip_anchors(soup) - refresh = soup.find('meta', {'http-equiv':'refresh'}) - if refresh is not None: - content = refresh.get('content').partition('=')[2] - raw = self.browser.open('http://www.nytimes.com'+content).read() - soup = BeautifulSoup(raw.decode('cp1252', 'replace')) - - soup = self.strip_anchors(soup) - - # Test for empty content - body = soup.find('body') - tagCount = len(body.findAll(True)) - if tagCount: -# print "%d tags in article" % tagCount - return soup - else: - print "no allowed content found, removing article" - raise Exception - - def postprocess_html(self,soup, True): - - # Change class="kicker" to

- kicker = soup.find(True, {'class':'kicker'}) - if kicker is not None : - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, kicker.contents[0]) - kicker.replaceWith(h3Tag) - - # Change captions to italic -1 - for caption in soup.findAll(True, {'class':'caption'}) : - if caption is not None: - emTag = Tag(soup, "em") - emTag.insert(0, caption.contents[0]) - hrTag = Tag(soup, 'hr') - emTag.insert(1, hrTag) - caption.replaceWith(emTag) - - # Change to

- headline = soup.find("nyt_headline") - if headline is not None : - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, headline.contents[0]) - soup.h1.replaceWith(tag) - - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead is not None : - # Nuke the href - if masthead.a is not None : - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, masthead.contents[0]) - soup.h1.replaceWith(tag) - - # Change to - for subhead in soup.findAll(True, {'class':'bold'}) : - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - - # Synthesize a section header - dsk = soup.find('meta', attrs={'name':'dsk'}) - if dsk is not None and dsk.has_key('content'): - hTag = Tag(soup,'h3') - hTag['class'] = 'section' - hTag.insert(0,NavigableString(dsk['content'])) - articleTag = soup.find(True, attrs={'id':'article'}) - articleTag.insert(0,hTag) - - # Add class="articleBody" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'articleBody'}) - if divTag is not None : - divTag['class'] = divTag['id'] - - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'authorId'}) - if divTag is not None : - divTag['class'] = divTag['id'] - - return soup - diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 08d658d9bc..9f2efa9f9b 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -20,7 +20,7 @@ class ANDROID(USBMS): VENDOR_ID = { 0x0bb4 : { 0x0c02 : [0x100], 0x0c01 : [0x100]}, 0x22b8 : { 0x41d9 : [0x216]}, - 0x18d1 : { 0x4e11 : [0x0100], 0x4e12: [0x0100]}, + 0x18d1 : { 0x4e11 : [0x0100, 0x226], 0x4e12: [0x0100, 0x226]}, 0x04e8 : { 0x681d : [0x0222]}, } EBOOK_DIR_MAIN = ['wordplayer/calibretransfer', 'eBooks/import', 'Books']