diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 8d51f4c6af..d044be24b6 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -766,8 +766,8 @@ class Manifest(object): data = self.oeb.html_preprocessor(data) # Remove DOCTYPE declaration as it messes up parsing - # Inparticular it causes a tostring to insert xmlns - # declarations, which messes up the coesrcing logic + # Inparticular it causes tostring to insert xmlns + # declarations, which messes up the coercing logic idx = data.find(' -1: pre = data[:idx] diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 51f0000605..78d22fef00 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -15,7 +15,7 @@ recipe_modules = ['recipe_' + r for r in ( 'demorgen_be', 'de_standaard', 'ap', 'barrons', 'chr_mon', 'cnn', 'faznet', 'jpost', 'jutarnji', 'nasa', 'reuters', 'spiegelde', 'wash_post', 'zeitde', 'blic', 'novosti', 'danas', 'vreme', 'times_online', 'the_scotsman', - 'nytimes_sub', 'security_watch', 'cyberpresse', 'st_petersburg_times', + 'nytimes_sub', 'nytimes', 'security_watch', 'cyberpresse', 'st_petersburg_times', 'clarin', 'financial_times', 'heise', 'le_monde', 'harpers', 'science_aas', 'science_news', 'the_nation', 'lrb', 'harpers_full', 'liberation', 'linux_magazine', 'telegraph_uk', 'utne', 'sciencedaily', 'forbes', diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes.py b/src/calibre/web/feeds/recipes/recipe_nytimes.py index 9276ad667a..bd150bffcf 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py @@ -1,110 +1,241 @@ #!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' ''' -mobile.nytimes.com +nytimes.com ''' import re -from calibre.web.feeds.news import BasicNewsRecipe -from lxml import html +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag -class NYTimesMobile(BasicNewsRecipe): - - title = 'The New York Times' - __author__ = 'Kovid Goyal' +class NYTimes(BasicNewsRecipe): + + title = 'NYTimes Top Stories' + __author__ = 'Greg Riker' language = _('English') - description = 'Daily news from the New York Times (mobile version)' - timefmt = ' [%a, %d %b, %Y]' - multithreaded_fetch = True - max_articles_per_feed = 15 + description = 'Top Stories from the New York Times' + #max_articles_per_feed = 3 + timefmt = '' + needs_subscription = False + remove_tags_before = dict(id='article') + remove_tags_after = dict(id='article') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), + dict(name=['script', 'noscript', 'style'])] + encoding = 'cp1252' no_stylesheets = True - extra_css = ''' - .h1 { font-size: x-large; font-weight: bold; font-family: sans-serif; text-align: left } - .h2 { font-size: large; font-weight: bold } - .credit { font-size: small } - .aut { font-weight: bold } - .bodycontent { font-family: serif } - ''' - - remove_tags = [ - dict(name='div', attrs={'class':['banner center', 'greyBackBlackTop', 'c bB']}), - dict(name='a', href='/main') - ] - remove_tags_after = [ - dict(name='a', attrs={'name': 'bottom'}) - ] - - def image_url_processor(self, baseurl, url): - return re.sub(r'(&|&).*', '', url) - - def get_browser(self): - return BasicNewsRecipe.get_browser(mobile_browser=True) - - def download(self, for_lrf=False): - if for_lrf: - self.max_articles_per_feed = 10 - return BasicNewsRecipe.download(self, for_lrf=for_lrf) - - def process_section(self, href): - raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True) - articles = [] - while True: - root = html.fromstring(raw) - for art in self.find_articles(root): - append = True - for x in articles: - if x['title'] == art['title']: - append = False - break - if append: articles.append(art) - more = root.xpath('//a[starts-with(@href, "section") and contains(text(), "MORE")]') - if not more: - break - href = more[0].get('href') - raw = self.index_to_soup('http://mobile.nytimes.com/section'+href[href.find('?'):], raw=True) - return articles - - - def find_articles(self, root): - for a in root.xpath('//a[@accesskey]'): - href = a.get('href') - if href.startswith('http://'): - url = href - else: - url = 'http://mobile.nytimes.com/article' + href[href.find('?'):]+'&single=1', - yield { - 'title': a.text.strip(), - 'date' : '', - 'url' : url, - 'description': '', - } - - + #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' + extra_css = '.headline {text-align:left;}\n\ + .byline {font:monospace; margin-bottom:0px;}\n\ + .source {align:left;}\n\ + .credit {align:right;}\n' + + + flatPeriodical = True + def parse_index(self): - raw = self.index_to_soup('http://mobile.nytimes.com', raw=True) - root = html.fromstring(raw) - feeds = [('Latest news', list(self.find_articles(root)))] - - for a in root.xpath('//a[starts-with(@href, "section")]'): - title = a.text.replace('»', '').replace(u'\xbb', '').strip() - print 'Processing section:', title - articles = self.process_section(a.get('href')) - feeds.append((title, articles)) - - return feeds - - def postprocess_html(self, soup, first_fetch): - for img in soup.findAll('img', width=True): - try: - width = int(img['width'].replace('px', '')) - if width < 5: - img.extract() - continue - except: - pass - del img['width'] - del img['height'] - del img.parent['style'] + soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') + + def feed_title(div): + return ''.join(div.findAll(text=True, recursive=False)).strip() + + articles = {} + + ans = [] + if self.flatPeriodical : + feed = key = 'All Top Stories' + articles[key] = [] + ans.append(key) + else : + key = None + + sections = { 'topstories' : 'Top Stories', + 'world' : 'World', + 'us' : 'U.S.', + 'politics' : 'Politics', + 'business' : 'Business', + 'technology' : 'Technology', + 'sports' : 'Sports', + 'arts' : 'Arts', + 'newyorkregion': 'New York/Region', + 'travel' : 'Travel', + 'editorials' : 'Editorials', + 'oped' : 'Op-Ed' + } + + #excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed'] + excludeSectionKeywords = [] + + # Fetch the outer table + table = soup.find('table') + previousTable = table + contentTable = None + + # Find the deepest table containing the stories + while True : + table = table.find('table') + if table.find(text=re.compile('top stories start')) : + if self.verbose > 2 : self.log( "*********** dropping one level deeper **************") + previousTable = table + continue + else : + if self.verbose > 2 : self.log( "found table with top stories") + table = previousTable + if self.verbose > 2 : self.log( "lowest table containing 'top stories start:\n%s" % table) + break + + # There are multiple subtables, find the one containing the stories + for block in table.findAll('table') : + if block.find(text=re.compile('top stories start')) : + if self.verbose > 2 : self.log( "found subtable with top stories") + table = block + if self.verbose > 2 : self.log( "lowest subtable containing 'top stories start:\n%s" % table) + break + else : + if self.verbose > 2 : self.log( "trying next subtable") + continue + + # Again there are multiple subtables, find the one containing the stories + for storyblock in table.findAll('table') : + if storyblock.find(text=re.compile('top stories start')) : + if self.verbose > 2 : self.log( "found subsubtable with top stories\n" ) + # table = storyblock + if self.verbose > 2 : self.log( "\nlowest subsubtable containing 'top stories start:\n%s" % storyblock) + break + else : + if self.verbose > 2 : self.log( "trying next subsubtable") + continue + + skipThisSection = False + + # Within this table are entries + for tr in storyblock.findAllNext('tr'): + if tr.find('span') is not None : + + sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif', + 'times new roman,times, sans serif', + 'times new roman, times, sans serif']}) + if self.verbose > 2 : self.log( "----------- new tr ----------------") + section = None + bylines = [] + descriptions = [] + pubdate = None + + # Get the Section title + for (x,i) in enumerate(sectionblock.contents) : + skipThisSection = False + # Extract the section title + if ('Comment' in str(i.__class__)) : + if 'start(name=' in i : + section = i[i.find('=')+1:-2] + if self.verbose > 2 : self.log( "sectionTitle: %s" % sections[section]) + + # Check for excluded section + if len(excludeSectionKeywords): + key = sections[section] + excluded = re.compile('|'.join(excludeSectionKeywords)) + if excluded.search(key) or articles.has_key(key): + if self.verbose > 2 : self.log("Skipping section %s" % key) + skipThisSection = True + break + + if not self.flatPeriodical : + articles[key] = [] + ans.append(key) + + # Get the bylines and descriptions + if not skipThisSection : + for (x,i) in enumerate(sectionblock.contents) : + + # Extract the bylines and descriptions + if (i.string is not None) and \ + (i.string.strip() > "") and \ + not ('Comment' in str(i.__class__)) : + + contentString = i.strip().encode('utf-8') + if contentString[0:3] == 'By ' : + bylines.append(contentString) + else : + descriptions.append(contentString) + + # Fetch the article titles and URLs + articleCount = len(sectionblock.findAll('span')) + for (i,span) in enumerate(sectionblock.findAll('span')) : + a = span.find('a', href=True) + #if not a: + #continue + url = re.sub(r'\?.*', '', a['href']) + url += '?pagewanted=all' + title = self.tag_to_string(a, use_alt=True) + if self.flatPeriodical : + # prepend the section name + title = sections[section] + " : " + title + if not isinstance(title, unicode): + title = title.decode('utf-8', 'replace') + description = descriptions[i] + if len(bylines) == articleCount : + author = bylines[i] + else : + author = None + + + if self.verbose > 2 : self.log( " title: %s" % title) + if self.verbose > 2 : self.log( " url: %s" % url) + if self.verbose > 2 : self.log( " author: %s" % author) + if self.verbose > 2 : self.log( "description: %s" % description) + + if not self.flatPeriodical : + feed = key + + if not articles.has_key(feed): + if self.verbose > 2 : self.log( "adding %s to articles[]" % feed) + articles[feed] = [] + if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, feed)) + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, content='')) + + ans = self.sort_index_by(ans, {'Top Stories':-1}) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + #sys.exit(1) + + return ans + + def postprocess_html(self,soup, True): + if self.verbose > 2 : self.log(" ********** recipe.postprocess_html ********** ") + + # Change captions to italic -1 + for caption in soup.findAll(True, {'class':'caption'}) : + emTag = Tag(soup, "em") + #emTag['class'] = "caption" + #emTag['font-size-adjust'] = "-1" + emTag.insert(0, caption.contents[0]) + hrTag = Tag(soup, 'hr') + emTag.insert(1, hrTag) + caption.replaceWith(emTag) + + + # Change to

+ headline = soup.div.div.div.div.div.h1.nyt_headline + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, headline.contents[0]) + soup.h1.replaceWith(tag) + return soup + + def postprocess_book(self, oeb, opts, log) : + log( " ********** recipe.postprocess_book ********** ") + log( list(oeb.toc) ) + log( "oeb: %s" % oeb.toc) + log( "opts: %s" % opts.verbose) + for sections in oeb.toc : + log( "section:") + for articleTOC in sections: + log( " title: %s" % articleTOC.title) + log( " author: %s" % articleTOC.author) + log( "description: %s" % articleTOC.description) + log( " href: %s" % articleTOC.href) + log( " content: %s" % oeb.manifest.hrefs[articleTOC.href]) + return