diff --git a/Changelog.yaml b/Changelog.yaml index d0c3478ba3..41ef499723 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -4,6 +4,137 @@ # for important features/bug fixes. # Also, each release can have new and improved recipes. +- version: 0.7.25 + date: 2010-10-29 + + new features: + - title: "Add support for the SONY periodical format." + description: "This means that news downloaded by calibre and sent to a newer SONY device (350/650/900) should appear in the Periodicals section and have the special periodicals navigation user interface" + type: major + + - title: "Content server: Make the new browsing interface the default. The old interface can be accessed at /old" + + - title: "Content server: Allow running of content server as a WSGI application within another server. Add tutorial for this to the User Manual." + + - title: "Support for the Pico Life reader, Kobo Wifi and HTC Aria" + + - title: "Content server: Add a new --url-prefix command line option to ease the use of the server with a reverse proxy" + + - title: "New social metadata plugin for Amazon that does not rely on AWS. Since Amazon broke AWS, it is recommended you upgrade to this version if you use metadata from Amazon" + + - title: "Add a tweak to specify the fonts used when geenrating the default cover" + + - title: "Add an output profile for generic Tablet devices" + tickets: [7289] + + - title: "SONY driver: Allow sorting of collections by arbitrary field via a new tweak." + + - title: "Content server: Make /mobile a little prettier" + + - title: "Add button to 'Library Check' to automatically delete spurious files and folders" + + bug fixes: + - title: "FB2 Input: Lots of love. Handle stylesheets and style attributes. Make parsinf malformed FB2 files more robust." + tickets: [7219, 7230] + + - title: "Fix auto send of news to device with multiple calibre libraries. The fix means that if you have any pending news to be sent, it will be ignored after the update. Future news downloads will once again be automatically sent to the device." + + - title: "MOBI Output: Conversion of super/sub scripts now handles nested tags." + tickets: [7264] + + - title: "Conversion pipeline: Fix parsing of XML encoding declarations." + tickets: [7328] + + - title: "Pandigital (Kobo): Upload thumbnails to correct location" + tickets: [7165] + + - title: "Fix auto emailed news with non asci characters in title not being deliverd to Kindle" + tickets: [7322] + + - title: "Read metadata only after on import plugins have run when adding books to GUI" + tickets: [7245] + + - title: "Various fixes for bugs caused by non ascii temporary paths on windows with non UTF-8 filesystem encodings" + tickets: [7288] + + - title: "Various fixes/enhancements to SNB Output" + + - title: "Allow Tag editor in edit metadata dialog to be used even if tags have been changed" + tickets: [7298] + + - title: "Fix crash on some OS X machines when Preferences->Conversion->Output is clicked" + + - title: "MOBI indexing: Fix last entry missing sometimes" + tickets: [6595] + + - title: "Fix regression causing books to be deselected after sending to device" + tickets: [7271] + + - title: "Conversion pipeline: Fix rescaling of GIF images not working" + tickets: [7306] + + - title: "Update PDF metadata/conversion libraries in windows build" + + - title: "Fix timezone bug when searching on date fields" + tickets: [7300] + + - title: "Fix regression that caused the viewr to crash if the main application is closed" + tickets: [7276] + + - title: "Fix bug causing a spurious metadata.opf file to be written at the root of the calibre library when adding books" + + - title: "Use the same title casing algorithm in all places" + + - title: "Fix bulk edit of dual state boolean custom columns" + + - title: "Increase image size for comics in Kindle DX profile for better conversion of comics to PDF" + + - title: "Fix restore db to not dies when conflicting custom columns are encountered and report conflicting columns errors. Fix exceptions when referencing invalid _index fields." + + - title: "Fix auto merge books not respecting article sort tweak" + tickets: [7147] + + - title: "Linux device drivers: Fix udisks based ejecting for devices with multiple nodes" + + - title: "Linux device mounting: Mount the drive with the lowest kernel name as main memory" + + - title: "Fix use of numeric fields in templates" + + - title: "EPUB Input: Handle EPUB files with multiple OPF files." + tickets: [7229] + + - title: "Setting EPUB metadata: Fix date format. Fix language being overwritten by und when unspecified. Fix empty ISBN identifier being created" + + - title: "Fix cannot delete a Series listing from List view also dismiss fetch metadata dialog when no metadata found automatically" + tickets: [7221, 7220] + + - title: "Content server: Handle switch library in GUI gracefully" + + - title: "calibre-server: Use cherrypy implementation of --pidfile and --daemonize" + + new recipes: + - title: "Ming Pao" + author: "Eddie Lau" + + - title: "lenta.ru" + author: "Nikolai Kotchetkov" + + - title: "frazpc.pl" + author: "Tomasz Dlugosz" + + - title: "Perfil and The Economic Collapse Blog" + author: "Darko Miletic" + + - title: "STNN" + author: "Larry Chan" + + improved recipes: + - CubaDebate + - El Pais + - Fox News + - New Scientist + - The Economic Times of India + - version: 0.7.24 date: 2010-10-17 diff --git a/resources/content_server/browse/browse.html b/resources/content_server/browse/browse.html index 4acc15f3ea..ef312334d9 100644 --- a/resources/content_server/browse/browse.html +++ b/resources/content_server/browse/browse.html @@ -8,24 +8,25 @@ - - - + + + - - + + + src="{prefix}/static/jquery_ui/js/jquery-ui-1.8.5.custom.min.js"> + src="{prefix}/static/jquery.multiselect.min.js"> - + - - + + + + +
- Show first set of books Show previous set of books              Show next set of books Show last set of books + Show first set of books Show previous set of books              Show next set of books Show last set of books
@@ -38,7 +39,7 @@
- Loading... Loading… + Loading... Loading…
diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 270b7e0b06..0f570bab40 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -203,3 +203,11 @@ content_server_wont_display = [''] # level sorts, and if you are seeing a slowdown, reduce the value of this tweak. maximum_resort_levels = 5 +# Absolute path to a TTF font file to use as the font for the title and author +# when generating a default cover. Useful if the default font (Liberation +# Serif) does not contain glyphs for the language of the books in your library. +generate_cover_title_font = None + +# Absolute path to a TTF font file to use as the font for the footer in the +# default cover +generate_cover_foot_font = None diff --git a/resources/images/news/perfil.png b/resources/images/news/perfil.png new file mode 100644 index 0000000000..54c8159e48 Binary files /dev/null and b/resources/images/news/perfil.png differ diff --git a/resources/recipes/cubadebate.recipe b/resources/recipes/cubadebate.recipe index 88d06d412d..f8887b2672 100644 --- a/resources/recipes/cubadebate.recipe +++ b/resources/recipes/cubadebate.recipe @@ -1,9 +1,7 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2010, Darko Miletic ' ''' -newyorker.com +cubadebate.cu ''' from calibre.web.feeds.news import BasicNewsRecipe @@ -13,32 +11,44 @@ class CubaDebate(BasicNewsRecipe): __author__ = 'Darko Miletic' description = 'Contra el Terorismo Mediatico' oldest_article = 15 - language = 'es' - + language = 'es' max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False publisher = 'Cubadebate' category = 'news, politics, Cuba' encoding = 'utf-8' - extra_css = ' #BlogTitle{font-size: x-large; font-weight: bold} ' + masthead_url = 'http://www.cubadebate.cu/wp-content/themes/cubadebate/images/logo.gif' + publication_type = 'newsportal' + extra_css = """ + #BlogTitle{font-size: xx-large; font-weight: bold} + body{font-family: Verdana, Arial, Tahoma, sans-serif} + """ conversion_options = { 'comments' : description ,'tags' : category - ,'language' : 'es' + ,'language' : language ,'publisher' : publisher - ,'pretty_print': True } keep_only_tags = [dict(name='div', attrs={'id':'Outline'})] remove_tags_after = dict(name='div',attrs={'id':'BlogContent'}) - remove_tags = [dict(name='link')] + remove_tags = [ + dict(name=['link','base','embed','object','meta','iframe']) + ,dict(attrs={'id':'addthis_container'}) + ] feeds = [(u'Articulos', u'http://www.cubadebate.cu/feed/')] - + remove_attributes=['width','height','lang'] + def print_version(self, url): return url + 'print/' def preprocess_html(self, soup): - return self.adeify_images(soup) + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + return soup diff --git a/resources/recipes/lenta_ru.recipe b/resources/recipes/lenta_ru.recipe new file mode 100644 index 0000000000..d400bc5886 --- /dev/null +++ b/resources/recipes/lenta_ru.recipe @@ -0,0 +1,177 @@ +#!/usr/bin/env python + +''' +Lenta.ru +''' + +from calibre.web.feeds.feedparser import parse +from calibre.ebooks.BeautifulSoup import Tag +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class LentaRURecipe(BasicNewsRecipe): + title = u'Lenta.ru: \u041d\u043e\u0432\u043e\u0441\u0442\u0438' + __author__ = 'Nikolai Kotchetkov' + publisher = 'lenta.ru' + category = 'news, Russia' + description = u'''\u0415\u0436\u0435\u0434\u043d\u0435\u0432\u043d\u0430\u044f + \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u0433\u0430\u0437\u0435\u0442\u0430. + \u041d\u043e\u0432\u043e\u0441\u0442\u0438 \u0441\u043e + \u0432\u0441\u0435\u0433\u043e \u043c\u0438\u0440\u0430 \u043d\u0430 + \u0440\u0443\u0441\u0441\u043a\u043e\u043c + \u044f\u0437\u044b\u043a\u0435''' + description = u'Ежедневная интернет-газета. Новости со всего мира на русском языке' + oldest_article = 3 + max_articles_per_feed = 100 + + masthead_url = u'http://img.lenta.ru/i/logowrambler.gif' + cover_url = u'http://img.lenta.ru/i/logowrambler.gif' + + #Add feed names if you want them to be sorted (feeds of this list appear first) + sortOrder = [u'_default', u'В России', u'б.СССР', u'В мире'] + + encoding = 'cp1251' + language = 'ru' + no_stylesheets = True + remove_javascript = True + recursions = 0 + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + + keep_only_tags = [dict(name='td', attrs={'class':['statya','content']})] + + remove_tags_after = [dict(name='p', attrs={'class':'links'}), dict(name='div', attrs={'id':'readers-block'})] + + remove_tags = [dict(name='table', attrs={'class':['vrezka','content']}), dict(name='div', attrs={'class':'b240'}), dict(name='div', attrs={'id':'readers-block'}), dict(name='p', attrs={'class':'links'})] + + feeds = [u'http://lenta.ru/rss/'] + + extra_css = 'h1 {font-size: 1.2em; margin: 0em 0em 0em 0em;} h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;} h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}' + + def parse_index(self): + try: + feedData = parse(self.feeds[0]) + if not feedData: + raise NotImplementedError + self.log("parse_index: Feed loaded successfully.") + if feedData.feed.has_key('title'): + self.title = feedData.feed.title + self.log("parse_index: Title updated to: ", self.title) + if feedData.feed.has_key('image'): + self.log("HAS IMAGE!!!!") + + def get_virtual_feed_articles(feed): + if feeds.has_key(feed): + return feeds[feed][1] + self.log("Adding new feed: ", feed) + articles = [] + feeds[feed] = (feed, articles) + return articles + + feeds = {} + + #Iterate feed items and distribute articles using tags + for item in feedData.entries: + link = item.get('link', ''); + title = item.get('title', ''); + if '' == link or '' == title: + continue + article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''}; + if not item.has_key('tags'): + get_virtual_feed_articles('_default').append(article) + continue + for tag in item.tags: + addedToDefault = False + term = tag.get('term', '') + if '' == term: + if (not addedToDefault): + get_virtual_feed_articles('_default').append(article) + continue + get_virtual_feed_articles(term).append(article) + + #Get feed list + #Select sorted feeds first of all + result = [] + for feedName in self.sortOrder: + if (not feeds.has_key(feedName)): continue + result.append(feeds[feedName]) + del feeds[feedName] + result = result + feeds.values() + + return result + + except Exception, err: + self.log(err) + raise NotImplementedError + + def preprocess_html(self, soup): + return self.adeify_images(soup) + + def postprocess_html(self, soup, first_fetch): + #self.log('Original: ', soup.prettify()) + + contents = Tag(soup, 'div') + + #Extract tags with given attributes + extractElements = {'div' : [{'id' : 'readers-block'}]} + + #Remove all elements that were not extracted before + for tag, attrs in extractElements.iteritems(): + for attr in attrs: + garbage = soup.findAll(tag, attr) + if garbage: + for pieceOfGarbage in garbage: + pieceOfGarbage.extract() + + #Find article text using header + #and add all elements to contents + element = soup.find({'h1' : True, 'h2' : True}) + if (element): + element.name = 'h1' + while element: + nextElement = element.nextSibling + element.extract() + contents.insert(len(contents.contents), element) + element = nextElement + + #Place article date after header + dates = soup.findAll(text=re.compile('\d{2}\.\d{2}\.\d{4}, \d{2}:\d{2}:\d{2}')) + if dates: + for date in dates: + for string in date: + parent = date.parent + if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == parent['class']): + #Date div found + parent.extract() + parent['style'] = 'font-size: 0.5em; color: gray; font-family: monospace;' + contents.insert(1, parent) + break + + #Place article picture after date + pic = soup.find('img') + if pic: + picDiv = Tag(soup, 'div') + picDiv['style'] = 'width: 100%; text-align: center;' + pic.extract() + picDiv.insert(0, pic) + title = pic.get('title', None) + if title: + titleDiv = Tag(soup, 'div') + titleDiv['style'] = 'font-size: 0.5em;' + titleDiv.insert(0, title) + picDiv.insert(1, titleDiv) + contents.insert(2, picDiv) + + body = soup.find('td', {'class':['statya','content']}) + if body: + body.replaceWith(contents) + + #self.log('Result: ', soup.prettify()) + return soup + diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 1814132667..5452ae1c6e 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -4,149 +4,79 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com -V5 - One picture per article, moved to top: -Headline -Image -Byline -Story ''' -import re, string, time +import string, re, time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +def decode(self, src): + enc = 'utf-8' + if 'iso-8859-1' in src: + enc = 'cp1252' + return src.decode(enc, 'ignore') class NYTimes(BasicNewsRecipe): - title = 'The New York Times' - __author__ = 'GRiker' + title = u'New York Times' + __author__ = 'Kovid Goyal/Nick Redding' language = 'en' - requires_version = (0, 7, 5) + requires_version = (0, 6, 36) description = 'Daily news from the New York Times (subscription version)' - allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials', - 'New York','Business Day','Science Times','Sports','Dining','Arts', - 'Home','Styles','Sunday Business','Week In Review','Travel','Magazine', - 'Book Review','Weddings','Real Estate','Automobiles',"T Men's Fashion", - "T Women's Fashion"] - - # List of sections to exclude - # To add a section, copy the section name from the allSectionKeywords list above - # For example, to exclude 'Dining' and 'Weddings': - #excludeSectionKeywords = ['Dining','Weddings'] - excludeSectionKeywords = [] - - # List of sections to include (test and debug only) - # By default, any sections in today's paper that are not listed in excludeSectionKeywords - # are downloaded. fetch_only specifies that only certain sections are to be downloaded. - # This should only be used for testing and debugging. - # For example, to download only 'The Front Page' section: - # fetch_only = set(['The Front Page']) - fetch_only = set([]) - if fetch_only: - excludeSectionKeywords = list(set(allSectionKeywords) ^ fetch_only) - - # one_picture_per_article specifies that calibre should only use the first image - # from an article (if one exists). If one_picture_per_article = True, the image - # will be moved to a location between the headline and the byline. - # If one_picture_per_article = False, all images from the article will be included - # and shown in their original location. - one_picture_per_article = True - - timefmt = '' + timefmt = ' [%b %d]' needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':[ - 'articleFooter', - 'articleTools', - 'columnGroup doubleRule', - 'columnGroup singleRule', - 'columnGroup last', - 'columnGroup last', - 'doubleRule', - 'dottedLine', - 'entry-meta', - 'entry-response module', - 'icon enlargeThis', - 'leftNavTabs', - 'module box nav', - 'nextArticleLink', - 'nextArticleLink clearfix', - 'post-tools', - 'relatedSearchesModule', - 'side_tool', - 'singleAd', - 'subNavigation clearfix', - 'subNavigation tabContent active', - 'subNavigation tabContent active clearfix', - ]}), - dict(id=[ - 'adxLeaderboard', - 'archive', - 'articleExtras', - 'articleInline', - 'blog_sidebar', - 'businessSearchBar', - 'cCol', - 'entertainmentSearchBar', - 'footer', - 'header', - 'header_search', - 'login', - 'masthead', - 'masthead-nav', - 'memberTools', - 'navigation', - 'portfolioInline', - 'relatedArticles', - 'respond', - 'side_search', - 'side_index', - 'side_tool', - 'toolsRight', - ]), - dict(name=['script', 'noscript', 'style'])] - masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - cover_margins = (18,18,'grey99') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink', + 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta', + 'icon enlargeThis','columnGroup last','relatedSearchesModule']}), + dict({'class':re.compile('^subNavigation')}), + dict({'class':re.compile('^leaderboard')}), + dict({'class':re.compile('^module')}), + dict({'class':'metaFootnote'}), + dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead', + 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline', + 'side_tool', 'side_index','header','readerReviewsCount','readerReviews', + 'relatedArticles', 'relatedTopics', 'adxSponLink']), + dict(name=['script', 'noscript', 'style','form','hr'])] + encoding = decode no_stylesheets = True - extra_css = '.headline {text-align: left;}\n \ - .byline {font-family: monospace; \ - text-align: left; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .dateline {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .timestamp {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .source {text-align: left;}\n \ - .image {text-align: center;}\n \ - .credit {text-align: right; \ - font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .articleBody {text-align: left;}\n \ - .authorId {text-align: left; \ - font-style: italic;}\n ' + extra_css = ''' + .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; } + .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .timestamp { font-size: small; } + .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + a:link {text-decoration: none; }''' def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - try: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - raw = br.submit().read() - if 'Sorry, we could not find the combination you entered. Please try again.' in raw: - raise Exception('Your username and password are incorrect') - #open('/t/log.html', 'wb').write(raw) - except: - self.log("\nFailed to login") - + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + raw = br.submit().read() + if 'Sorry, we could not find the combination you entered. Please try again.' in raw: + raise Exception('Your username and password are incorrect') + #open('/t/log.html', 'wb').write(raw) return br + def get_masthead_url(self): + masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + #masthead = 'http://members.cox.net/nickredding/nytlogo.gif' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nMasthead unavailable") + masthead = None + return masthead + + def get_cover_url(self): cover = None st = time.localtime() @@ -162,316 +92,101 @@ class NYTimes(BasicNewsRecipe): cover = None return cover - def get_masthead_title(self): - return self.title - - def dump_ans(self, ans): - total_article_count = 0 - for section in ans : - if self.verbose: - self.log("section %s: %d articles" % (section[0], len(section[1])) ) - for article in section[1]: - total_article_count += 1 - if self.verbose: - self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('mac-roman','replace'), - article['url'].encode('mac-roman','replace'))) - self.log( "Queued %d articles" % total_article_count ) - - def dump_hex(self, src, length=16): - ''' Diagnostic ''' - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) - N=0; result='' - while src: - s,src = src[:length],src[length:] - hexa = ' '.join(["%02X"%ord(x) for x in s]) - s = s.translate(FILTER) - result += "%04X %-*s %s\n" % (N, length*3, hexa, s) - N+=length - print result - - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - - return fixed - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description + def short_title(self): + return 'New York Times' def parse_index(self): + self.encoding = 'cp1252' soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') + self.encoding = decode def feed_title(div): - return ''.join(div.findAll(text=True, recursive=False)).strip() + return ''.join(div.findAll(text=True, recursive=True)).strip() articles = {} key = None ans = [] - # Find each instance of class="section-headline", class="story", class="story headline" - for div in soup.findAll(True, - attrs={'class':['section-headline', 'story', 'story headline']}): + url_list = [] - if div['class'] == 'section-headline': - key = string.capwords(feed_title(div)) - if self.excludeSectionKeywords: - excluded = re.compile('|'.join(self.excludeSectionKeywords)) - if excluded.search(key): - self.log("Skipping section %s" % key) - continue - articles[key] = [] - ans.append(key) - - elif div['class'] in ['story', 'story headline'] : - a = div.find('a', href=True) - if not a: - continue - url = re.sub(r'\?.*', '', a['href']) - url += '?pagewanted=all' - - title = self.massageNCXText(self.tag_to_string(a, use_alt=True).strip()) - - description = '' - pubdate = strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.massageNCXText(self.tag_to_string(summary, use_alt=False)) - - author = '' - authorAttribution = div.find(True, attrs={'class':'storyheadline-author'}) + def handle_article(div): + a = div.find('a', href=True) + if not a: + return + url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + return + if not url.endswith(".html"): + return + if 'podcast' in url: + return + url += '?pagewanted=all' + if url in url_list: + return + url_list.append(url) + title = self.tag_to_string(a, use_alt=True).strip() + #self.log("Title: %s" % title) + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: + authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) - else: - authorAttribution = div.find(True, attrs={'class':'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - # Kill commas - Kindle switches to '&' - author = re.sub(',','',author) + feed = key if key is not None else 'Uncategorized' + if not articles.has_key(feed): + articles[feed] = [] + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, + content='')) - feed = key if key is not None else 'Uncategorized' - if not articles.has_key(feed): - articles[feed] = [] - if not 'podcasts' in url: - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, - content='')) - ans = self.sort_index_by(ans, {'The Front Page':-1, - 'Dining In, Dining Out':1, - 'Obituaries':2}) + + + # Find each instance of class="section-headline", class="story", class="story headline" + for div in soup.findAll(True, + attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + + if div['class'] in ['section-headline','sectionHeader']: + key = string.capwords(feed_title(div)) + articles[key] = [] + ans.append(key) + #self.log('Section: %s' % key) + + elif div['class'] in ['story', 'story headline'] : + handle_article(div) + elif div['class'] == 'headlinesOnly multiline flush': + for lidiv in div.findAll('li'): + handle_article(lidiv) + +# ans = self.sort_index_by(ans, {'The Front Page':-1, +# 'Dining In, Dining Out':1, +# 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - self.dump_ans(ans) + return ans - def skip_ad_pages(self, soup): - # Skip ad pages served before actual article - skip_tag = soup.find(True, {'name':'skip'}) - if skip_tag is not None: - self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) - url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) - url += '?pagewanted=all' - self.log.warn("Skipping ad to article at '%s'" % url) - return self.index_to_soup(url, raw=True) - def preprocess_html(self, soup): - return self.strip_anchors(soup) + kicker_tag = soup.find(attrs={'class':'kicker'}) + if kicker_tag: + tagline = self.tag_to_string(kicker_tag) + #self.log("FOUND KICKER %s" % tagline) + if tagline=='Op-Ed Columnist': + img_div = soup.find('div','inlineImage module') + #self.log("Searching for photo") + if img_div: + img_div.extract() + #self.log("Photo deleted") + refresh = soup.find('meta', {'http-equiv':'refresh'}) + if refresh is None: + return soup + content = refresh.get('content').partition('=')[2] + raw = self.browser.open_novisit('http://www.nytimes.com'+content).read() + return BeautifulSoup(raw.decode('cp1252', 'replace')) - def postprocess_html(self,soup, True): - print "\npostprocess_html()\n" - - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class':'articleSpanImage'}) - inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg after headline - cgFirst = soup.find(True, {'class':'columnGroup first'}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll(text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag,'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc,firstImg) - else: - self.log(">>> No class:'columnGroup first' found <<<") - # Change class="kicker" to

- kicker = soup.find(True, {'class':'kicker'}) - if kicker and kicker.contents and kicker.contents[0]: - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, - use_alt=False))) - kicker.replaceWith(h3Tag) - - # Change captions to italic -1 - for caption in soup.findAll(True, {'class':'caption'}) : - if caption and caption.contents[0]: - emTag = Tag(soup, "em") - c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() - mp_off = c.find("More Photos") - if mp_off >= 0: - c = c[:mp_off] - emTag.insert(0, c) - #hrTag = Tag(soup, 'hr') - #hrTag['class'] = 'caption_divider' - hrTag = Tag(soup, 'div') - hrTag['class'] = 'divider' - emTag.insert(1, hrTag) - caption.replaceWith(emTag) - - # Change to

- h1 = soup.find('h1') - if h1: - headline = h1.find("nyt_headline") - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - h1.replaceWith(tag) - else: - # Blog entry - replace headline, remove
tags - headline = soup.find('title') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - soup.insert(0, tag) - hrs = soup.findAll('hr') - for hr in hrs: - hr.extract() - - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead: - # Nuke the href - if masthead.a: - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, self.fixChars(masthead.contents[0])) - masthead.replaceWith(tag) - - # Change to - for subhead in soup.findAll(True, {'class':'bold'}) : - if subhead.contents: - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - - # Synthesize a section header - dsk = soup.find('meta', attrs={'name':'dsk'}) - if dsk and dsk.has_key('content'): - hTag = Tag(soup,'h3') - hTag['class'] = 'section' - hTag.insert(0,NavigableString(dsk['content'])) - articleTag = soup.find(True, attrs={'id':'article'}) - if articleTag: - articleTag.insert(0,hTag) - - # Add class="articleBody" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'articleBody'}) - if divTag: - divTag['class'] = divTag['id'] - - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'authorId'}) - if divTag and divTag.contents[0]: - tag = Tag(soup, "p") - tag['class'] = "authorId" - tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], - use_alt=False))) - divTag.replaceWith(tag) - - return soup - - def populate_article_metadata(self,article,soup,first): - ''' - Extract author and description from article, add to article metadata - ''' - def extract_author(soup): - byline = soup.find('meta',attrs={'name':['byl','CLMST']}) - if byline : - author = byline['content'] - else : - # Try for