From 138d503aba2de23bdbfbe1a0c488c84dbe2b2754 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 24 Oct 2010 14:57:43 -0700 Subject: [PATCH 01/27] Fix #7276 (Viewer crash on main app close. (v0.7.24)) --- src/calibre/gui2/viewer/main.py | 3 +++ src/calibre/utils/ipc/worker.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/viewer/main.py b/src/calibre/gui2/viewer/main.py index e113ef0611..09019af18b 100644 --- a/src/calibre/gui2/viewer/main.py +++ b/src/calibre/gui2/viewer/main.py @@ -716,6 +716,9 @@ View an ebook. def main(args=sys.argv): + # Ensure viewer can continue to function if GUI is closed + os.environ.pop('CALIBRE_WORKER_TEMP_DIR', None) + parser = option_parser() opts, args = parser.parse_args(args) pid = os.fork() if False and (islinux or isfreebsd) else -1 diff --git a/src/calibre/utils/ipc/worker.py b/src/calibre/utils/ipc/worker.py index e3584380a1..d8ffad7c53 100644 --- a/src/calibre/utils/ipc/worker.py +++ b/src/calibre/utils/ipc/worker.py @@ -105,7 +105,7 @@ def main(): notifier.start() result = func(*args, **kwargs) - if result is not None: + if result is not None and os.path.exists(os.path.dirname(resultf)): cPickle.dump(result, open(resultf, 'wb'), -1) notifier.queue.put(None) From 7c25c4d149f70c3abc281a52c328a0efada46e36 Mon Sep 17 00:00:00 2001 From: Timothy Legge Date: Mon, 25 Oct 2010 21:18:06 -0300 Subject: [PATCH 02/27] Changes for kobo WIFI version 1.7 --- src/calibre/devices/kobo/driver.py | 63 ++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index 418bfe5e0d..3562da55d2 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -22,7 +22,9 @@ class KOBO(USBMS): gui_name = 'Kobo Reader' description = _('Communicate with the Kobo Reader') author = 'Timothy Legge and Kovid Goyal' - version = (1, 0, 6) + version = (1, 0, 7) + + dbversion = 0 supported_platforms = ['windows', 'osx', 'linux'] @@ -92,7 +94,7 @@ class KOBO(USBMS): if lpath.startswith(os.sep): lpath = lpath[len(os.sep):] lpath = lpath.replace('\\', '/') -# print "LPATH: " + lpath + # debug_print("LPATH: ", lpath, " - Title: " , title) playlist_map = {} @@ -112,7 +114,7 @@ class KOBO(USBMS): #print "Image name Normalized: " + imagename if imagename is not None: bl[idx].thumbnail = ImageWrapper(imagename) - if ContentType != '6': + if (ContentType != '6'and self.dbversion < 8) or (self.dbversion >= 8): if self.update_metadata_item(bl[idx]): # print 'update_metadata_item returned true' changed = True @@ -120,10 +122,16 @@ class KOBO(USBMS): playlist_map[lpath] not in bl[idx].device_collections: bl[idx].device_collections.append(playlist_map[lpath]) else: - if ContentType == '6': + if ContentType == '6' and self.dbversion < 8: book = Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576) else: - book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID) + try: + book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID) + except: + debug_print("prefix: ", prefix, "lpath: ", lpath, "title: ", title, "authors: ", authors, \ + "mime: ", mime, "date: ", date, "ContentType: ", ContentType, "ImageID: ", ImageID) + raise + # print 'Update booklist' book.device_collections = [playlist_map[lpath]] if lpath in playlist_map else [] @@ -143,6 +151,13 @@ class KOBO(USBMS): # numrows = row[0] #cursor.close() + # Determine the database version + # 4 - Bluetooth Kobo Rev 2 (1.4) + # 8 - WIFI KOBO Rev 1 + cursor.execute('select version from dbversion') + result = cursor.fetchone() + self.dbversion = result[0] + query= 'select Title, Attribution, DateCreated, ContentID, MimeType, ContentType, ' \ 'ImageID, ReadStatus from content where BookID is Null' @@ -153,7 +168,8 @@ class KOBO(USBMS): # self.report_progress((i+1) / float(numrows), _('Getting list of books on device...')) path = self.path_from_contentid(row[3], row[5], oncard) - mime = mime_type_ext(path_to_ext(row[3])) + mime = mime_type_ext(path_to_ext(path)) if path.find('kepub') == -1 else 'application/epub+zip' + # debug_print("mime:", mime) if oncard != 'carda' and oncard != 'cardb' and not row[3].startswith("file:///mnt/sd/"): changed = update_booklist(self._main_prefix, path, row[0], row[1], mime, row[2], row[5], row[6], row[7]) @@ -206,7 +222,7 @@ class KOBO(USBMS): cursor.close() cursor = connection.cursor() - if ContentType == 6: + if ContentType == 6 and self.dbversion < 8: # Delete the shortcover_pages first cursor.execute('delete from shortcover_page where shortcoverid in (select ContentID from content where BookID = ?)', t) @@ -249,7 +265,7 @@ class KOBO(USBMS): path = self.normalize_path(path) # print "Delete file normalized path: " + path extension = os.path.splitext(path)[1] - ContentType = self.get_content_type_from_extension(extension) + ContentType = self.get_content_type_from_extension(extension) if extension != '' else self.get_content_type_from_path(path) ContentID = self.contentid_from_path(path, ContentType) @@ -332,9 +348,14 @@ class KOBO(USBMS): def contentid_from_path(self, path, ContentType): if ContentType == 6: - ContentID = os.path.splitext(path)[0] - # Remove the prefix on the file. it could be either - ContentID = ContentID.replace(self._main_prefix, '') + if self.dbversion < 8: + ContentID = os.path.splitext(path)[0] + # Remove the prefix on the file. it could be either + ContentID = ContentID.replace(self._main_prefix, '') + else: + ContentID = path + ContentID = ContentID.replace(self._main_prefix + '.kobo/kepub/', '') + if self._card_a_prefix is not None: ContentID = ContentID.replace(self._card_a_prefix, '') elif ContentType == 999: # HTML Files @@ -350,6 +371,13 @@ class KOBO(USBMS): ContentID = ContentID.replace("\\", '/') return ContentID + def get_content_type_from_path(self, path): + # Strictly speaking the ContentType could be 6 or 10 + # however newspapers have the same storage format + if path.find('kepub') >= 0: + ContentType = 6 + return ContentType + def get_content_type_from_extension(self, extension): if extension == '.kobo': # Kobo books do not have book files. They do have some images though @@ -369,19 +397,22 @@ class KOBO(USBMS): print 'path from_contentid cardb' elif oncard == 'carda': path = path.replace("file:///mnt/sd/", self._card_a_prefix) - # print "SD Card: " + filename + # print "SD Card: " + path else: - if ContentType == "6": + if ContentType == "6" and self.dbversion < 8: # This is a hack as the kobo files do not exist # but the path is required to make a unique id # for calibre's reference path = self._main_prefix + path + '.kobo' # print "Path: " + path + elif (ContentType == "6" or ContentType == "10") and self.dbversion >= 8: + path = self._main_prefix + '.kobo/kepub/' + path + # print "Internal: " + path else: # if path.startswith("file:///mnt/onboard/"): path = path.replace("file:///mnt/onboard/", self._main_prefix) path = path.replace("/mnt/onboard/", self._main_prefix) - # print "Internal: " + filename + # print "Internal: " + path return path @@ -469,7 +500,7 @@ class KOBO(USBMS): book.device_collections = ['Im_Reading'] extension = os.path.splitext(book.path)[1] - ContentType = self.get_content_type_from_extension(extension) + ContentType = self.get_content_type_from_extension(extension) if extension != '' else self.get_content_type_from_path(book.path) ContentID = self.contentid_from_path(book.path, ContentType) datelastread = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime()) @@ -505,7 +536,7 @@ class KOBO(USBMS): book.device_collections = ['Read'] extension = os.path.splitext(book.path)[1] - ContentType = self.get_content_type_from_extension(extension) + ContentType = self.get_content_type_from_extension(extension) if extension != '' else self.get_content_type_from_path(book.path) ContentID = self.contentid_from_path(book.path, ContentType) # datelastread = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime()) From d7f649ce985334da8a52be56351f1807b2e643f7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Oct 2010 18:14:52 -0700 Subject: [PATCH 03/27] Ming Pao by Eddie Lau --- resources/recipes/ming_pao.recipe | 64 +++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 resources/recipes/ming_pao.recipe diff --git a/resources/recipes/ming_pao.recipe b/resources/recipes/ming_pao.recipe new file mode 100644 index 0000000000..6a61405698 --- /dev/null +++ b/resources/recipes/ming_pao.recipe @@ -0,0 +1,64 @@ +cense__ = 'GPL v3' +__copyright__ = '2010, Eddie Lau' +''' +modified from Singtao Toronto calibre recipe by rty +''' + +import datetime +from calibre.web.feeds.recipes import BasicNewsRecipe + +class AdvancedUserRecipe1278063072(BasicNewsRecipe): + title = 'Ming Pao - Hong Kong' + oldest_article = 1 + max_articles_per_feed = 100 + __author__ = 'Eddie Lau' + description = 'Hong Kong Chinese Newspaper' + publisher = 'news.mingpao.com' + category = 'Chinese, News, Hong Kong' + remove_javascript = True + use_embedded_content = False + no_stylesheets = True + language = 'zh' + encoding = 'Big5-HKSCS' + recursions = 0 + conversion_options = {'linearize_tables':True} + masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' + + keep_only_tags = [dict(name='h1'), + dict(attrs={'id':['newscontent01','newscontent02']})] + + def get_fetchdate(self): + dt_utc = datetime.datetime.utcnow() + # convert UTC to local hk time + dt_local = dt_utc - datetime.timedelta(-8.0/24) + return dt_local.strftime("%Y%m%d") + + def parse_index(self): + feeds = [] + dateStr = self.get_fetchdate() + for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: + articles = self.parse_section(url) + if articles: + feeds.append((title, articles)) + return feeds + + def parse_section(self, url): + dateStr = self.get_fetchdate() + soup = self.index_to_soup(url) + divs = soup.findAll(attrs={'class': ['bullet']}) + current_articles = [] + for i in divs: + a = i.find('a', href = True) + title = self.tag_to_string(a) + url = a.get('href', False) + url = 'http://news.mingpao.com/' + dateStr + '/' +url + current_articles.append({'title': title, 'url': url, 'description':''}) + return current_articles + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(width=True): + del item['width'] + return soup + From e86d52d4f0c60a299a3de9d71e92a5bbe36c69b9 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Tue, 26 Oct 2010 09:21:13 +0100 Subject: [PATCH 04/27] Fix #7300 - date comparisons wrong --- src/calibre/library/caches.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py index 300ddbac0b..03383ee7dd 100644 --- a/src/calibre/library/caches.py +++ b/src/calibre/library/caches.py @@ -380,7 +380,7 @@ class ResultCache(SearchQueryParser): # {{{ field_count = 3 else: try: - qd = parse_date(query) + qd = parse_date(query, as_utc=False) except: raise ParseException(query, len(query), 'Date conversion error', self) if '-' in query: From 284ca8f7a15a6d653761694e125a721a2d68a1bc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Oct 2010 08:08:57 -0600 Subject: [PATCH 05/27] ... --- src/calibre/web/feeds/news.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index cb6bf30bcf..869799f6bb 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -842,6 +842,9 @@ class BasicNewsRecipe(Recipe): except NotImplementedError: feeds = self.parse_feeds() + if not feeds: + raise ValueError('No articles found, aborting') + #feeds = FeedCollection(feeds) self.report_progress(0, _('Trying to download cover...')) From 9a93d3fd2dccbf0f1fb9212dc00134ea41a8310a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Oct 2010 08:17:02 -0600 Subject: [PATCH 06/27] /browse: Fix handling of non-ascii saved searches --- src/calibre/library/server/browse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/server/browse.py b/src/calibre/library/server/browse.py index 463fcd6fde..142f40efab 100644 --- a/src/calibre/library/server/browse.py +++ b/src/calibre/library/server/browse.py @@ -509,7 +509,7 @@ class BrowseServer(object): hide_sort = 'true' if dt == 'series' else 'false' if category == 'search': - which = unhexlify(cid) + which = unhexlify(cid).decode('utf-8') try: ids = self.search_cache('search:"%s"'%which) except: From 330a7c989b163e27ca36424c1b680a95c8a47e40 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Oct 2010 08:18:42 -0600 Subject: [PATCH 07/27] Content server: Make /browse the default --- src/calibre/library/server/content.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index d95cd1818c..52a08e6175 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -124,7 +124,7 @@ class ContentServer(object): if want_mobile: return self.mobile() - return self.static('index.html') + return self.browse_toplevel() def old(self, **kwargs): return self.static('index.html') From 7988560d75a681089eabc2a0929bcdc1e5a3ae2f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Oct 2010 09:55:35 -0600 Subject: [PATCH 08/27] SONY driver: Fix bug when adding records to empty cacheExt.xml --- src/calibre/devices/prs505/sony_cache.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/prs505/sony_cache.py b/src/calibre/devices/prs505/sony_cache.py index 15245d3cd5..17eea3a27c 100644 --- a/src/calibre/devices/prs505/sony_cache.py +++ b/src/calibre/devices/prs505/sony_cache.py @@ -573,7 +573,10 @@ class XMLCache(object): ans = root.makeelement('{%s}text'%namespace, attrib=attrib, nsmap=root.nsmap) ans.tail = '\n' - root[-1].tail = '\n' + '\t' + if len(root) > 0: + root[-1].tail = '\n\t' + else: + root.text = '\n\t' root.append(ans) if thumbnail and thumbnail[-1]: ans.text = '\n' + '\t\t' From 3fdde535027acca84c4423ac21a2584c71b7a3c2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Oct 2010 09:59:57 -0600 Subject: [PATCH 09/27] Fix #7304 (New York Times Conversion Error) --- resources/recipes/nytimes_sub.recipe | 551 +++++++-------------------- 1 file changed, 133 insertions(+), 418 deletions(-) diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 1814132667..5452ae1c6e 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -4,149 +4,79 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com -V5 - One picture per article, moved to top: -Headline -Image -Byline -Story ''' -import re, string, time +import string, re, time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, Tag +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +def decode(self, src): + enc = 'utf-8' + if 'iso-8859-1' in src: + enc = 'cp1252' + return src.decode(enc, 'ignore') class NYTimes(BasicNewsRecipe): - title = 'The New York Times' - __author__ = 'GRiker' + title = u'New York Times' + __author__ = 'Kovid Goyal/Nick Redding' language = 'en' - requires_version = (0, 7, 5) + requires_version = (0, 6, 36) description = 'Daily news from the New York Times (subscription version)' - allSectionKeywords = ['The Front Page', 'International','National','Obituaries','Editorials', - 'New York','Business Day','Science Times','Sports','Dining','Arts', - 'Home','Styles','Sunday Business','Week In Review','Travel','Magazine', - 'Book Review','Weddings','Real Estate','Automobiles',"T Men's Fashion", - "T Women's Fashion"] - - # List of sections to exclude - # To add a section, copy the section name from the allSectionKeywords list above - # For example, to exclude 'Dining' and 'Weddings': - #excludeSectionKeywords = ['Dining','Weddings'] - excludeSectionKeywords = [] - - # List of sections to include (test and debug only) - # By default, any sections in today's paper that are not listed in excludeSectionKeywords - # are downloaded. fetch_only specifies that only certain sections are to be downloaded. - # This should only be used for testing and debugging. - # For example, to download only 'The Front Page' section: - # fetch_only = set(['The Front Page']) - fetch_only = set([]) - if fetch_only: - excludeSectionKeywords = list(set(allSectionKeywords) ^ fetch_only) - - # one_picture_per_article specifies that calibre should only use the first image - # from an article (if one exists). If one_picture_per_article = True, the image - # will be moved to a location between the headline and the byline. - # If one_picture_per_article = False, all images from the article will be included - # and shown in their original location. - one_picture_per_article = True - - timefmt = '' + timefmt = ' [%b %d]' needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':[ - 'articleFooter', - 'articleTools', - 'columnGroup doubleRule', - 'columnGroup singleRule', - 'columnGroup last', - 'columnGroup last', - 'doubleRule', - 'dottedLine', - 'entry-meta', - 'entry-response module', - 'icon enlargeThis', - 'leftNavTabs', - 'module box nav', - 'nextArticleLink', - 'nextArticleLink clearfix', - 'post-tools', - 'relatedSearchesModule', - 'side_tool', - 'singleAd', - 'subNavigation clearfix', - 'subNavigation tabContent active', - 'subNavigation tabContent active clearfix', - ]}), - dict(id=[ - 'adxLeaderboard', - 'archive', - 'articleExtras', - 'articleInline', - 'blog_sidebar', - 'businessSearchBar', - 'cCol', - 'entertainmentSearchBar', - 'footer', - 'header', - 'header_search', - 'login', - 'masthead', - 'masthead-nav', - 'memberTools', - 'navigation', - 'portfolioInline', - 'relatedArticles', - 'respond', - 'side_search', - 'side_index', - 'side_tool', - 'toolsRight', - ]), - dict(name=['script', 'noscript', 'style'])] - masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - cover_margins = (18,18,'grey99') + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink', + 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta', + 'icon enlargeThis','columnGroup last','relatedSearchesModule']}), + dict({'class':re.compile('^subNavigation')}), + dict({'class':re.compile('^leaderboard')}), + dict({'class':re.compile('^module')}), + dict({'class':'metaFootnote'}), + dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead', + 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline', + 'side_tool', 'side_index','header','readerReviewsCount','readerReviews', + 'relatedArticles', 'relatedTopics', 'adxSponLink']), + dict(name=['script', 'noscript', 'style','form','hr'])] + encoding = decode no_stylesheets = True - extra_css = '.headline {text-align: left;}\n \ - .byline {font-family: monospace; \ - text-align: left; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .dateline {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .timestamp {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .source {text-align: left;}\n \ - .image {text-align: center;}\n \ - .credit {text-align: right; \ - font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .articleBody {text-align: left;}\n \ - .authorId {text-align: left; \ - font-style: italic;}\n ' + extra_css = ''' + .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; } + .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .timestamp { font-size: small; } + .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + a:link {text-decoration: none; }''' def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - try: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - raw = br.submit().read() - if 'Sorry, we could not find the combination you entered. Please try again.' in raw: - raise Exception('Your username and password are incorrect') - #open('/t/log.html', 'wb').write(raw) - except: - self.log("\nFailed to login") - + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + raw = br.submit().read() + if 'Sorry, we could not find the combination you entered. Please try again.' in raw: + raise Exception('Your username and password are incorrect') + #open('/t/log.html', 'wb').write(raw) return br + def get_masthead_url(self): + masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + #masthead = 'http://members.cox.net/nickredding/nytlogo.gif' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nMasthead unavailable") + masthead = None + return masthead + + def get_cover_url(self): cover = None st = time.localtime() @@ -162,316 +92,101 @@ class NYTimes(BasicNewsRecipe): cover = None return cover - def get_masthead_title(self): - return self.title - - def dump_ans(self, ans): - total_article_count = 0 - for section in ans : - if self.verbose: - self.log("section %s: %d articles" % (section[0], len(section[1])) ) - for article in section[1]: - total_article_count += 1 - if self.verbose: - self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('mac-roman','replace'), - article['url'].encode('mac-roman','replace'))) - self.log( "Queued %d articles" % total_article_count ) - - def dump_hex(self, src, length=16): - ''' Diagnostic ''' - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) - N=0; result='' - while src: - s,src = src[:length],src[length:] - hexa = ' '.join(["%02X"%ord(x) for x in s]) - s = s.translate(FILTER) - result += "%04X %-*s %s\n" % (N, length*3, hexa, s) - N+=length - print result - - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - - return fixed - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description + def short_title(self): + return 'New York Times' def parse_index(self): + self.encoding = 'cp1252' soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') + self.encoding = decode def feed_title(div): - return ''.join(div.findAll(text=True, recursive=False)).strip() + return ''.join(div.findAll(text=True, recursive=True)).strip() articles = {} key = None ans = [] - # Find each instance of class="section-headline", class="story", class="story headline" - for div in soup.findAll(True, - attrs={'class':['section-headline', 'story', 'story headline']}): + url_list = [] - if div['class'] == 'section-headline': - key = string.capwords(feed_title(div)) - if self.excludeSectionKeywords: - excluded = re.compile('|'.join(self.excludeSectionKeywords)) - if excluded.search(key): - self.log("Skipping section %s" % key) - continue - articles[key] = [] - ans.append(key) - - elif div['class'] in ['story', 'story headline'] : - a = div.find('a', href=True) - if not a: - continue - url = re.sub(r'\?.*', '', a['href']) - url += '?pagewanted=all' - - title = self.massageNCXText(self.tag_to_string(a, use_alt=True).strip()) - - description = '' - pubdate = strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.massageNCXText(self.tag_to_string(summary, use_alt=False)) - - author = '' - authorAttribution = div.find(True, attrs={'class':'storyheadline-author'}) + def handle_article(div): + a = div.find('a', href=True) + if not a: + return + url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + return + if not url.endswith(".html"): + return + if 'podcast' in url: + return + url += '?pagewanted=all' + if url in url_list: + return + url_list.append(url) + title = self.tag_to_string(a, use_alt=True).strip() + #self.log("Title: %s" % title) + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: + authorAttribution = div.find(True, attrs={'class':'byline'}) if authorAttribution: author = self.tag_to_string(authorAttribution, use_alt=False) - else: - authorAttribution = div.find(True, attrs={'class':'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - # Kill commas - Kindle switches to '&' - author = re.sub(',','',author) + feed = key if key is not None else 'Uncategorized' + if not articles.has_key(feed): + articles[feed] = [] + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, + content='')) - feed = key if key is not None else 'Uncategorized' - if not articles.has_key(feed): - articles[feed] = [] - if not 'podcasts' in url: - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, - content='')) - ans = self.sort_index_by(ans, {'The Front Page':-1, - 'Dining In, Dining Out':1, - 'Obituaries':2}) + + + # Find each instance of class="section-headline", class="story", class="story headline" + for div in soup.findAll(True, + attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + + if div['class'] in ['section-headline','sectionHeader']: + key = string.capwords(feed_title(div)) + articles[key] = [] + ans.append(key) + #self.log('Section: %s' % key) + + elif div['class'] in ['story', 'story headline'] : + handle_article(div) + elif div['class'] == 'headlinesOnly multiline flush': + for lidiv in div.findAll('li'): + handle_article(lidiv) + +# ans = self.sort_index_by(ans, {'The Front Page':-1, +# 'Dining In, Dining Out':1, +# 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - self.dump_ans(ans) + return ans - def skip_ad_pages(self, soup): - # Skip ad pages served before actual article - skip_tag = soup.find(True, {'name':'skip'}) - if skip_tag is not None: - self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) - url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) - url += '?pagewanted=all' - self.log.warn("Skipping ad to article at '%s'" % url) - return self.index_to_soup(url, raw=True) - def preprocess_html(self, soup): - return self.strip_anchors(soup) + kicker_tag = soup.find(attrs={'class':'kicker'}) + if kicker_tag: + tagline = self.tag_to_string(kicker_tag) + #self.log("FOUND KICKER %s" % tagline) + if tagline=='Op-Ed Columnist': + img_div = soup.find('div','inlineImage module') + #self.log("Searching for photo") + if img_div: + img_div.extract() + #self.log("Photo deleted") + refresh = soup.find('meta', {'http-equiv':'refresh'}) + if refresh is None: + return soup + content = refresh.get('content').partition('=')[2] + raw = self.browser.open_novisit('http://www.nytimes.com'+content).read() + return BeautifulSoup(raw.decode('cp1252', 'replace')) - def postprocess_html(self,soup, True): - print "\npostprocess_html()\n" - - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class':'articleSpanImage'}) - inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg after headline - cgFirst = soup.find(True, {'class':'columnGroup first'}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll(text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag,'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc,firstImg) - else: - self.log(">>> No class:'columnGroup first' found <<<") - # Change class="kicker" to

- kicker = soup.find(True, {'class':'kicker'}) - if kicker and kicker.contents and kicker.contents[0]: - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, - use_alt=False))) - kicker.replaceWith(h3Tag) - - # Change captions to italic -1 - for caption in soup.findAll(True, {'class':'caption'}) : - if caption and caption.contents[0]: - emTag = Tag(soup, "em") - c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() - mp_off = c.find("More Photos") - if mp_off >= 0: - c = c[:mp_off] - emTag.insert(0, c) - #hrTag = Tag(soup, 'hr') - #hrTag['class'] = 'caption_divider' - hrTag = Tag(soup, 'div') - hrTag['class'] = 'divider' - emTag.insert(1, hrTag) - caption.replaceWith(emTag) - - # Change to

- h1 = soup.find('h1') - if h1: - headline = h1.find("nyt_headline") - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - h1.replaceWith(tag) - else: - # Blog entry - replace headline, remove
tags - headline = soup.find('title') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - soup.insert(0, tag) - hrs = soup.findAll('hr') - for hr in hrs: - hr.extract() - - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead: - # Nuke the href - if masthead.a: - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, self.fixChars(masthead.contents[0])) - masthead.replaceWith(tag) - - # Change to - for subhead in soup.findAll(True, {'class':'bold'}) : - if subhead.contents: - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - - # Synthesize a section header - dsk = soup.find('meta', attrs={'name':'dsk'}) - if dsk and dsk.has_key('content'): - hTag = Tag(soup,'h3') - hTag['class'] = 'section' - hTag.insert(0,NavigableString(dsk['content'])) - articleTag = soup.find(True, attrs={'id':'article'}) - if articleTag: - articleTag.insert(0,hTag) - - # Add class="articleBody" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'articleBody'}) - if divTag: - divTag['class'] = divTag['id'] - - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'authorId'}) - if divTag and divTag.contents[0]: - tag = Tag(soup, "p") - tag['class'] = "authorId" - tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], - use_alt=False))) - divTag.replaceWith(tag) - - return soup - - def populate_article_metadata(self,article,soup,first): - ''' - Extract author and description from article, add to article metadata - ''' - def extract_author(soup): - byline = soup.find('meta',attrs={'name':['byl','CLMST']}) - if byline : - author = byline['content'] - else : - # Try for