From 6af7b6a43e4946609ee4d131af567cb232283f1b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Jan 2010 14:25:54 -0700 Subject: [PATCH] NY Times Sunday Book review by Krittika Goyal --- Changelog.yaml | 95 ++++++++++++++++++++ resources/recipes/nytimesbook.recipe | 56 ++++++++++++ src/calibre/ebooks/oeb/transforms/rescale.py | 5 +- 3 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 resources/recipes/nytimesbook.recipe diff --git a/Changelog.yaml b/Changelog.yaml index c2124aadd9..7c5644fd63 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -4,6 +4,101 @@ # for important features/bug fixes. # Also, each release can have new and improved recipes. +- version: 0.6.37 + date: 2010-01-31 + + new features: + - title: "E-book viewer: Add support for viewing SVG images" + type: major + + - title: "Add category of Recently added books when generating catalog in e-book format" + + - title: "OS X: Allow adding of books to calibre via drag and drop on the calibre dock icon" + + - title: "Add support for masthead images when downloading news for the Kindle" + + - title: "MOBI metadata: Allow setting of metadata in old PRC files without EXTH headers as well" + + bug fixes: + - title: Changing the date in Dutch + tickets: [4732] + + - title: "Fix regression that broke sending files to unupdated PRS 500s" + + - title: "MOBI Input: Ignore width and height percentage measures for tags." + tickets: [4726] + + - title: "EPUB Output: Remove tags that point to the internet for their images as this causes the ever delicate ADE to crash." + tickets: [4692] + + - title: "Comic Input: Handle UTF-8 BOM when converting a cbc file" + tickets: [4683] + + - title: "Allow rating to be cleared via the Bulk metadata edit dialog" + tickets: [4693] + + - title: "Add workaround for broken linux systems with multiply encoded file names" + tickets: [4721] + + - title: Fix bug preventing the the use of indices when setting save to disk templates + tickets: [4710] + + - title: "Linux device mounting. Use filetype of auto to allow non vfat filesystems to be mounted" + tickets: [4707] + + - title: "Catalog generation: Make sorting of numbers in title as text optional" + + - title: "Fix error while sending book with non-ascii character in title/author to device on linux" + tickets: [4690] + + new recipes: + - title: Kamera Bild + author: Darko Miletic + + - title: The Online Photographer + author: Darko Miletic + + - title: The Luminous Landscape + author: Darko Miletic + + - title: Slovo + author: Abelturd + + - title: Various Danish newspapers + author: Darko Miletic + + - title: Heraldo de Aragon + author: Lorenzo Vigentini + + - title: Orange County Register + author: Lorenzi Vigentini + + - title: Open Left + author: Xanthan Gum + + - title: Michelle Malkin + author: Walt Anthony + + - title: The Metro Montreal + author: Jerry Clapperton + + - title: The Gazette + author: Jerry Clapperton + + - title: Macleans Magazine + author: Nick Redding + + - title: NY Time Sunday Book Review + author: Krittika Goyal + + + improved recipes: + - The Irish Times + - Washington Post + - NIN + - The Discover Magazine + - Pagina 12 + - version: 0.6.36 date: 2010-01-25 diff --git a/resources/recipes/nytimesbook.recipe b/resources/recipes/nytimesbook.recipe new file mode 100644 index 0000000000..686f30b69a --- /dev/null +++ b/resources/recipes/nytimesbook.recipe @@ -0,0 +1,56 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class NewYorkTimesBookReview(BasicNewsRecipe): + title = u'New York Times Book Review' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 8 #days + max_articles_per_feed = 1000 + recursions = 2 + #encoding = 'latin1' + + remove_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + remove_tags_after = dict(name='div', attrs={'id':'authorId'}) + remove_tags = [ + dict(name='iframe'), + dict(name=['div', 'a'], attrs={'class':['enlargeThis', 'jumpLink']}), + dict(name='div', attrs={'id':['sidebarArticles', 'toolsRight']}), + #dict(name='ul', attrs={'class':'article-tools'}), + #dict(name='ul', attrs={'class':'articleTools'}), + ] + match_regexps = [ + r'http://www.nytimes.com/.+pagewanted=[2-9]+' + ] + + feeds = [ +('New York Times Sunday Book Review', + 'http://feeds.nytimes.com/nyt/rss/SundayBookReview'), +] + + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'article'}) + #td = heading.findParent(name='td') + #td.extract() + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + #for x in soup.findAll(name='p', text=lambda x:x and '-->' in x): + #p = x.findParent('p') + #if p is not None: + #p.extract() + return soup + + def postprocess_html(self, soup, first): + for div in soup.findAll(id='pageLinks'): + div.extract() + if not first: + h1 = soup.find('h1') + if h1 is not None: + h1.extract() + t = soup.find(attrs={'class':'timestamp'}) + if t is not None: + t.extract() + return soup diff --git a/src/calibre/ebooks/oeb/transforms/rescale.py b/src/calibre/ebooks/oeb/transforms/rescale.py index 7ce3b5a588..fbf0e9bc4f 100644 --- a/src/calibre/ebooks/oeb/transforms/rescale.py +++ b/src/calibre/ebooks/oeb/transforms/rescale.py @@ -35,7 +35,10 @@ class RescaleImages(object): if not raw: continue if qt: img = QImage(10, 10, QImage.Format_ARGB32_Premultiplied) - if not img.loadFromData(raw): continue + try: + if not img.loadFromData(raw): continue + except: + continue width, height = img.width(), img.height() else: f = cStringIO.StringIO(raw)