NY Times Sunday Book review by Krittika Goyal

2026-04-04 16:21:57 -04:00 · 2010-01-31 14:25:54 -07:00 · 2010-01-31 14:25:54 -07:00 · 6af7b6a43e
commit 6af7b6a43e
parent 50a8fe4a50
3 changed files with 155 additions and 1 deletions
--- a/Changelog.yaml
+++ b/Changelog.yaml
@ -4,6 +4,101 @@
 # for important features/bug fixes.
 # Also, each release can have new and improved recipes.

+- version: 0.6.37
+  date: 2010-01-31
+
+  new features:
+    - title: "E-book viewer: Add support for viewing SVG images"
+      type: major
+
+    - title: "Add category of Recently added books when generating catalog in e-book format"
+
+    - title: "OS X: Allow adding of books to calibre via drag and drop on the calibre dock icon"
+
+    - title: "Add support for masthead images when downloading news for the Kindle"
+
+    - title: "MOBI metadata: Allow setting of metadata in old PRC files without EXTH headers as well"
+
+  bug fixes:
+    - title: Changing the date in Dutch
+      tickets: [4732]
+
+    - title: "Fix regression that broke sending files to unupdated PRS 500s"
+
+    - title: "MOBI Input: Ignore width and height percentage measures for <img> tags."
+      tickets: [4726]
+
+    - title: "EPUB Output: Remove <img> tags that point to the internet for their images as this causes the ever delicate ADE to crash."
+      tickets: [4692]
+
+    - title: "Comic Input: Handle UTF-8 BOM when converting a cbc file"
+      tickets: [4683]
+
+    - title: "Allow rating to be cleared via the Bulk metadata edit dialog"
+      tickets: [4693]
+
+    - title: "Add workaround for broken linux systems with multiply encoded file names"
+      tickets: [4721]
+
+    - title: Fix bug preventing the the use of indices when setting save to disk templates
+      tickets: [4710]
+
+    - title: "Linux device mounting. Use filetype of auto to allow non vfat filesystems to be mounted"
+      tickets: [4707]
+
+    - title: "Catalog generation: Make sorting of numbers in title as text optional"
+
+    - title: "Fix error while sending book with non-ascii character in title/author to device on linux"
+      tickets: [4690]
+
+  new recipes:
+    - title: Kamera Bild
+      author: Darko Miletic
+
+    - title: The Online Photographer
+      author: Darko Miletic
+
+    - title: The Luminous Landscape
+      author: Darko Miletic
+
+    - title: Slovo
+      author: Abelturd
+
+    - title: Various Danish newspapers
+      author: Darko Miletic
+
+    - title: Heraldo de Aragon
+      author: Lorenzo Vigentini
+
+    - title: Orange County Register
+      author: Lorenzi Vigentini
+
+    - title: Open Left
+      author: Xanthan Gum
+
+    - title: Michelle Malkin
+      author: Walt Anthony
+
+    - title: The Metro Montreal
+      author: Jerry Clapperton
+
+    - title: The Gazette
+      author: Jerry Clapperton
+
+    - title: Macleans Magazine
+      author:  Nick Redding
+
+    - title: NY Time Sunday Book Review
+      author: Krittika Goyal
+
+
+  improved recipes:
+    - The Irish Times
+    - Washington Post
+    - NIN
+    - The Discover Magazine
+    - Pagina 12
+
 - version: 0.6.36
  date: 2010-01-25

--- a/resources/recipes/nytimesbook.recipe
+++ b/resources/recipes/nytimesbook.recipe
@ -0,0 +1,56 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class NewYorkTimesBookReview(BasicNewsRecipe):
+    title          = u'New York Times Book Review'
+    language       = 'en'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 8 #days
+    max_articles_per_feed = 1000
+    recursions = 2
+    #encoding = 'latin1'
+
+    remove_stylesheets = True
+    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
+    remove_tags_after  = dict(name='div', attrs={'id':'authorId'})
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name=['div', 'a'], attrs={'class':['enlargeThis', 'jumpLink']}),
+       dict(name='div', attrs={'id':['sidebarArticles', 'toolsRight']}),
+       #dict(name='ul', attrs={'class':'article-tools'}),
+       #dict(name='ul', attrs={'class':'articleTools'}),
+    ]
+    match_regexps = [
+            r'http://www.nytimes.com/.+pagewanted=[2-9]+'
+            ]
+
+    feeds          = [
+('New York Times Sunday Book Review',
+ 'http://feeds.nytimes.com/nyt/rss/SundayBookReview'),
+]
+
+
+    def preprocess_html(self, soup):
+        story = soup.find(name='div', attrs={'id':'article'})
+        #td = heading.findParent(name='td')
+        #td.extract()
+        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
+        body = soup.find(name='body')
+        body.insert(0, story)
+        #for x in soup.findAll(name='p', text=lambda x:x and '--&gt;' in x):
+             #p = x.findParent('p')
+             #if p is not None:
+                  #p.extract()
+        return soup
+
+    def postprocess_html(self, soup, first):
+        for div in soup.findAll(id='pageLinks'):
+            div.extract()
+        if not first:
+            h1 = soup.find('h1')
+            if h1 is not None:
+                h1.extract()
+            t = soup.find(attrs={'class':'timestamp'})
+            if t is not None:
+                t.extract()
+        return soup
--- a/src/calibre/ebooks/oeb/transforms/rescale.py
+++ b/src/calibre/ebooks/oeb/transforms/rescale.py
@ -35,7 +35,10 @@ class RescaleImages(object):
                if not raw: continue
                if qt:
                    img = QImage(10, 10, QImage.Format_ARGB32_Premultiplied)
-                    if not img.loadFromData(raw): continue
+                    try:
+                        if not img.loadFromData(raw): continue
+                    except:
+                        continue
                    width, height = img.width(), img.height()
                else:
                    f = cStringIO.StringIO(raw)