From a83de9ce2d6ad5cf60b250d4d92afc5e2bcfdd46 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 18 Jan 2009 16:07:15 -0800 Subject: [PATCH] Fix #1639 (Calibre can not handle properly URL's with non-ascii characters). New recipe for Sueddeutsche by Oliver Niesner --- src/calibre/ebooks/epub/from_feeds.py | 1 + src/calibre/web/feeds/recipes/__init__.py | 2 +- src/calibre/web/feeds/recipes/recipe_sueddeutsche.py | 4 ++++ src/calibre/web/fetch/simple.py | 6 +++++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/epub/from_feeds.py b/src/calibre/ebooks/epub/from_feeds.py index bbadbc54de..fd1759712d 100644 --- a/src/calibre/ebooks/epub/from_feeds.py +++ b/src/calibre/ebooks/epub/from_feeds.py @@ -40,6 +40,7 @@ def convert(opts, recipe_arg, notification=None): c.smart_update(recipe_opts, opts) opts = recipe_opts opts.chapter_mark = 'none' + opts.dont_split_on_page_breaks = True opf = glob.glob(os.path.join(tdir, '*.opf')) if not opf: raise Exception('Downloading of recipe: %s failed'%recipe_arg) diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index e6bcdaed9c..f0687ece28 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -22,7 +22,7 @@ recipe_modules = ['recipe_' + r for r in ( 'time_magazine', 'endgadget', 'fudzilla', 'nspm_int', 'nspm', 'pescanik', 'spiegel_int', 'themarketticker', 'tomshardware', 'xkcd', 'ftd', 'zdnet', 'joelonsoftware', 'telepolis', 'common_dreams', 'nin', 'tomshardware_de', - 'pagina12', 'infobae', 'ambito', 'elargentino', + 'pagina12', 'infobae', 'ambito', 'elargentino', 'sueddeutsche', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_sueddeutsche.py b/src/calibre/web/feeds/recipes/recipe_sueddeutsche.py index 606fc35320..fa97b73c80 100644 --- a/src/calibre/web/feeds/recipes/recipe_sueddeutsche.py +++ b/src/calibre/web/feeds/recipes/recipe_sueddeutsche.py @@ -56,3 +56,7 @@ class Sueddeutsche(BasicNewsRecipe): feeds = [ (u'Sueddeutsche', u'http://www.sueddeutsche.de/app/service/rss/alles/rss.xml') ] + def postprocess_html(self, soup, first_fetch): + for t in soup.findAll(['table', 'tr', 'td']): + t.name = 'div' + return soup diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 2a8b03a545..43f7aa0626 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -395,7 +395,11 @@ class RecursiveFetcher(object, LoggingInterface): if self.download_stylesheets: self.process_stylesheets(soup, newbaseurl) - res = os.path.join(linkdiskpath, basename(iurl)) + _fname = basename(iurl) + if not isinstance(_fname, unicode): + _fname.decode('latin1', 'replace') + _fname.encode('ascii', 'replace').replace('%', '') + res = os.path.join(linkdiskpath, _fname) self.downloaded_paths.append(res) self.filemap[nurl] = res if recursion_level < self.max_recursions: