From ded912be3f6dc21d1b6373c8f9365db068063afa Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Sep 2010 09:59:03 -0600 Subject: [PATCH 1/3] ... --- src/calibre/manual/faq.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index b93444f4c3..781048666b 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like:: then the problem is probably a corrupted font cache. You can clear the cache by following these `instructions `_. If that doesn't -solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. +solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to +check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File +menu, choose "Validate fonts". My antivirus program claims |app| is a virus/trojan? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From ffe8fe5fd23a721af0fe1d07df109d78a39c743c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Sep 2010 10:42:11 -0600 Subject: [PATCH 2/3] Fix use of UTF-8 raw string --- src/calibre/ebooks/conversion/preprocess.py | 1 - src/calibre/ebooks/conversion/utils.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3e5de26766..03a0047927 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -144,7 +144,6 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ # Fix umlauts - # ¨ (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'), diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 8588ff65ad..37fd169cb1 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -176,8 +176,8 @@ class PreProcessor(object): self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") # # Unwrap and/or delete soft-hyphens, hyphens - html = re.sub(u'­\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) - html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) + html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) + html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) # Unwrap lines using punctation and line length unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) @@ -195,6 +195,6 @@ class PreProcessor(object): html = doubleheading.sub('\g'+'\n'+'', html) # put back non-breaking spaces in empty paragraphs to preserve original formatting - html = blankreg.sub('\n'+'\g'+' '+'\g', html) + html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html From 2c11080dc7780deae4ebfa4e3582a7ffd2885e5a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Sep 2010 10:46:49 -0600 Subject: [PATCH 3/3] taz.de RSS by Alexander Schremmer --- resources/recipes/taz_rss.recipe | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 resources/recipes/taz_rss.recipe diff --git a/resources/recipes/taz_rss.recipe b/resources/recipes/taz_rss.recipe new file mode 100644 index 0000000000..6520a23b63 --- /dev/null +++ b/resources/recipes/taz_rss.recipe @@ -0,0 +1,24 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Alexander Schremmer ' + +from calibre.resources.recipes import BasicNewsRecipe + +class TazRSSRecipe(BasicNewsRecipe): + title = u'Taz.de (die tageszeitung) RSS Feed - German' + __author__ = 'Alexander Schremmer' + language = 'de' + lang = 'de-DE' + oldest_article = 7 + max_articles_per_feed = 100 + publisher = 'taz Entwicklungs GmbH & Co. Medien KG' + + conversion_options = {'publisher': publisher, + 'language': lang, + } + + feeds = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')] + keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})] + remove_tags_after = dict(name='div', attrs={'class': 'rack'}) + remove_tags = [dict(name=['div'], attrs={'class': 'rack'}), + dict(name=['div'], attrs={'class': 'artikelwerbung'}), + dict(name=['ul'], attrs={'class': 'toolbar'}),]