diff --git a/resources/recipes/taz_rss.recipe b/resources/recipes/taz_rss.recipe
new file mode 100644
index 0000000000..6520a23b63
--- /dev/null
+++ b/resources/recipes/taz_rss.recipe
@@ -0,0 +1,24 @@
+__license__ = 'GPL v3'
+__copyright__ = '2010, Alexander Schremmer '
+
+from calibre.resources.recipes import BasicNewsRecipe
+
+class TazRSSRecipe(BasicNewsRecipe):
+ title = u'Taz.de (die tageszeitung) RSS Feed - German'
+ __author__ = 'Alexander Schremmer'
+ language = 'de'
+ lang = 'de-DE'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ publisher = 'taz Entwicklungs GmbH & Co. Medien KG'
+
+ conversion_options = {'publisher': publisher,
+ 'language': lang,
+ }
+
+ feeds = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')]
+ keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})]
+ remove_tags_after = dict(name='div', attrs={'class': 'rack'})
+ remove_tags = [dict(name=['div'], attrs={'class': 'rack'}),
+ dict(name=['div'], attrs={'class': 'artikelwerbung'}),
+ dict(name=['ul'], attrs={'class': 'toolbar'}),]
diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index 72c067747d..8c8ce8c686 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -155,7 +155,7 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
- def preprocess_html(self, html):
+ def preprocess_html(self, opts, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 3e5de26766..03a0047927 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -144,7 +144,6 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup
PDFTOHTML = [
# Fix umlauts
- # ¨
(re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'),
(re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'),
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 91de2dc259..37fd169cb1 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -96,13 +96,13 @@ class PreProcessor(object):
html = convert_basic(html, epub_split_size_kb=0)
else:
# Add markup naively
- # TODO - find out if there are cases where there are more than one tag or
+ # TODO - find out if there are cases where there are more than one tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?)(\n)')
html = add_markup.sub('
\n', html)
-
+
###### Mark Indents/Cleanup ######
- #
+ #
# Replace series of non-breaking spaces with text-indent
txtindent = re.compile(ur'
[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
html = txtindent.sub(self.insert_indent, html)
@@ -176,8 +176,8 @@ class PreProcessor(object):
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
#
# Unwrap and/or delete soft-hyphens, hyphens
- html = re.sub(u'\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
- html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+ html = re.sub(u'\xad\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+ html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
# Unwrap lines using punctation and line length
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
@@ -193,8 +193,8 @@ class PreProcessor(object):
# headings and titles, images, etc
doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
html = doubleheading.sub('\g'+'\n'+'
', html)
-
+
# put back non-breaking spaces in empty paragraphs to preserve original formatting
- html = blankreg.sub('\n'+'\g'+' '+'\g', html)
-
+ html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
+
return html
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index e29ebd4554..603adadb53 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -490,7 +490,7 @@ class HTMLInput(InputFormatPlugin):
return (None, None)
return (None, raw)
- def preprocess_html(self, options, html):
+ def preprocess_html(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 4c0beebdd9..46a5e75977 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -53,8 +53,8 @@ class LITInput(InputFormatPlugin):
pre.append(ne)
- def preprocess_html(self, options, html):
- self.options = options
+ def preprocess_html(self, options, html):
+ self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index b93444f4c3..781048666b 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like::
then the problem is probably a corrupted font cache. You can clear the cache by following these
`instructions `_. If that doesn't
-solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.
+solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to
+check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File
+menu, choose "Validate fonts".
My antivirus program claims |app| is a virus/trojan?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~