merge from trunk

2025-07-09 03:04:10 -04:00 · 2010-09-18 02:57:50 +08:00 · 2010-09-18 02:57:50 +08:00 · abcb95e69e
commit abcb95e69e
parent dbb35b5823 2c11080dc7
7 changed files with 39 additions and 14 deletions
--- a/resources/recipes/taz_rss.recipe
+++ b/resources/recipes/taz_rss.recipe
@ -0,0 +1,24 @@
 __license__   = 'GPL v3'
 __copyright__ = '2010, Alexander Schremmer <alex@alexanderweb.de>'
 from calibre.resources.recipes import BasicNewsRecipe
 class TazRSSRecipe(BasicNewsRecipe):
    title = u'Taz.de (die tageszeitung) RSS Feed - German'
    __author__ = 'Alexander Schremmer'
    language = 'de'
    lang = 'de-DE'
    oldest_article = 7
    max_articles_per_feed = 100
    publisher = 'taz Entwicklungs GmbH & Co. Medien KG'
    conversion_options = {'publisher': publisher,
                          'language': lang,
                        }
    feeds          = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')]
    keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})]
    remove_tags_after = dict(name='div', attrs={'class': 'rack'})
    remove_tags = [dict(name=['div'], attrs={'class': 'rack'}),
 		 dict(name=['div'], attrs={'class': 'artikelwerbung'}),
 		 dict(name=['ul'], attrs={'class': 'toolbar'}),]
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -155,7 +155,7 @@ class InputFormatPlugin(Plugin):
        '''
        raise NotImplementedError()
-    def preprocess_html(self, html):
+    def preprocess_html(self, opts, html):
        '''
        This method is called by the conversion pipeline on all HTML before it
        is parsed. It is meant to be used to do any required preprocessing on
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -144,7 +144,6 @@ class HTMLPreProcessor(object):
    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Fix umlauts
                  # ¨
                  (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -96,13 +96,13 @@ class PreProcessor(object):
                 html = convert_basic(html, epub_split_size_kb=0)
             else:
                 # Add markup naively
-                 # TODO - find out if there are cases where there are more than one <pre> tag or 
+                 # TODO - find out if there are cases where there are more than one <pre> tag or
                 # other types of unmarked html and handle them in some better fashion
                 add_markup = re.compile('(?<!>)(\n)')
                 html = add_markup.sub('</p>\n<p>', html)
-        
+
        ###### Mark Indents/Cleanup ######
-        #        
+        #
        # Replace series of non-breaking spaces with text-indent
        txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
        html = txtindent.sub(self.insert_indent, html)
@ -176,8 +176,8 @@ class PreProcessor(object):
        self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
        #
        # Unwrap and/or delete soft-hyphens, hyphens
-        html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
-        html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+        html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
        # Unwrap lines using punctation and line length
        unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
@ -193,8 +193,8 @@ class PreProcessor(object):
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
-        
+
        # put back non-breaking spaces in empty paragraphs to preserve original formatting
-        html = blankreg.sub('\n'+'\g<openline>'+' '+'\g<closeline>', html)
+        html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
-        
+
        return html
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -490,7 +490,7 @@ class HTMLInput(InputFormatPlugin):
            return (None, None)
        return (None, raw)
-	def preprocess_html(self, options, html):
+    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -53,8 +53,8 @@ class LITInput(InputFormatPlugin):
                        pre.append(ne)
-	def preprocess_html(self, options, html):
+    def preprocess_html(self, options, html):
-	    self.options = options
+        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like::
 then the problem is probably a corrupted font cache. You can clear the cache by following these
 `instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
-solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.
+solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to
 check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File
 menu, choose "Validate fonts".
 My antivirus program claims |app| is a virus/trojan?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~