merge from trunk

This commit is contained in:
ldolse 2010-09-18 02:57:50 +08:00
commit abcb95e69e
7 changed files with 39 additions and 14 deletions

View File

@ -0,0 +1,24 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Alexander Schremmer <alex@alexanderweb.de>'
from calibre.resources.recipes import BasicNewsRecipe
class TazRSSRecipe(BasicNewsRecipe):
title = u'Taz.de (die tageszeitung) RSS Feed - German'
__author__ = 'Alexander Schremmer'
language = 'de'
lang = 'de-DE'
oldest_article = 7
max_articles_per_feed = 100
publisher = 'taz Entwicklungs GmbH & Co. Medien KG'
conversion_options = {'publisher': publisher,
'language': lang,
}
feeds = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')]
keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})]
remove_tags_after = dict(name='div', attrs={'class': 'rack'})
remove_tags = [dict(name=['div'], attrs={'class': 'rack'}),
dict(name=['div'], attrs={'class': 'artikelwerbung'}),
dict(name=['ul'], attrs={'class': 'toolbar'}),]

View File

@ -155,7 +155,7 @@ class InputFormatPlugin(Plugin):
''' '''
raise NotImplementedError() raise NotImplementedError()
def preprocess_html(self, html): def preprocess_html(self, opts, html):
''' '''
This method is called by the conversion pipeline on all HTML before it This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on is parsed. It is meant to be used to do any required preprocessing on

View File

@ -144,7 +144,6 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup # Fix pdftohtml markup
PDFTOHTML = [ PDFTOHTML = [
# Fix umlauts # Fix umlauts
# ¨
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'), (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'), (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),

View File

@ -96,13 +96,13 @@ class PreProcessor(object):
html = convert_basic(html, epub_split_size_kb=0) html = convert_basic(html, epub_split_size_kb=0)
else: else:
# Add markup naively # Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or # TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion # other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)') add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html) html = add_markup.sub('</p>\n<p>', html)
###### Mark Indents/Cleanup ###### ###### Mark Indents/Cleanup ######
# #
# Replace series of non-breaking spaces with text-indent # Replace series of non-breaking spaces with text-indent
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
html = txtindent.sub(self.insert_indent, html) html = txtindent.sub(self.insert_indent, html)
@ -176,8 +176,8 @@ class PreProcessor(object):
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
# #
# Unwrap and/or delete soft-hyphens, hyphens # Unwrap and/or delete soft-hyphens, hyphens
html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
# Unwrap lines using punctation and line length # Unwrap lines using punctation and line length
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
@ -193,8 +193,8 @@ class PreProcessor(object):
# headings and titles, images, etc # headings and titles, images, etc
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html) html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
# put back non-breaking spaces in empty paragraphs to preserve original formatting # put back non-breaking spaces in empty paragraphs to preserve original formatting
html = blankreg.sub('\n'+'\g<openline>'+' '+'\g<closeline>', html) html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
return html return html

View File

@ -490,7 +490,7 @@ class HTMLInput(InputFormatPlugin):
return (None, None) return (None, None)
return (None, raw) return (None, raw)
def preprocess_html(self, options, html): def preprocess_html(self, options, html):
self.options = options self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html) return preprocessor(html)

View File

@ -53,8 +53,8 @@ class LITInput(InputFormatPlugin):
pre.append(ne) pre.append(ne)
def preprocess_html(self, options, html): def preprocess_html(self, options, html):
self.options = options self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html) return preprocessor(html)

View File

@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like::
then the problem is probably a corrupted font cache. You can clear the cache by following these then the problem is probably a corrupted font cache. You can clear the cache by following these
`instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't `instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to
check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File
menu, choose "Validate fonts".
My antivirus program claims |app| is a virus/trojan? My antivirus program claims |app| is a virus/trojan?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~