merge from trunk

This commit is contained in:
ldolse 2010-09-18 02:57:50 +08:00
commit abcb95e69e
7 changed files with 39 additions and 14 deletions

View File

@ -0,0 +1,24 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Alexander Schremmer <alex@alexanderweb.de>'
from calibre.resources.recipes import BasicNewsRecipe
class TazRSSRecipe(BasicNewsRecipe):
title = u'Taz.de (die tageszeitung) RSS Feed - German'
__author__ = 'Alexander Schremmer'
language = 'de'
lang = 'de-DE'
oldest_article = 7
max_articles_per_feed = 100
publisher = 'taz Entwicklungs GmbH & Co. Medien KG'
conversion_options = {'publisher': publisher,
'language': lang,
}
feeds = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')]
keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})]
remove_tags_after = dict(name='div', attrs={'class': 'rack'})
remove_tags = [dict(name=['div'], attrs={'class': 'rack'}),
dict(name=['div'], attrs={'class': 'artikelwerbung'}),
dict(name=['ul'], attrs={'class': 'toolbar'}),]

View File

@ -155,7 +155,7 @@ class InputFormatPlugin(Plugin):
'''
raise NotImplementedError()
def preprocess_html(self, html):
def preprocess_html(self, opts, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on

View File

@ -144,7 +144,6 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup
PDFTOHTML = [
# Fix umlauts
# ¨
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),

View File

@ -96,13 +96,13 @@ class PreProcessor(object):
html = convert_basic(html, epub_split_size_kb=0)
else:
# Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or
# TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
###### Mark Indents/Cleanup ######
#
#
# Replace series of non-breaking spaces with text-indent
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
html = txtindent.sub(self.insert_indent, html)
@ -176,8 +176,8 @@ class PreProcessor(object):
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
#
# Unwrap and/or delete soft-hyphens, hyphens
html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
# Unwrap lines using punctation and line length
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
@ -193,8 +193,8 @@ class PreProcessor(object):
# headings and titles, images, etc
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
# put back non-breaking spaces in empty paragraphs to preserve original formatting
html = blankreg.sub('\n'+'\g<openline>'+' '+'\g<closeline>', html)
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
return html

View File

@ -490,7 +490,7 @@ class HTMLInput(InputFormatPlugin):
return (None, None)
return (None, raw)
def preprocess_html(self, options, html):
def preprocess_html(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -53,8 +53,8 @@ class LITInput(InputFormatPlugin):
pre.append(ne)
def preprocess_html(self, options, html):
self.options = options
def preprocess_html(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View File

@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like::
then the problem is probably a corrupted font cache. You can clear the cache by following these
`instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.
solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to
check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File
menu, choose "Validate fonts".
My antivirus program claims |app| is a virus/trojan?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~