mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
merge from trunk
This commit is contained in:
commit
abcb95e69e
24
resources/recipes/taz_rss.recipe
Normal file
24
resources/recipes/taz_rss.recipe
Normal file
@ -0,0 +1,24 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Alexander Schremmer <alex@alexanderweb.de>'
|
||||
|
||||
from calibre.resources.recipes import BasicNewsRecipe
|
||||
|
||||
class TazRSSRecipe(BasicNewsRecipe):
|
||||
title = u'Taz.de (die tageszeitung) RSS Feed - German'
|
||||
__author__ = 'Alexander Schremmer'
|
||||
language = 'de'
|
||||
lang = 'de-DE'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
publisher = 'taz Entwicklungs GmbH & Co. Medien KG'
|
||||
|
||||
conversion_options = {'publisher': publisher,
|
||||
'language': lang,
|
||||
}
|
||||
|
||||
feeds = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')]
|
||||
keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})]
|
||||
remove_tags_after = dict(name='div', attrs={'class': 'rack'})
|
||||
remove_tags = [dict(name=['div'], attrs={'class': 'rack'}),
|
||||
dict(name=['div'], attrs={'class': 'artikelwerbung'}),
|
||||
dict(name=['ul'], attrs={'class': 'toolbar'}),]
|
@ -155,7 +155,7 @@ class InputFormatPlugin(Plugin):
|
||||
'''
|
||||
raise NotImplementedError()
|
||||
|
||||
def preprocess_html(self, html):
|
||||
def preprocess_html(self, opts, html):
|
||||
'''
|
||||
This method is called by the conversion pipeline on all HTML before it
|
||||
is parsed. It is meant to be used to do any required preprocessing on
|
||||
|
@ -144,7 +144,6 @@ class HTMLPreProcessor(object):
|
||||
# Fix pdftohtml markup
|
||||
PDFTOHTML = [
|
||||
# Fix umlauts
|
||||
# ¨
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
|
||||
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
|
||||
|
@ -96,13 +96,13 @@ class PreProcessor(object):
|
||||
html = convert_basic(html, epub_split_size_kb=0)
|
||||
else:
|
||||
# Add markup naively
|
||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||
# other types of unmarked html and handle them in some better fashion
|
||||
add_markup = re.compile('(?<!>)(\n)')
|
||||
html = add_markup.sub('</p>\n<p>', html)
|
||||
|
||||
|
||||
###### Mark Indents/Cleanup ######
|
||||
#
|
||||
#
|
||||
# Replace series of non-breaking spaces with text-indent
|
||||
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
|
||||
html = txtindent.sub(self.insert_indent, html)
|
||||
@ -176,8 +176,8 @@ class PreProcessor(object):
|
||||
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
|
||||
#
|
||||
# Unwrap and/or delete soft-hyphens, hyphens
|
||||
html = re.sub(u'\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
|
||||
html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
|
||||
|
||||
# Unwrap lines using punctation and line length
|
||||
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||
@ -193,8 +193,8 @@ class PreProcessor(object):
|
||||
# headings and titles, images, etc
|
||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||
|
||||
|
||||
# put back non-breaking spaces in empty paragraphs to preserve original formatting
|
||||
html = blankreg.sub('\n'+'\g<openline>'+' '+'\g<closeline>', html)
|
||||
|
||||
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
|
||||
|
||||
return html
|
||||
|
@ -490,7 +490,7 @@ class HTMLInput(InputFormatPlugin):
|
||||
return (None, None)
|
||||
return (None, raw)
|
||||
|
||||
def preprocess_html(self, options, html):
|
||||
def preprocess_html(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
@ -53,8 +53,8 @@ class LITInput(InputFormatPlugin):
|
||||
pre.append(ne)
|
||||
|
||||
|
||||
def preprocess_html(self, options, html):
|
||||
self.options = options
|
||||
def preprocess_html(self, options, html):
|
||||
self.options = options
|
||||
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
||||
return preprocessor(html)
|
||||
|
||||
|
@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like::
|
||||
|
||||
then the problem is probably a corrupted font cache. You can clear the cache by following these
|
||||
`instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
|
||||
solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.
|
||||
solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to
|
||||
check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File
|
||||
menu, choose "Validate fonts".
|
||||
|
||||
My antivirus program claims |app| is a virus/trojan?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
Loading…
x
Reference in New Issue
Block a user