Merge from trunk

This commit is contained in:
Charles Haley 2010-09-17 19:39:49 +01:00
commit 6fc3a25556
17 changed files with 182 additions and 125 deletions

View File

@ -27,10 +27,19 @@ class Danas(BasicNewsRecipe):
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
.article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif} .article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif}
.nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif} .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif}
.antrfileText{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em; .antrfileText{border-left: 2px solid #999999;
margin-bottom: 0; margin-top: 0} h2,.datum,.lokacija,.autor{font-size: small} margin-left: 0.8em;
.antrfileNaslov{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em; padding-left: 1.2em;
font-weight:bold; margin-bottom: 0; margin-top: 0} img{margin-bottom: 0.8em} margin-bottom: 0;
margin-top: 0}
h2,.datum,.lokacija,.autor{font-size: small}
.antrfileNaslov{border-left: 2px solid #999999;
margin-left: 0.8em;
padding-left: 1.2em;
font-weight:bold;
margin-bottom: 0;
margin-top: 0}
img{margin-bottom: 0.8em}
""" """
conversion_options = { conversion_options = {
@ -40,18 +49,7 @@ class Danas(BasicNewsRecipe):
, 'language' : language , 'language' : language
} }
preprocess_regexps = [ preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
(re.compile(u'\u0110'), lambda match: u'\u00D0')
,(re.compile(r'<?xml:namespace prefix = st1 ns = "urn:schemas-microsoft-com:office:smarttags".*?/>',re.DOTALL|re.IGNORECASE), lambda match: r'')
,(re.compile(r'<st1:place.*?>',re.DOTALL|re.IGNORECASE), lambda match: r'')
,(re.compile(r'</st1:place>',re.DOTALL|re.IGNORECASE), lambda match: r'')
,(re.compile(r'<st1:city.*?>',re.DOTALL|re.IGNORECASE), lambda match: r'')
,(re.compile(r'</st1:city>',re.DOTALL|re.IGNORECASE), lambda match: r'')
,(re.compile(r'<st1:country-region.*?>',re.DOTALL|re.IGNORECASE), lambda match: r'')
,(re.compile(r'</st1:country-region>',re.DOTALL|re.IGNORECASE), lambda match: r'')
,(re.compile(r'<st1:state.*?>',re.DOTALL|re.IGNORECASE), lambda match: r'')
,(re.compile(r'</st1:state>',re.DOTALL|re.IGNORECASE), lambda match: r'')
]
keep_only_tags = [dict(name='div', attrs={'id':'left'})] keep_only_tags = [dict(name='div', attrs={'id':'left'})]
remove_tags = [ remove_tags = [
@ -59,7 +57,7 @@ class Danas(BasicNewsRecipe):
,dict(name='div', attrs={'id':'comments'}) ,dict(name='div', attrs={'id':'comments'})
,dict(name=['object','link','iframe','meta']) ,dict(name=['object','link','iframe','meta'])
] ]
remove_attributes = ['st'] remove_attributes = ['w:st','st']
feeds = [ feeds = [
(u'Politika' , u'http://www.danas.rs/rss/rss.asp?column_id=27') (u'Politika' , u'http://www.danas.rs/rss/rss.asp?column_id=27')
@ -91,6 +89,9 @@ class Danas(BasicNewsRecipe):
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for tagn in ['st1:place','st1:city','st1:country-region','st1:state']:
for item in soup.body.findAll(tagn):
item.name='span'
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
for item in soup.findAll('a'): for item in soup.findAll('a'):

View File

@ -8,6 +8,7 @@ espn.com
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import TemporaryFile
class ESPN(BasicNewsRecipe): class ESPN(BasicNewsRecipe):
@ -78,12 +79,19 @@ class ESPN(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False) br.set_handle_refresh(False)
if self.username is not None and self.password is not None: url = ('https://r.espn.go.com/members/v3_1/login')
br.open('http://espn.com')#('http://espn.go.com/#myespn') raw = br.open(url).read()
br.select_form(nr=1) raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
br.form.find_control(name='username', type='text').value = self.username with TemporaryFile(suffix='.htm') as fname:
br.form['password'] = self.password with open(fname, 'wb') as f:
br.submit() f.write(raw)
br.open_local_file(fname)
br.form = br.forms().next()
br.form.find_control(name='username', type='text').value = self.username
br.form['password'] = self.password
br.submit().read()
br.open('http://espn.go.com').read()
br.set_handle_refresh(True) br.set_handle_refresh(True)
return br return br

View File

@ -8,8 +8,9 @@ __docformat__ = 'restructuredtext de'
''' '''
www.taz.de/digiabo www.taz.de/digiabo
''' '''
import os, urllib2, zipfile, tempfile import os, urllib2, zipfile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class TazDigiabo(BasicNewsRecipe): class TazDigiabo(BasicNewsRecipe):
@ -26,38 +27,39 @@ class TazDigiabo(BasicNewsRecipe):
} }
def build_index(self): def build_index(self):
if self.username is not None and self.password is not None: domain = "http://www.taz.de"
domain = "http://www.taz.de"
url = domain + "/epub/" url = domain + "/epub/"
auth_handler = urllib2.HTTPBasicAuthHandler() auth_handler = urllib2.HTTPBasicAuthHandler()
auth_handler.add_password(realm='TAZ-ABO', auth_handler.add_password(realm='TAZ-ABO',
uri=url, uri=url,
user=self.username, user=self.username,
passwd=self.password) passwd=self.password)
opener = urllib2.build_opener(auth_handler) opener = urllib2.build_opener(auth_handler)
urllib2.install_opener(opener) urllib2.install_opener(opener)
try: try:
f = urllib2.urlopen(url) f = urllib2.urlopen(url)
except urllib2.HTTPError: except urllib2.HTTPError:
self.report_progress(0,_('Can\'t login to download issue')) self.report_progress(0,_('Can\'t login to download issue'))
raise ValueError('Failed to login, check your username and' raise ValueError('Failed to login, check your username and'
' password') ' password')
tmp = tempfile.TemporaryFile() tmp = PersistentTemporaryFile(suffix='.epub')
self.report_progress(0,_('downloading epub')) self.report_progress(0,_('downloading epub'))
tmp.write(f.read()) tmp.write(f.read())
tmp.close()
zfile = zipfile.ZipFile(tmp, 'r') zfile = zipfile.ZipFile(tmp.name, 'r')
self.report_progress(0,_('extracting epub')) self.report_progress(0,_('extracting epub'))
zfile.extractall(self.output_dir) zfile.extractall(self.output_dir)
tmp.close() tmp.close()
index = os.path.join(self.output_dir, 'content.opf') index = os.path.join(self.output_dir, 'content.opf')
self.report_progress(1,_('epub downloaded and extracted')) self.report_progress(1,_('epub downloaded and extracted'))
return index
return index

View File

@ -0,0 +1,24 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Alexander Schremmer <alex@alexanderweb.de>'
from calibre.resources.recipes import BasicNewsRecipe
class TazRSSRecipe(BasicNewsRecipe):
title = u'Taz.de (die tageszeitung) RSS Feed - German'
__author__ = 'Alexander Schremmer'
language = 'de'
lang = 'de-DE'
oldest_article = 7
max_articles_per_feed = 100
publisher = 'taz Entwicklungs GmbH & Co. Medien KG'
conversion_options = {'publisher': publisher,
'language': lang,
}
feeds = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')]
keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})]
remove_tags_after = dict(name='div', attrs={'class': 'rack'})
remove_tags = [dict(name=['div'], attrs={'class': 'rack'}),
dict(name=['div'], attrs={'class': 'artikelwerbung'}),
dict(name=['ul'], attrs={'class': 'toolbar'}),]

View File

@ -155,7 +155,7 @@ class InputFormatPlugin(Plugin):
''' '''
raise NotImplementedError() raise NotImplementedError()
def preprocess_html(self, html): def preprocess_html(self, opts, html):
''' '''
This method is called by the conversion pipeline on all HTML before it This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on is parsed. It is meant to be used to do any required preprocessing on

View File

@ -144,7 +144,6 @@ class HTMLPreProcessor(object):
# Fix pdftohtml markup # Fix pdftohtml markup
PDFTOHTML = [ PDFTOHTML = [
# Fix umlauts # Fix umlauts
# ¨
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'), (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'), (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
@ -399,7 +398,7 @@ class HTMLPreProcessor(object):
html = unidecoder.decode(html) html = unidecoder.decode(html)
if self.plugin_preprocess: if self.plugin_preprocess:
html = self.input_plugin_preprocess(html) html = self.input_plugin_preprocess(self.extra_opts, html)
if getattr(self.extra_opts, 'smarten_punctuation', False): if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html) html = self.smarten_punctuation(html)

View File

@ -11,7 +11,7 @@ from calibre.utils.logging import default_log
class PreProcessor(object): class PreProcessor(object):
def __init__(self, log=None, extra_opts=None): def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log self.log = default_log if log is None else log
self.html_preprocess_sections = 0 self.html_preprocess_sections = 0
self.found_indents = 0 self.found_indents = 0
@ -77,6 +77,32 @@ class PreProcessor(object):
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")
###### Check Markup ######
#
# some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
# check if content is in pre tags, use txt procesor to mark up if so
pre = re.compile(r'<pre>', re.IGNORECASE)
if len(pre.findall(html)) == 1:
self.log("Running Text Processing")
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
separate_paragraphs_single_line
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
html = outerhtml.sub('\g<text>', html)
html = separate_paragraphs_single_line(html)
html = preserve_spaces(html)
html = convert_basic(html, epub_split_size_kb=0)
else:
# Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
###### Mark Indents/Cleanup ######
#
# Replace series of non-breaking spaces with text-indent # Replace series of non-breaking spaces with text-indent
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
html = txtindent.sub(self.insert_indent, html) html = txtindent.sub(self.insert_indent, html)
@ -86,31 +112,27 @@ class PreProcessor(object):
html = re.sub(ur'\u00a0', ' ', html) html = re.sub(ur'\u00a0', ' ', html)
# Get rid of empty <o:p> tags to simplify other processing # Get rid of empty <o:p> tags to simplify other processing
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
# Get rid of empty span tags # Get rid of empty span, bold, & italics tags
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE) #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
blanklines = blankreg.findall(html) blanklines = blankreg.findall(html)
lines = linereg.findall(html) lines = linereg.findall(html)
if len(lines) > 1: if len(lines) > 1:
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40: if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
'remove_paragraph_spacing', False):
self.log("deleting blank lines") self.log("deleting blank lines")
html = blankreg.sub('', html) html = blankreg.sub('', html)
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html) html = re.sub(r"\s*</p>", "</p>\n", html)
html = re.sub(r"\s*<p>\s*", "\n<p>", html) html = re.sub(r"\s*<p>\s*", "\n<p>", html)
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html)) self.html_preprocess_sections = len(heading.findall(html))
@ -118,7 +140,7 @@ class PreProcessor(object):
# #
# Start with most typical chapter headings, get more aggressive until one works # Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE)
html = chapdetect.sub(self.chapter_head, html) html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
@ -127,10 +149,10 @@ class PreProcessor(object):
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)
# Unwrap lines ###### Unwrap lines ######
# #
self.log("Unwrapping Lines") self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags # Some OCR sourced files have line breaks in the html using a combination of span & p tags
@ -149,13 +171,13 @@ class PreProcessor(object):
format = 'html' format = 'html'
# Calculate Length # Calculate Length
length = line_length('pdf', html, getattr(self.extra_opts, length = line_length(format, html, getattr(self.extra_opts,
'html_unwrap_factor', 0.4)) 'html_unwrap_factor', 0.4))
self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
# #
# Unwrap and/or delete soft-hyphens, hyphens # Unwrap and/or delete soft-hyphens, hyphens
html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
# Unwrap lines using punctation and line length # Unwrap lines using punctation and line length
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
@ -164,13 +186,15 @@ class PreProcessor(object):
# If still no sections after unwrapping mark split points on lines with no punctuation # If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
#self.log(html)
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html) html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another # search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter # top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc # headings and titles, images, etc
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
# put back non-breaking spaces in empty paragraphs to preserve original formatting
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
return html return html

View File

@ -490,7 +490,8 @@ class HTMLInput(InputFormatPlugin):
return (None, None) return (None, None)
return (None, raw) return (None, raw)
def preprocess_html(self, html): def preprocess_html(self, options, html):
preprocessor = PreProcessor(log=getattr(self, 'log', None)) self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html) return preprocessor(html)

View File

@ -53,7 +53,8 @@ class LITInput(InputFormatPlugin):
pre.append(ne) pre.append(ne)
def preprocess_html(self, html): def preprocess_html(self, options, html):
preprocessor = PreProcessor(log=getattr(self, 'log', None)) self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html) return preprocessor(html)

View File

@ -420,8 +420,9 @@ class LRFInput(InputFormatPlugin):
styles.write() styles.write()
return os.path.abspath('content.opf') return os.path.abspath('content.opf')
def preprocess_html(self, html): def preprocess_html(self, options, html):
preprocessor = PreProcessor(log=getattr(self, 'log', None)) self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html) return preprocessor(html)

View File

@ -39,11 +39,11 @@ class MOBIInput(InputFormatPlugin):
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path return mr.created_opf_path
def preprocess_html(self, html): def preprocess_html(self, options, html):
# search for places where a first or second level heading is immediately followed by another # search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter # top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc # headings and titles, images, etc
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
return html return html

View File

@ -229,7 +229,7 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result) res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html: if self.options.preprocess_html:
preprocessor = PreProcessor(log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
res = preprocessor(res) res = preprocessor(res)
f.write(res) f.write(res)
self.write_inline_css(inline_class) self.write_inline_css(inline_class)

View File

@ -6,7 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
from threading import Thread from threading import Thread
import os, re, shutil import os, re, shutil
from PyQt4.Qt import SIGNAL, QDialog, QGridLayout from PyQt4.Qt import QDialog, QGridLayout
from PyQt4 import QtGui from PyQt4 import QtGui
from calibre.gui2.dialogs.metadata_bulk_ui import Ui_MetadataBulkDialog from calibre.gui2.dialogs.metadata_bulk_ui import Ui_MetadataBulkDialog
@ -136,12 +136,10 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
self.series.editTextChanged.connect(self.series_changed) self.series.editTextChanged.connect(self.series_changed)
self.tag_editor_button.clicked.connect(self.tag_editor) self.tag_editor_button.clicked.connect(self.tag_editor)
# Haven't yet figured out how to hide a single tab if len(db.custom_column_label_map) == 0:
# if len(db.custom_column_label_map) == 0: self.central_widget.removeTab(1)
# self.central_widget.widget(1).setVisible(False) else:
# else: self.create_custom_column_editors()
# self.create_custom_column_editors()
self.create_custom_column_editors()
self.prepare_search_and_replace() self.prepare_search_and_replace()
self.exec_() self.exec_()
@ -201,21 +199,11 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
self.s_r_obj = None self.s_r_obj = None
self.replace_func.addItems(sorted(self.s_r_functions.keys())) self.replace_func.addItems(sorted(self.s_r_functions.keys()))
self.connect(self.search_field, self.search_field.currentIndexChanged[str].connect(self.s_r_field_changed)
SIGNAL('currentIndexChanged(const QString &)'), self.replace_func.currentIndexChanged[str].connect(self.s_r_paint_results)
self.s_r_field_changed) self.search_for.editTextChanged[str].connect(self.s_r_paint_results)
self.connect(self.replace_func, self.replace_with.editTextChanged[str].connect(self.s_r_paint_results)
SIGNAL('currentIndexChanged(const QString &)'), self.test_text.editTextChanged[str].connect(self.s_r_paint_results)
self.s_r_paint_results)
self.connect(self.search_for,
SIGNAL('editTextChanged(const QString &)'),
self.s_r_paint_results)
self.connect(self.replace_with,
SIGNAL('editTextChanged(const QString &)'),
self.s_r_paint_results)
self.connect(self.test_text,
SIGNAL('editTextChanged(const QString &)'),
self.s_r_paint_results)
def s_r_field_changed(self, txt): def s_r_field_changed(self, txt):
txt = unicode(txt) txt = unicode(txt)

View File

@ -6,8 +6,8 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>526</width> <width>572</width>
<height>499</height> <height>554</height>
</rect> </rect>
</property> </property>
<property name="windowTitle"> <property name="windowTitle">
@ -200,14 +200,15 @@
</item> </item>
<item row="6" column="2"> <item row="6" column="2">
<widget class="QCheckBox" name="remove_all_tags"> <widget class="QCheckBox" name="remove_all_tags">
<property name="text">
<string>Remove all</string>
</property>
<property name="toolTip"> <property name="toolTip">
<string>Check this box to remove all tags from the books.</string> <string>Check this box to remove all tags from the books.</string>
</property> </property>
<property name="text">
<string>Remove all</string>
</property>
</widget> </widget>
</item><item row="7" column="0"> </item>
<item row="7" column="0">
<widget class="QLabel" name="label_7"> <widget class="QLabel" name="label_7">
<property name="text"> <property name="text">
<string>&amp;Series:</string> <string>&amp;Series:</string>
@ -303,7 +304,7 @@ Future conversion of these books will use the default settings.</string>
</widget> </widget>
<widget class="QWidget" name="tabWidgetPage3"> <widget class="QWidget" name="tabWidgetPage3">
<attribute name="title"> <attribute name="title">
<string>&amp;Search and replace</string> <string>&amp;Search and replace (experimental)</string>
</attribute> </attribute>
<layout class="QGridLayout" name="gridLayout"> <layout class="QGridLayout" name="gridLayout">
<property name="sizeConstraint"> <property name="sizeConstraint">

View File

@ -143,6 +143,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
SchemaUpgrade.__init__(self) SchemaUpgrade.__init__(self)
self.initialize_dynamic() self.initialize_dynamic()
def get_property(self, idx, index_is_id=False, loc=-1):
row = self.data._data[idx] if index_is_id else self.data[idx]
if row is not None:
return row[loc]
def initialize_dynamic(self): def initialize_dynamic(self):
self.field_metadata = FieldMetadata() #Ensure we start with a clean copy self.field_metadata = FieldMetadata() #Ensure we start with a clean copy
self.prefs = DBPrefs(self) self.prefs = DBPrefs(self)
@ -324,19 +329,13 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
self.last_update_check = self.last_modified() self.last_update_check = self.last_modified()
def get_property(idx, index_is_id=False, loc=-1):
row = self.data._data[idx] if index_is_id else self.data[idx]
if row is not None:
return row[loc]
for prop in ('author_sort', 'authors', 'comment', 'comments', 'isbn', for prop in ('author_sort', 'authors', 'comment', 'comments', 'isbn',
'publisher', 'rating', 'series', 'series_index', 'tags', 'publisher', 'rating', 'series', 'series_index', 'tags',
'title', 'timestamp', 'uuid', 'pubdate', 'ondevice'): 'title', 'timestamp', 'uuid', 'pubdate', 'ondevice'):
setattr(self, prop, functools.partial(get_property, setattr(self, prop, functools.partial(self.get_property,
loc=self.FIELD_MAP['comments' if prop == 'comment' else prop])) loc=self.FIELD_MAP['comments' if prop == 'comment' else prop]))
setattr(self, 'title_sort', functools.partial(get_property, setattr(self, 'title_sort', functools.partial(self.get_property,
loc=self.FIELD_MAP['sort'])) loc=self.FIELD_MAP['sort']))
setattr(self, 'get_property', get_property)
def initialize_database(self): def initialize_database(self):
metadata_sqlite = open(P('metadata_sqlite.sql'), 'rb').read() metadata_sqlite = open(P('metadata_sqlite.sql'), 'rb').read()
@ -440,7 +439,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
if not f: if not f:
continue continue
stream = cStringIO.StringIO(f) stream = cStringIO.StringIO(f)
self.add_format(id, format, stream, index_is_id=True, path=tpath) self.add_format(id, format, stream, index_is_id=True,
path=tpath, notify=False)
self.conn.execute('UPDATE books SET path=? WHERE id=?', (path, id)) self.conn.execute('UPDATE books SET path=? WHERE id=?', (path, id))
if commit: if commit:
self.conn.commit() self.conn.commit()

View File

@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like::
then the problem is probably a corrupted font cache. You can clear the cache by following these then the problem is probably a corrupted font cache. You can clear the cache by following these
`instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't `instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to
check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File
menu, choose "Validate fonts".
My antivirus program claims |app| is a virus/trojan? My antivirus program claims |app| is a virus/trojan?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -10,6 +10,7 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.constants import numeric_version from calibre.constants import numeric_version
from calibre import walk
class RecipeDisabled(Exception): class RecipeDisabled(Exception):
pass pass
@ -111,6 +112,10 @@ class RecipeInput(InputFormatPlugin):
if f.endswith('.opf'): if f.endswith('.opf'):
return os.path.abspath(f) return os.path.abspath(f)
for f in walk('.'):
if f.endswith('.opf'):
return os.path.abspath(f)
def postprocess_book(self, oeb, opts, log): def postprocess_book(self, oeb, opts, log):
if self.recipe_object is not None: if self.recipe_object is not None:
self.recipe_object.postprocess_book(oeb, opts, log) self.recipe_object.postprocess_book(oeb, opts, log)