diff --git a/resources/recipes/danas.recipe b/resources/recipes/danas.recipe index 6d6042b5c9..3543acd684 100644 --- a/resources/recipes/danas.recipe +++ b/resources/recipes/danas.recipe @@ -27,10 +27,19 @@ class Danas(BasicNewsRecipe): @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif} .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif} - .antrfileText{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em; - margin-bottom: 0; margin-top: 0} h2,.datum,.lokacija,.autor{font-size: small} - .antrfileNaslov{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em; - font-weight:bold; margin-bottom: 0; margin-top: 0} img{margin-bottom: 0.8em} + .antrfileText{border-left: 2px solid #999999; + margin-left: 0.8em; + padding-left: 1.2em; + margin-bottom: 0; + margin-top: 0} + h2,.datum,.lokacija,.autor{font-size: small} + .antrfileNaslov{border-left: 2px solid #999999; + margin-left: 0.8em; + padding-left: 1.2em; + font-weight:bold; + margin-bottom: 0; + margin-top: 0} + img{margin-bottom: 0.8em} """ conversion_options = { @@ -40,18 +49,7 @@ class Danas(BasicNewsRecipe): , 'language' : language } - preprocess_regexps = [ - (re.compile(u'\u0110'), lambda match: u'\u00D0') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ,(re.compile(r'',re.DOTALL|re.IGNORECASE), lambda match: r'') - ] + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] keep_only_tags = [dict(name='div', attrs={'id':'left'})] remove_tags = [ @@ -59,7 +57,7 @@ class Danas(BasicNewsRecipe): ,dict(name='div', attrs={'id':'comments'}) ,dict(name=['object','link','iframe','meta']) ] - remove_attributes = ['st'] + remove_attributes = ['w:st','st'] feeds = [ (u'Politika' , u'http://www.danas.rs/rss/rss.asp?column_id=27') @@ -87,10 +85,13 @@ class Danas(BasicNewsRecipe): ,(u'Zvaka u pepeljari' , u'http://www.danas.rs/rss/rss.asp?column_id=56') ,(u'Vostani Serbie' , u'http://www.danas.rs/rss/rss.asp?column_id=57') ,(u'Med&Jad-a' , u'http://www.danas.rs/rss/rss.asp?column_id=58') - ,(u'Svetlosti pozornice' , u'http://www.danas.rs/rss/rss.asp?column_id=59') + ,(u'Svetlosti pozornice' , u'http://www.danas.rs/rss/rss.asp?column_id=59') ] def preprocess_html(self, soup): + for tagn in ['st1:place','st1:city','st1:country-region','st1:state']: + for item in soup.body.findAll(tagn): + item.name='span' for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('a'): @@ -98,7 +99,7 @@ class Danas(BasicNewsRecipe): item.extract() for item in soup.findAll('img'): if not item.has_key('alt'): - item['alt'] = 'image' + item['alt'] = 'image' return soup def print_version(self, url): diff --git a/resources/recipes/espn.recipe b/resources/recipes/espn.recipe index 5d75c3977b..178dbf27a8 100644 --- a/resources/recipes/espn.recipe +++ b/resources/recipes/espn.recipe @@ -8,6 +8,7 @@ espn.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ptempfile import TemporaryFile class ESPN(BasicNewsRecipe): @@ -78,12 +79,19 @@ class ESPN(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser() br.set_handle_refresh(False) - if self.username is not None and self.password is not None: - br.open('http://espn.com')#('http://espn.go.com/#myespn') - br.select_form(nr=1) - br.form.find_control(name='username', type='text').value = self.username - br.form['password'] = self.password - br.submit() + url = ('https://r.espn.go.com/members/v3_1/login') + raw = br.open(url).read() + raw = re.sub(r'(?s)
.*?id="regsigninbtn".*?
', '', raw) + with TemporaryFile(suffix='.htm') as fname: + with open(fname, 'wb') as f: + f.write(raw) + br.open_local_file(fname) + + br.form = br.forms().next() + br.form.find_control(name='username', type='text').value = self.username + br.form['password'] = self.password + br.submit().read() + br.open('http://espn.go.com').read() br.set_handle_refresh(True) return br diff --git a/resources/recipes/taz.recipe b/resources/recipes/taz.recipe index 45b414f3cd..93ce5fded0 100644 --- a/resources/recipes/taz.recipe +++ b/resources/recipes/taz.recipe @@ -8,8 +8,9 @@ __docformat__ = 'restructuredtext de' ''' www.taz.de/digiabo ''' -import os, urllib2, zipfile, tempfile +import os, urllib2, zipfile from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ptempfile import PersistentTemporaryFile class TazDigiabo(BasicNewsRecipe): @@ -26,38 +27,39 @@ class TazDigiabo(BasicNewsRecipe): } def build_index(self): - if self.username is not None and self.password is not None: - domain = "http://www.taz.de" + domain = "http://www.taz.de" - url = domain + "/epub/" + url = domain + "/epub/" - auth_handler = urllib2.HTTPBasicAuthHandler() - auth_handler.add_password(realm='TAZ-ABO', - uri=url, - user=self.username, - passwd=self.password) - opener = urllib2.build_opener(auth_handler) - urllib2.install_opener(opener) + auth_handler = urllib2.HTTPBasicAuthHandler() + auth_handler.add_password(realm='TAZ-ABO', + uri=url, + user=self.username, + passwd=self.password) + opener = urllib2.build_opener(auth_handler) + urllib2.install_opener(opener) - try: - f = urllib2.urlopen(url) - except urllib2.HTTPError: - self.report_progress(0,_('Can\'t login to download issue')) - raise ValueError('Failed to login, check your username and' - ' password') + try: + f = urllib2.urlopen(url) + except urllib2.HTTPError: + self.report_progress(0,_('Can\'t login to download issue')) + raise ValueError('Failed to login, check your username and' + ' password') - tmp = tempfile.TemporaryFile() - self.report_progress(0,_('downloading epub')) - tmp.write(f.read()) + tmp = PersistentTemporaryFile(suffix='.epub') + self.report_progress(0,_('downloading epub')) + tmp.write(f.read()) + tmp.close() - zfile = zipfile.ZipFile(tmp, 'r') - self.report_progress(0,_('extracting epub')) + zfile = zipfile.ZipFile(tmp.name, 'r') + self.report_progress(0,_('extracting epub')) - zfile.extractall(self.output_dir) + zfile.extractall(self.output_dir) - tmp.close() - index = os.path.join(self.output_dir, 'content.opf') + tmp.close() + index = os.path.join(self.output_dir, 'content.opf') - self.report_progress(1,_('epub downloaded and extracted')) + self.report_progress(1,_('epub downloaded and extracted')) + + return index - return index diff --git a/resources/recipes/taz_rss.recipe b/resources/recipes/taz_rss.recipe new file mode 100644 index 0000000000..6520a23b63 --- /dev/null +++ b/resources/recipes/taz_rss.recipe @@ -0,0 +1,24 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Alexander Schremmer ' + +from calibre.resources.recipes import BasicNewsRecipe + +class TazRSSRecipe(BasicNewsRecipe): + title = u'Taz.de (die tageszeitung) RSS Feed - German' + __author__ = 'Alexander Schremmer' + language = 'de' + lang = 'de-DE' + oldest_article = 7 + max_articles_per_feed = 100 + publisher = 'taz Entwicklungs GmbH & Co. Medien KG' + + conversion_options = {'publisher': publisher, + 'language': lang, + } + + feeds = [(u'TAZ main feed', u'http://www.taz.de/rss.xml')] + keep_only_tags = [dict(name='div', attrs={'class': 'sect sect_article'})] + remove_tags_after = dict(name='div', attrs={'class': 'rack'}) + remove_tags = [dict(name=['div'], attrs={'class': 'rack'}), + dict(name=['div'], attrs={'class': 'artikelwerbung'}), + dict(name=['ul'], attrs={'class': 'toolbar'}),] diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 72c067747d..8c8ce8c686 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -155,7 +155,7 @@ class InputFormatPlugin(Plugin): ''' raise NotImplementedError() - def preprocess_html(self, html): + def preprocess_html(self, opts, html): ''' This method is called by the conversion pipeline on all HTML before it is parsed. It is meant to be used to do any required preprocessing on diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index e72e15c3d9..03a0047927 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -144,7 +144,6 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ # Fix umlauts - # ¨ (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'), @@ -399,7 +398,7 @@ class HTMLPreProcessor(object): html = unidecoder.decode(html) if self.plugin_preprocess: - html = self.input_plugin_preprocess(html) + html = self.input_plugin_preprocess(self.extra_opts, html) if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 3fe6ce0ed4..37fd169cb1 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -11,7 +11,7 @@ from calibre.utils.logging import default_log class PreProcessor(object): - def __init__(self, log=None, extra_opts=None): + def __init__(self, extra_opts=None, log=None): self.log = default_log if log is None else log self.html_preprocess_sections = 0 self.found_indents = 0 @@ -77,6 +77,32 @@ class PreProcessor(object): def __call__(self, html): self.log("********* Preprocessing HTML *********") + ###### Check Markup ###### + # + # some lit files don't have any

tags or equivalent (generally just plain text between + #

 tags), check and  mark up line endings if required before proceeding
+        if self.no_markup(html, 0.1):
+             self.log("not enough paragraph markers, adding now")
+             # check if content is in pre tags, use txt procesor to mark up if so
+             pre = re.compile(r'
', re.IGNORECASE)
+             if len(pre.findall(html)) == 1:
+                 self.log("Running Text Processing")
+                 from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+                 separate_paragraphs_single_line
+                 outerhtml = re.compile(r'.*?(?<=
)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL) + html = outerhtml.sub('\g', html) + html = separate_paragraphs_single_line(html) + html = preserve_spaces(html) + html = convert_basic(html, epub_split_size_kb=0) + else: + # Add markup naively + # TODO - find out if there are cases where there are more than one
 tag or
+                 # other types of unmarked html and handle them in some better fashion
+                 add_markup = re.compile('(?)(\n)')
+                 html = add_markup.sub('

\n

', html) + + ###### Mark Indents/Cleanup ###### + # # Replace series of non-breaking spaces with text-indent txtindent = re.compile(ur'[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) html = txtindent.sub(self.insert_indent, html) @@ -86,31 +112,27 @@ class PreProcessor(object): html = re.sub(ur'\u00a0', ' ', html) # Get rid of empty tags to simplify other processing html = re.sub(ur'\s*\s*', ' ', html) - # Get rid of empty span tags - html = re.sub(r"\s*]*>\s*", " ", html) + # Get rid of empty span, bold, & italics tags + html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) + html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*\s*){0,2}\s*", " ", html) + html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - blankreg = re.compile(r'\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

', re.IGNORECASE) + blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) #multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

){2,}', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) if len(lines) > 1: self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") - if float(len(blanklines)) / float(len(lines)) > 0.40: + if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, + 'remove_paragraph_spacing', False): self.log("deleting blank lines") html = blankreg.sub('', html) # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*

\s*", "\n

", html) - - # some lit files don't have any

tags or equivalent (generally just plain text between - #

 tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-             self.log("not enough paragraph markers, adding now")
-             add_markup = re.compile('(?)(\n)')
-             html = add_markup.sub('

\n

', html) - + #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) @@ -118,7 +140,7 @@ class PreProcessor(object): # # Start with most typical chapter headings, get more aggressive until one works if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'(?=]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}s*(]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(){0,2})\s*()?s*()?\s*(){0,2}\s*()\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") @@ -127,10 +149,10 @@ class PreProcessor(object): if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) - # Unwrap lines + ###### Unwrap lines ###### # self.log("Unwrapping Lines") # Some OCR sourced files have line breaks in the html using a combination of span & p tags @@ -149,13 +171,13 @@ class PreProcessor(object): format = 'html' # Calculate Length - length = line_length('pdf', html, getattr(self.extra_opts, + length = line_length(format, html, getattr(self.extra_opts, 'html_unwrap_factor', 0.4)) - self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***") + self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") # # Unwrap and/or delete soft-hyphens, hyphens - html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) - html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) + html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) + html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) # Unwrap lines using punctation and line length unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) @@ -164,13 +186,15 @@ class PreProcessor(object): # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < 10: self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) - #self.log(html) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) - html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html) + + # put back non-breaking spaces in empty paragraphs to preserve original formatting + html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html) return html diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 084d48e54b..603adadb53 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -490,7 +490,8 @@ class HTMLInput(InputFormatPlugin): return (None, None) return (None, raw) - def preprocess_html(self, html): - preprocessor = PreProcessor(log=getattr(self, 'log', None)) + def preprocess_html(self, options, html): + self.options = options + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) return preprocessor(html) diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 65f5c607a2..46a5e75977 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -53,7 +53,8 @@ class LITInput(InputFormatPlugin): pre.append(ne) - def preprocess_html(self, html): - preprocessor = PreProcessor(log=getattr(self, 'log', None)) + def preprocess_html(self, options, html): + self.options = options + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) return preprocessor(html) diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py index c54f3b071f..70529c0a04 100644 --- a/src/calibre/ebooks/lrf/input.py +++ b/src/calibre/ebooks/lrf/input.py @@ -420,8 +420,9 @@ class LRFInput(InputFormatPlugin): styles.write() return os.path.abspath('content.opf') - def preprocess_html(self, html): - preprocessor = PreProcessor(log=getattr(self, 'log', None)) + def preprocess_html(self, options, html): + self.options = options + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) return preprocessor(html) diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py index b8dc7a9560..9ab7996a74 100644 --- a/src/calibre/ebooks/mobi/input.py +++ b/src/calibre/ebooks/mobi/input.py @@ -39,11 +39,11 @@ class MOBIInput(InputFormatPlugin): accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path - def preprocess_html(self, html): + def preprocess_html(self, options, html): # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) - html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html) return html diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 000c603c1c..078b30627f 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -229,7 +229,7 @@ class RTFInput(InputFormatPlugin): res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] if self.options.preprocess_html: - preprocessor = PreProcessor(log=getattr(self, 'log', None)) + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) res = preprocessor(res) f.write(res) self.write_inline_css(inline_class) diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py index 173c69910f..253686344a 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.py +++ b/src/calibre/gui2/dialogs/metadata_bulk.py @@ -6,7 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' from threading import Thread import os, re, shutil -from PyQt4.Qt import SIGNAL, QDialog, QGridLayout +from PyQt4.Qt import QDialog, QGridLayout from PyQt4 import QtGui from calibre.gui2.dialogs.metadata_bulk_ui import Ui_MetadataBulkDialog @@ -136,12 +136,10 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog): self.series.editTextChanged.connect(self.series_changed) self.tag_editor_button.clicked.connect(self.tag_editor) -# Haven't yet figured out how to hide a single tab -# if len(db.custom_column_label_map) == 0: -# self.central_widget.widget(1).setVisible(False) -# else: -# self.create_custom_column_editors() - self.create_custom_column_editors() + if len(db.custom_column_label_map) == 0: + self.central_widget.removeTab(1) + else: + self.create_custom_column_editors() self.prepare_search_and_replace() self.exec_() @@ -201,21 +199,11 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog): self.s_r_obj = None self.replace_func.addItems(sorted(self.s_r_functions.keys())) - self.connect(self.search_field, - SIGNAL('currentIndexChanged(const QString &)'), - self.s_r_field_changed) - self.connect(self.replace_func, - SIGNAL('currentIndexChanged(const QString &)'), - self.s_r_paint_results) - self.connect(self.search_for, - SIGNAL('editTextChanged(const QString &)'), - self.s_r_paint_results) - self.connect(self.replace_with, - SIGNAL('editTextChanged(const QString &)'), - self.s_r_paint_results) - self.connect(self.test_text, - SIGNAL('editTextChanged(const QString &)'), - self.s_r_paint_results) + self.search_field.currentIndexChanged[str].connect(self.s_r_field_changed) + self.replace_func.currentIndexChanged[str].connect(self.s_r_paint_results) + self.search_for.editTextChanged[str].connect(self.s_r_paint_results) + self.replace_with.editTextChanged[str].connect(self.s_r_paint_results) + self.test_text.editTextChanged[str].connect(self.s_r_paint_results) def s_r_field_changed(self, txt): txt = unicode(txt) diff --git a/src/calibre/gui2/dialogs/metadata_bulk.ui b/src/calibre/gui2/dialogs/metadata_bulk.ui index f51b93cafa..04fb3d4602 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.ui +++ b/src/calibre/gui2/dialogs/metadata_bulk.ui @@ -6,8 +6,8 @@ <rect> <x>0</x> <y>0</y> - <width>526</width> - <height>499</height> + <width>572</width> + <height>554</height> </rect> </property> <property name="windowTitle"> @@ -200,14 +200,15 @@ </item> <item row="6" column="2"> <widget class="QCheckBox" name="remove_all_tags"> - <property name="text"> - <string>Remove all</string> - </property> <property name="toolTip"> <string>Check this box to remove all tags from the books.</string> </property> + <property name="text"> + <string>Remove all</string> + </property> </widget> - </item><item row="7" column="0"> + </item> + <item row="7" column="0"> <widget class="QLabel" name="label_7"> <property name="text"> <string>&Series:</string> @@ -303,7 +304,7 @@ Future conversion of these books will use the default settings.</string> </widget> <widget class="QWidget" name="tabWidgetPage3"> <attribute name="title"> - <string>&Search and replace</string> + <string>&Search and replace (experimental)</string> </attribute> <layout class="QGridLayout" name="gridLayout"> <property name="sizeConstraint"> diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index c2d727e3c2..eb6e8336f9 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -143,6 +143,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): SchemaUpgrade.__init__(self) self.initialize_dynamic() + def get_property(self, idx, index_is_id=False, loc=-1): + row = self.data._data[idx] if index_is_id else self.data[idx] + if row is not None: + return row[loc] + def initialize_dynamic(self): self.field_metadata = FieldMetadata() #Ensure we start with a clean copy self.prefs = DBPrefs(self) @@ -324,19 +329,13 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): self.last_update_check = self.last_modified() - def get_property(idx, index_is_id=False, loc=-1): - row = self.data._data[idx] if index_is_id else self.data[idx] - if row is not None: - return row[loc] - for prop in ('author_sort', 'authors', 'comment', 'comments', 'isbn', 'publisher', 'rating', 'series', 'series_index', 'tags', 'title', 'timestamp', 'uuid', 'pubdate', 'ondevice'): - setattr(self, prop, functools.partial(get_property, + setattr(self, prop, functools.partial(self.get_property, loc=self.FIELD_MAP['comments' if prop == 'comment' else prop])) - setattr(self, 'title_sort', functools.partial(get_property, + setattr(self, 'title_sort', functools.partial(self.get_property, loc=self.FIELD_MAP['sort'])) - setattr(self, 'get_property', get_property) def initialize_database(self): metadata_sqlite = open(P('metadata_sqlite.sql'), 'rb').read() @@ -440,7 +439,8 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns): if not f: continue stream = cStringIO.StringIO(f) - self.add_format(id, format, stream, index_is_id=True, path=tpath) + self.add_format(id, format, stream, index_is_id=True, + path=tpath, notify=False) self.conn.execute('UPDATE books SET path=? WHERE id=?', (path, id)) if commit: self.conn.commit() diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index b93444f4c3..781048666b 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -376,7 +376,9 @@ be printed to it. If the debug output contains a line that looks like:: then the problem is probably a corrupted font cache. You can clear the cache by following these `instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't -solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. +solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like. An easy way to +check for corrupted fonts in OS X is to start the "Font Book" application, select all fonts and then in the File +menu, choose "Validate fonts". My antivirus program claims |app| is a virus/trojan? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/calibre/web/feeds/input.py b/src/calibre/web/feeds/input.py index 32d9075465..9b9a34be7d 100644 --- a/src/calibre/web/feeds/input.py +++ b/src/calibre/web/feeds/input.py @@ -10,6 +10,7 @@ import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.constants import numeric_version +from calibre import walk class RecipeDisabled(Exception): pass @@ -111,6 +112,10 @@ class RecipeInput(InputFormatPlugin): if f.endswith('.opf'): return os.path.abspath(f) + for f in walk('.'): + if f.endswith('.opf'): + return os.path.abspath(f) + def postprocess_book(self, oeb, opts, log): if self.recipe_object is not None: self.recipe_object.postprocess_book(oeb, opts, log)