diff --git a/resources/recipes/ajc.recipe b/resources/recipes/ajc.recipe index ccd0efebdd..ea989b4b4c 100644 --- a/resources/recipes/ajc.recipe +++ b/resources/recipes/ajc.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__author__ = 'Tony Stegall' +__author__ = 'Tony Stegall' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __version__ = '1.03' __date__ = '27, September 2010' @@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en' import datetime +from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1282101454(BasicNewsRecipe): now = datetime.datetime.now() title = 'The AJC' @@ -20,39 +22,39 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True - + masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif' extra_css = ''' h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - + p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} - - + + p{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' - - + + keep_only_tags = [ dict(name='div', attrs={'class':['cxArticleHeader']}) ,dict(attrs={'id':['cxArticleText']}) ] - - + + remove_tags = [ dict(name='div' , attrs={'class':'cxArticleList' }) ,dict(name='div' , attrs={'class':'cxFeedTease' }) ,dict(name='div' , attrs={'class':'cxElementEnlarge' }) ,dict(name='div' , attrs={'id':'cxArticleTools' }) ] - - - + + + feeds = [ ('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'), # ------------------------------------------------------------------- - # Here are the different area feeds. Choose which ever one you wish to + # Here are the different area feeds. Choose which ever one you wish to # read by simply removing the pound sign from it. I currently have it # set to only get the Cobb area # -------------------------------------------------------------------- @@ -70,7 +72,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): ('Opinions', 'http://www.ajc.com/section-rss.do?source=opinion'), ('Ga Politics', 'http://www.ajc.com/section-rss.do?source=georgia-politics-elections'), # ------------------------------------------------------------------------ - # Here are the different sports feeds. I only follow the Falcons, and Highschool + # Here are the different sports feeds. I only follow the Falcons, and Highschool # but again # You can enable which ever team you like by removing the pound sign # ------------------------------------------------------------------------ @@ -85,25 +87,25 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): ('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'), ] - + def postprocess_html(self, soup, first): for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}): credit_tag.extract() - + return soup - + #def print_version(self, url): # return url.partition('?')[0] +'?printArticle=y' - - - - - - + + + + + + diff --git a/resources/recipes/boortz.recipe b/resources/recipes/boortz.recipe index dfb624c4bc..b281798ac8 100644 --- a/resources/recipes/boortz.recipe +++ b/resources/recipes/boortz.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__author__ = 'Tony Stegall' +__author__ = 'Tony Stegall' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __version__ = '1.04' __date__ = '27, September 2010' @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, re + class AdvancedUserRecipe1282101454(BasicNewsRecipe): title = 'Nealz Nuze' language = 'en' @@ -18,7 +18,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): category = 'news, politics, USA, talkshow' oldest_article = 1 max_articles_per_feed = 100 - + no_stylesheets = True remove_javascript = True use_embedded_content = True @@ -26,5 +26,5 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): conversion_options = {'linearize_tables' : True} feeds = [ ('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml') - + ] diff --git a/resources/recipes/popscience.recipe b/resources/recipes/popscience.recipe index 1527a1bb71..5f66d048a6 100644 --- a/resources/recipes/popscience.recipe +++ b/resources/recipes/popscience.recipe @@ -1,5 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import re +import re class AdvancedUserRecipe1282101454(BasicNewsRecipe): title = 'Popular Science' diff --git a/resources/recipes/telegraph_uk.recipe b/resources/recipes/telegraph_uk.recipe index 2c261987b2..f79f0fa50c 100644 --- a/resources/recipes/telegraph_uk.recipe +++ b/resources/recipes/telegraph_uk.recipe @@ -1,6 +1,5 @@ -#!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' telegraph.co.uk ''' @@ -8,14 +7,16 @@ telegraph.co.uk from calibre.web.feeds.news import BasicNewsRecipe class TelegraphUK(BasicNewsRecipe): - title = u'Telegraph.co.uk' + title = 'Telegraph.co.uk' __author__ = 'Darko Miletic and Sujata Raman' description = 'News from United Kingdom' - oldest_article = 7 + oldest_article = 2 + category = 'news, politics, UK' + publisher = 'Telegraph Media Group ltd.' max_articles_per_feed = 100 no_stylesheets = True - language = 'en' - + language = 'en_GB' + remove_empty_feeds = True use_embedded_content = False extra_css = ''' @@ -27,13 +28,20 @@ class TelegraphUK(BasicNewsRecipe): .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} ''' + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + keep_only_tags = [ - dict(name='div', attrs={'class':'storyHead'}) - ,dict(name='div', attrs={'class':'story' }) - #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] }) + dict(name='div', attrs={'class':['storyHead','byline']}) + ,dict(name='div', attrs={'id':'mainBodyArea' }) ] - remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']}) - #,dict(name='div', attrs={'class':['toolshideoneQuarter']}) + remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']}) + ,dict(name='ul' , attrs={'class':['shareThis shareBottom']}) ,dict(name='span', attrs={'class':['num','placeComment']}) ] @@ -51,24 +59,7 @@ class TelegraphUK(BasicNewsRecipe): ] def get_article_url(self, article): - - url = article.get('guid', None) - + url = article.get('link', None) if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url : url = None - return url - - - def postprocess_html(self,soup,first): - - for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}): - for pTag in bylineTag.findAll(name='p'): - if getattr(pTag.contents[0],"Comments",True): - pTag.extract() - return soup - - - - - diff --git a/resources/recipes/twtfb.recipe b/resources/recipes/twtfb.recipe new file mode 100644 index 0000000000..bb2bfe2348 --- /dev/null +++ b/resources/recipes/twtfb.recipe @@ -0,0 +1,40 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +www.thewaythefutureblogs.com +Frederik Pohl's Blog +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class TheWayTheFutureBlogs(BasicNewsRecipe): + title = 'The Way the Future Blogs' + __author__ = 'Darko Miletic' + description = "Frederik Pohl's blog" + publisher = 'Frederik Pohl' + category = 'news, SF, books' + oldest_article = 30 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en' + remove_empty_feeds = True + extra_css = ' body{font-family: Georgia,serif } ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + remove_tags =[dict(name=['meta','object','embed','iframe','base','link'])] + keep_only_tags=[dict(attrs={'class':['post','commentlist']})] + remove_attributes=['width','height','lang','border'] + + feeds = [(u'Posts', u'http://www.thewaythefutureblogs.com/feed/')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index cf6995d3bb..d4d4ee5d4e 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -446,7 +446,7 @@ from calibre.devices.eb600.driver import EB600, COOL_ER, SHINEBOOK, \ BOOQ, ELONEX, POCKETBOOK301, MENTOR from calibre.devices.iliad.driver import ILIAD from calibre.devices.irexdr.driver import IREXDR1000, IREXDR800 -from calibre.devices.jetbook.driver import JETBOOK, MIBUK +from calibre.devices.jetbook.driver import JETBOOK, MIBUK, JETBOOK_MINI from calibre.devices.kindle.driver import KINDLE, KINDLE2, KINDLE_DX from calibre.devices.nook.driver import NOOK from calibre.devices.prs505.driver import PRS505 @@ -468,14 +468,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ - LibraryThingCovers + LibraryThingCovers, DoubanCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers] + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] plugins += [ ComicInput, EPUBInput, @@ -520,6 +520,7 @@ plugins += [ IREXDR1000, IREXDR800, JETBOOK, + JETBOOK_MINI, MIBUK, SHINEBOOK, POCKETBOOK360, diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py index 2b5eb5011e..0310f09242 100644 --- a/src/calibre/customize/profiles.py +++ b/src/calibre/customize/profiles.py @@ -1,3 +1,4 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' @@ -251,6 +252,9 @@ class OutputProfile(Plugin): #: The character used to represent a star in ratings ratings_char = u'*' + #: Unsupported unicode characters to be replaced during preprocessing + unsupported_unicode_chars = [] + @classmethod def tags_to_string(cls, tags): return escape(', '.join(tags)) @@ -422,6 +426,8 @@ class SonyReaderOutput(OutputProfile): dpi = 168.451 fbase = 12 fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] + unsupported_unicode_chars = [u'\u201f', u'\u201b'] + class KoboReaderOutput(OutputProfile): diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 231cc0e225..844269e453 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Douban Books', + 'Douban Books', 'Douban.com covers', ]) def is_disabled(plugin): diff --git a/src/calibre/devices/__init__.py b/src/calibre/devices/__init__.py index 956d18e903..1918a36cc8 100644 --- a/src/calibre/devices/__init__.py +++ b/src/calibre/devices/__init__.py @@ -56,6 +56,7 @@ def get_connected_device(): return dev def debug(ioreg_to_tmp=False, buf=None): + import textwrap from calibre.customize.ui import device_plugins from calibre.devices.scanner import DeviceScanner, win_pnp_drives from calibre.constants import iswindows, isosx, __version__ @@ -95,13 +96,19 @@ def debug(ioreg_to_tmp=False, buf=None): ioreg += 'Output from osx_get_usb_drives:\n'+drives+'\n\n' ioreg += Device.run_ioreg() connected_devices = [] - for dev in sorted(device_plugins(), cmp=lambda - x,y:cmp(x.__class__.__name__, y.__class__.__name__)): - out('Looking for', dev.__class__.__name__) + devplugins = list(sorted(device_plugins(), cmp=lambda + x,y:cmp(x.__class__.__name__, y.__class__.__name__))) + out('Available plugins:', textwrap.fill(' '.join([x.__class__.__name__ for x in + devplugins]))) + out(' ') + out('Looking for devices...') + for dev in devplugins: connected, det = s.is_device_connected(dev, debug=True) if connected: + out('\t\tDetected possible device', dev.__class__.__name__) connected_devices.append((dev, det)) + out(' ') errors = {} success = False out('Devices possibly connected:', end=' ') diff --git a/src/calibre/devices/jetbook/driver.py b/src/calibre/devices/jetbook/driver.py index 6ee1c07464..f108de3347 100644 --- a/src/calibre/devices/jetbook/driver.py +++ b/src/calibre/devices/jetbook/driver.py @@ -99,4 +99,30 @@ class MIBUK(USBMS): VENDOR_NAME = 'LINUX' WINDOWS_MAIN_MEM = 'WOLDERMIBUK' +class JETBOOK_MINI(USBMS): + + ''' + ['0x4b8', + '0x507', + '0x100', + 'ECTACO', + 'ECTACO ATA/ATAPI Bridge (Bulk-Only)', + 'Rev.0.20'] + ''' + FORMATS = ['fb2', 'txt'] + + gui_name = 'JetBook Mini' + name = 'JetBook Mini Device Interface' + description = _('Communicate with the JetBook Mini reader.') + author = 'Kovid Goyal' + + VENDOR_ID = [0x4b8] + PRODUCT_ID = [0x507] + BCD = [0x100] + VENDOR_NAME = 'ECTACO' + WINDOWS_MAIN_MEM = '' # Matches PROD_ + MAIN_MEMORY_VOLUME_LABEL = 'Jetbook Mini' + + SUPPORTS_SUB_DIRS = True + diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 92c2fe5954..bb5c26a50c 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,49 +62,104 @@ def wrap_lines(match): else: return ital+' ' -def line_length(format, raw, percent): +class DocAnalysis(object): ''' - raw is the raw text to find the line length to use for wrapping. - percentage is a decimal number, 0 - 1 which is used to determine - how far in the list of line lengths to use. The list of line lengths is - ordered smallest to larged and does not include duplicates. 0.5 is the - median value. + Provides various text analysis functions to determine how the document is structured. + format is the type of document analysis will be done against. + raw is the raw text to determine the line length to use for wrapping. + Blank lines are excluded from analysis ''' - raw = raw.replace(' ', ' ') - if format == 'html': - linere = re.compile('(?<=)', re.DOTALL) - elif format == 'pdf': - linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) - elif format == 'spanned_html': - linere = re.compile('(?<=)', re.DOTALL) - lines = linere.findall(raw) - lengths = [] - for line in lines: - if len(line) > 0: - lengths.append(len(line)) + def __init__(self, format='html', raw=''): + raw = raw.replace(' ', ' ') + if format == 'html': + linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) + elif format == 'pdf': + linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL) + elif format == 'spanned_html': + linere = re.compile('(?<=)', re.DOTALL) + self.lines = linere.findall(raw) - if not lengths: - return 0 + def line_length(self, percent): + ''' + Analyses the document to find the median line length. + percentage is a decimal number, 0 - 1 which is used to determine + how far in the list of line lengths to use. The list of line lengths is + ordered smallest to larged and does not include duplicates. 0.5 is the + median value. + ''' + lengths = [] + for line in self.lines: + if len(line) > 0: + lengths.append(len(line)) - lengths = list(set(lengths)) - total = sum(lengths) - avg = total / len(lengths) - max_line = avg * 2 + if not lengths: + return 0 - lengths = sorted(lengths) - for i in range(len(lengths) - 1, -1, -1): - if lengths[i] > max_line: - del lengths[i] + lengths = list(set(lengths)) + total = sum(lengths) + avg = total / len(lengths) + max_line = avg * 2 - if percent > 1: - percent = 1 - if percent < 0: - percent = 0 + lengths = sorted(lengths) + for i in range(len(lengths) - 1, -1, -1): + if lengths[i] > max_line: + del lengths[i] - index = int(len(lengths) * percent) - 1 + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 - return lengths[index] + index = int(len(lengths) * percent) - 1 + + return lengths[index] + + def line_histogram(self, percent): + ''' + Creates a broad histogram of the document to determine whether it incorporates hard + line breaks. Lines are sorted into 20 'buckets' based on length. + percent is the percentage of lines that should be in a single bucket to return true + The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks + ''' + minLineLength=20 # Ignore lines under 20 chars (typical of spaces) + maxLineLength=1900 # Discard larger than this to stay in range + buckets=20 # Each line is divided into a bucket based on length + + #print "there are "+str(len(lines))+" lines" + #max = 0 + #for line in self.lines: + # l = len(line) + # if l > max: + # max = l + #print "max line found is "+str(max) + # Build the line length histogram + hRaw = [ 0 for i in range(0,buckets) ] + for line in self.lines: + l = len(line) + if l > minLineLength and l < maxLineLength: + l = int(l/100) + #print "adding "+str(l) + hRaw[l]+=1 + + # Normalize the histogram into percents + totalLines = len(self.lines) + h = [ float(count)/totalLines for count in hRaw ] + #print "\nhRaw histogram lengths are: "+str(hRaw) + #print " percents are: "+str(h)+"\n" + + # Find the biggest bucket + maxValue = 0 + for i in range(0,len(h)): + if h[i] > maxValue: + maxValue = h[i] + + if maxValue < percent: + #print "Line lengths are too variable. Not unwrapping." + return False + else: + #print str(maxValue)+" of the lines were in one bucket" + return True class Dehyphenator(object): ''' @@ -117,42 +172,62 @@ class Dehyphenator(object): def __init__(self): # Add common suffixes to the regex below to increase the likelihood of a match - # don't add suffixes which are also complete words, such as 'able' or 'sex' - self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) # remove prefixes if the prefix was not already the point of hyphenation - self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE) - self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE) + self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE) + self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE) def dehyphenate(self, match): firsthalf = match.group('firstpart') secondhalf = match.group('secondpart') + try: + wraptags = match.group('wraptags') + except: + wraptags = '' hyphenated = str(firsthalf) + "-" + str(secondhalf) dehyphenated = str(firsthalf) + str(secondhalf) lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) - match = booklookup.search(self.html) - if match: - #print "returned dehyphenated word: " + str(dehyphenated) - return dehyphenated - else: - #print "returned hyphenated word: " + str(hyphenated) + try: + searchresult = self.html.find(str.lower(lookupword)) + except: return hyphenated + if self.format == 'html_cleanup': + if self.html.find(lookupword) != -1 or searchresult != -1: + #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + return dehyphenated + elif self.html.find(hyphenated) != -1: + #print "Cleanup:returned hyphenated word: " + str(hyphenated) + return hyphenated + else: + #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + return firsthalf+u'\u2014'+wraptags+secondhalf + + else: + if self.html.find(lookupword) != -1 or searchresult != -1: + #print "returned dehyphenated word: " + str(dehyphenated) + return dehyphenated + else: + #print " returned hyphenated word: " + str(hyphenated) + return hyphenated def __call__(self, html, format, length=1): self.html = html + self.format = format if format == 'html': - intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) elif format == 'pdf': - intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + intextmatch = re.compile(u'(?<=.{%i})(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) elif format == 'individual_words': - intextmatch = re.compile('>[^<]*\b(?P[^"\s>]+)-(?P[^<]*\b(?P[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)') html = intextmatch.sub(self.dehyphenate, html) return html - class CSSPreProcessor(object): PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') @@ -286,7 +361,7 @@ class HTMLPreProcessor(object): (re.compile(r']+>'), lambda match : ''), # Detect Chapters to match default XPATH in GUI - (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), + (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), # Cover the case where every letter in a chapter title is separated by a space (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), @@ -374,10 +449,8 @@ class HTMLPreProcessor(object): print 'Failed to parse remove_footer regexp' traceback.print_exc() - # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal + # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: - # unwrap em/en dashes - end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting @@ -391,12 +464,15 @@ class HTMLPreProcessor(object): length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: - length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) + docanalysis = DocAnalysis('pdf', html) + length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) if length: - # print "The pdf line length returned is " + str(length) + #print "The pdf line length returned is " + str(length) + # unwrap em/en dashes + end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -454,6 +530,14 @@ class HTMLPreProcessor(object): if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) + unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars + if unsupported_unicode_chars: + from calibre.ebooks.unidecode.unidecoder import Unidecoder + unidecoder = Unidecoder() + for char in unsupported_unicode_chars: + asciichar = unidecoder.decode(char) + html = html.replace(char, asciichar) + return html def smarten_punctuation(self, html): diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6a5eaa4a34..5f5c12a703 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' import re -from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator +from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.utils.logging import default_log class PreProcessor(object): @@ -77,13 +77,18 @@ class PreProcessor(object): def __call__(self, html): self.log("********* Preprocessing HTML *********") + + # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*</p>", "</p>\n", html) + html = re.sub(r"\s*<p>\s*", "\n<p>", html) + ###### Check Markup ###### # # some lit files don't have any <p> tags or equivalent (generally just plain text between # <pre> tags), check and mark up line endings if required before proceeding if self.no_markup(html, 0.1): self.log("not enough paragraph markers, adding now") - # check if content is in pre tags, use txt procesor to mark up if so + # check if content is in pre tags, use txt processor to mark up if so pre = re.compile(r'<pre>', re.IGNORECASE) if len(pre.findall(html)) == 1: self.log("Running Text Processing") @@ -113,47 +118,77 @@ class PreProcessor(object): # Get rid of empty <o:p> tags to simplify other processing html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) # Get rid of empty span, bold, & italics tags - html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) + html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) - # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing + # If more than 40% of the lines are empty paragraphs and the user has enabled remove + # paragraph spacing then delete blank lines to clean up spacing linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) + blanks_between_paragraphs = False if len(lines) > 1: self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, 'remove_paragraph_spacing', False): self.log("deleting blank lines") html = blankreg.sub('', html) - # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*</p>", "</p>\n", html) - html = re.sub(r"\s*<p>\s*", "\n<p>", html) + elif float(len(blanklines)) / float(len(lines)) > 0.40: + blanks_between_paragraphs = True + #print "blanks between paragraphs is marked True" + else: + blanks_between_paragraphs = False + #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic + # + # Build the Regular Expressions in pieces + lookahead = "(?=<(p|div))" + chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" + chapter_header_open = r"(?P<chap>" + chapter_header_close = ")\s*" + chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*" + if blanks_between_paragraphs: + blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" + else: + blank_lines = "" + opt_title_open = "(" + title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*" + title_header_open = "(?P<title>" + title_header_close = ")\s*" + title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>" + opt_title_close = ")?" + + default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)" + typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}" + numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*" + uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*" + + chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + #print chapter_marker heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") # # Start with most typical chapter headings, get more aggressive until one works if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE) + chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close + chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) - ###### Unwrap lines ###### # - self.log("Unwrapping Lines") # Some OCR sourced files have line breaks in the html using a combination of span & p tags # span are used for hard line breaks, p for new paragraphs. Determine which is used so # that lines can be un-wrapped across page boundaries @@ -168,25 +203,40 @@ class PreProcessor(object): format = 'html' else: format = 'html' - + # Check Line histogram to determine if the document uses hard line breaks, If 50% or + # more of the lines break in the same region of the document then unwrapping is required + docanalysis = DocAnalysis(format, html) + hardbreaks = docanalysis.line_histogram(.50) + self.log("Hard line breaks check returned "+str(hardbreaks)) # Calculate Length - length = line_length(format, html, getattr(self.extra_opts, - 'html_unwrap_factor', 0.4)) + unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) + length = docanalysis.line_length(unwrap_factor) self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") - max_length = length * 1.4 - min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})") - # - # Unwrap em/en dashes, delete soft-hyphens - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") - html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) - html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html) - # Dehyphenate - dehyphenator = Dehyphenator() - html = dehyphenator(html,'html', length) + # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor + if hardbreaks or unwrap_factor < 0.4: + self.log("Unwrapping required, unwrapping Lines") + # Unwrap em/en dashes + html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html) + # Dehyphenate + self.log("Unwrapping/Removing hyphens") + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html', length) + self.log("Done dehyphenating") + # Unwrap lines using punctation and line length + unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + html = unwrap.sub(' ', html) + #check any remaining hyphens, but only unwrap if there is a match + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html_cleanup', length) + else: + # dehyphenate in cleanup mode to fix anything previous conversions/editing missed + self.log("Cleaning up hyphenation") + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html_cleanup', length) + self.log("Done dehyphenating") - # Unwrap lines using punctation and line length - unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - html = unwrap.sub(' ', html) + # delete soft hyphens + html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < 10: diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index b05444c1c6..2f6fb46540 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -9,6 +9,7 @@ import traceback, socket, re, sys from functools import partial from threading import Thread, Event from Queue import Queue, Empty +from lxml import etree import mechanize @@ -216,6 +217,68 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{ # }}} +class DoubanCovers(CoverDownload): # {{{ + 'Download covers from Douban.com' + + DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/' + CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' + name = 'Douban.com covers' + description = _('Download covers from Douban.com') + author = 'Li Fanxi' + + def get_cover_url(self, isbn, br, timeout=5.): + try: + url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY + src = br.open(url, timeout=timeout).read() + except Exception, err: + if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + err = Exception(_('Douban.com API timed out. Try again later.')) + raise err + else: + feed = etree.fromstring(src) + NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'db': 'http://www.douban.com/xmlns/' + } + XPath = partial(etree.XPath, namespaces=NAMESPACES) + entries = XPath('//atom:entry')(feed) + if len(entries) < 1: + return None + try: + cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") + u = cover_url(entries[0])[0].replace('/spic/', '/lpic/'); + # If URL contains "book-default", the book doesn't have a cover + if u.find('book-default') != -1: + return None + except: + return None + return u + + def has_cover(self, mi, ans, timeout=5.): + if not mi.isbn: + return False + br = browser() + try: + if self.get_cover_url(mi.isbn, br, timeout=timeout) != None: + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + br = browser() + try: + url = self.get_cover_url(mi.isbn, br, timeout=timeout) + cover_data = br.open_novisit(url).read() + result_queue.put((True, cover_data, 'jpg', self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) +# }}} + def download_cover(mi, timeout=5.): # {{{ results = Queue() download_covers(mi, results, max_covers=1, timeout=timeout) diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 68deca5e10..b02ae2dbff 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -181,7 +181,7 @@ def metadata_from_filename(name, pat=None): mi.isbn = si except (IndexError, ValueError): pass - if not mi.title: + if mi.is_null('title'): mi.title = name return mi diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py index bd9728989b..cc74b3c515 100644 --- a/src/calibre/gui2/actions/edit_metadata.py +++ b/src/calibre/gui2/actions/edit_metadata.py @@ -184,7 +184,7 @@ class EditMetadataAction(InterfaceAction): self.gui.tags_view.blockSignals(True) try: changed = MetadataBulkDialog(self.gui, rows, - self.gui.library_view.model().db).changed + self.gui.library_view.model()).changed finally: self.gui.tags_view.blockSignals(False) if changed: diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py index 9c83b3aee5..b0ce0a1e6d 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.py +++ b/src/calibre/gui2/dialogs/metadata_bulk.py @@ -142,12 +142,13 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog): _('Append to field'), ] - def __init__(self, window, rows, db): + def __init__(self, window, rows, model): QDialog.__init__(self, window) Ui_MetadataBulkDialog.__init__(self) self.setupUi(self) - self.db = db - self.ids = [db.id(r) for r in rows] + self.model = model + self.db = model.db + self.ids = [self.db.id(r) for r in rows] self.box_title.setText('<p>' + _('Editing meta information for <b>%d books</b>') % len(rows)) @@ -170,7 +171,7 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog): self.tag_editor_button.clicked.connect(self.tag_editor) self.autonumber_series.stateChanged[int].connect(self.auto_number_changed) - if len(db.custom_field_keys(include_composites=False)) == 0: + if len(self.db.custom_field_keys(include_composites=False)) == 0: self.central_widget.removeTab(1) else: self.create_custom_column_editors() @@ -617,8 +618,15 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog): self.worker = Worker(args, self.db, self.ids, getattr(self, 'custom_column_widgets', []), Dispatcher(bb.accept, parent=bb)) - self.worker.start() - bb.exec_() + + # The metadata backup thread causes database commits + # which can slow down bulk editing of large numbers of books + self.model.stop_metadata_backup() + try: + self.worker.start() + bb.exec_() + finally: + self.model.start_metadata_backup() if self.worker.error is not None: return error_dialog(self, _('Failed'), diff --git a/src/calibre/gui2/dialogs/scheduler.py b/src/calibre/gui2/dialogs/scheduler.py index fd8184933f..30f4a2d8a2 100644 --- a/src/calibre/gui2/dialogs/scheduler.py +++ b/src/calibre/gui2/dialogs/scheduler.py @@ -57,6 +57,10 @@ class SchedulerDialog(QDialog, Ui_Dialog): self.old_news.setValue(gconf['oldest_news']) + def keyPressEvent(self, ev): + if ev.key() not in (Qt.Key_Enter, Qt.Key_Return): + return QDialog.keyPressEvent(self, ev) + def break_cycles(self): self.disconnect(self.recipe_model, SIGNAL('searched(PyQt_PyObject)'), self.search_done) diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py index b2a7f08055..9da5420681 100644 --- a/src/calibre/gui2/library/models.py +++ b/src/calibre/gui2/library/models.py @@ -159,17 +159,24 @@ class BooksModel(QAbstractTableModel): # {{{ # do something on the GUI thread. Deadlock. self.cover_cache = CoverCache(db, FunctionDispatcher(self.db.cover)) self.cover_cache.start() - if self.metadata_backup is not None: - self.metadata_backup.stop() - # Would like to to a join here, but the thread might be waiting to - # do something on the GUI thread. Deadlock. - self.metadata_backup = MetadataBackup(db) - self.metadata_backup.start() + self.stop_metadata_backup() + self.start_metadata_backup() def refresh_cover(event, ids): if event == 'cover' and self.cover_cache is not None: self.cover_cache.refresh(ids) db.add_listener(refresh_cover) + def start_metadata_backup(self): + self.metadata_backup = MetadataBackup(self.db) + self.metadata_backup.start() + + def stop_metadata_backup(self): + if getattr(self, 'metadata_backup', None) is not None: + self.metadata_backup.stop() + # Would like to to a join here, but the thread might be waiting to + # do something on the GUI thread. Deadlock. + + def refresh_ids(self, ids, current_row=-1): rows = self.db.refresh_ids(ids) if rows: diff --git a/src/calibre/gui2/preferences/misc.py b/src/calibre/gui2/preferences/misc.py index 865115c2ed..582d110c6c 100644 --- a/src/calibre/gui2/preferences/misc.py +++ b/src/calibre/gui2/preferences/misc.py @@ -106,14 +106,13 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): d.exec_() def compact(self, *args): - from calibre.library.caches import MetadataBackup m = self.gui.library_view.model() - if m.metadata_backup is not None: - m.metadata_backup.stop() - d = CheckIntegrity(m.db, self) - d.exec_() - m.metadata_backup = MetadataBackup(m.db) - m.metadata_backup.start() + m.stop_metadata_backup() + try: + d = CheckIntegrity(m.db, self) + d.exec_() + finally: + m.start_metadata_backup() def open_config_dir(self, *args): from calibre.utils.config import config_dir diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 2f0452a773..c068168247 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -217,9 +217,12 @@ def fetch_scheduled_recipe(arg): if 'output_profile' in ps: recs.append(('output_profile', ps['output_profile'], OptionRecommendation.HIGH)) - if ps['output_profile'] == 'kindle': - recs.append(('no_inline_toc', True, - OptionRecommendation.HIGH)) + # Disabled since apparently some people use + # K4PC and, surprise, surprise, it doesn't support + # indexed MOBIs. + #if ps['output_profile'] == 'kindle': + # recs.append(('no_inline_toc', True, + # OptionRecommendation.HIGH)) lf = load_defaults('look_and_feel') if lf.get('base_font_size', 0.0) != 0.0: diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index f3234d48d5..37b7c7bd7c 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -73,6 +73,14 @@ class JetBook(Device): manufacturer = 'Ectaco' id = 'jetbook' +class JetBookMini(Device): + + output_profile = 'jetbook5' + output_format = 'FB2' + name = 'JetBook Mini' + manufacturer = 'Ectaco' + id = 'jetbookmini' + class KindleDX(Kindle): output_profile = 'kindle_dx' @@ -584,12 +592,42 @@ class LibraryPage(QWizardPage, LibraryUI): qt_app.load_translations() self.emit(SIGNAL('retranslate()')) self.init_languages() + try: + if prefs['language'].lower().startswith('zh'): + from calibre.customize.ui import enable_plugin + for name in ('Douban Books', 'Douban.com covers'): + enable_plugin(name) + except: + pass + + def is_library_dir_suitable(self, x): + return LibraryDatabase2.exists_at(x) or not os.listdir(x) + + def validatePage(self): + newloc = unicode(self.location.text()) + if not self.is_library_dir_suitable(newloc): + self.show_library_dir_error(newloc) + return False + return True def change(self): - dir = choose_dir(self, 'database location dialog', + x = choose_dir(self, 'database location dialog', _('Select location for books')) - if dir: - self.location.setText(dir) + if x: + if self.is_library_dir_suitable(x): + self.location.setText(x) + else: + self.show_library_dir_error(x) + + def show_library_dir_error(self, x): + if not isinstance(x, unicode): + try: + x = x.decode(filesystem_encoding) + except: + x = unicode(repr(x)) + error_dialog(self, _('Bad location'), + _('You must choose an empty folder for ' + 'the calibre library. %s is not empty.')%x, show=True) def initializePage(self): lp = prefs['library_path']