diff --git a/imgsrc/trim.svg b/imgsrc/trim.svg new file mode 100644 index 0000000000..8c8810fc66 --- /dev/null +++ b/imgsrc/trim.svg @@ -0,0 +1,688 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + Oxygen team + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/resources/images/trim.png b/resources/images/trim.png new file mode 100644 index 0000000000..3cb93adfa6 Binary files /dev/null and b/resources/images/trim.png differ diff --git a/resources/images/tweak_epub.png b/resources/images/tweak_epub.png deleted file mode 100644 index b9fd9e5f1e..0000000000 Binary files a/resources/images/tweak_epub.png and /dev/null differ diff --git a/resources/recipes/le_journal.recipe b/resources/recipes/le_journal.recipe new file mode 100644 index 0000000000..24a7d52164 --- /dev/null +++ b/resources/recipes/le_journal.recipe @@ -0,0 +1,43 @@ +__author__ = ' (lrfurtado@yahoo.com.br)' + +from calibre.web.feeds.news import BasicNewsRecipe + +class LeJournalDeMontrealRecipe(BasicNewsRecipe): + + title = u'Le Journal de Montreal' + description = u'Le Journal de Montreal' + __author__ = 'Luciano Furtado' + language = 'fr' + + oldest_article = 7 + use_embedded_content=0 + max_articles_per_feed = 15 + + remove_tags = [ + dict(name='ul',attrs={'id':'mainNav'}), + dict(name='div',attrs={'id':'boxPolitique'}), + dict(name='div',attrs={'id':'boxScoop'}), + dict(name='div',attrs={'id':'DossierSpec'}), + dict(name='div',attrs={'id':'channelBoxes'}), + dict(name='div',attrs={'id':'sectionBoxes'}), + dict(name='div',attrs={'id':'header'}), + dict(name='div',attrs={'id':'footer'}), + dict(name='div',attrs={'id':'navbarCanoe_container'}), + dict(name='div',attrs={'id':'popularCanoe'}), + dict(name='div',attrs={'id':'textAds'}), + dict(name='div',attrs={'id':'24heures'}), + dict(name='div',attrs={'class':'bottomBox clear'}), + dict(name='div',attrs={'class':'articleControls thin'}), + ] + + + feeds = [ + (u'Actualites', + u'http://www.canoe.com/rss/feed/nouvelles/ljm_actualites.xml'), + (u'Arts et spectacle', + u'http://www.canoe.com/rss/feed/nouvelles/ljm_arts.xml'), + (u'Sports', + u'http://www.canoe.com/rss/feed/nouvelles/ljm_sports.xml'), + (u'Chroniques', + u'http://www.canoe.com/rss/feed/nouvelles/ljm_chroniques.xml'), + ] diff --git a/resources/recipes/tagesan.recipe b/resources/recipes/tagesan.recipe new file mode 100644 index 0000000000..8514162598 --- /dev/null +++ b/resources/recipes/tagesan.recipe @@ -0,0 +1,45 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1284927619(BasicNewsRecipe): + title = u'Tagesanzeiger' + publisher = u'Tamedia AG' + oldest_article = 2 + __author__ = 'noxxx' + max_articles_per_feed = 100 + description = 'tagesanzeiger.ch: Nichts verpassen' + category = 'News, Politik, Nachrichten, Schweiz, Zürich' + language = 'de' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + remove_tags = [ + dict(name='img') + ,dict(name='div',attrs={'class':['swissquote ad','boxNews','centerAD','contentTabs2','sbsLabel']}) + ,dict(name='div',attrs={'id':['colRightAd','singleRight','singleSmallRight','MailInfo','metaLine','sidebarSky','contentFooter','commentInfo','commentInfo2','commentInfo3','footerBottom','clear','boxExclusiv','singleLogo','navSearch','headerLogin','headerBottomRight','horizontalNavigation','subnavigation','googleAdSense','footerAd','contentbox','articleGalleryNav']}) + ,dict(name='form',attrs={'id':['articleMailForm','commentform']}) + ,dict(name='div',attrs={'style':['position:absolute']}) + ,dict(name='script',attrs={'type':['text/javascript']}) + ,dict(name='p',attrs={'class':['schreiben','smallPrint','charCounter','caption']}) + ] + feeds = [ + (u'Front', u'http://www.tagesanzeiger.ch/rss.html') + ,(u'Zürich', u'http://www.tagesanzeiger.ch/zuerich/rss.html') + ,(u'Schweiz', u'http://www.tagesanzeiger.ch/schweiz/rss.html') + ,(u'Ausland', u'http://www.tagesanzeiger.ch/ausland/rss.html') + ,(u'Digital', u'http://www.tagesanzeiger.ch/digital/rss.html') + ,(u'Wissen', u'http://www.tagesanzeiger.ch/wissen/rss.html') + ,(u'Panorama', u'http://www.tagesanzeiger.ch/panorama/rss.html') + ,(u'Wirtschaft', u'http://www.tagesanzeiger.ch/wirtschaft/rss.html') + ,(u'Sport', u'http://www.tagesanzeiger.ch/sport/rss.html') + ,(u'Kultur', u'http://www.tagesanzeiger.ch/kultur/rss.html') + ,(u'Leben', u'http://www.tagesanzeiger.ch/leben/rss.html') + ,(u'Auto', u'http://www.tagesanzeiger.ch/auto/rss.html')] + + def print_version(self, url): + return url + '/print.html' + diff --git a/resources/recipes/the_marker.recipe b/resources/recipes/the_marker.recipe new file mode 100644 index 0000000000..e5f1ffc761 --- /dev/null +++ b/resources/recipes/the_marker.recipe @@ -0,0 +1,52 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1283848012(BasicNewsRecipe): + description = 'TheMarker Financial News in Hebrew' + __author__ = 'TonyTheBookworm, Marbs' + cover_url = 'http://static.ispot.co.il/wp-content/upload/2009/09/themarker.jpg' + title = u'TheMarker' + language = 'he' + simultaneous_downloads = 5 + remove_javascript = True + timefmt = '[%a, %d %b, %Y]' + oldest_article = 1 + remove_tags = [dict(name='tr', attrs={'bgcolor':['#738A94']}) ] + max_articles_per_feed = 10 + extra_css='body{direction: rtl;} .article_description{direction: rtl; } a.article{direction: rtl; } .calibre_feed_description{direction: rtl; }' + feeds = [(u'Head Lines', u'http://www.themarker.com/tmc/content/xml/rss/hpfeed.xml'), + (u'TA Market', u'http://www.themarker.com/tmc/content/xml/rss/sections/marketfeed.xml'), + (u'Real Estate', u'http://www.themarker.com/tmc/content/xml/rss/sections/realEstaterfeed.xml'), + (u'Wall Street & Global', u'http://www.themarker.com/tmc/content/xml/rss/sections/wallsfeed.xml'), + (u'Law', u'http://www.themarker.com/tmc/content/xml/rss/sections/lawfeed.xml'), + (u'Media', u'http://www.themarker.com/tmc/content/xml/rss/sections/mediafeed.xml'), + (u'Consumer', u'http://www.themarker.com/tmc/content/xml/rss/sections/consumerfeed.xml'), + (u'Career', u'http://www.themarker.com/tmc/content/xml/rss/sections/careerfeed.xml'), + (u'Car', u'http://www.themarker.com/tmc/content/xml/rss/sections/carfeed.xml'), + (u'High Tech', u'http://www.themarker.com/tmc/content/xml/rss/sections/hightechfeed.xml'), + (u'Investor Guide', u'http://www.themarker.com/tmc/content/xml/rss/sections/investorGuidefeed.xml')] + + def print_version(self, url): + split1 = url.split("=") + weblinks = url + + if weblinks is not None: + for link in weblinks: + #--------------------------------------------------------- + #here we need some help with some regexpressions + #we are trying to find it.themarker.com in a url + #----------------------------------------------------------- + re1='.*?' # Non-greedy match on filler + re2='(it\\.themarker\\.com)' # Fully Qualified Domain Name 1 + rg = re.compile(re1+re2,re.IGNORECASE|re.DOTALL) + m = rg.search(url) + + + if m: + split2 = url.split("article/") + print_url = 'http://it.themarker.com/tmit/PrintArticle/' + split2[1] + + else: + print_url = 'http://www.themarker.com/ibo/misc/printFriendly.jhtml?ElementId=%2Fibo%2Frepositories%2Fstories%2Fm1_2000%2F' + split1[1]+'.xml' + + return print_url diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index fd5e977d10..88e07bcea3 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -70,13 +70,16 @@ class WallStreetJournal(BasicNewsRecipe): def wsj_add_feed(self,feeds,title,url): self.log('Found section:', title) - if url.endswith('whatsnews'): - articles = self.wsj_find_wn_articles(url) - else: - articles = self.wsj_find_articles(url) + try: + if url.endswith('whatsnews'): + articles = self.wsj_find_wn_articles(url) + else: + articles = self.wsj_find_articles(url) + except: + articles = [] if articles: feeds.append((title, articles)) - return feeds + return feeds def parse_index(self): soup = self.wsj_get_index() @@ -99,7 +102,7 @@ class WallStreetJournal(BasicNewsRecipe): url = 'http://online.wsj.com' + a['href'] feeds = self.wsj_add_feed(feeds,title,url) title = 'What''s News' - url = url.replace('pageone','whatsnews') + url = url.replace('pageone','whatsnews') feeds = self.wsj_add_feed(feeds,title,url) else: title = self.tag_to_string(a) @@ -141,7 +144,7 @@ class WallStreetJournal(BasicNewsRecipe): articles = [] flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x}) - if flavorarea is not None: + if flavorarea is not None: flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article')) if flavorstory is not None: flavorstory['class'] = 'mjLinkItem' diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe index 7f3664f1c4..df8234e8e2 100644 --- a/resources/recipes/wsj_free.recipe +++ b/resources/recipes/wsj_free.recipe @@ -54,10 +54,13 @@ class WallStreetJournal(BasicNewsRecipe): def wsj_add_feed(self,feeds,title,url): self.log('Found section:', title) - if url.endswith('whatsnews'): - articles = self.wsj_find_wn_articles(url) - else: - articles = self.wsj_find_articles(url) + try: + if url.endswith('whatsnews'): + articles = self.wsj_find_wn_articles(url) + else: + articles = self.wsj_find_articles(url) + except: + articles = [] if articles: feeds.append((title, articles)) return feeds diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index 762a05d193..1171b74f5c 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -443,9 +443,9 @@ class KOBO(USBMS): # Reset Im_Reading list in the database if oncard == 'carda': - query= 'update content set ReadStatus=0, FirstTimeReading = \'true\' where BookID is Null and ContentID like \'file:///mnt/sd/%\'' + query= 'update content set ReadStatus=0, FirstTimeReading = \'true\' where BookID is Null and ReadStatus = 1 and ContentID like \'file:///mnt/sd/%\'' elif oncard != 'carda' and oncard != 'cardb': - query= 'update content set ReadStatus=0, FirstTimeReading = \'true\' where BookID is Null and ContentID not like \'file:///mnt/sd/%\'' + query= 'update content set ReadStatus=0, FirstTimeReading = \'true\' where BookID is Null and ReadStatus = 1 and ContentID not like \'file:///mnt/sd/%\'' try: cursor.execute (query) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 3ea2926461..395447edba 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -241,7 +241,7 @@ OptionRecommendation(name='toc_filter', OptionRecommendation(name='chapter', recommended_value="//*[((name()='h1' or name()='h2') and " - r"re:test(., 'chapter|book|section|part\s+', 'i')) or @class " + r"re:test(., 'chapter|book|section|part|prologue|epilogue\s+', 'i')) or @class " "= 'chapter']", level=OptionRecommendation.LOW, help=_('An XPath expression to detect chapter titles. The default ' 'is to consider

or

tags that contain the words ' diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index e87a8021f9..3b1239814a 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -106,6 +106,52 @@ def line_length(format, raw, percent): return lengths[index] +class Dehyphenator(object): + ''' + Analyzes words to determine whether hyphens should be retained/removed. Uses the document + itself is as a dictionary. This method handles all languages along with uncommon, made-up, and + scientific words. The primary disadvantage is that words appearing only once in the document + retain hyphens. + ''' + + def __init__(self): + # Add common suffixes to the regex below to increase the likelihood of a match - + # don't add suffixes which are also complete words, such as 'able' or 'sex' + self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) + # remove prefixes if the prefix was not already the point of hyphenation + self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE) + self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE) + + def dehyphenate(self, match): + firsthalf = match.group('firstpart') + secondhalf = match.group('secondpart') + hyphenated = str(firsthalf) + "-" + str(secondhalf) + dehyphenated = str(firsthalf) + str(secondhalf) + lookupword = self.removesuffixes.sub('', dehyphenated) + if self.prefixes.match(firsthalf) is None: + lookupword = self.removeprefix.sub('', lookupword) + booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) + #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + match = booklookup.search(self.html) + if match: + #print "returned dehyphenated word: " + str(dehyphenated) + return dehyphenated + else: + #print "returned hyphenated word: " + str(hyphenated) + return hyphenated + + def __call__(self, html, format, length=1): + self.html = html + if format == 'html': + intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?P[\w\d]+)' % length) + elif format == 'pdf': + intextmatch = re.compile(u'(?<=.{%i})(?P[^“"\s>]+)-\s*(

|\s*

\s*<[iub]>)\s*(?P[\w\d]+)'% length) + elif format == 'individual_words': + intextmatch = re.compile('>[^<]*\b(?P[^"\s>]+)-(?P\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap em/en dashes + end_rules.append((re.compile(u'(?<=[–—])\s*

\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens end_rules.append((re.compile(u'[­](\s*

)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting @@ -350,7 +395,7 @@ class HTMLPreProcessor(object): # print "The pdf line length returned is " + str(length) end_rules.append( # Un wrap using punctuation - (re.compile(r'(?<=.{%i}([a-z,:)\-IA]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: @@ -380,6 +425,11 @@ class HTMLPreProcessor(object): for rule in rules + end_rules: html = rule[0].sub(rule[1], html) + if is_pdftohtml: + # Dehyphenate + dehyphenator = Dehyphenator() + html = dehyphenator(html,'pdf', length) + #dump(html, 'post-preprocess') # Handle broken XHTML w/ SVG (ugh) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 37fd169cb1..6a5eaa4a34 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' import re -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator from calibre.utils.logging import default_log class PreProcessor(object): @@ -114,7 +114,7 @@ class PreProcessor(object): html = re.sub(ur'\s*\s*', ' ', html) # Get rid of empty span, bold, & italics tags html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) - html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*\s*){0,2}\s*", " ", html) + html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*\s*){0,2}\s*", " ", html) html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing @@ -132,7 +132,6 @@ class PreProcessor(object): # Arrange line feeds and

tags so the line_length and no_markup functions work correctly html = re.sub(r"\s*

", "

\n", html) html = re.sub(r"\s*

\s*", "\n

", html) - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) @@ -140,16 +139,16 @@ class PreProcessor(object): # # Start with most typical chapter headings, get more aggressive until one works if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'(?=]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(){0,2})\s*()?s*(){0,2}\s*()?\s*()\s*\s*(\s*]*>\s*

){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE) + chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) ###### Unwrap lines ###### @@ -174,10 +173,16 @@ class PreProcessor(object): length = line_length(format, html, getattr(self.extra_opts, 'html_unwrap_factor', 0.4)) self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") + max_length = length * 1.4 + min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})") # - # Unwrap and/or delete soft-hyphens, hyphens + # Unwrap em/en dashes, delete soft-hyphens + #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) - html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) + html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html) + # Dehyphenate + dehyphenator = Dehyphenator() + html = dehyphenator(html,'html', length) # Unwrap lines using punctation and line length unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) @@ -186,7 +191,7 @@ class PreProcessor(object): # If still no sections after unwrapping mark split points on lines with no punctuation if self.html_preprocess_sections < 10: self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) - chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) + chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter diff --git a/src/calibre/gui2/actions/tweak_epub.py b/src/calibre/gui2/actions/tweak_epub.py index 96331e2887..67ec34c12b 100755 --- a/src/calibre/gui2/actions/tweak_epub.py +++ b/src/calibre/gui2/actions/tweak_epub.py @@ -5,7 +5,6 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -from calibre.constants import iswindows, isosx from calibre.gui2 import error_dialog from calibre.gui2.actions import InterfaceAction from calibre.gui2.dialogs.tweak_epub import TweakEpub @@ -13,53 +12,45 @@ from calibre.gui2.dialogs.tweak_epub import TweakEpub class TweakEpubAction(InterfaceAction): name = 'Tweak ePub' - action_spec = (_('Tweak ePub'), 'tweak_epub.png', 'Edit ePub in situ', - _('T')) + action_spec = (_('Tweak ePub'), 'trim.png', + _('Make small changes to ePub format books'), + _('T')) dont_add_to = frozenset(['toolbar-device', 'context-menu-device']) action_type = 'current' def genesis(self): - self.qaction.triggered.connect(self._edit_epub_in_situ) + self.qaction.triggered.connect(self.edit_epub_in_situ) - def _edit_epub_in_situ(self, *args): - - # Assure exactly one row selected - rows = self.gui.library_view.selectionModel().selectedRows() - if not rows or len(rows) == 0: - d = error_dialog(self.gui, _('Cannot tweak ePub'), _('No book selected')) - d.exec_() - return - if len(rows) > 1: - d = error_dialog(self.gui, _('Cannot tweak ePub'), _('Multiple books selected')) - d.exec_() - return + def edit_epub_in_situ(self, *args): + row = self.gui.library_view.currentIndex() + if not row.isValid(): + return error_dialog(self.gui, _('Cannot tweak ePub'), + _('No book selected'), show=True) # Confirm 'EPUB' in formats - row = rows[0].row() - formats = self.gui.library_view.model().db.formats(row).upper().split(',') - if not 'EPUB' in formats: - d = error_dialog(self.gui, _('Cannot tweak ePub'), _('No EPUB available')) - d.exec_() - return + book_id = self.gui.library_view.model().id(row) + try: + path_to_epub = self.gui.library_view.model().db.format_abspath( + book_id, 'EPUB', index_is_id=True) + except: + path_to_epub = None + + if not path_to_epub: + return error_dialog(self.gui, _('Cannot tweak ePub'), + _('No ePub available. First convert the book to ePub.'), + show=True) - path_to_epub = self.gui.library_view.model().db.format_abspath(row, 'EPUB') - id = self._get_selected_id() # Launch a modal dialog waiting for user to complete or cancel dlg = TweakEpub(self.gui, path_to_epub) if dlg.exec_() == dlg.Accepted: - self._update_db(id, dlg._output) + self.update_db(book_id, dlg._output) dlg.cleanup() - def _get_selected_id(self): - rows = self.gui.library_view.selectionModel().selectedRows() - return map(self.gui.library_view.model().id, rows)[0] - - def _update_db(self, id, rebuilt): + def update_db(self, book_id, rebuilt): ''' Update the calibre db with the tweaked epub ''' - print "gui2.actions.tweak_epub:TweakEpubAction._update_db()" - print " updating id %d from %s" % (id, rebuilt) - self.gui.library_view.model().db.add_format_with_hooks(id, 'EPUB', rebuilt, index_is_id=True) + self.gui.library_view.model().db.add_format(book_id, 'EPUB', + open(rebuilt, 'rb'), index_is_id=True) diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index ae3141db56..a7e55c4619 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -800,7 +800,7 @@ class DeviceMixin(object): # {{{ # if set_books_in_library did not. if not self.set_books_in_library(self.booklists(), reset=True): self.upload_booklists() - self.book_on_device(None, None, reset=True) + self.book_on_device(None, reset=True) # We need to reset the ondevice flags in the library. Use a big hammer, # so we don't need to worry about whether some succeeded or not. self.refresh_ondevice_info(device_connected=True, reset_only=False) @@ -1309,7 +1309,7 @@ class DeviceMixin(object): # {{{ for f in files: getattr(f, 'close', lambda : True)() - def book_on_device(self, id, format=None, reset=False): + def book_on_device(self, id, reset=False): ''' Return an indication of whether the given book represented by its db id is on the currently connected device. It returns a 5 element list. The @@ -1338,8 +1338,6 @@ class DeviceMixin(object): # {{{ self.book_db_id_cache.append(set()) for book in l: db_id = getattr(book, 'application_id', None) - if db_id is None: - db_id = book.db_id if db_id is not None: # increment the count of books on the device with this # db_id. diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index 26dbda6ca4..53788809b6 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -300,6 +300,24 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.cpixmap = pix self.cover_data = cdata + def trim_cover(self, *args): + from calibre.utils.magick import Image + cdata = self.cover_data + if not cdata: + return + im = Image() + im.load(cdata) + im.trim(10) + cdata = im.export('jpg') + pix = QPixmap() + pix.loadFromData(cdata) + self.cover.setPixmap(pix) + self.cover_changed = True + self.cpixmap = pix + self.cover_data = cdata + + + def sync_formats(self): old_extensions, new_extensions, paths = set(), set(), {} for row in range(self.formats.count()): @@ -380,6 +398,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): self.remove_unused_series) QObject.connect(self.auto_author_sort, SIGNAL('clicked()'), self.deduce_author_sort) + self.trim_cover_button.clicked.connect(self.trim_cover) self.connect(self.author_sort, SIGNAL('textChanged(const QString&)'), self.author_sort_box_changed) self.connect(self.authors, SIGNAL('editTextChanged(const QString&)'), diff --git a/src/calibre/gui2/dialogs/metadata_single.ui b/src/calibre/gui2/dialogs/metadata_single.ui index 74febf9c29..dbf825e706 100644 --- a/src/calibre/gui2/dialogs/metadata_single.ui +++ b/src/calibre/gui2/dialogs/metadata_single.ui @@ -625,6 +625,17 @@ Using this button to create author sort will change author sort from red to gree </property> </widget> </item> + <item> + <widget class="QToolButton" name="trim_cover_button"> + <property name="toolTip"> + <string>Remove border (if any) from cover</string> + </property> + <property name="icon"> + <iconset resource="../../../../resources/images.qrc"> + <normaloff>:/images/trim.png</normaloff>:/images/trim.png</iconset> + </property> + </widget> + </item> <item> <widget class="QToolButton" name="reset_cover"> <property name="toolTip"> diff --git a/src/calibre/gui2/dialogs/tweak_epub.py b/src/calibre/gui2/dialogs/tweak_epub.py index a967ca310a..fb3643884b 100755 --- a/src/calibre/gui2/dialogs/tweak_epub.py +++ b/src/calibre/gui2/dialogs/tweak_epub.py @@ -6,15 +6,12 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import os, shutil, subprocess, sys +import os, shutil from contextlib import closing from zipfile import ZipFile, ZIP_DEFLATED, ZIP_STORED -from PyQt4 import QtGui -from PyQt4.Qt import QDialog, SIGNAL +from PyQt4.Qt import QDialog -from calibre import prints -from calibre.constants import iswindows, isosx, DEBUG from calibre.gui2 import open_local_file from calibre.gui2.dialogs.tweak_epub_ui import Ui_Dialog from calibre.libunzip import extract as zipextract @@ -26,7 +23,6 @@ class TweakEpub(QDialog, Ui_Dialog): To do: - need way to kill file browser proc in cleanup() - - linux file browser launch ''' def __init__(self, parent, epub): @@ -40,36 +36,17 @@ class TweakEpub(QDialog, Ui_Dialog): # Run the dialog setup generated from tweak_epub.ui self.setupUi(self) - self.connect(self.cancel_button, - SIGNAL("clicked()"), - self.cancel) - self.connect(self.explode_button, - SIGNAL("clicked()"), - self.explode) - self.connect(self.rebuild_button, - SIGNAL("clicked()"), - self.rebuild) + self.cancel_button.clicked.connect(self.reject) + self.explode_button.clicked.connect(self.explode) + self.rebuild_button.clicked.connect(self.rebuild) # Position update dialog overlaying top left of app window parent_loc = parent.pos() self.move(parent_loc.x(),parent_loc.y()) - def cancel(self): - if DEBUG: - prints("gui2.dialogs.tweak_epub:TweakEpub.cancel()") - return QDialog.reject(self) - def cleanup(self): - ''' - Kill the file browser - ''' - if DEBUG: - prints("gui2.dialogs.tweak_epub:TweakEpub.cleanup()") - # Delete directory containing exploded ePub if self._exploded is not None: - if DEBUG: - prints(" removing exploded dir\n %s" % self._exploded) shutil.rmtree(self._exploded, ignore_errors=True) @@ -78,37 +55,17 @@ class TweakEpub(QDialog, Ui_Dialog): Generic subprocess launch of native file browser User can use right-click to 'Open with ...' ''' - if DEBUG: - prints("gui2.dialogs.tweak_epub:TweakEpub.display_exploded()") - ''' - if isosx: - cmd = 'open %s' % self._exploded - elif iswindows: - cmd = 'start explorer.exe /e,/root,%s' % self._exploded - else: - # *** Kovid - need proper linux invocation here *** - cmd = '<linux command to open native file browser>' - - # *** Kovid - need a way of launching this process than can be killed in cleanup() *** - self._file_browser_proc = subprocess.Popen(cmd, shell=True) - ''' open_local_file(self._exploded) - def explode(self): - if DEBUG: - prints("gui2.dialogs.tweak_epub:TweakEpub.explode()") + def explode(self, *args): if self._exploded is None: - if DEBUG: - prints(" exploding %s" % self._epub) self._exploded = PersistentTemporaryDirectory("_exploded", prefix='') zipextract(self._epub, self._exploded) self.display_exploded() self.rebuild_button.setEnabled(True) self.explode_button.setEnabled(False) - def rebuild(self): - if DEBUG: - prints("gui2.dialogs.tweak_epub:TweakEpub.rebuild()") + def rebuild(self, *args): self._output = os.path.join(self._exploded, 'rebuilt.epub') with closing(ZipFile(self._output, 'w', compression=ZIP_DEFLATED)) as zf: # Write mimetype @@ -120,7 +77,8 @@ class TweakEpub(QDialog, Ui_Dialog): if fn in exclude_files: continue absfn = os.path.join(root, fn) - zfn = absfn[len(self._exploded) + len(os.sep):] + zfn = os.path.relpath(absfn, + self._exploded).replace(os.sep, '/') zf.write(absfn, zfn) return QDialog.accept(self) diff --git a/src/calibre/gui2/dialogs/tweak_epub.ui b/src/calibre/gui2/dialogs/tweak_epub.ui index 36c9a7bc22..9daa5a8f67 100644 --- a/src/calibre/gui2/dialogs/tweak_epub.ui +++ b/src/calibre/gui2/dialogs/tweak_epub.ui @@ -9,8 +9,8 @@ <rect> <x>0</x> <y>0</y> - <width>161</width> - <height>132</height> + <width>382</width> + <height>242</height> </rect> </property> <property name="windowTitle"> @@ -22,65 +22,66 @@ <property name="modal"> <bool>false</bool> </property> - <widget class="QWidget" name="verticalLayoutWidget"> - <property name="geometry"> - <rect> - <x>10</x> - <y>10</y> - <width>141</width> - <height>110</height> - </rect> - </property> - <layout class="QVBoxLayout" name="verticalLayout"> - <item> - <widget class="QPushButton" name="explode_button"> - <property name="statusTip"> - <string>Display contents of exploded ePub</string> - </property> - <property name="text"> - <string>Explode ePub</string> - </property> - </widget> - </item> - <item> - <widget class="QPushButton" name="rebuild_button"> - <property name="enabled"> - <bool>false</bool> - </property> - <property name="statusTip"> - <string>Rebuild ePub from exploded contents</string> - </property> - <property name="text"> - <string>Rebuild ePub</string> - </property> - </widget> - </item> - <item> - <widget class="QPushButton" name="cancel_button"> - <property name="statusTip"> - <string>Discard changes</string> - </property> - <property name="text"> - <string>Cancel</string> - </property> - </widget> - </item> - <item> - <spacer name="verticalSpacer"> - <property name="orientation"> - <enum>Qt::Vertical</enum> - </property> - <property name="sizeHint" stdset="0"> - <size> - <width>20</width> - <height>40</height> - </size> - </property> - </spacer> - </item> - </layout> - </widget> + <layout class="QGridLayout" name="gridLayout"> + <item row="1" column="0"> + <widget class="QPushButton" name="explode_button"> + <property name="statusTip"> + <string>Display contents of exploded ePub</string> + </property> + <property name="text"> + <string>&Explode ePub</string> + </property> + <property name="icon"> + <iconset resource="../../../../resources/images.qrc"> + <normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset> + </property> + </widget> + </item> + <item row="2" column="0"> + <widget class="QPushButton" name="rebuild_button"> + <property name="enabled"> + <bool>false</bool> + </property> + <property name="statusTip"> + <string>Rebuild ePub from exploded contents</string> + </property> + <property name="text"> + <string>&Rebuild ePub</string> + </property> + <property name="icon"> + <iconset resource="../../../../resources/images.qrc"> + <normaloff>:/images/exec.png</normaloff>:/images/exec.png</iconset> + </property> + </widget> + </item> + <item row="3" column="0"> + <widget class="QPushButton" name="cancel_button"> + <property name="statusTip"> + <string>Discard changes</string> + </property> + <property name="text"> + <string>&Cancel</string> + </property> + <property name="icon"> + <iconset resource="../../../../resources/images.qrc"> + <normaloff>:/images/window-close.png</normaloff>:/images/window-close.png</iconset> + </property> + </widget> + </item> + <item row="0" column="0"> + <widget class="QLabel" name="label"> + <property name="text"> + <string>First, explode the epub. Then edit is contents by right clicking on the individual files and selecting the editor of your choice. When you are done, click rebuild epub and the epub in your calibre library will be updated with the changes you have made.</string> + </property> + <property name="wordWrap"> + <bool>true</bool> + </property> + </widget> + </item> + </layout> </widget> - <resources/> + <resources> + <include location="../../../../resources/images.qrc"/> + </resources> <connections/> </ui> diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 7a516bb4ff..2f0452a773 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -217,6 +217,10 @@ def fetch_scheduled_recipe(arg): if 'output_profile' in ps: recs.append(('output_profile', ps['output_profile'], OptionRecommendation.HIGH)) + if ps['output_profile'] == 'kindle': + recs.append(('no_inline_toc', True, + OptionRecommendation.HIGH)) + lf = load_defaults('look_and_feel') if lf.get('base_font_size', 0.0) != 0.0: recs.append(('base_font_size', lf['base_font_size'], diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 95794a8c1d..aeba8a3218 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -184,7 +184,7 @@ class ContentServer(object): if path and os.path.exists(path): updated = fromtimestamp(os.stat(path).st_mtime) cherrypy.response.headers['Last-Modified'] = self.last_modified(updated) - return fmt.read() + return fmt # }}}