From 441b4e20cc9661579e09b6bfa6aa00b19c54eb3d Mon Sep 17 00:00:00 2001 From: Lee Date: Sun, 24 Apr 2011 22:43:25 +0800 Subject: [PATCH 01/37] re-factored the query logic for overdrive to handle titles including punctuation --- src/calibre/ebooks/metadata/sources/overdrive.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 759da45610..62a3ca2091 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -198,12 +198,16 @@ class OverDrive(Source): title_tokens = list(self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)) - if len(title_tokens) >= len(author_tokens): + xref_q = '' + if len(author_tokens) <= 1: initial_q = ' '.join(title_tokens) xref_q = '+'.join(author_tokens) else: initial_q = ' '.join(author_tokens) - xref_q = '+'.join(title_tokens) + for token in title_tokens: + if len(xref_q) < len(token): + xref_q = token + #log.error('Initial query is %s'%initial_q) #log.error('Cross reference query is %s'%xref_q) @@ -234,10 +238,12 @@ class OverDrive(Source): if xref_q.find('+') != -1: xref_tokens = xref_q.split('+') xref_q = xref_tokens[0] - #log.error('xref_q is '+xref_q) + for token in xref_tokens: + if len(xref_q) < len(token): + xref_q = token + #log.error('rewrote xref_q, new query is '+xref_q) else: xref_q = '' - xref_q = '' q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q elif int(m.group('totalrecords')) == 0: return '' @@ -264,6 +270,7 @@ class OverDrive(Source): else: if creators: creators = creators.split(', ') + # if an exact match in a preferred format occurs if ((author and creators[0] == author[0]) or (not author and not creators)) and od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage: return self.format_results(reserveid, od_title, subtitle, series, publisher, From 88a54e805405cbc72f7eb72ba469a711a3285777 Mon Sep 17 00:00:00 2001 From: Lee Date: Mon, 25 Apr 2011 10:51:20 +0800 Subject: [PATCH 02/37] ... --- src/calibre/ebooks/metadata/sources/overdrive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 62a3ca2091..67eac7e337 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -337,9 +337,9 @@ class OverDrive(Source): def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None): q = base_url if ovrdrv_id is None: - return self.overdrive_search(br, log, q, title, author) + return self.overdrive_search(br, log, q, title, author) else: - return self.overdrive_get_record(br, log, q, ovrdrv_id) + return self.overdrive_get_record(br, log, q, ovrdrv_id) From a1bbba3198f0945ade6d4d70f51a87f22c1d284a Mon Sep 17 00:00:00 2001 From: Lee Date: Sun, 7 Aug 2011 22:18:47 +0800 Subject: [PATCH 03/37] fix a number of issues with scene break formatting in heuristics --- src/calibre/ebooks/conversion/utils.py | 38 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 7488df4609..9962335da3 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -451,27 +451,33 @@ class HeuristicProcessor(object): return html def detect_whitespace(self, html): - blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) + blanks_around_headings = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P\d+)[^>]*>.*?)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) + blanks_around_scene_breaks = re.compile(r'(?P(<(p|div)[^>]*>\s*\s*){1,}\s*)?(?P

]*>.*?

)(?P\s*(<(p|div)[^>]*>\s*\s*){1,})?', re.IGNORECASE|re.DOTALL) blanks_n_nopunct = re.compile(r'(?P(]*>\s*

\s*){1,}\s*)?]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](\s*)*

(?P\s*(]*>\s*

\s*){1,})?', re.IGNORECASE|re.DOTALL) def merge_header_whitespace(match): initblanks = match.group('initparas') - endblanks = match.group('initparas') - heading = match.group('heading') + endblanks = match.group('endparas') + content = match.group('content') top_margin = '' bottom_margin = '' if initblanks is not None: + print "initial blanks are:\n"+initblanks top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;' if endblanks is not None: - bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;' + print "endblanks blanks are:\n"+endblanks + bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(endblanks)))+'em;' if initblanks == None and endblanks == None: - return heading + return content + elif content.find('scenebreak') != -1: + return content else: - heading = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', heading) - return heading + content = re.sub('(?i)\d+)[^>]*>', '\n\n'+' style="'+top_margin+bottom_margin+'">', content) + return content html = blanks_around_headings.sub(merge_header_whitespace, html) + html = blanks_around_scene_breaks.sub(merge_header_whitespace, html) def markup_whitespaces(match): blanks = match.group(0) @@ -506,6 +512,12 @@ class HeuristicProcessor(object): html = self.blankreg.sub('\n

', html) return html + def detect_scene_breaks(self, html): + scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close + scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) + html = scene_breaks.sub(self.scene_break_open+'\g'+'

', html) + return html + def markup_user_break(self, replacement_break): ''' Takes string a user supplies and wraps it in markup that will be centered with @@ -765,25 +777,25 @@ class HeuristicProcessor(object): # If non-blank scene breaks exist they are center aligned and styled with appropriate margins. if getattr(self.extra_opts, 'format_scene_breaks', False): html = re.sub('(?i)]*>\s*\s*', '

', html) + html = self.detect_scene_breaks(html) html = self.detect_whitespace(html) html = self.detect_soft_breaks(html) blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) - scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+'<))(?P((?P((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close - scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE) + detected_scene_break = re.compile(r'

]*>.*?

') + scene_break_count = len(detected_scene_break.findall(html)) # If the user has enabled scene break replacement, then either softbreaks # or 'hard' scene breaks are replaced, depending on which is in use # Otherwise separator lines are centered, use a bit larger margin in this case replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None) if replacement_break: replacement_break = self.markup_user_break(replacement_break) - if len(scene_break.findall(html)) >= 1: - html = scene_break.sub(replacement_break, html) + if scene_break_count >= 1: + html = detected_scene_break.sub(replacement_break, html) + html = re.sub(']*>\s*

', replacement_break, html) else: html = re.sub(']*>\s*

', replacement_break, html) - else: - html = scene_break.sub(self.scene_break_open+'\g'+'

', html) if self.deleted_nbsps: # put back non-breaking spaces in empty paragraphs so they render correctly From d07b4556e97c15b373fbf6b40c4fcc29b3872c10 Mon Sep 17 00:00:00 2001 From: Lee Date: Mon, 8 Aug 2011 03:51:56 +0800 Subject: [PATCH 04/37] fix issue where overdrive can get in an ifinite loop --- .../ebooks/metadata/sources/overdrive.py | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 1faacaa3ef..0af41538b0 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -208,8 +208,8 @@ class OverDrive(Source): if len(xref_q) < len(token): xref_q = token - #log.error('Initial query is %s'%initial_q) - #log.error('Cross reference query is %s'%xref_q) + log.error('Initial query is %s'%initial_q) + log.error('Cross reference query is %s'%xref_q) q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q query = '{"szKeyword":"'+initial_q+'"}' @@ -224,29 +224,30 @@ class OverDrive(Source): # get the search results object results = False + iterations = 0 while results == False: + iterations += 1 xreq = mechanize.Request(q_xref) xreq.add_header('X-Requested-With', 'XMLHttpRequest') xreq.add_header('Referer', q_init_search) xreq.add_header('Accept', 'application/json, text/javascript, */*') raw = br.open_novisit(xreq).read() for m in re.finditer(ur'"iTotalDisplayRecords":(?P\d+).*?"iTotalRecords":(?P\d+)', raw): - if int(m.group('displayrecords')) >= 1: - results = True - elif int(m.group('totalrecords')) >= 1: - if int(m.group('totalrecords')) >= 100: - if xref_q.find('+') != -1: - xref_tokens = xref_q.split('+') - xref_q = xref_tokens[0] - for token in xref_tokens: - if len(xref_q) < len(token): - xref_q = token - #log.error('rewrote xref_q, new query is '+xref_q) - else: - xref_q = '' - q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q - elif int(m.group('totalrecords')) == 0: + if int(m.group('totalrecords')) == 0: return '' + elif int(m.group('displayrecords')) >= 1: + results = True + elif int(m.group('totalrecords')) >= 1 and iterations < 3: + if xref_q.find('+') != -1: + xref_tokens = xref_q.split('+') + xref_q = xref_tokens[0] + for token in xref_tokens: + if len(xref_q) < len(token): + xref_q = token + #log.error('rewrote xref_q, new query is '+xref_q) + else: + xref_q = '' + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens) @@ -461,10 +462,10 @@ if __name__ == '__main__': [ ( - {'title':'Foundation and Earth', - 'authors':['Asimov']}, - [title_test('Foundation and Earth', exact=True), - authors_test(['Isaac Asimov'])] + {'title':'The Sea Kings Daughter', + 'authors':['Elizabeth Peters']}, + [title_test('The Sea Kings Daughter', exact=False), + authors_test(['Elizabeth Peters'])] ), ( From b72aaf7fa60f5ed2508ac1e96834c67ad3cc99ff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 07:55:24 +0530 Subject: [PATCH 05/37] Acim Bilim Dergisi by thomass --- recipes/acim_bilim_dergisi.recipe | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 recipes/acim_bilim_dergisi.recipe diff --git a/recipes/acim_bilim_dergisi.recipe b/recipes/acim_bilim_dergisi.recipe new file mode 100644 index 0000000000..5d674fe93a --- /dev/null +++ b/recipes/acim_bilim_dergisi.recipe @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1334868409(BasicNewsRecipe): + title = u'AÇIK BİLİM DERGİSİ' + description = ' Aylık çevrimiçi bilim dergisi' + __author__ = u'thomass' + oldest_article = 30 + max_articles_per_feed = 300 + auto_cleanup = True + encoding = 'UTF-8' + publisher = 'açık bilim' + category = 'haber, bilim,TR,dergi' + language = 'tr' + publication_type = 'magazine ' + conversion_options = { + 'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + cover_img_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg' + masthead_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg' + + + feeds = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')] From 6545602342c4309f02f552ffe926507754818668 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 08:14:03 +0530 Subject: [PATCH 06/37] version 0.8.48 --- Changelog.yaml | 51 ++++++++++++++++++++++++++++++++++++++++ src/calibre/constants.py | 2 +- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/Changelog.yaml b/Changelog.yaml index 17f3ebcf97..01425ec2ca 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,57 @@ # new recipes: # - title: +- version: 0.8.48 + date: 2012-04-20 + + new features: + - title: "Conversion: The search and replace feature has been completely revamped." + description: "You can now use any number of search and replace + expression, not just three. You can also store and load frequently used + sets of search and replace expressions. Also, the wizard generates its + preview in a separate process to protect against crashes/memory leaks." + tickets: [983476,983484,983478] + + - title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free." + + - title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X" + tickets: [981185] + + bug fixes: + - title: "Get Books: Support the new website design of Barnes & Noble" + + - title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted." + tickets: [943586] + + - title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'" + + - title: "MOBI Output: Handle background color specified on and in addition to tags." + tickets: [980813] + + - title: "MOBI Output: Fix underline style applied to parent element not getting inherited by children." + tickets: [985711] + + improved recipes: + - xkcd + - Metro Nieuws + - Calgary Herald + - Orlando Sentinel + - countryfile + - Heise + + new recipes: + - title: Various new Polish news sources + author: fenuks + + - title: Various Italian news sources + author: faber1971 + + - title: Jakarta Globe + author: rty + + - title: Acim Bilim Dergisi + author: thomass + - version: 0.8.47 date: 2012-04-13 diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 402fef4c67..1db9c90466 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -4,7 +4,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' __appname__ = u'calibre' -numeric_version = (0, 8, 47) +numeric_version = (0, 8, 48) __version__ = u'.'.join(map(unicode, numeric_version)) __author__ = u"Kovid Goyal " From 15ec14ab52844bfa88fc6cf3bb712f3f78b6c3a6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 08:18:25 +0530 Subject: [PATCH 07/37] ... --- src/calibre/devices/kobo/driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index f68ea8feff..1384ec0810 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -298,7 +298,7 @@ class KOBO(USBMS): changed = False for i, row in enumerate(cursor): # self.report_progress((i+1) / float(numrows), _('Getting list of books on device...')) - if row[3].startswith("file:///usr/local/Kobo/help/"): + if not hasattr(row[3], 'startswith') or row[3].startswith("file:///usr/local/Kobo/help/"): # These are internal to the Kobo device and do not exist continue path = self.path_from_contentid(row[3], row[5], row[4], oncard) From eb2d0761b0682f9f4b1580672c51b1f12252357d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 10:38:41 +0530 Subject: [PATCH 08/37] IGN:Tag release --- src/calibre/translations/calibre.pot | 331 ++++++++++++++++----------- 1 file changed, 203 insertions(+), 128 deletions(-) diff --git a/src/calibre/translations/calibre.pot b/src/calibre/translations/calibre.pot index 7c3bb3b302..5b0c096104 100644 --- a/src/calibre/translations/calibre.pot +++ b/src/calibre/translations/calibre.pot @@ -4,9 +4,9 @@ # msgid "" msgstr "" -"Project-Id-Version: calibre 0.8.47\n" -"POT-Creation-Date: 2012-04-13 09:24+IST\n" -"PO-Revision-Date: 2012-04-13 09:24+IST\n" +"Project-Id-Version: calibre 0.8.48\n" +"POT-Creation-Date: 2012-04-20 08:19+IST\n" +"PO-Revision-Date: 2012-04-20 08:19+IST\n" "Last-Translator: Automatically generated\n" "Language-Team: LANGUAGE\n" "MIME-Version: 1.0\n" @@ -24,8 +24,8 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/db/cache.py:106 #: /home/kovid/work/calibre/src/calibre/db/cache.py:109 #: /home/kovid/work/calibre/src/calibre/db/cache.py:120 -#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:317 -#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:318 +#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:319 +#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:320 #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:100 #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:101 #: /home/kovid/work/calibre/src/calibre/devices/jetbook/driver.py:74 @@ -36,9 +36,9 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/nook/driver.py:71 #: /home/kovid/work/calibre/src/calibre/devices/prs500/books.py:267 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:661 -#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:337 -#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:338 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:493 +#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:347 +#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:348 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:496 #: /home/kovid/work/calibre/src/calibre/ebooks/chm/metadata.py:57 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/chm_input.py:109 #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/chm_input.py:112 @@ -109,7 +109,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/ebooks/mobi/writer2/indexer.py:497 #: /home/kovid/work/calibre/src/calibre/ebooks/odt/input.py:168 #: /home/kovid/work/calibre/src/calibre/ebooks/odt/input.py:170 -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:836 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:850 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:353 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:356 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/parse_utils.py:360 @@ -183,14 +183,15 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/library/database2.py:580 #: /home/kovid/work/calibre/src/calibre/library/database2.py:2087 #: /home/kovid/work/calibre/src/calibre/library/database2.py:2241 -#: /home/kovid/work/calibre/src/calibre/library/database2.py:3303 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:2657 #: /home/kovid/work/calibre/src/calibre/library/database2.py:3305 -#: /home/kovid/work/calibre/src/calibre/library/database2.py:3442 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:3307 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:3444 #: /home/kovid/work/calibre/src/calibre/library/server/content.py:250 #: /home/kovid/work/calibre/src/calibre/library/server/content.py:251 #: /home/kovid/work/calibre/src/calibre/library/server/mobile.py:245 -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:157 #: /home/kovid/work/calibre/src/calibre/library/server/opds.py:160 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:163 #: /home/kovid/work/calibre/src/calibre/library/server/xml.py:79 #: /home/kovid/work/calibre/src/calibre/utils/localization.py:162 #: /home/kovid/work/calibre/src/calibre/utils/podofo/__init__.py:46 @@ -894,15 +895,15 @@ msgstr "" msgid "Communicate with Android phones." msgstr "" -#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:167 +#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:168 msgid "Comma separated list of directories to send e-books to on the device. The first one that exists will be used" msgstr "" -#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:257 +#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:259 msgid "Communicate with S60 phones." msgstr "" -#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:276 +#: /home/kovid/work/calibre/src/calibre/devices/android/driver.py:278 msgid "Communicate with WebOS tablets." msgstr "" @@ -1002,8 +1003,8 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/nook/driver.py:102 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:448 #: /home/kovid/work/calibre/src/calibre/devices/prs505/sony_cache.py:471 -#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:558 -#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:577 +#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:568 +#: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:587 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1051 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1057 #: /home/kovid/work/calibre/src/calibre/devices/usbms/device.py:1092 @@ -1013,7 +1014,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/tag_browser/model.py:1165 #: /home/kovid/work/calibre/src/calibre/library/database2.py:346 #: /home/kovid/work/calibre/src/calibre/library/database2.py:359 -#: /home/kovid/work/calibre/src/calibre/library/database2.py:3160 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:3162 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:187 msgid "News" msgstr "" @@ -1021,8 +1022,8 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/apple/driver.py:2727 #: /home/kovid/work/calibre/src/calibre/gui2/catalog/catalog_epub_mobi.py:65 #: /home/kovid/work/calibre/src/calibre/library/catalogs/epub_mobi.py:65 -#: /home/kovid/work/calibre/src/calibre/library/database2.py:3116 -#: /home/kovid/work/calibre/src/calibre/library/database2.py:3134 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:3118 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:3136 msgid "Catalog" msgstr "" @@ -1067,20 +1068,20 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:111 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:337 #: /home/kovid/work/calibre/src/calibre/devices/prst1/driver.py:155 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:141 #: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:144 #: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:147 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:215 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:222 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:245 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:150 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:218 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:225 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:248 msgid "Getting list of books on device..." msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:264 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:268 #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:324 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:274 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:276 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:277 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:279 msgid "Transferring books to device..." msgstr "" @@ -1088,8 +1089,8 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:344 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:474 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:509 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:298 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:329 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:301 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:332 msgid "Adding books to device metadata listing..." msgstr "" @@ -1099,8 +1100,8 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/hanvon/driver.py:126 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:426 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:458 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:335 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:356 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:338 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:359 msgid "Removing books from device..." msgstr "" @@ -1108,13 +1109,13 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:374 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:462 #: /home/kovid/work/calibre/src/calibre/devices/kobo/driver.py:469 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:363 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:368 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:366 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:371 msgid "Removing books from device metadata listing..." msgstr "" #: /home/kovid/work/calibre/src/calibre/devices/bambook/driver.py:442 -#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:398 +#: /home/kovid/work/calibre/src/calibre/devices/usbms/driver.py:401 msgid "Sending metadata to device..." msgstr "" @@ -1364,11 +1365,11 @@ msgstr "" msgid "If you have a custom column in your library that you use to store the page count of books, you can have calibre use that information, instead of calculating a page count. Specify the name of the custom column here, for example, #pages. " msgstr "" -#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:415 +#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:419 msgid "Communicate with the Kindle DX eBook reader." msgstr "" -#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:424 +#: /home/kovid/work/calibre/src/calibre/devices/kindle/driver.py:428 msgid "Communicate with the Kindle Fire" msgstr "" @@ -1900,31 +1901,31 @@ msgid "Modify the document text and structure using common patterns. Disabled by msgstr "" #: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:157 -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:16 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:20 msgid "Modify the document text and structure using user defined patterns." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:166 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:167 msgid "Control auto-detection of document structure." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:176 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:177 msgid "Control the automatic generation of a Table of Contents. By default, if the source file has a Table of Contents, it will be used in preference to the automatically generated one." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:186 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:187 msgid "Options to set metadata in the output" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:189 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:190 msgid "Options to help with debugging the conversion" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:217 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:219 msgid "List builtin recipe names. You can create an ebook from a builtin recipe like this: ebook-convert \"Recipe Name.recipe\" output.epub" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:289 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/cli.py:322 msgid "Output saved to" msgstr "" @@ -2163,48 +2164,48 @@ msgstr "" msgid "Comic" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:23 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:21 msgid "When present, use author sort field as author." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:27 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:25 msgid "Don't add Table of Contents to the book. Useful if the book has its own table of contents." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:30 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:28 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/htmltoc.py:57 msgid "Title for any generated in-line table of contents." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:34 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:32 msgid "Disable compression of the file contents." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:37 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:35 msgid "Tag marking book to be filed with Personal Docs" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:41 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:39 msgid "Ignore margins in the input document. If False, then the MOBI output plugin will try to convert margins specified in the input document, otherwise it will ignore them." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:47 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:45 msgid "When adding the Table of Contents to the book, add it at the start of the book instead of the end. Not recommended." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:51 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:49 msgid "Extract the contents of the MOBI file to the specified directory. If the directory already exists, it will be deleted." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:56 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:54 msgid "Enable sharing of book content via Facebook etc. on the Kindle. WARNING: Using this feature means that the book will not auto sync its last read position on multiple devices. Complain to Amazon." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:63 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:61 msgid "By default calibre converts all images to JPEG format in the output MOBI file. This is for maximum compatibility as some older MOBI viewers have problems with other image formats. This option tells calibre not to do this. Useful if your document contains lots of GIF/PNG images that become very large when converted to JPEG." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:114 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plugins/mobi_output.py:103 #: /home/kovid/work/calibre/src/calibre/ebooks/epub/periodical.py:125 msgid "All articles" msgstr "" @@ -2714,27 +2715,31 @@ msgstr "" msgid "Replacement to replace the text found with sr3-search." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:733 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:632 +msgid "Path to a file containing search and replace regular expressions. The file must contain alternating lines of regular expression followed by replacement pattern (which can be an empty line). The regular expression must be in the python regex syntax and the file must be UTF-8 encoded." +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:741 msgid "Could not find an ebook inside the archive" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:791 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:799 msgid "Values of series index and rating must be numbers. Ignoring" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:798 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:806 msgid "Failed to parse date/time" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:957 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:965 msgid "Converting input to HTML..." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:984 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:992 msgid "Running transforms on ebook..." msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:1088 +#: /home/kovid/work/calibre/src/calibre/ebooks/conversion/plumber.py:1096 msgid "Creating" msgstr "" @@ -3032,7 +3037,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/store/search/models.py:41 #: /home/kovid/work/calibre/src/calibre/gui2/store/stores/mobileread/models.py:23 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:375 -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:580 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:583 msgid "Title" msgstr "" @@ -3200,7 +3205,7 @@ msgid "" msgstr "" #: /home/kovid/work/calibre/src/calibre/ebooks/metadata/opf2.py:1434 -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1244 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1258 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:937 #: /home/kovid/work/calibre/src/calibre/gui2/store/search/models.py:41 msgid "Cover" @@ -3310,70 +3315,70 @@ msgstr "" msgid "No details available" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1245 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1259 msgid "Title Page" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1246 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260 #: /home/kovid/work/calibre/src/calibre/ebooks/oeb/transforms/htmltoc.py:15 #: /home/kovid/work/calibre/src/calibre/gui2/viewer/main.py:57 #: /home/kovid/work/calibre/src/calibre/gui2/viewer/main_ui.py:199 msgid "Table of Contents" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1247 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1261 msgid "Index" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1248 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1262 msgid "Glossary" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1249 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1263 msgid "Acknowledgements" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1250 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1264 msgid "Bibliography" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1251 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1265 msgid "Colophon" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1252 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1266 msgid "Copyright" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1253 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1267 msgid "Dedication" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1254 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1268 msgid "Epigraph" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1255 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1269 msgid "Foreword" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1256 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1270 msgid "List of Illustrations" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1257 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1271 msgid "List of Tables" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1258 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1272 msgid "Notes" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1259 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1273 msgid "Preface" msgstr "" -#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1260 +#: /home/kovid/work/calibre/src/calibre/ebooks/oeb/base.py:1274 msgid "Main Text" msgstr "" @@ -4073,7 +4078,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/actions/choose_library.py:147 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/toolbar.py:58 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:171 -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:126 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:129 #, python-format msgid "%d books" msgstr "" @@ -5043,7 +5048,7 @@ msgid "Selected books have no formats" msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/actions/view.py:153 -#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:128 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:138 msgid "Choose the format to view" msgstr "" @@ -5416,7 +5421,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/convert/pdf_output_ui.py:54 #: /home/kovid/work/calibre/src/calibre/gui2/convert/pmlz_output_ui.py:46 #: /home/kovid/work/calibre/src/calibre/gui2/convert/rb_output_ui.py:33 -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:147 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:110 #: /home/kovid/work/calibre/src/calibre/gui2/convert/snb_output_ui.py:42 #: /home/kovid/work/calibre/src/calibre/gui2/convert/structure_detection_ui.py:59 #: /home/kovid/work/calibre/src/calibre/gui2/convert/toc_ui.py:70 @@ -6631,23 +6636,32 @@ msgstr "" msgid "RB Output" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:134 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:146 msgid "No formats available" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:135 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:147 msgid "Cannot build regex using the GUI builder without a book." msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:144 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:156 msgid "Could not open file" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:145 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:157 msgid "Could not open the file, do you have it open in another program?" msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:175 +msgid "Failed to generate markup for testing. Click \"Show Details\" to learn more." +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:181 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:185 +msgid "Failed to generate preview" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/regex_builder.py:191 msgid "Open book" msgstr "" @@ -6699,50 +6713,124 @@ msgstr "" msgid "Preview" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:15 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:19 msgid "" "Search\n" "&\n" "Replace" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:29 -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:32 -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:35 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:43 msgid "&Search Regular Expression" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:72 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:56 +msgid "Replacement Text" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:56 +msgid "Search Regular Expression" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:100 +msgid "Load Calibre Search-Replace definitions file" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:102 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:119 +msgid "Calibre Search-Replace definitions file" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:110 +msgid "Failed to read" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:111 +#, python-format +msgid "Failed to load patterns from %s, click Show details to learn more." +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:117 +msgid "Save Calibre Search-Replace definitions file" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:168 +msgid "Unused Search & Replace definition" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:169 +msgid "The search / replace definition being edited has not been added to the list of definitions. Do you wish to continue with the conversion (the definition will not be used)?" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:180 #: /home/kovid/work/calibre/src/calibre/gui2/widgets.py:112 msgid "Invalid regular expression" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:73 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:181 #: /home/kovid/work/calibre/src/calibre/gui2/widgets.py:113 #, python-format msgid "Invalid regular expression: %s" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:148 -msgid "First expression" +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace.py:265 +msgid "The list of search/replace definitions that will be applied to this conversion." msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:149 -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:151 -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:153 +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:111 +msgid "Search/Replace Definition Edit" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:112 msgid "&Replacement Text" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:150 -msgid "Second Expression" +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:113 +msgid "Add the current expression to the list of expressions that will be applied" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:152 -msgid "Third expression" +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:114 +msgid "&Add" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:154 -msgid "

Search and replace uses regular expressions. See the regular expressions tutorial to get started with regular expressions. Also clicking the wizard buttons below will allow you to test your regular expression against the current input document." +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:115 +msgid "Edit the currently selected expression" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:116 +#: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:59 +msgid "&Change" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:117 +msgid "Remove the currently selected expression" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:118 +#: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:886 +msgid "&Remove" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:119 +msgid "Load a listof expression from a previously saved file" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:120 +msgid "&Load" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:121 +msgid "Save this list of expression so that you can re-use it easily" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:122 +#: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:131 +#: /usr/src/qt-everywhere-opensource-src-4.8.0/src/gui/widgets/qdialogbuttonbox.cpp:661 +msgid "&Save" +msgstr "" + +#: /home/kovid/work/calibre/src/calibre/gui2/convert/search_and_replace_ui.py:123 +msgid "

Search and replace uses regular expressions. See the regular expressions tutorial to get started with regular expressions. Also clicking the wizard button below will allow you to test your regular expression against the current input document. When you are happy with an expression, click the Add button to add it to the list of expressions." msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/convert/single.py:181 @@ -7808,7 +7896,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/dialogs/confirm_delete_location_ui.py:77 #: /home/kovid/work/calibre/src/calibre/gui2/layout.py:73 -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:233 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:236 msgid "Library" msgstr "" @@ -7843,7 +7931,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/preferences/create_custom_column.py:35 #: /home/kovid/work/calibre/src/calibre/gui2/preferences/create_custom_column.py:76 #: /home/kovid/work/calibre/src/calibre/library/field_metadata.py:365 -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:579 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:582 msgid "Date" msgstr "" @@ -10811,10 +10899,6 @@ msgstr "" msgid "T&rim" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:886 -msgid "&Remove" -msgstr "" - #: /home/kovid/work/calibre/src/calibre/gui2/metadata/basic_widgets.py:892 msgid "Download co&ver" msgstr "" @@ -12867,11 +12951,6 @@ msgid "" "of a search term by changing the value box then pressing Save." msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:131 -#: /usr/src/qt-everywhere-opensource-src-4.8.0/src/gui/widgets/qdialogbuttonbox.cpp:661 -msgid "&Save" -msgstr "" - #: /home/kovid/work/calibre/src/calibre/gui2/preferences/search_ui.py:132 msgid "Make &user categories from:" msgstr "" @@ -14924,10 +15003,6 @@ msgstr "" msgid "

Choose a location for your books. When you add books to calibre, they will be copied here. Use an empty folder for a new calibre library:" msgstr "" -#: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:59 -msgid "&Change" -msgstr "" - #: /home/kovid/work/calibre/src/calibre/gui2/wizard/library_ui.py:60 msgid "If you have an existing calibre library, it will be copied to the new location. If a calibre library already exists at the new location, calibre will switch to using it." msgstr "" @@ -15984,17 +16059,17 @@ msgstr "" msgid "%(tt)sAverage rating is %(rating)3.1f" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/database2.py:3468 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:3470 #, python-format msgid "

Migrating old database to ebook library in %s

" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/database2.py:3497 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:3499 #, python-format msgid "Copying %s" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/database2.py:3514 +#: /home/kovid/work/calibre/src/calibre/library/database2.py:3516 msgid "Compacting database" msgstr "" @@ -16198,7 +16273,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/library/server/ajax.py:295 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:341 #: /home/kovid/work/calibre/src/calibre/library/server/browse.py:625 -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:579 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:582 msgid "Newest" msgstr "" @@ -16355,40 +16430,40 @@ msgstr "" msgid "The full interface gives you many more features, but it may not work well on a small screen" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:126 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:129 #, python-format msgid "%d book" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:149 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:152 #, python-format msgid "%d items" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:167 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:170 #, python-format msgid "RATING: %s
" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:170 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:173 #, python-format msgid "TAGS: %s
" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:175 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:178 #, python-format msgid "SERIES: %(series)s [%(sidx)s]
" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:273 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:276 msgid "Books in your library" msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:279 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:282 msgid "By " msgstr "" -#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:280 +#: /home/kovid/work/calibre/src/calibre/library/server/opds.py:283 msgid "Books sorted by " msgstr "" From abe11a17dcaf51dd5fc5b5a21ba6b148676b0d4f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 14:19:22 +0530 Subject: [PATCH 09/37] Finished testing the KF8 chunker --- src/calibre/ebooks/mobi/writer8/skeleton.py | 71 +++++++++++++++++++-- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index b8c90e0e36..04555467f3 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -13,6 +13,7 @@ from collections import namedtuple from lxml import etree from calibre.ebooks.oeb.base import XHTML_NS +from calibre.constants import ispy3 CHUNK_SIZE = 8192 @@ -48,6 +49,24 @@ def node_from_path(root, path): parent = parent[idx] return parent +mychr = chr if ispy3 else unichr + +def tostring(raw, **kwargs): + ''' lxml *sometimes* represents non-ascii characters as hex entities in + attribute values. I can't figure out exactly what circumstances cause it. + It seems to happen when serializing a part of a larger tree. Since we need + serialization to be the same when serializing full and partial trees, we + manually replace all hex entities with their unicode codepoints. ''' + + xml_declaration = kwargs.pop('xml_declaration', False) + kwargs['encoding'] = unicode + kwargs['xml_declaration'] = False + ans = etree.tostring(raw, **kwargs) + if xml_declaration: + ans = '\n' + ans + return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)), + ans) + class Chunk(object): def __init__(self, raw): @@ -63,6 +82,12 @@ class Chunk(object): self.raw += chunk.raw self.ends_tags = chunk.ends_tags + def __repr__(self): + return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%( + len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags) + + __str__ = __repr__ + class Skeleton(object): def __init__(self, file_number, item, root, chunks): @@ -76,8 +101,8 @@ class Skeleton(object): self.calculate_insert_positions() def render(self, root): - raw = etree.tostring(root, encoding='UTF-8', xml_declaration=True) - raw = raw.replace('')[0]) + len(text) + 1 end_length = len(raw.rpartition(b'<')[-1]) + 1 self.metrics[tag.get('aid')] = Metric(start_length, end_length) @@ -101,6 +125,13 @@ class Skeleton(object): for tag in chunk.ends_tags: pos += self.metrics[tag].end + def rebuild(self): + ans = self.skeleton + for chunk in self.chunks: + i = chunk.insert_pos + ans = ans[:i] + chunk.raw + ans[i:] + return ans + class Chunker(object): def __init__(self, oeb, data_func): @@ -109,10 +140,20 @@ class Chunker(object): self.skeletons = [] + # Set this to a list to enable dumping of the original and rebuilt + # html files for debugging + self.orig_dumps = [] + for i, item in enumerate(self.oeb.spine): root = self.remove_namespaces(self.data(item)) body = root.xpath('//body')[0] body.tail = '\n' + if self.orig_dumps is not None: + self.orig_dumps.append(tostring(root, xml_declaration=True, + with_tail=True)) + self.orig_dumps[-1] = close_self_closing_tags( + self.orig_dumps[-1].replace(b' CHUNK_SIZE and child.get('aid', None): self.step_into_tag(child, chunks) @@ -230,3 +273,19 @@ class Chunker(object): prev.merge(chunk) return ans + def dump(self): + import tempfile, shutil, os + tdir = os.path.join(tempfile.gettempdir(), 'skeleton') + self.log('Skeletons dumped to:', tdir) + if os.path.exists(tdir): + shutil.rmtree(tdir) + orig = os.path.join(tdir, 'orig') + rebuilt = os.path.join(tdir, 'rebuilt') + for x in (orig, rebuilt): + os.makedirs(x) + for i, skeleton in enumerate(self.skeletons): + with open(os.path.join(orig, '%04d.html'%i), 'wb') as f: + f.write(self.orig_dumps[i]) + with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: + f.write(skeleton.rebuild()) + From 819b76f6575716ab35b43bf70af3b6ef42f8af93 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 14:20:09 +0530 Subject: [PATCH 10/37] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 04555467f3..d369e36b9d 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -65,7 +65,7 @@ def tostring(raw, **kwargs): if xml_declaration: ans = '\n' + ans return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)), - ans) + ans).encode('utf-8') class Chunk(object): From 67d93b84d69d093da5b62dc6546b624d281f77ec Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 14:22:42 +0530 Subject: [PATCH 11/37] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index d369e36b9d..4e83791962 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -59,13 +59,14 @@ def tostring(raw, **kwargs): manually replace all hex entities with their unicode codepoints. ''' xml_declaration = kwargs.pop('xml_declaration', False) + encoding = kwargs.pop('encoding', 'UTF-8') kwargs['encoding'] = unicode kwargs['xml_declaration'] = False ans = etree.tostring(raw, **kwargs) if xml_declaration: - ans = '\n' + ans + ans = '\n'%encoding + ans return re.sub(r'&#x([0-9A-Fa-f]+);', lambda m:mychr(int(m.group(1), 16)), - ans).encode('utf-8') + ans).encode(encoding) class Chunk(object): From f2b734a12095a707d697a508524a80a834a8315f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 14:25:03 +0530 Subject: [PATCH 12/37] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 4e83791962..f28fbb8588 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -143,7 +143,7 @@ class Chunker(object): # Set this to a list to enable dumping of the original and rebuilt # html files for debugging - self.orig_dumps = [] + self.orig_dumps = None for i, item in enumerate(self.oeb.spine): root = self.remove_namespaces(self.data(item)) From 85d357bd47583c13bf849f8008a34699edcf6891 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 14:25:33 +0530 Subject: [PATCH 13/37] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index f28fbb8588..201d2b63d4 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -149,6 +149,7 @@ class Chunker(object): root = self.remove_namespaces(self.data(item)) body = root.xpath('//body')[0] body.tail = '\n' + if self.orig_dumps is not None: self.orig_dumps.append(tostring(root, xml_declaration=True, with_tail=True)) From 4b93ebc99068aa5493dcce549f84a9ee9f094488 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 14:41:53 +0530 Subject: [PATCH 14/37] Fix #986070 (Typos in tooltip messages in trunk) --- src/calibre/gui2/convert/search_and_replace.ui | 4 ++-- src/calibre/translations/calibre.pot | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/convert/search_and_replace.ui b/src/calibre/gui2/convert/search_and_replace.ui index 850f59885e..2497855abd 100644 --- a/src/calibre/gui2/convert/search_and_replace.ui +++ b/src/calibre/gui2/convert/search_and_replace.ui @@ -137,7 +137,7 @@ - Load a listof expression from a previously saved file + Load a list of expressions from a previously saved file &Load @@ -147,7 +147,7 @@ - Save this list of expression so that you can re-use it easily + Save this list of expressions so that you can re-use it easily &Save diff --git a/src/calibre/translations/calibre.pot b/src/calibre/translations/calibre.pot index 5b0c096104..aecd4e2625 100644 --- a/src/calibre/translations/calibre.pot +++ b/src/calibre/translations/calibre.pot @@ -5,8 +5,8 @@ msgid "" msgstr "" "Project-Id-Version: calibre 0.8.48\n" -"POT-Creation-Date: 2012-04-20 08:19+IST\n" -"PO-Revision-Date: 2012-04-20 08:19+IST\n" +"POT-Creation-Date: 2012-04-20 14:41+IST\n" +"PO-Revision-Date: 2012-04-20 14:41+IST\n" "Last-Translator: Automatically generated\n" "Language-Team: LANGUAGE\n" "MIME-Version: 1.0\n" From 13abe2bb6efb537bd2b5d404a7eda1c81ce80b1f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 18:49:22 +0530 Subject: [PATCH 15/37] KF8 Output: Text processing layer is complete --- src/calibre/ebooks/mobi/utils.py | 51 ++++++++ src/calibre/ebooks/mobi/writer2/__init__.py | 1 - src/calibre/ebooks/mobi/writer2/indexer.py | 3 +- src/calibre/ebooks/mobi/writer2/main.py | 61 +-------- src/calibre/ebooks/mobi/writer8/main.py | 25 +++- src/calibre/ebooks/mobi/writer8/skeleton.py | 136 ++++++++++++++++++-- 6 files changed, 201 insertions(+), 76 deletions(-) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 0ae992f438..fe5cd7eaf2 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -14,6 +14,7 @@ from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 +RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed)) def decode_string(raw, codec='utf-8', ordt_map=''): length, = struct.unpack(b'>B', raw[0]) @@ -498,3 +499,53 @@ def write_font_record(data, obfuscate=True, compress=True): # }}} +def create_text_record(text): + ''' + Return a Palmdoc record of size RECORD_SIZE from the text file object. + In case the record ends in the middle of a multibyte character return + the overlap as well. + + Returns data, overlap: where both are byte strings. overlap is the + extra bytes needed to complete the truncated multibyte character. + ''' + opos = text.tell() + text.seek(0, 2) + # npos is the position of the next record + npos = min((opos + RECORD_SIZE, text.tell())) + # Number of bytes from the next record needed to complete the last + # character in this record + extra = 0 + + last = b'' + while not last.decode('utf-8', 'ignore'): + # last contains no valid utf-8 characters + size = len(last) + 1 + text.seek(npos - size) + last = text.read(size) + + # last now has one valid utf-8 char and possibly some bytes that belong + # to a truncated char + + try: + last.decode('utf-8', 'strict') + except UnicodeDecodeError: + # There are some truncated bytes in last + prev = len(last) + while True: + text.seek(npos - prev) + last = text.read(len(last) + 1) + try: + last.decode('utf-8') + except UnicodeDecodeError: + pass + else: + break + extra = len(last) - prev + + text.seek(opos) + data = text.read(RECORD_SIZE) + overlap = text.read(extra) + text.seek(npos) + + return data, overlap + diff --git a/src/calibre/ebooks/mobi/writer2/__init__.py b/src/calibre/ebooks/mobi/writer2/__init__.py index bc8dbbf7de..df3dcefb94 100644 --- a/src/calibre/ebooks/mobi/writer2/__init__.py +++ b/src/calibre/ebooks/mobi/writer2/__init__.py @@ -12,5 +12,4 @@ UNCOMPRESSED = 1 PALMDOC = 2 HUFFDIC = 17480 PALM_MAX_IMAGE_SIZE = 63 * 1024 -RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed)) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index e349172d95..134fbadc60 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -12,9 +12,8 @@ from struct import pack from cStringIO import StringIO from collections import OrderedDict, defaultdict -from calibre.ebooks.mobi.writer2 import RECORD_SIZE from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, - encode_tbs, align_block, utf8_text) + encode_tbs, align_block, utf8_text, RECORD_SIZE) class CNCX(object): # {{{ diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index b7a0d76424..c930609489 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -16,9 +16,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.filenames import ascii_filename -from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE) +from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED) from calibre.ebooks.mobi.utils import (encint, encode_trailing_data, - align_block, detect_periodical) + align_block, detect_periodical, RECORD_SIZE, create_text_record) from calibre.ebooks.mobi.writer2.indexer import Indexer EXTH_CODES = { @@ -163,9 +163,7 @@ class MobiWriter(object): # }}} - # Text {{{ - - def generate_text(self): + def generate_text(self): # {{{ self.oeb.logger.info('Serializing markup content...') self.serializer = Serializer(self.oeb, self.image_map, self.is_periodical, @@ -180,7 +178,7 @@ class MobiWriter(object): self.oeb.logger.info(' Compressing markup content...') while text.tell() < self.text_length: - data, overlap = self.read_text_record(text) + data, overlap = create_text_record(text) if self.compression == PALMDOC: data = compress_doc(data) @@ -197,57 +195,6 @@ class MobiWriter(object): if records_size % 4 != 0: self.records.append(b'\x00'*(records_size % 4)) self.first_non_text_record_idx += 1 - - def read_text_record(self, text): - ''' - Return a Palmdoc record of size RECORD_SIZE from the text file object. - In case the record ends in the middle of a multibyte character return - the overlap as well. - - Returns data, overlap: where both are byte strings. overlap is the - extra bytes needed to complete the truncated multibyte character. - ''' - opos = text.tell() - text.seek(0, 2) - # npos is the position of the next record - npos = min((opos + RECORD_SIZE, text.tell())) - # Number of bytes from the next record needed to complete the last - # character in this record - extra = 0 - - last = b'' - while not last.decode('utf-8', 'ignore'): - # last contains no valid utf-8 characters - size = len(last) + 1 - text.seek(npos - size) - last = text.read(size) - - # last now has one valid utf-8 char and possibly some bytes that belong - # to a truncated char - - try: - last.decode('utf-8', 'strict') - except UnicodeDecodeError: - # There are some truncated bytes in last - prev = len(last) - while True: - text.seek(npos - prev) - last = text.read(len(last) + 1) - try: - last.decode('utf-8') - except UnicodeDecodeError: - pass - else: - break - extra = len(last) - prev - - text.seek(opos) - data = text.read(RECORD_SIZE) - overlap = text.read(extra) - text.seek(npos) - - return data, overlap - # }}} def generate_record0(self): # MOBI header {{{ diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index 79ff7c3d96..b924a4df7c 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -19,15 +19,13 @@ from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, extract, XHTML, urlnormalize) from calibre.ebooks.oeb.parse_utils import barename -from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags +from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href XML_DOCS = OEB_DOCS | {SVG_MIME} # References to record numbers in KF8 are stored as base-32 encoded integers, # with 4 digits to_ref = partial(to_base, base=32, min_num_digits=4) -# References in links are stored with 10 digits -to_href = partial(to_base, base=32, min_num_digits=10) class KF8Writer(object): @@ -167,7 +165,7 @@ class KF8Writer(object): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} - for item in self.oeb.spine: + for i, item in enumerate(self.oeb.spine): root = self.data(item) for a in XPath('//h:a[@href]')(root): @@ -176,7 +174,8 @@ class KF8Writer(object): href, _, frag = ref.partition('#') href = urlnormalize(href) if href in hrefs: - placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) + placeholder = 'kindle:pos:fid:%04d:off:%s'%(i, + to_href(count)) self.link_map[placeholder] = (href, frag) a.set('href', placeholder) @@ -199,7 +198,19 @@ class KF8Writer(object): j += 1 def chunk_it_up(self): - chunker = Chunker(self.oeb, self.data) - chunker + placeholder_map = {} + for placeholder, x in self.link_map.iteritems(): + href, frag = x + aid = self.id_map.get(x, None) + if aid is None: + aid = self.id_map.get((href, '')) + placeholder_map[placeholder] = aid + chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress, + placeholder_map) + + for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records', + 'last_text_record_idx', 'first_non_text_record_idx', + 'text_length'): + setattr(self, x, getattr(chunker, x)) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 201d2b63d4..da3b9407bd 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -9,14 +9,22 @@ __docformat__ = 'restructuredtext en' import re from collections import namedtuple +from io import BytesIO +from struct import pack +from functools import partial from lxml import etree from calibre.ebooks.oeb.base import XHTML_NS from calibre.constants import ispy3 +from calibre.ebooks.mobi.utils import create_text_record, to_base +from calibre.ebooks.compression.palmdoc import compress_doc CHUNK_SIZE = 8192 +# References in links are stored with 10 digits +to_href = partial(to_base, base=32, min_num_digits=10) + # Tags to which kindlegen adds the aid attribute aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', 'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details', @@ -70,11 +78,15 @@ def tostring(raw, **kwargs): class Chunk(object): - def __init__(self, raw): + def __init__(self, raw, parent_tag): self.raw = raw self.starts_tags = [] self.ends_tags = [] self.insert_pos = None + self.parent_tag = parent_tag + self.parent_is_body = False + self.is_last_chunk = False + self.is_first_chunk = False def __len__(self): return len(self.raw) @@ -87,6 +99,11 @@ class Chunk(object): return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%( len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags) + @property + def selector(self): + typ = 'S' if (self.is_last_chunk and not self.parent_is_body) else 'P' + return "%s-//*[@aid='%s']"%(typ, self.parent_tag) + __str__ = __repr__ class Skeleton(object): @@ -133,11 +150,20 @@ class Skeleton(object): ans = ans[:i] + chunk.raw + ans[i:] return ans + def __len__(self): + return len(self.skeleton) + sum([len(x.raw) for x in self.chunks]) + + @property + def raw_text(self): + return b''.join([self.skeleton] + [x.raw for x in self.chunks]) + class Chunker(object): - def __init__(self, oeb, data_func): + def __init__(self, oeb, data_func, compress, placeholder_map): self.oeb, self.log = oeb, oeb.log self.data = data_func + self.compress = compress + self.placeholder_map = placeholder_map self.skeletons = [] @@ -174,6 +200,19 @@ class Chunker(object): if self.orig_dumps: self.dump() + # Create the SKEL and Chunk tables + self.skel_table = [] + self.chunk_table = [] + self.create_tables() + + # Set internal links + text = b''.join(x.raw_text for x in self.skeletons) + text = self.set_internal_links(text) + + # Create text records + self.records = [] + self.create_text_records(text) + def remove_namespaces(self, root): lang = None for attr, val in root.attrib.iteritems(): @@ -206,15 +245,15 @@ class Chunker(object): return nroot - def step_into_tag(self, tag, chunks): aid = tag.get('aid') + is_body = tag.tag == 'body' first_chunk_idx = len(chunks) # First handle any text if tag.text and tag.text.strip(): # Leave pure whitespace in the skel - chunks.extend(self.chunk_up_text(tag.text)) + chunks.extend(self.chunk_up_text(tag.text, aid)) tag.text = None # Now loop over children @@ -224,15 +263,15 @@ class Chunker(object): if len(raw) > CHUNK_SIZE and child.get('aid', None): self.step_into_tag(child, chunks) if child.tail and child.tail.strip(): # Leave pure whitespace - chunks.extend(self.chunk_up_text(child.tail)) + chunks.extend(self.chunk_up_text(child.tail, aid)) child.tail = None else: if len(raw) > CHUNK_SIZE: self.log.warn('Tag %s has no aid and a too large chunk' ' size. Adding anyway.'%child.tag) - chunks.append(Chunk(raw)) + chunks.append(Chunk(raw, aid)) if child.tail: - chunks.extend(self.chunk_up_text(child.tail)) + chunks.extend(self.chunk_up_text(child.tail, aid)) tag.remove(child) if len(chunks) <= first_chunk_idx and chunks: @@ -242,8 +281,15 @@ class Chunker(object): if chunks: chunks[first_chunk_idx].starts_tags.append(aid) chunks[-1].ends_tags.append(aid) + my_chunks = chunks[first_chunk_idx:] + if my_chunks: + my_chunks[0].is_first_chunk = True + my_chunks[-1].is_last_chunk = True + if is_body: + for chunk in my_chunks: + chunk.parent_is_body = True - def chunk_up_text(self, text): + def chunk_up_text(self, text, parent_tag): text = text.encode('utf-8') ans = [] @@ -259,7 +305,7 @@ class Chunker(object): while rest: start, rest = split_multibyte_text(rest) ans.append(b'' + start + '') - return [Chunk(x) for x in ans] + return [Chunk(x, parent_tag) for x in ans] def merge_small_chunks(self, chunks): ans = chunks[:1] @@ -275,6 +321,77 @@ class Chunker(object): prev.merge(chunk) return ans + def create_tables(self): + Skel = namedtuple('Skel', + 'file_number name chunk_count start_pos length') + sp = 0 + for s in self.skeletons: + s.start_pos = sp + sp += len(s) + self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number, + len(s.chunks), s.start_pos, len(s.skeleton)) for x in self.skeletons] + + Chunk = namedtuple('Chunk', + 'insert_pos selector file_number sequence_number start_pos length') + num = cp = 0 + for skel in self.skeletons: + cp = skel.start_pos + for chunk in skel.chunks: + self.chunk_table.append( + Chunk(chunk.insert_pos + skel.start_pos, chunk.selector, + skel.file_number, num, cp, len(chunk.raw))) + cp += len(chunk.raw) + num += 1 + + def set_internal_links(self, text): + # First find the start pos of all tags with aids + aid_map = {} + for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text): + aid_map[match.group(1)] = match.start() + self.aid_offset_map = aid_map + placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in + self.placeholder_map.iteritems()} + + # Now update the links + def sub(match): + raw = match.group() + pl = match.group(1) + try: + return raw[:-10] + placeholder_map[pl] + except KeyError: + pass + return raw + + return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) + + def create_text_records(self, text): + self.text_length = len(text) + text = BytesIO(text) + nrecords = 0 + records_size = 0 + + if self.compress: + self.oeb.logger.info(' Compressing markup content...') + + while text.tell() < self.text_length: + data, overlap = create_text_record(text) + if self.compress: + data = compress_doc(data) + + data += overlap + data += pack(b'>B', len(overlap)) + + self.records.append(data) + records_size += len(data) + nrecords += 1 + + self.last_text_record_idx = nrecords + self.first_non_text_record_idx = nrecords + 1 + # Pad so that the next records starts at a 4 byte boundary + if records_size % 4 != 0: + self.records.append(b'\x00'*(records_size % 4)) + self.first_non_text_record_idx += 1 + def dump(self): import tempfile, shutil, os tdir = os.path.join(tempfile.gettempdir(), 'skeleton') @@ -291,3 +408,4 @@ class Chunker(object): with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: f.write(skeleton.rebuild()) + From a6efef3d3159348665323cda2f9e8c9bffd2d990 Mon Sep 17 00:00:00 2001 From: Lee Date: Fri, 20 Apr 2012 21:52:57 +0800 Subject: [PATCH 16/37] removed dash unwrap regression from bug #822744 --- src/calibre/ebooks/conversion/utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 26b800f55b..e2a02702df 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -322,7 +322,6 @@ class HeuristicProcessor(object): lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*" @@ -331,23 +330,19 @@ class HeuristicProcessor(object): unwrap_regex = lookahead+line_ending+blanklines+line_opening em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening - dash_unwrap_regex = dash+line_ending+blanklines+line_opening if format == 'txt': unwrap_regex = lookahead+txt_line_wrap em_en_unwrap_regex = em_en_lookahead+txt_line_wrap shy_unwrap_regex = soft_hyphen+txt_line_wrap - dash_unwrap_regex = dash+txt_line_wrap unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE) - dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE) content = unwrap.sub(' ', content) content = em_en_unwrap.sub('', content) content = shy_unwrap.sub('', content) - content = dash_unwrap.sub('', content) return content def txt_process(self, match): From 7800024bac39d901c575f4369dd4528691faaf90 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 20:04:13 +0530 Subject: [PATCH 17/37] ... --- src/calibre/ebooks/mobi/writer8/main.py | 61 ++++++++++++++++++--- src/calibre/ebooks/mobi/writer8/skeleton.py | 42 +------------- 2 files changed, 57 insertions(+), 46 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index b924a4df7c..d8ef501eb6 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -9,13 +9,16 @@ __docformat__ = 'restructuredtext en' import copy from functools import partial -from collections import defaultdict +from collections import defaultdict, namedtuple +from io import BytesIO +from struct import pack import cssutils from lxml import etree from calibre import isbytestring, force_unicode -from calibre.ebooks.mobi.utils import to_base +from calibre.ebooks.mobi.utils import create_text_record, to_base +from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, extract, XHTML, urlnormalize) from calibre.ebooks.oeb.parse_utils import barename @@ -31,11 +34,14 @@ class KF8Writer(object): def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log + self.compress = not self.opts.dont_compress self.log.info('Creating KF8 output') self.used_images = set() self.resources = resources self.dup_data() self.flows = [None] # First flow item is reserved for the text + self.records = [] + self.fdst_table = [] self.replace_resource_links() self.extract_css_into_flows() @@ -43,6 +49,10 @@ class KF8Writer(object): self.replace_internal_links_with_placeholders() self.insert_aid_attributes() self.chunk_it_up() + # Dump the cloned data as it is no longer needed + del self._data_cache + self.create_text_records() + self.create_fdst_table() def dup_data(self): ''' Duplicate data so that any changes we make to markup/CSS only @@ -205,12 +215,49 @@ class KF8Writer(object): if aid is None: aid = self.id_map.get((href, '')) placeholder_map[placeholder] = aid - chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress, - placeholder_map) + chunker = Chunker(self.oeb, self.data, placeholder_map) - for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records', - 'last_text_record_idx', 'first_non_text_record_idx', - 'text_length'): + for x in ('skel_table', 'chunk_table', 'aid_offset_map'): setattr(self, x, getattr(chunker, x)) + self.flows[0] = chunker.text + + def create_text_records(self): + self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x + in self.flows] + text = b''.join(self.flows) + self.text_length = len(text) + text = BytesIO(text) + nrecords = 0 + records_size = 0 + + if self.compress: + self.oeb.logger.info(' Compressing markup content...') + + while text.tell() < self.text_length: + data, overlap = create_text_record(text) + if self.compress: + data = compress_doc(data) + + data += overlap + data += pack(b'>B', len(overlap)) + + self.records.append(data) + records_size += len(data) + nrecords += 1 + + self.last_text_record_idx = nrecords + self.first_non_text_record_idx = nrecords + 1 + # Pad so that the next records starts at a 4 byte boundary + if records_size % 4 != 0: + self.records.append(b'\x00'*(records_size % 4)) + self.first_non_text_record_idx += 1 + + def create_fdst_table(self): + FDST = namedtuple('Flow', 'start end') + for i, flow in enumerate(self.flows): + start = 0 if i == 0 else self.fdst_table[-1].end + self.fdst_table.append(FDST(start, start + len(flow))) + + diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index da3b9407bd..eff03c9de4 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -9,16 +9,13 @@ __docformat__ = 'restructuredtext en' import re from collections import namedtuple -from io import BytesIO -from struct import pack from functools import partial from lxml import etree from calibre.ebooks.oeb.base import XHTML_NS from calibre.constants import ispy3 -from calibre.ebooks.mobi.utils import create_text_record, to_base -from calibre.ebooks.compression.palmdoc import compress_doc +from calibre.ebooks.mobi.utils import to_base CHUNK_SIZE = 8192 @@ -159,10 +156,9 @@ class Skeleton(object): class Chunker(object): - def __init__(self, oeb, data_func, compress, placeholder_map): + def __init__(self, oeb, data_func, placeholder_map): self.oeb, self.log = oeb, oeb.log self.data = data_func - self.compress = compress self.placeholder_map = placeholder_map self.skeletons = [] @@ -207,11 +203,7 @@ class Chunker(object): # Set internal links text = b''.join(x.raw_text for x in self.skeletons) - text = self.set_internal_links(text) - - # Create text records - self.records = [] - self.create_text_records(text) + self.text = self.set_internal_links(text) def remove_namespaces(self, root): lang = None @@ -364,34 +356,6 @@ class Chunker(object): return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) - def create_text_records(self, text): - self.text_length = len(text) - text = BytesIO(text) - nrecords = 0 - records_size = 0 - - if self.compress: - self.oeb.logger.info(' Compressing markup content...') - - while text.tell() < self.text_length: - data, overlap = create_text_record(text) - if self.compress: - data = compress_doc(data) - - data += overlap - data += pack(b'>B', len(overlap)) - - self.records.append(data) - records_size += len(data) - nrecords += 1 - - self.last_text_record_idx = nrecords - self.first_non_text_record_idx = nrecords + 1 - # Pad so that the next records starts at a 4 byte boundary - if records_size % 4 != 0: - self.records.append(b'\x00'*(records_size % 4)) - self.first_non_text_record_idx += 1 - def dump(self): import tempfile, shutil, os tdir = os.path.join(tempfile.gettempdir(), 'skeleton') From fbcd3eb279b8acd0d450d42069827fda8d9c9d0f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 20:11:12 +0530 Subject: [PATCH 18/37] ... --- recipes/tpm_uk.recipe | 2 ++ src/calibre/ebooks/mobi/writer8/main.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/tpm_uk.recipe b/recipes/tpm_uk.recipe index aa042de951..0ccad32fa9 100644 --- a/recipes/tpm_uk.recipe +++ b/recipes/tpm_uk.recipe @@ -11,6 +11,8 @@ class TPM_uk(BasicNewsRecipe): __author__ = 'Darko Miletic' description = 'Title says it all' publisher = "The Philosophers' Magazine" + recipe_disabled = ('This recipe has been disabled as the website has' + ' started providing articles only in PDF form') category = 'philosophy, news' oldest_article = 25 max_articles_per_feed = 200 diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index d8ef501eb6..c9334b22a3 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -259,5 +259,3 @@ class KF8Writer(object): start = 0 if i == 0 else self.fdst_table[-1].end self.fdst_table.append(FDST(start, start + len(flow))) - - From 03ed4010f58ebb8499d0b9d49ecc5c275214a3d4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 20:49:23 +0530 Subject: [PATCH 19/37] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index eff03c9de4..4b39d0cb15 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -165,7 +165,7 @@ class Chunker(object): # Set this to a list to enable dumping of the original and rebuilt # html files for debugging - self.orig_dumps = None + orig_dumps = None for i, item in enumerate(self.oeb.spine): root = self.remove_namespaces(self.data(item)) @@ -193,8 +193,8 @@ class Chunker(object): # for all chunks self.skeletons.append(Skeleton(i, item, root, chunks)) - if self.orig_dumps: - self.dump() + if orig_dumps: + self.dump(orig_dumps) # Create the SKEL and Chunk tables self.skel_table = [] @@ -356,7 +356,7 @@ class Chunker(object): return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) - def dump(self): + def dump(self, orig_dumps): import tempfile, shutil, os tdir = os.path.join(tempfile.gettempdir(), 'skeleton') self.log('Skeletons dumped to:', tdir) @@ -368,7 +368,7 @@ class Chunker(object): os.makedirs(x) for i, skeleton in enumerate(self.skeletons): with open(os.path.join(orig, '%04d.html'%i), 'wb') as f: - f.write(self.orig_dumps[i]) + f.write(orig_dumps[i]) with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: f.write(skeleton.rebuild()) From 8d44e8d83f4b7f84051463117cef5cfcfdad5252 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 21:08:27 +0530 Subject: [PATCH 20/37] ... --- src/calibre/ebooks/mobi/writer8/main.py | 9 ++++----- src/calibre/ebooks/mobi/writer8/skeleton.py | 9 +++++++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index c9334b22a3..430d695fd1 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -175,7 +175,7 @@ class KF8Writer(object): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} - for i, item in enumerate(self.oeb.spine): + for item in self.oeb.spine: root = self.data(item) for a in XPath('//h:a[@href]')(root): @@ -184,8 +184,7 @@ class KF8Writer(object): href, _, frag = ref.partition('#') href = urlnormalize(href) if href in hrefs: - placeholder = 'kindle:pos:fid:%04d:off:%s'%(i, - to_href(count)) + placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) self.link_map[placeholder] = (href, frag) a.set('href', placeholder) @@ -201,9 +200,9 @@ class KF8Writer(object): aid = aidbase + j tag.attrib['aid'] = to_base(aid, base=32) if tag.tag == XHTML('body'): - self.id_map[(item.href, '')] = tag.attrib['aid'] + self.id_map[(item.href, '')] = (i, tag.attrib['aid']) if id_ is not None: - self.id_map[(item.href, id_)] = tag.attrib['aid'] + self.id_map[(item.href, id_)] = (i, tag.attrib['aid']) j += 1 diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 4b39d0cb15..494aa30def 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -341,7 +341,12 @@ class Chunker(object): for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text): aid_map[match.group(1)] = match.start() self.aid_offset_map = aid_map - placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in + + def to_placeholder(x): + file_number, aid = x + return bytes('%04d:%s'%(file_number, to_href(aid_map[aid]))) + + placeholder_map = {bytes(k):to_placeholder(v) for k, v in self.placeholder_map.iteritems()} # Now update the links @@ -349,7 +354,7 @@ class Chunker(object): raw = match.group() pl = match.group(1) try: - return raw[:-10] + placeholder_map[pl] + return raw[:-15] + placeholder_map[pl] except KeyError: pass return raw From 6c631e0e64ce2ce7604367ebed60457d51924af2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 21:15:12 +0530 Subject: [PATCH 21/37] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 494aa30def..d04f119316 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -371,10 +371,19 @@ class Chunker(object): rebuilt = os.path.join(tdir, 'rebuilt') for x in (orig, rebuilt): os.makedirs(x) + error = False for i, skeleton in enumerate(self.skeletons): + oraw, rraw = orig_dumps[i], skeleton.rebuild() with open(os.path.join(orig, '%04d.html'%i), 'wb') as f: - f.write(orig_dumps[i]) + f.write(oraw) with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: - f.write(skeleton.rebuild()) + f.write(rraw) + if oraw != rraw: + error = True + if error: + raise ValueError('The before and after HTML differs. Run a diff ' + 'tool on the orig and rebuilt directories') + else: + self.log('Skeleton HTML before and after is identical.') From 9217e6bed381211842f56d25c9ed4957324a2b7e Mon Sep 17 00:00:00 2001 From: Lee Date: Sat, 21 Apr 2012 00:24:32 +0800 Subject: [PATCH 22/37] remove full stop punctuation from the line unwrap heuristic --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index e2a02702df..24528d1fb8 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -319,7 +319,7 @@ class HeuristicProcessor(object): ''' # define the pieces of the regex - lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?\s*()?" From 740c812de24e38120b33ba7d094ad288bc7cf234 Mon Sep 17 00:00:00 2001 From: Lee Date: Sat, 21 Apr 2012 00:52:13 +0800 Subject: [PATCH 23/37] expanded comments --- src/calibre/ebooks/conversion/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 24528d1fb8..6dc3973213 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -316,9 +316,17 @@ class HeuristicProcessor(object): ''' Unwraps lines based on line length and punctuation supports a range of html markup and text files + + the lookahead regex below is meant look for any non-full stop characters - punctuation + characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc + the reason for this is to prevent false positive wrapping. False positives are more + difficult to detect than false negatives during a manual review of the doc + + This function intentionally leaves hyphenated content alone as that is handled by the + dehyphenate routine in a future step ''' - # define the pieces of the regex + # define the pieces of the regex lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(? Date: Sat, 21 Apr 2012 00:54:35 +0800 Subject: [PATCH 24/37] ... --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6dc3973213..acfa80e877 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -323,7 +323,7 @@ class HeuristicProcessor(object): difficult to detect than false negatives during a manual review of the doc This function intentionally leaves hyphenated content alone as that is handled by the - dehyphenate routine in a future step + dehyphenate routine in a separate step ''' # define the pieces of the regex From b717749138e144155edc86c7d61ff8c1413e7d9a Mon Sep 17 00:00:00 2001 From: Lee Date: Sat, 21 Apr 2012 00:59:30 +0800 Subject: [PATCH 25/37] fix the pattern in preprocess --- src/calibre/ebooks/conversion/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index c526cba8a9..16acaad383 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -559,7 +559,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*

\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: From 081897ae5723958830db099240dd461c521b822f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 22:39:32 +0530 Subject: [PATCH 26/37] KF8 Output: Start work on the index layer --- src/calibre/ebooks/mobi/utils.py | 46 +++++++++++++ src/calibre/ebooks/mobi/writer2/indexer.py | 49 +++----------- src/calibre/ebooks/mobi/writer8/index.py | 78 ++++++++++++++++++++++ 3 files changed, 132 insertions(+), 41 deletions(-) create mode 100644 src/calibre/ebooks/mobi/writer8/index.py diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index fe5cd7eaf2..319af30f86 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en' import struct, string, imghdr, zlib, os from collections import OrderedDict +from io import BytesIO from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail from calibre.ebooks import normalize @@ -549,3 +550,48 @@ def create_text_record(text): return data, overlap +class CNCX(object): # {{{ + + ''' + Create the CNCX records. These are records containing all the strings from + an index. Each record is of the form: + ''' + + MAX_STRING_LENGTH = 500 + + def __init__(self, strings=()): + self.strings = OrderedDict((s, 0) for s in strings) + + self.records = [] + offset = 0 + buf = BytesIO() + for key in tuple(self.strings.iterkeys()): + utf8 = utf8_text(key[:self.MAX_STRING_LENGTH]) + l = len(utf8) + sz_bytes = encint(l) + raw = sz_bytes + utf8 + if 0xfbf8 - buf.tell() < 6 + len(raw): + # Records in PDB files cannot be larger than 0x10000, so we + # stop well before that. + pad = 0xfbf8 - buf.tell() + buf.write(b'\0' * pad) + self.records.append(buf.getvalue()) + buf.truncate(0) + offset = len(self.records) * 0x10000 + buf.write(raw) + self.strings[key] = offset + offset += len(raw) + + self.records.append(align_block(buf.getvalue())) + + def __getitem__(self, string): + return self.strings[string] + + def __bool__(self): + return bool(self.records) + __nonzero__ = __bool__ + +# }}} + + diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 134fbadc60..be926a80a0 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -13,54 +13,21 @@ from cStringIO import StringIO from collections import OrderedDict, defaultdict from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, - encode_tbs, align_block, utf8_text, RECORD_SIZE) + encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_) -class CNCX(object): # {{{ - - ''' - Create the CNCX records. These are records containing all the strings from - the NCX. Each record is of the form: - ''' - - MAX_STRING_LENGTH = 500 +class CNCX(CNCX_): # {{{ def __init__(self, toc, is_periodical): - self.strings = OrderedDict() - + strings = [] for item in toc.iterdescendants(breadth_first=True): - self.strings[item.title] = 0 + strings.append(item.title) if is_periodical: - self.strings[item.klass] = 0 + strings.append(item.klass) if item.author: - self.strings[item.author] = 0 + strings.append(item.author) if item.description: - self.strings[item.description] = 0 - - self.records = [] - offset = 0 - buf = StringIO() - for key in tuple(self.strings.iterkeys()): - utf8 = utf8_text(key[:self.MAX_STRING_LENGTH]) - l = len(utf8) - sz_bytes = encint(l) - raw = sz_bytes + utf8 - if 0xfbf8 - buf.tell() < 6 + len(raw): - # Records in PDB files cannot be larger than 0x10000, so we - # stop well before that. - pad = 0xfbf8 - buf.tell() - buf.write(b'\0' * pad) - self.records.append(buf.getvalue()) - buf.truncate(0) - offset = len(self.records) * 0x10000 - buf.write(raw) - self.strings[key] = offset - offset += len(raw) - - self.records.append(align_block(buf.getvalue())) - - def __getitem__(self, string): - return self.strings[string] + strings.append(item.description) + CNCX_.__init__(self, strings) # }}} class TAGX(object): # {{{ diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py new file mode 100644 index 0000000000..a2b991a612 --- /dev/null +++ b/src/calibre/ebooks/mobi/writer8/index.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) +from future_builtins import map + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from collections import namedtuple +from struct import pack + +from calibre.ebooks.mobi.utils import CNCX + +TagMeta = namedtuple('TagMeta', + 'name number values_per_entry bitmask end_flag') +EndTagTable = TagMeta('eof', 0, 0, 0, 1) + +class Index(object): + + control_byte_count = 1 + cncx = CNCX() + tag_types = (EndTagTable,) + + @classmethod + def generate_tagx(cls): + header = b'TAGX' + byts = bytearray() + for tag_meta in cls.tag_types: + byts.extend(tag_meta[1:]) + # table length, control byte count + header += pack(b'>II', 12+len(byts), cls.control_byte_count) + return header + bytes(byts) + +class SkelIndex(Index): + + tag_types = tuple(map(TagMeta, ( + ('chunk_count', 1, 1, 3, 0), + ('geometry', 6, 2, 12, 0), + EndTagTable + ))) + + def __init__(self, skel_table): + self.entries = [ + (s.name, { + # Dont ask me why these entries have to be repeated twice + 'chunk_count':(s.chunk_count, s.chunk_count), + 'geometry':(s.start_pos, s.length, s.start_pos, s.length), + }) for s in skel_table + ] + + +class ChunkIndex(Index): + + tag_types = tuple(map(TagMeta, ( + ('cncx_offset', 2, 1, 1, 0), + ('file_number', 3, 1, 2, 0), + ('sequence_number', 4, 1, 4, 0), + ('geometry', 6, 2, 8, 0), + EndTagTable + ))) + + def __init__(self, chunk_table): + self.cncx = CNCX(c.selector for c in chunk_table) + + self.entries = [ + ('%010d'%c.insert_pos, { + + 'cncx_offset':self.cncx[c.selector], + 'file_number':c.file_number, + 'sequence_number':c.sequence_number, + 'geometry':(c.start_pos, c.length), + }) for s in chunk_table + ] + + + From 22ee4152416a98e84a587f9fcf1a1f5aa52f4960 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 23:32:53 +0530 Subject: [PATCH 27/37] ... --- src/calibre/ebooks/mobi/writer8/index.py | 55 ++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py index a2b991a612..1ee20857fb 100644 --- a/src/calibre/ebooks/mobi/writer8/index.py +++ b/src/calibre/ebooks/mobi/writer8/index.py @@ -10,13 +10,20 @@ __docformat__ = 'restructuredtext en' from collections import namedtuple from struct import pack +from io import BytesIO -from calibre.ebooks.mobi.utils import CNCX +from calibre.ebooks.mobi.utils import CNCX, encint TagMeta = namedtuple('TagMeta', 'name number values_per_entry bitmask end_flag') EndTagTable = TagMeta('eof', 0, 0, 0, 1) +# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks +# could also be extended to 4 bit wide ones as well +mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6, + 128:7, 192: 6 } + + class Index(object): control_byte_count = 1 @@ -33,6 +40,50 @@ class Index(object): header += pack(b'>II', 12+len(byts), cls.control_byte_count) return header + bytes(byts) + @classmethod + def calculate_control_bytes_for_each_entry(cls, entries): + control_bytes = [] + for lead_text, tags in entries: + cbs = [] + ans = 0 + for (name, number, vpe, mask, endi) in cls.tag_types: + if endi == 1: + cbs.append(ans) + ans = 0 + continue + nvals = len(tags.get(name, ())) + nentries = nvals // vpe + shifts = mask_to_bit_shifts[mask] + ans |= mask & (nentries << shifts) + if len(cbs) != cls.control_byte_count: + raise ValueError('The entry %r is invalid'%[lead_text, tags]) + control_bytes.append(cbs) + return control_bytes + + def build_records(self): + self.control_bytes = self.calculate_control_bytes_for_each_entry( + self.entries) + + self.rendered_entries = [] + offset = 0 + IndexEntry = namedtuple('IndexEntry', 'offset length raw') + for i, x in enumerate(self.entries): + control_bytes = self.control_bytes[i] + leading_text, tags = x + buf = BytesIO() + raw = bytearray(leading_text) + raw.insert(0, len(leading_text)) + buf.write(bytes(raw)) + buf.write(control_bytes) + for tag in self.tag_types: + values = tags.get(tag.name, None) + if values: + for val in values: + buf.write(encint(val)) + raw = buf.getvalue() + self.rendered_entries.append(IndexEntry(offset, len(raw), raw)) + offset += len(raw) + class SkelIndex(Index): tag_types = tuple(map(TagMeta, ( @@ -74,5 +125,3 @@ class ChunkIndex(Index): }) for s in chunk_table ] - - From 5d3e24e1053e6078dfe3a7e9a0fe135baeb69286 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Apr 2012 07:50:27 +0530 Subject: [PATCH 28/37] Fix set_internal_links() --- src/calibre/ebooks/mobi/writer8/main.py | 4 +- src/calibre/ebooks/mobi/writer8/skeleton.py | 42 +++++++++++++++------ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index 430d695fd1..955fbab460 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -200,9 +200,9 @@ class KF8Writer(object): aid = aidbase + j tag.attrib['aid'] = to_base(aid, base=32) if tag.tag == XHTML('body'): - self.id_map[(item.href, '')] = (i, tag.attrib['aid']) + self.id_map[(item.href, '')] = tag.attrib['aid'] if id_ is not None: - self.id_map[(item.href, id_)] = (i, tag.attrib['aid']) + self.id_map[(item.href, id_)] = tag.attrib['aid'] j += 1 diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index d04f119316..4da540cac6 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -172,11 +172,11 @@ class Chunker(object): body = root.xpath('//body')[0] body.tail = '\n' - if self.orig_dumps is not None: - self.orig_dumps.append(tostring(root, xml_declaration=True, + if orig_dumps is not None: + orig_dumps.append(tostring(root, xml_declaration=True, with_tail=True)) - self.orig_dumps[-1] = close_self_closing_tags( - self.orig_dumps[-1].replace(b']+? aid=[\'"]([A-Z0-9]+)[\'"]', text): - aid_map[match.group(1)] = match.start() + offset = match.start() + pos_fid = None + for chunk in self.chunk_table: + if chunk.insert_pos <= offset < chunk.insert_pos + chunk.length: + pos_fid = (chunk.sequence_number, offset-chunk.insert_pos) + break + if chunk.insert_pos > offset: + # This aid is in the skeleton, not in a chunk, so we use + # the chunk immediately after + pos_fid = (chunk.sequence_number, 0) + break + if pos_fid is None: + raise ValueError('Could not find chunk for aid: %r'% + match.group(1)) + aid_map[match.group(1)] = (to_base(chunk.sequence_number, + base=32, min_num_digits=4), + to_href(offset-chunk.insert_pos)) + self.aid_offset_map = aid_map - def to_placeholder(x): - file_number, aid = x - return bytes('%04d:%s'%(file_number, to_href(aid_map[aid]))) + def to_placeholder(aid): + return bytes(':'.join(aid_map[aid])) placeholder_map = {bytes(k):to_placeholder(v) for k, v in self.placeholder_map.iteritems()} @@ -359,7 +379,7 @@ class Chunker(object): pass return raw - return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) + return re.sub(br'<[^>]+(kindle:pos:fid:0000:\d{10})', sub, text) def dump(self, orig_dumps): import tempfile, shutil, os From 5c72ad513b982741b6dc0777d89cda837f9566a8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Apr 2012 07:52:27 +0530 Subject: [PATCH 29/37] ... --- src/calibre/ebooks/mobi/writer8/skeleton.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 4da540cac6..8f0a3795db 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -336,7 +336,9 @@ class Chunker(object): num += 1 def set_internal_links(self, text): - # A kindle pos:fid link contains two base 32 numbers of the form + ''' Update the internal link placeholders to point to the correct + location, based on the chunk table.''' + # A kindle:pos:fid link contains two base 32 numbers of the form # XXXX:YYYYYYYYYY # The first number is an index into the chunk table and the second is # an offset from the start of the chunk to the start of the tag pointed From 9ab4ff1840a7b3735a6e94e4c1465295285bfc4f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Apr 2012 11:15:31 +0530 Subject: [PATCH 30/37] A nice framework for generating MOBI header records --- .../ebooks/conversion/plugins/mobi_output.py | 2 +- src/calibre/ebooks/mobi/debug/index.py | 4 +- src/calibre/ebooks/mobi/debug/mobi8.py | 2 +- src/calibre/ebooks/mobi/utils.py | 7 +- src/calibre/ebooks/mobi/writer8/header.py | 77 +++++++++++ src/calibre/ebooks/mobi/writer8/index.py | 125 +++++++++++++++++- 6 files changed, 206 insertions(+), 11 deletions(-) create mode 100644 src/calibre/ebooks/mobi/writer8/header.py diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index 89ab91f8eb..971d11df3b 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -169,6 +169,7 @@ class MOBIOutput(OutputFormatPlugin): self.remove_html_cover() resources = Resources(oeb, opts, self.is_periodical, add_fonts=create_kf8) + self.check_for_periodical() kf8 = self.create_kf8(resources) if create_kf8 else None @@ -203,7 +204,6 @@ class MOBIOutput(OutputFormatPlugin): resources.add_extra_images() mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) mobimlizer(oeb, opts) - self.check_for_periodical() write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz') from calibre.ebooks.mobi.writer2.main import MobiWriter writer = MobiWriter(opts, resources, kf8, diff --git a/src/calibre/ebooks/mobi/debug/index.py b/src/calibre/ebooks/mobi/debug/index.py index 1af1611918..94f252e231 100644 --- a/src/calibre/ebooks/mobi/debug/index.py +++ b/src/calibre/ebooks/mobi/debug/index.py @@ -17,7 +17,7 @@ from calibre.ebooks.mobi.reader.ncx import (tag_fieldname_map, default_entry) File = namedtuple('File', 'file_number name divtbl_count start_position length') -Elem = namedtuple('Elem', +Elem = namedtuple('Chunk', 'insert_pos toc_text file_number sequence_number start_pos ' 'length') @@ -110,7 +110,7 @@ class SECTIndex(Index): for i, text in enumerate(self.table.iterkeys()): tag_map = self.table[text] if set(tag_map.iterkeys()) != {2, 3, 4, 6}: - raise ValueError('SECT Index has unknown tags: %s'% + raise ValueError('Chunk Index has unknown tags: %s'% (set(tag_map.iterkeys())-{2, 3, 4, 6})) toc_text = self.cncx[tag_map[2][0]] diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index 1c61690d42..e3e26af0b1 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -198,7 +198,7 @@ def inspect_mobi(mobi_file, ddir): with open(os.path.join(ddir, 'skel.record'), 'wb') as fo: fo.write(str(f.skel_index).encode('utf-8')) - with open(os.path.join(ddir, 'sect.record'), 'wb') as fo: + with open(os.path.join(ddir, 'chunks.record'), 'wb') as fo: fo.write(str(f.sect_index).encode('utf-8')) with open(os.path.join(ddir, 'ncx.record'), 'wb') as fo: diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 319af30f86..aa59ee2217 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -583,7 +583,9 @@ class CNCX(object): # {{{ self.strings[key] = offset offset += len(raw) - self.records.append(align_block(buf.getvalue())) + val = buf.getvalue() + if val: + self.records.append(align_block(val)) def __getitem__(self, string): return self.strings[string] @@ -592,6 +594,9 @@ class CNCX(object): # {{{ return bool(self.records) __nonzero__ = __bool__ + def __len__(self): + return len(self.records) + # }}} diff --git a/src/calibre/ebooks/mobi/writer8/header.py b/src/calibre/ebooks/mobi/writer8/header.py new file mode 100644 index 0000000000..31571d0f5f --- /dev/null +++ b/src/calibre/ebooks/mobi/writer8/header.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from io import BytesIO +from collections import OrderedDict +from struct import pack + +from calibre.ebooks.mobi.utils import align_block + +NULL = 0xffffffff +zeroes = lambda x: b'\0'*x +nulls = lambda x: b'\xff'*x + +class Header(OrderedDict): + + HEADER_NAME = b'' + + DEFINITION = ''' + ''' + + ALIGN_BLOCK = False + POSITIONS = {} + + def __init__(self): + OrderedDict.__init__(self) + + for line in self.DEFINITION.splitlines(): + line = line.strip() + if not line or line.startswith('#'): continue + name, val = [x.strip() for x in line.partition('=')[0::2]] + if val: + val = eval(val, {'zeroes':zeroes, 'NULL':NULL, 'DYN':None, + 'nulls':nulls}) + else: + val = 0 + if name in self: + raise ValueError('Duplicate field in definition: %r'%name) + self[name] = val + + def __call__(self, **kwargs): + positions = {} + for name, val in kwargs.iteritems(): + if name not in self: + raise KeyError('Not a valid header field: %r'%name) + self[name] = val + + buf = BytesIO() + buf.write(bytes(self.HEADER_NAME)) + for name, val in self.iteritems(): + val = self.format_value(name, val) + positions[name] = buf.tell() + if val is None: + raise ValueError('Dynamic field %r not set'%name) + if isinstance(val, (int, long)): + val = pack(b'>I', val) + buf.write(val) + + for pos_field, field in self.POSITIONS.iteritems(): + buf.seek(positions[pos_field]) + buf.write(pack(b'>I', positions[field])) + + ans = buf.getvalue() + if self.ALIGN_BLOCK: + ans = align_block(ans) + return ans + + + def format_value(self, name, val): + return val + + diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py index 1ee20857fb..153e140b06 100644 --- a/src/calibre/ebooks/mobi/writer8/index.py +++ b/src/calibre/ebooks/mobi/writer8/index.py @@ -12,7 +12,8 @@ from collections import namedtuple from struct import pack from io import BytesIO -from calibre.ebooks.mobi.utils import CNCX, encint +from calibre.ebooks.mobi.utils import CNCX, encint, align_block +from calibre.ebooks.mobi.writer8.header import Header TagMeta = namedtuple('TagMeta', 'name number values_per_entry bitmask end_flag') @@ -23,13 +24,79 @@ EndTagTable = TagMeta('eof', 0, 0, 0, 1) mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6, 128:7, 192: 6 } +class IndexHeader(Header): # {{{ -class Index(object): + HEADER_NAME = b'INDX' + ALIGN_BLOCK = True + HEADER_LENGTH = 192 + + DEFINITION = ''' + # 4 - 8: Header Length + header_length = {header_length} + + # 8 - 16: Unknown + unknown1 = zeroes(8) + + # 16 - 20: Index type: 0 - normal 2 - inflection + type = 2 + + # 20 - 24: IDXT offset (filled in later) + idxt_offset + + # 24 - 28: Number of index records + num_of_records = 1 + + # 28 - 32: Index encoding (65001 = utf-8) + encoding = 65001 + + # 32 - 36: Unknown + unknown2 = NULL + + # 36 - 40: Number of Index entries + num_of_entries = DYN + + # 40 - 44: ORDT offset + ordt_offset + + # 44 - 48: LIGT offset + ligt_offset + + # 48 - 52: Number of ORDT/LIGT? entries + num_of_ordt_entries + + # 52 - 56: Number of CNCX records + num_of_cncx = DYN + + # 56 - 180: Unknown + unknown3 = zeroes(124) + + # 180 - 184: TAGX offset + tagx_offset = {header_length} + + # 184 - 192: Unknown + unknown4 = zeroes(8) + + # TAGX + tagx = DYN + + # Last Index entry + last_index = DYN + + # IDXT + idxt = DYN + '''.format(header_length=HEADER_LENGTH) + + POSITIONS = {'idxt_offset':'idxt'} +# }}} + +class Index(object): # {{{ control_byte_count = 1 cncx = CNCX() tag_types = (EndTagTable,) + HEADER_LENGTH = IndexHeader.HEADER_LENGTH + @classmethod def generate_tagx(cls): header = b'TAGX' @@ -60,17 +127,18 @@ class Index(object): control_bytes.append(cbs) return control_bytes - def build_records(self): + def __call__(self): self.control_bytes = self.calculate_control_bytes_for_each_entry( self.entries) - self.rendered_entries = [] + rendered_entries = [] offset = 0 + index, idxt, buf = BytesIO(), BytesIO(), BytesIO() IndexEntry = namedtuple('IndexEntry', 'offset length raw') for i, x in enumerate(self.entries): control_bytes = self.control_bytes[i] leading_text, tags = x - buf = BytesIO() + buf.truncate(0) raw = bytearray(leading_text) raw.insert(0, len(leading_text)) buf.write(bytes(raw)) @@ -81,8 +149,53 @@ class Index(object): for val in values: buf.write(encint(val)) raw = buf.getvalue() - self.rendered_entries.append(IndexEntry(offset, len(raw), raw)) + rendered_entries.append(IndexEntry(offset, len(raw), raw)) + idxt.write(pack(b'>H', self.HEADER_LENGTH+offset)) offset += len(raw) + index.write(raw) + + index_block = align_block(index.getvalue()) + idxt_block = align_block(b'IDXT' + idxt.getvalue()) + body = index_block + idxt_block + if len(body) + self.HEADER_LENGTH >= 0x10000: + raise ValueError('Index has too many entries, calibre does not' + ' support generating multiple index records at this' + ' time.') + + header = b'INDX' + buf.truncate(0) + buf.write(pack(b'>I', self.HEADER_LENGTH)) + buf.write(b'\0'*4) # Unknown + buf.write(pack(b'>I', 1)) # Header type? Or index record number? + buf.write(b'\0'*4) # Unknown + + # IDXT block offset + buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block))) + + # Number of index entries + buf.write(pack(b'>I', len(rendered_entries))) + + buf.write(b'\xff'*8) # Unknown + + buf.write(b'\0'*156) # Unknown + + header += buf.getvalue() + index_record = header + body + + tagx = self.generate_tagx() + idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) + + b'\0') + header = { + 'num_of_entries': len(rendered_entries), + 'num_of_cncx': len(self.cncx), + 'tagx':tagx, + 'idxt':idxt + } + header = IndexHeader()(**header) + self.records = [header, index_record] + self.records.extend(self.cncx.records) + return self.records +# }}} class SkelIndex(Index): From 687586f9a16f55b7c675690e130c7a61be145f7e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Apr 2012 13:20:14 +0530 Subject: [PATCH 31/37] ... --- src/calibre/manual/faq.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index a248962abd..f0d9aa8bd3 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -29,7 +29,7 @@ It can convert every input format in the following list, to every output format. PRC is a generic format, |app| supports PRC files with TextRead and MOBIBook headers. PDB is also a generic format. |app| supports eReder, Plucker, PML and zTxt PDB files. DJVU support is only for converting DJVU files that contain embedded text. These are typically generated by OCR software. - MOBI books can be of two types Mobi6 and KF8. |app| currently fully supports Mobi6 and supports conversion from, but not to, KF8 + MOBI books can be of two types Mobi6 and KF8. |app| fully supports both. MOBI files often have .azw or .azw3 file extensions .. _best-source-formats: From 0db1fcb10396f81e7a1bbf13e7900d125eeeb88d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 21 Apr 2012 23:38:52 +0530 Subject: [PATCH 32/37] Fix #986598 (New recipe for News agency Telam) --- recipes/icons/telam.png | Bin 0 -> 1992 bytes recipes/telam.recipe | 62 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 recipes/icons/telam.png create mode 100644 recipes/telam.recipe diff --git a/recipes/icons/telam.png b/recipes/icons/telam.png new file mode 100644 index 0000000000000000000000000000000000000000..f86dcc1dbf16db5e41411f0dd20c4af478336877 GIT binary patch literal 1992 zcmV;(2RHbMP)t)QrX=s)%^L5e6e zVnBo6IDE|Jqk9#@q ze9mwCo*xuKfd8ez|35%3NC03A$aN4ffK&|u1mS-HkO<)gc)GSQ>-pCz*kqUjTGT+^ zNfom8X#Y@mT7&l^-5Vf3x<38mU(dZ&ZO?~3@x;}b7P0i8j4lTMMWmfJI2cyik( zyBzD|0p!8ooGHKk?upY;He}73z6wHbkoq@G-HLD%lb)#IA(rN#Cl1tn?nj?~Y|F&D zk8<<}|{$bB4NPdHK)04ZgD<3TN)+`04V z&+ND-V+9}(;*GP{UYeS6oq-ucje~H^Bzt=LH#&MpE~6^8TuK?f+$>)y-8eUO`GSBv zR1D;$HSGtV+V<7QxBn|+1>oFF>G@+lYCbq|1izW(?5A0IjP)Wpzt zKJl>5S;-|=IH`r&g}5~Bb;jje9b7~_R-?89Svn2BIJ*Av@nP8zK^W?ev;X&R{qoqU zuRXT)+1*dv9jIkMsU80O;L!&k=$hQV@o#hWw_B~tHF$pDmhgix*7c)&H z>wEQv7lME0brC>0Z#2BlV($xoIs9@fnpgBST4XW9r|m>rXn2aEatr1x#3IQbTZ^J2&~hySDiqJ;xhz(QXa_!uTTy?r;wrHS zSgzy?jan^_K|6A(9a2ptgvvxn@3GD!K(zvpG7bR>7DXK;Ntm${S;KG?nP`8b|C|U@BNJ za&TNHWKk-FDC@k}P|}IK>y%UFifX_n*P$WzOBQ>VsYAI3G?}C&fDF}0PuF0^(J5^s zkd9p@i%G;hL|MOjk?Ng(o9b$aF#!ZqT4NBpVfWy$scM{&<1;fkq6;S-Lk=PXYH*z$ zj~8tPe$Z$#6~p<|qr<&^yOP7a0aOBTJ7glJ6%mb*+JffO5QOqtmOTlhEQ_wvWUN;T zIGQJLwpyu~S!fhg#R5ndf&aM;gQFd(CAC=AUfVKuZC;;AXMh`;j8UlcvbutOf}KjF zNTKfYJdq?axFC?330L`Z3GnMh;YS{tRfQQZWWq$yxXyweH6GcpK7wHX$xBU?13aC`aDl); z!802QJ9-N%#BRwdtYa&L~2#lg4XC#@FPm z>5DfGTx}GyIWX7_U65D3@AlGX)(xntvO?_cSYM&|{m(x++R>#(F$^e^V%OpbL@{Ht zROump8)|Bjs5tz>mI(xuz5Cyw+^%Me02ro)ck~(Gd}K=|W!$~z3V`%$)9C77P43#X zaZKoh6YiO^sWjPQ86e6q?-;le`CGe&C&tEqaq!4%wOYGx807~?66<@T-TQVtG?G=o zO1sC{BEOW}@X+y-uS`vyC{-M!6^P!yC=L%-@w|5Z9SPU zjr8vt$msh0rivAqL|mFLpS^J5otgQoR~pmYrKCNIw2gzqnS5uRwE8>x)~+63GniRt zTb+*npUy=eGR~M!Xb_Q*l7(iVB(%39qnf6nDO}aFx+bgf-%pPpW4gYxa%o5Tdf&lR a+CKo7=wztDq%i#e0000 Date: Sun, 22 Apr 2012 10:17:06 +0530 Subject: [PATCH 33/37] KF8 Output: Create NCX and Guide records --- src/calibre/ebooks/mobi/utils.py | 4 + src/calibre/ebooks/mobi/writer2/serializer.py | 5 +- src/calibre/ebooks/mobi/writer8/index.py | 118 ++++++++++++++++-- src/calibre/ebooks/mobi/writer8/main.py | 99 ++++++++++++++- src/calibre/ebooks/mobi/writer8/skeleton.py | 8 +- 5 files changed, 208 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index aa59ee2217..3b8ce61ba8 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -599,4 +599,8 @@ class CNCX(object): # {{{ # }}} +def is_guide_ref_start(ref): + return (ref.title.lower() == 'start' or + (ref.type and ref.type.lower() in {'start', + 'other.start', 'text'})) diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index d8d63bcff4..2dda657a93 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -12,6 +12,7 @@ import re from calibre.ebooks.oeb.base import (OEB_DOCS, XHTML, XHTML_NS, XML_NS, namespace, prefixname, urlnormalize) from calibre.ebooks.mobi.mobiml import MBP_NS +from calibre.ebooks.mobi.utils import is_guide_ref_start from collections import defaultdict from urlparse import urldefrag @@ -161,9 +162,7 @@ class Serializer(object): buf.write(b'title="') self.serialize_text(ref.title, quot=True) buf.write(b'" ') - if (ref.title.lower() == 'start' or - (ref.type and ref.type.lower() in {'start', - 'other.start', 'text'})): + if is_guide_ref_start(ref): self._start_href = ref.href self.serialize_href(ref.href) # Space required or won't work, I kid you not diff --git a/src/calibre/ebooks/mobi/writer8/index.py b/src/calibre/ebooks/mobi/writer8/index.py index 153e140b06..1cf9f02d4b 100644 --- a/src/calibre/ebooks/mobi/writer8/index.py +++ b/src/calibre/ebooks/mobi/writer8/index.py @@ -15,9 +15,10 @@ from io import BytesIO from calibre.ebooks.mobi.utils import CNCX, encint, align_block from calibre.ebooks.mobi.writer8.header import Header -TagMeta = namedtuple('TagMeta', +TagMeta_ = namedtuple('TagMeta', 'name number values_per_entry bitmask end_flag') -EndTagTable = TagMeta('eof', 0, 0, 0, 1) +TagMeta = lambda x:TagMeta_(*x) +EndTagTable = TagMeta(('eof', 0, 0, 0, 1)) # map of mask to number of shifts needed, works with 1 bit and two-bit wide masks # could also be extended to 4 bit wide ones as well @@ -118,7 +119,10 @@ class Index(object): # {{{ cbs.append(ans) ans = 0 continue - nvals = len(tags.get(name, ())) + try: + nvals = len(tags.get(name, ())) + except TypeError: + nvals = 1 nentries = nvals // vpe shifts = mask_to_bit_shifts[mask] ans |= mask & (nentries << shifts) @@ -132,36 +136,51 @@ class Index(object): # {{{ self.entries) rendered_entries = [] - offset = 0 index, idxt, buf = BytesIO(), BytesIO(), BytesIO() IndexEntry = namedtuple('IndexEntry', 'offset length raw') + last_lead_text = b'' + too_large = ValueError('Index has too many entries, calibre does not' + ' support generating multiple index records at this' + ' time.') + for i, x in enumerate(self.entries): control_bytes = self.control_bytes[i] leading_text, tags = x - buf.truncate(0) + buf.seek(0), buf.truncate(0) + leading_text = (leading_text.encode('utf-8') if + isinstance(leading_text, unicode) else leading_text) raw = bytearray(leading_text) raw.insert(0, len(leading_text)) buf.write(bytes(raw)) - buf.write(control_bytes) + buf.write(bytes(bytearray(control_bytes))) for tag in self.tag_types: values = tags.get(tag.name, None) + if values is None: continue + try: + len(values) + except TypeError: + values = [values] if values: for val in values: - buf.write(encint(val)) + try: + buf.write(encint(val)) + except ValueError: + raise ValueError('Invalid values for %r: %r'%( + tag, values)) raw = buf.getvalue() + offset = index.tell() + if offset + self.HEADER_LENGTH >= 0x10000: + raise too_large rendered_entries.append(IndexEntry(offset, len(raw), raw)) idxt.write(pack(b'>H', self.HEADER_LENGTH+offset)) - offset += len(raw) index.write(raw) + last_lead_text = leading_text index_block = align_block(index.getvalue()) idxt_block = align_block(b'IDXT' + idxt.getvalue()) body = index_block + idxt_block if len(body) + self.HEADER_LENGTH >= 0x10000: - raise ValueError('Index has too many entries, calibre does not' - ' support generating multiple index records at this' - ' time.') - + raise too_large header = b'INDX' buf.truncate(0) buf.write(pack(b'>I', self.HEADER_LENGTH)) @@ -185,10 +204,15 @@ class Index(object): # {{{ tagx = self.generate_tagx() idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) + b'\0') + # Last index + idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text + idx += pack(b'>H', len(rendered_entries)) + header = { 'num_of_entries': len(rendered_entries), 'num_of_cncx': len(self.cncx), 'tagx':tagx, + 'last_index':align_block(idx), 'idxt':idxt } header = IndexHeader()(**header) @@ -235,6 +259,74 @@ class ChunkIndex(Index): 'file_number':c.file_number, 'sequence_number':c.sequence_number, 'geometry':(c.start_pos, c.length), - }) for s in chunk_table + }) for c in chunk_table ] +class GuideIndex(Index): + + tag_types = tuple(map(TagMeta, ( + ('title', 1, 1, 1, 0), + ('pos_fid', 6, 2, 2, 0), + EndTagTable + ))) + + def __init__(self, guide_table): + self.cncx = CNCX(c.title for c in guide_table) + + self.entries = [ + (r.type, { + + 'title':self.cncx[r.title], + 'pos_fid':r.pos_fid, + }) for r in guide_table + ] + + +class NCXIndex(Index): + + control_byte_count = 2 + tag_types = tuple(map(TagMeta, ( + ('offset', 1, 1, 1, 0), + ('length', 2, 1, 2, 0), + ('label', 3, 1, 4, 0), + ('depth', 4, 1, 8, 0), + ('parent', 21, 1, 16, 0), + ('first_child', 22, 1, 32, 0), + ('last_child', 23, 1, 64, 0), + ('pos_fid', 6, 2, 128, 0), + EndTagTable, + ('image', 69, 1, 1, 0), + ('description', 70, 1, 2, 0), + ('author', 71, 1, 4, 0), + ('caption', 72, 1, 8, 0), + ('attribution', 73, 1, 16, 0), + EndTagTable + ))) + + def __init__(self, toc_table): + strings = [] + for entry in toc_table: + strings.append(entry['label']) + aut = entry.get('author', None) + if aut: + strings.append(aut) + desc = entry.get('description', None) + if desc: + strings.append(desc) + self.cncx = CNCX(strings) + + def to_entry(x): + ans = {} + for f in ('offset', 'length', 'depth', 'pos_fid', 'parent', + 'first_child', 'last_child'): + if f in x: + ans[f] = x[f] + for f in ('label', 'description', 'author'): + if f in x: + ans[f] = self.cncx[x[f]] + return ('%02x'%x['index'], ans) + + self.entries = list(map(to_entry, toc_table)) + + + diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index 955fbab460..76492cb9a9 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -17,12 +17,15 @@ import cssutils from lxml import etree from calibre import isbytestring, force_unicode -from calibre.ebooks.mobi.utils import create_text_record, to_base +from calibre.ebooks.mobi.utils import (create_text_record, to_base, + is_guide_ref_start) from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, extract, XHTML, urlnormalize) from calibre.ebooks.oeb.parse_utils import barename from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href +from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex, + ChunkIndex, GuideIndex) XML_DOCS = OEB_DOCS | {SVG_MIME} @@ -38,11 +41,11 @@ class KF8Writer(object): self.log.info('Creating KF8 output') self.used_images = set() self.resources = resources - self.dup_data() self.flows = [None] # First flow item is reserved for the text self.records = [] - self.fdst_table = [] + self.log('\tGenerating KF8 markup...') + self.dup_data() self.replace_resource_links() self.extract_css_into_flows() self.extract_svg_into_flows() @@ -52,7 +55,10 @@ class KF8Writer(object): # Dump the cloned data as it is no longer needed del self._data_cache self.create_text_records() - self.create_fdst_table() + self.log('\tCreating indices...') + self.create_fdst_records() + self.create_indices() + self.create_guide() def dup_data(self): ''' Duplicate data so that any changes we make to markup/CSS only @@ -231,7 +237,7 @@ class KF8Writer(object): records_size = 0 if self.compress: - self.oeb.logger.info(' Compressing markup content...') + self.oeb.logger.info('\tCompressing markup...') while text.tell() < self.text_length: data, overlap = create_text_record(text) @@ -252,9 +258,90 @@ class KF8Writer(object): self.records.append(b'\x00'*(records_size % 4)) self.first_non_text_record_idx += 1 - def create_fdst_table(self): + def create_fdst_records(self): FDST = namedtuple('Flow', 'start end') + entries = [] + self.fdst_table = [] for i, flow in enumerate(self.flows): start = 0 if i == 0 else self.fdst_table[-1].end self.fdst_table.append(FDST(start, start + len(flow))) + entries.extend(self.fdst_table[-1]) + rec = (b'FDST' + pack(b'>LL', len(self.fdst_table), 12) + + pack(b'>%dL'%len(entries), *entries)) + self.fdst_records = [rec] + + def create_indices(self): + self.skel_records = SkelIndex(self.skel_table)() + self.chunk_records = ChunkIndex(self.chunk_table)() + self.ncx_records = [] + toc = self.oeb.toc + max_depth = toc.depth() + entries = [] + is_periodical = self.opts.mobi_periodical + if toc.count() < 2: + self.log.warn('Document has no ToC, MOBI will have no NCX index') + return + + # Flatten the ToC into a depth first list + fl = toc.iter() if is_periodical else toc.iterdescendants() + for i, item in enumerate(fl): + entry = {'index':i, 'depth': max_depth - item.depth() - (0 if + is_periodical else 1), 'href':item.href, 'label':(item.title or + _('Unknown'))} + entries.append(entry) + for child in item: + child.ncx_parent = entry + p = getattr(item, 'ncx_parent', None) + if p is not None: + entry['parent'] = p['index'] + if is_periodical: + if item.author: + entry['author'] = item.author + if item.description: + entry['description'] = item.description + + for entry in entries: + children = [e for e in entries if e.get('parent', -1) == entry['index']] + if children: + entry['first_child'] = children[0]['index'] + entry['last_child'] = children[-1]['index'] + href = entry.pop('href') + href, frag = href.partition('#')[0::2] + aid = self.id_map.get((href, frag), None) + if aid is None: + aid = self.id_map.get((href, ''), None) + if aid is None: + pos, fid = 0, 0 + else: + pos, fid = self.aid_offset_map[aid] + chunk = self.chunk_table[pos] + offset = chunk.insert_pos + fid + length = chunk.length + entry['pos_fid'] = (pos, fid) + entry['offset'] = offset + entry['length'] = length + + self.ncx_records = NCXIndex(entries)() + + def create_guide(self): + self.start_offset = None + self.guide_table = [] + self.guide_records = [] + GuideRef = namedtuple('GuideRef', 'title type pos_fid') + for ref in self.oeb.guide: + ref = self.oeb.guide[ref] + href, frag = ref.href.partition('#')[0::2] + aid = self.id_map.get((href, frag), None) + if aid is None: + aid = self.id_map.get((href, '')) + if aid is None: + continue + pos, fid = self.aid_offset_map[aid] + if is_guide_ref_start(ref): + self.start_offset = pos + self.guide_table.append(GuideRef(ref.title or + _('Unknown'), ref.type, (pos, fid))) + + if self.guide_table: + self.guide_records = GuideIndex(self.guide_table)() diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 8f0a3795db..398c684e43 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -359,14 +359,14 @@ class Chunker(object): if pos_fid is None: raise ValueError('Could not find chunk for aid: %r'% match.group(1)) - aid_map[match.group(1)] = (to_base(chunk.sequence_number, - base=32, min_num_digits=4), - to_href(offset-chunk.insert_pos)) + aid_map[match.group(1)] = pos_fid self.aid_offset_map = aid_map def to_placeholder(aid): - return bytes(':'.join(aid_map[aid])) + pos, fid = aid_map[aid] + pos, fid = to_base(pos, min_num_digits=4), to_href(fid) + return bytes(':'.join((pos, fid))) placeholder_map = {bytes(k):to_placeholder(v) for k, v in self.placeholder_map.iteritems()} From 3269b8c3611ec68855f60c46c675cde2a4e3dc5e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Apr 2012 10:20:47 +0530 Subject: [PATCH 34/37] ... --- src/calibre/ebooks/mobi/writer8/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index 76492cb9a9..ffc806cb5a 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -337,7 +337,10 @@ class KF8Writer(object): if aid is None: continue pos, fid = self.aid_offset_map[aid] - if is_guide_ref_start(ref): + if is_guide_ref_start(ref) and fid == 0: + # If fid != 0 then we cannot represent the start position as a + # single number in the EXTH header, so we do not write it to + # EXTH self.start_offset = pos self.guide_table.append(GuideRef(ref.title or _('Unknown'), ref.type, (pos, fid))) From e5e2bfd8f359df52428d000662613bab89b1a621 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Apr 2012 10:34:28 +0530 Subject: [PATCH 35/37] ... --- src/calibre/ebooks/mobi/writer8/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index ffc806cb5a..e061da7df6 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -328,8 +328,7 @@ class KF8Writer(object): self.guide_table = [] self.guide_records = [] GuideRef = namedtuple('GuideRef', 'title type pos_fid') - for ref in self.oeb.guide: - ref = self.oeb.guide[ref] + for ref in self.oeb.guide.values(): href, frag = ref.href.partition('#')[0::2] aid = self.id_map.get((href, frag), None) if aid is None: From e0002deb1fba920695c88147b415d583ac79f517 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Apr 2012 12:48:29 +0530 Subject: [PATCH 36/37] Sol Haber by Onur Gungor --- recipes/sol_haber.recipe | 141 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 recipes/sol_haber.recipe diff --git a/recipes/sol_haber.recipe b/recipes/sol_haber.recipe new file mode 100644 index 0000000000..29db88019c --- /dev/null +++ b/recipes/sol_haber.recipe @@ -0,0 +1,141 @@ +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2012, Onur Gungor onurgu@gmail.com' +__docformat__ = 'restructuredtext en' + +''' +www.sol.org.tr +''' + +import datetime + +import re + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class SolHaberRecipe(BasicNewsRecipe): + title = u'soL Haber' + oldest_article = 7 + max_articles_per_feed = 100 + + language = 'tr' + __author__ = 'Onur Güngör' + description = 'Hayata soL''dan bakın..' + publisher = 'soL Haber' + tags = 'news, haberler, siyaset, türkiye, turkey, politics' + + + conversion_options = { + 'comment' : description + , 'tags' : tags + , 'publisher' : publisher + , 'language' : language + } + + category_dict = { 'sonuncu-kavga':'Sonuncu Kavga', + 'devlet-ve-siyaset':'Devlet ve Siyaset', + 'ekonomi':'Ekonomi', + 'enternasyonal-gundem':'Enternasyonel Gündem', + 'kent-gundemleri':'Kent Gündemleri', + 'kultur-sanat':'Kültür Sanat', + 'dunyadan':'Dünyadan', + 'serbest-kursu':'Serbest Kürsü', + 'medya':'Medya', + 'liseliler':'Liseliler', + 'yazarlar':'Köşe Yazıları'} + + end_date = datetime.date.today().isoformat() + start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat() + + + section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)], + ['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)], + ['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)], + ['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]] + + + # Disable stylesheets from site. + no_stylesheets = True + + cover_margins = (20, 20, '#ffffff') + + storybody_reg_exp = '^\s*(haber|kose)\s*$' + + comments_reg_exp = '^\s*makale-elestiri\s*$' + + remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})] + + keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})] + + def get_masthead_title(self): + return self.title + "(" + self.end_date + ")" + + def parse_index(self): + + result = [] + articles_dict = dict() + + author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$') + category_regexp = re.compile('^http://.*?/(.+?)/.*$') + + for section_tuple in self.section_tuples: + + section_title = section_tuple[0] + section_index_url = section_tuple[1] + + self.log('Bölüm:', section_title, 'URL:', section_index_url) + + soup = self.index_to_soup(section_index_url) + + logo = soup.find('div', id='logo').find('img', src=True) + if logo is not None: + self.cover_url = logo['src'] + if self.cover_url.startswith('/'): + self.cover_url = 'http://haber.sol.org.tr'+self.cover_url + + view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'}) + if view_content == None: + break + rows = view_content.find('tbody').findAll('tr') + + self.log('Row sayısı', len(rows)) + for row in rows: + cells = row.findAll('td') + + a = cells[1].find('a', href=True) + + url = a['href'] + title = self.tag_to_string(a) + + if url.startswith('/'): + url = 'http://haber.sol.org.tr'+url + + category = section_title + category_match_result = category_regexp.match(url) + if category_match_result: + category = category_match_result.group(1) + + date = self.tag_to_string(cells[2]) + + author = 'soL haber' + + author_match_result = author_regexp.match(url) + if author_match_result: + author = author_match_result.group(1) + + self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author) + article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author} + if category in articles_dict: + articles_dict[category].append(article) + else: + articles_dict[category] = [article] + + for category in articles_dict.keys(): + if category in self.category_dict: + result.append((self.category_dict[category], articles_dict[category])) + else: + result.append((category, articles_dict[category])) + + return result From fe1e29082003058efbcdaf4f8610021bc3b393f1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Apr 2012 15:52:12 +0530 Subject: [PATCH 37/37] Bash completion for ebook-viewer should complete all fiel types for which calibre has an input plugin --- src/calibre/linux.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/linux.py b/src/calibre/linux.py index 64bc9a5a0b..e3bfe04e75 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -228,8 +228,8 @@ class PostInstall: from calibre.utils.smtp import option_parser as smtp_op from calibre.library.server.main import option_parser as serv_op from calibre.ebooks.epub.fix.main import option_parser as fix_op - any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', - 'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2', 'odt', 'lrf', 'snb'] + from calibre.ebooks import BOOK_EXTENSIONS + input_formats = sorted(all_input_formats()) bc = os.path.join(os.path.dirname(self.opts.staging_sharedir), 'bash-completion') if os.path.exists(bc): @@ -249,11 +249,11 @@ class PostInstall: self.info('Installing bash completion to', f) with open(f, 'wb') as f: f.write('# calibre Bash Shell Completion\n') - f.write(opts_and_exts('calibre', guiop, any_formats)) + f.write(opts_and_exts('calibre', guiop, BOOK_EXTENSIONS)) f.write(opts_and_exts('lrf2lrs', lrf2lrsop, ['lrf'])) f.write(opts_and_exts('ebook-meta', metaop, list(meta_filetypes()))) f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf'])) - f.write(opts_and_exts('ebook-viewer', viewer_op, any_formats)) + f.write(opts_and_exts('ebook-viewer', viewer_op, input_formats)) f.write(opts_and_words('fetch-ebook-metadata', fem_op, [])) f.write(opts_and_words('calibre-smtp', smtp_op, [])) f.write(opts_and_words('calibre-server', serv_op, []))