From 1882c36a719dececfc2cf5425e108d9543477fb2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 12:35:55 +0530 Subject: [PATCH 01/13] calibre-smtp: Fix filter mode operation and handle multiple to addresses separated by commas --- src/calibre/utils/smtp.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/calibre/utils/smtp.py b/src/calibre/utils/smtp.py index e496975507..e15afbd56d 100644 --- a/src/calibre/utils/smtp.py +++ b/src/calibre/utils/smtp.py @@ -211,23 +211,25 @@ def main(args=sys.argv): msg = compose_mail(args[1], args[2], args[3], subject=opts.subject, attachment=opts.attachment) from_, to = args[1:3] - efrom, eto = map(extract_email_address, (from_, to)) - eto = [eto] + eto = [extract_email_address(x.strip()) for x in to.split(',')] + efrom = extract_email_address(from_) else: msg = sys.stdin.read() - from email.parser import Parser + from email import message_from_string from email.utils import getaddresses - eml = Parser.parsestr(msg, headersonly=True) + eml = message_from_string(msg) tos = eml.get_all('to', []) - ccs = eml.get_all('cc', []) - eto = getaddresses(tos + ccs) + ccs = eml.get_all('cc', []) + eml.get_all('bcc', []) + all_tos = [] + for x in tos + ccs: + all_tos.extend(y.strip() for y in x.split(',')) + eto = list(map(extract_email_address, all_tos)) if not eto: raise ValueError('Email from STDIN does not specify any recipients') efrom = getaddresses(eml.get_all('from', [])) if not efrom: raise ValueError('Email from STDIN does not specify a sender') - efrom = efrom[0] - + efrom = efrom[0][1] outbox = None if opts.outbox is not None: From 343c0a8d1b06dcd0039e49d7b51075e6068b4b95 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 14:28:02 +0530 Subject: [PATCH 02/13] Kobo driver: Delay load a few depenencies to lower startup time of calibre worker processes --- src/calibre/devices/kobo/driver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index f4128d2fdc..9805510c9f 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -12,19 +12,17 @@ Originally developed by Timothy Legge . Extended to support Touch firmware 2.0.0 and later and newer devices by David Forrester ''' -import os, time, calendar +import os, time from contextlib import closing from calibre.devices.usbms.books import BookList from calibre.devices.usbms.books import CollectionsBookList from calibre.devices.kobo.books import KTCollectionsBookList from calibre.devices.kobo.books import Book from calibre.devices.kobo.books import ImageWrapper -from calibre.devices.kobo.bookmark import Bookmark from calibre.devices.mime import mime_type_ext from calibre.devices.usbms.driver import USBMS, debug_print from calibre import prints from calibre.ptempfile import PersistentTemporaryFile - from calibre.constants import DEBUG from calibre.utils.config import prefs @@ -994,6 +992,7 @@ class KOBO(USBMS): return USBMS.create_annotations_path(self, mdata) def get_annotations(self, path_map): + from calibre.devices.kobo.bookmark import Bookmark EPUB_FORMATS = [u'epub'] epub_formats = set(EPUB_FORMATS) @@ -1056,6 +1055,7 @@ class KOBO(USBMS): return bookmarked_books def generate_annotation_html(self, bookmark): + import calendar from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString # Returns
...
#last_read_location = bookmark.last_read_location From 7cd23bbeaa74d853c793a39c6d42a0446074def2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 22:27:10 +0530 Subject: [PATCH 03/13] Updated NME --- recipes/nme.recipe | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/recipes/nme.recipe b/recipes/nme.recipe index 48d7ce7503..6a62b00902 100644 --- a/recipes/nme.recipe +++ b/recipes/nme.recipe @@ -4,7 +4,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): title = u'New Musical Express Magazine' description = 'Author D.Asbury. UK Rock & Pop Mag. ' __author__ = 'Dave Asbury' - # last updated 9/6/12 + # last updated 7/10/12 remove_empty_feeds = True remove_javascript = True no_stylesheets = True @@ -14,26 +14,24 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): language = 'en_GB' def get_cover_url(self): - soup = self.index_to_soup('http://www.magazinesdirect.com/categories/mens/tv-and-music/') - cov = soup.find(attrs={'title' : 'NME magazine subscriptions'}) - cov2 = 'http://www.magazinesdirect.com'+cov['src'] - print '***cov = ',cov2,' ***' + soup = self.index_to_soup('http://www.nme.com/component/subscribe') + cov = soup.find(attrs={'id' : 'magazine_cover'}) + cov2 = str(cov['src']) + # print '**** Cov url =*', cover_url,'***' + #print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***' - cover_url = str(cov2) - # print '**** Cov url =*', cover_url,'***' - #print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***' - br = browser() - br.set_handle_redirect(False) - try: - br.open_novisit(cov2) - cover_url = str(cov2) - except: - cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg' - return cover_url + br = browser() + br.set_handle_redirect(False) + try: + br.open_novisit(cov2) + cover_url = str(cov2) + except: + cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg' + return cover_url + masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg' - remove_tags = [ dict( attrs={'class':'clear_icons'}), dict( attrs={'class':'share_links'}), @@ -61,9 +59,15 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe): feeds = [ - (u'NME News', u'http://feeds2.feedburner.com/nmecom/rss/newsxml'), + (u'NME News', u'http://feeds.feedburner.com/nmecom/rss/newsxml?format=xml'), #(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'), - (u'Reviews',u'http://feed43.com/4138608576351646.xml'), + (u'Reviews',u'http://feed43.com/1817687144061333.xml'), (u'Bloggs',u'http://feed43.com/3326754333186048.xml'), ] + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} + p{font-family:Arial,Helvetica,sans-serif;font-size:small;} + body{font-family:Helvetica,Arial,sans-serif;font-size:small;} + ''' From c35a81dafa40e1e35dcd4e2760c6e290cbf8561d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 22:50:04 +0530 Subject: [PATCH 04/13] News download: Add a field to allow recipe authors to tell calibre to remove duplicate articles that a re present in more than one section from the download. --- src/calibre/web/feeds/__init__.py | 6 ++++++ src/calibre/web/feeds/news.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 746afefaef..ee8072cda4 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -265,6 +265,12 @@ class Feed(object): if i > -1: self.articles[i:i+1] = [] + def remove_article(self, article): + try: + self.articles.remove(article) + except ValueError: + pass + class FeedCollection(list): def __init__(self, feeds): diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index f494618eaa..5502244007 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -321,6 +321,15 @@ class BasicNewsRecipe(Recipe): #: The string will be used as the disabled message recipe_disabled = None + #: Ignore duplicates of articles that are present in more than one section. + #: A duplicate article is an article that has the same title and/or URL. + #: To ignore articles with the same title, set this to: + #: ignore_duplicate_articles = {'title'} + #: To use URLs instead, set it to: + #: ignore_duplicate_articles = {'url'} + #: To match on title or URL, set it to: + #: ignore_duplicate_articles = {'title', 'url'} + ignore_duplicate_articles = None # See the built-in profiles for examples of these settings. @@ -1019,6 +1028,24 @@ class BasicNewsRecipe(Recipe): url = ('file:'+pt.name) if iswindows else ('file://'+pt.name) return self._fetch_article(url, dir, f, a, num_of_feeds) + def remove_duplicate_articles(self, feeds): + seen_keys = defaultdict(set) + remove = [] + for f in feeds: + for article in f: + for key in self.ignore_duplicate_articles: + val = getattr(article, key) + seen = seen_keys[key] + if val: + if val in seen: + remove.append((f, article)) + else: + seen.add(val) + + for feed, article in remove: + self.log.debug('Removing duplicate article: %s from section: %s'%( + article.title, feed.title)) + feed.remove_article(article) def build_index(self): self.report_progress(0, _('Fetching feeds...')) @@ -1033,6 +1060,9 @@ class BasicNewsRecipe(Recipe): if not feeds: raise ValueError('No articles found, aborting') + if self.ignore_duplicate_articles is not None: + self.remove_duplicate_articles(feeds) + #feeds = FeedCollection(feeds) self.report_progress(0, _('Trying to download cover...')) From ccbf0f50bd899c4023036998a738ea97b17ba8f3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 23:06:27 +0530 Subject: [PATCH 05/13] Follow the remove_empty_feeds option when using ignore_duplicate_articles --- src/calibre/web/feeds/news.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 5502244007..14834ff88c 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -167,9 +167,10 @@ class BasicNewsRecipe(Recipe): extra_css = None #: If True empty feeds are removed from the output. - #: This option has no effect if parse_index is overriden in + #: This option has no effect if parse_index is overridden in #: the sub class. It is meant only for recipes that return a list - #: of feeds using `feeds` or :meth:`get_feeds`. + #: of feeds using `feeds` or :meth:`get_feeds`. It is also used if you use + #: the ignore_duplicate_articles option. remove_empty_feeds = False #: List of regular expressions that determines which links to follow @@ -1047,6 +1048,10 @@ class BasicNewsRecipe(Recipe): article.title, feed.title)) feed.remove_article(article) + if self.remove_empty_feeds: + feeds = [f for f in feeds if len(f) > 0] + return feeds + def build_index(self): self.report_progress(0, _('Fetching feeds...')) try: @@ -1061,7 +1066,7 @@ class BasicNewsRecipe(Recipe): raise ValueError('No articles found, aborting') if self.ignore_duplicate_articles is not None: - self.remove_duplicate_articles(feeds) + feeds = self.remove_duplicate_articles(feeds) #feeds = FeedCollection(feeds) From 10d371230de03952d706e6f4f358804809e1ebd7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 23:09:58 +0530 Subject: [PATCH 06/13] Update Cosmo UK and andd PVP Online by Krittika Goyal --- recipes/cosmopolitan_uk.recipe | 1 + recipes/pvp_online.recipe | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 recipes/pvp_online.recipe diff --git a/recipes/cosmopolitan_uk.recipe b/recipes/cosmopolitan_uk.recipe index ae23be224d..078718f6f7 100644 --- a/recipes/cosmopolitan_uk.recipe +++ b/recipes/cosmopolitan_uk.recipe @@ -15,6 +15,7 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe): max_articles_per_feed = 20 remove_empty_feeds = True remove_javascript = True + ignore_duplicate_articles = {'title'} preprocess_regexps = [ (re.compile(r'.*?', re.IGNORECASE | re.DOTALL), lambda match: '')] diff --git a/recipes/pvp_online.recipe b/recipes/pvp_online.recipe new file mode 100644 index 0000000000..5b6c466062 --- /dev/null +++ b/recipes/pvp_online.recipe @@ -0,0 +1,18 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1344926684(BasicNewsRecipe): + title = u'PVP online' + __author__ = 'Krittika Goyal' + oldest_article = 7 + max_articles_per_feed = 100 + #auto_cleanup = True + no_stylesheets = True + use_embedded_content = False + language = 'en' + remove_javascript = True + + keep_only_tags = [dict(name='div', attrs={'class':'body'})] + remove_tags = [dict(name='div', attrs={'class':'prevBg'}),dict(name='div', attrs={'class':'nextBg'}),dict(name='div', attrs={'class':'postMeta'})] + + feeds = [(u'Comics', u'http://pvponline.com/feed'), ] + From d02bc170756e2f1ec9cd3643f960c5e62aec8651 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 23:12:28 +0530 Subject: [PATCH 07/13] Update Countryfile --- recipes/countryfile.recipe | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/recipes/countryfile.recipe b/recipes/countryfile.recipe index 71977048c7..4f2e8cd95f 100644 --- a/recipes/countryfile.recipe +++ b/recipes/countryfile.recipe @@ -1,11 +1,13 @@ from calibre import browser from calibre.web.feeds.news import BasicNewsRecipe +import re + class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'Countryfile.com' #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg' __author__ = 'Dave Asbury' description = 'The official website of Countryfile Magazine' - # last updated 9/9//12 + # last updated 7/10/12 language = 'en_GB' oldest_article = 30 max_articles_per_feed = 25 @@ -13,12 +15,14 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): no_stylesheets = True auto_cleanup = True #articles_are_obfuscated = True + ignore_duplicate_articles = {'title'} def get_cover_url(self): soup = self.index_to_soup('http://www.countryfile.com/') - cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'}) + + cov = soup.find(attrs={'width' : '160', 'class' : re.compile('imagecache imagecache-160px_wide')}) print '******** ',cov,' ***' cov2 = str(cov) - cov2=cov2[140:223] + cov2=cov2[10:101] print '******** ',cov2,' ***' #cov2='http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/1b_0.jpg' # try to get cover - if can't get known cover @@ -40,3 +44,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): (u'Country News', u'http://www.countryfile.com/rss/news'), (u'Countryside', u'http://www.countryfile.com/rss/countryside'), ] + + + From 66bd718702d49a162d263ead3db2f8e1eafec9e8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 23:13:57 +0530 Subject: [PATCH 08/13] Update FHM UK --- recipes/fhm_uk.recipe | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe index 84455ddd3c..ffcb411807 100644 --- a/recipes/fhm_uk.recipe +++ b/recipes/fhm_uk.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1325006965(BasicNewsRecipe): title = u'FHM UK' description = 'Good News for Men.' @@ -7,14 +8,15 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): # cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg' masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif' __author__ = 'Dave Asbury' - # last updated 1/7/12 + # last updated 7/10/12 language = 'en_GB' - oldest_article = 28 - max_articles_per_feed = 8 + oldest_article = 31 + max_articles_per_feed = 15 remove_empty_feeds = True no_stylesheets = True #auto_cleanup = True # articles_are_obfuscated = True + keep_only_tags = [ dict(name='h1'), dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}), @@ -28,14 +30,12 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe): ] feeds = [ - (u'Homepage 1',u'http://feed43.com/6655867614547036.xml'), - (u'Homepage 2',u'http://feed43.com/4167731873103110.xml'), - (u'Homepage 3',u'http://feed43.com/7667138788771570.xml'), - (u'Homepage 4',u'http://feed43.com/6550421522527341.xml'), - (u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'), - (u'Gaming',u'http://feed43.com/6537162612465672.xml'), - (u'Girls',u'http://feed43.com/4574262733341068.xml'),# edit link http://feed43.com/feed.html?name=4574262733341068 - ] + # repeatable search = {|}{%}{|}

{*}

+ (u'Homepage',u'http://rss.feedsportal.com/c/375/f/434908/index.rss'), + (u'Funny',u'http://rss.feedsportal.com/c/375/f/434910/index.rss'), + (u'Girls',u'http://rss.feedsportal.com/c/375/f/434913/index.rss'), +] + extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} From 71c70ad5c9ddf40c7bdb94a7b4faaa99df61827d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Oct 2012 23:30:17 +0530 Subject: [PATCH 09/13] Update shortlist --- recipes/shortlist.recipe | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/recipes/shortlist.recipe b/recipes/shortlist.recipe index 155cbd25aa..e481213691 100644 --- a/recipes/shortlist.recipe +++ b/recipes/shortlist.recipe @@ -5,13 +5,15 @@ class AdvancedUserRecipe1324663493(BasicNewsRecipe): title = u'Shortlist' description = 'Articles From Shortlist.com' # I've set oldest article to 7 days as the website updates weekly - oldest_article = 7 - max_articles_per_feed = 12 + oldest_article = 8 + max_articles_per_feed = 20 remove_empty_feeds = True remove_javascript = True no_stylesheets = True + ignore_duplicate_articles = {'title'} + __author__ = 'Dave Asbury' - # last updated 19/5/12 + # last updated 7/10/12 language = 'en_GB' def get_cover_url(self): soup = self.index_to_soup('http://www.shortlist.com') @@ -45,17 +47,16 @@ class AdvancedUserRecipe1324663493(BasicNewsRecipe): ] feeds = [ - (u'Home carousel',u'http://feed43.com/7106317222455380.xml'), - (u'This Weeks Issue', u'http://feed43.com/0323588208751786.xml'), - (u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'), - (u'Style',u'http://feed43.com/7217107577215678.xml'), - (u'Films',u'http://feed43.com/3101308515277265.xml'), - (u'Music',u'http://feed43.com/2416400550560162.xml'), - (u'TV',u'http://feed43.com/4781172470717123.xml'), - (u'Sport',u'http://feed43.com/5303151885853308.xml'), - (u'Gaming',u'http://feed43.com/8883764600355347.xml'), - (u'Women',u'http://feed43.com/2648221746514241.xml'), - (u'Instant Improver', u'http://feed43.com/1236541026275417.xml'), + #edit http://feed43.com/feed.html?name=3156308700147005 + # repeatable pattern =

{_}{%}{*}

+ + (u'This Weeks Issue', u'http://feed43.com/5205766657404804.xml'), + (u'Home Page',u'http://feed43.com/3156308700147005.xml'), + (u'Cool Stuff',u'http://feed43.com/1557051772026706.xml'), + (u'Style',u'http://feed43.com/4168836374571502.xml'), + (u'Entertainment',u'http://feed43.com/4578504030588024.xml'), + - #(u'Articles', u'http://feed43.com/3428534448355545.xml') ] + + From 59fa97189891642f525ba389b7216f163313fda1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Oct 2012 09:58:37 +0530 Subject: [PATCH 10/13] Fix Der Spiegel --- recipes/der_spiegel.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/der_spiegel.recipe b/recipes/der_spiegel.recipe index 1e94785233..3a12378405 100644 --- a/recipes/der_spiegel.recipe +++ b/recipes/der_spiegel.recipe @@ -72,7 +72,7 @@ class DerSpiegel(BasicNewsRecipe): for article in section.findNextSiblings(['dd','dt']): if article.name == 'dt': break - link = article.find('a') + link = article.find('a', href=True) title = self.tag_to_string(link).strip() if title in self.empty_articles: continue From 4a4fb6390955d8c6c222b9de5b52965ecbc624f3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Oct 2012 10:33:31 +0530 Subject: [PATCH 11/13] Fix #1063533 (Evo 3d sdcard support data) --- src/calibre/devices/android/driver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index a2d556d583..b26072d333 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -40,6 +40,7 @@ class ANDROID(USBMS): 0xca4 : HTC_BCDS, 0xca9 : HTC_BCDS, 0xcac : HTC_BCDS, + 0xcba : HTC_BCDS, 0xccf : HTC_BCDS, 0xcd6 : HTC_BCDS, 0xce5 : HTC_BCDS, From 9e3385177777ad2ac1e575f2e95499d3cc5ce441 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Oct 2012 12:25:01 +0530 Subject: [PATCH 12/13] ... --- src/calibre/web/feeds/recipes/collection.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py index 6ab5764302..e5613f2690 100644 --- a/src/calibre/web/feeds/recipes/collection.py +++ b/src/calibre/web/feeds/recipes/collection.py @@ -68,7 +68,12 @@ def serialize_collection(mapping_of_recipe_classes): key=lambda key: force_unicode( getattr(mapping_of_recipe_classes[key], 'title', 'zzz'), 'utf-8')): - recipe = serialize_recipe(urn, mapping_of_recipe_classes[urn]) + try: + recipe = serialize_recipe(urn, mapping_of_recipe_classes[urn]) + except: + import traceback + traceback.print_exc() + continue collection.append(recipe) collection.set('count', str(len(collection))) return etree.tostring(collection, encoding='utf-8', xml_declaration=True, From 9b8c6f218ef994e686a1f99ce4065f1559ba6f5d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 8 Oct 2012 13:36:39 +0530 Subject: [PATCH 13/13] ODT Input: More workarounds for the image positioning markup produced by newer versions of LibreOffice. Fixes #1063207 (odt to anything [alignment]) --- src/calibre/ebooks/odt/input.py | 78 ++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/odt/input.py b/src/calibre/ebooks/odt/input.py index f0d2335a30..43d33b8566 100644 --- a/src/calibre/ebooks/odt/input.py +++ b/src/calibre/ebooks/odt/input.py @@ -6,15 +6,19 @@ __docformat__ = 'restructuredtext en' ''' Convert an ODT file into a Open Ebook ''' -import os +import os, logging from lxml import etree +from cssutils import CSSParser +from cssutils.css import CSSRule + from odf.odf2xhtml import ODF2XHTML from odf.opendocument import load as odLoad from odf.draw import Frame as odFrame, Image as odImage from odf.namespaces import TEXTNS as odTEXTNS from calibre import CurrentDir, walk +from calibre.ebooks.oeb.base import _css_logger class Extract(ODF2XHTML): @@ -29,14 +33,14 @@ class Extract(ODF2XHTML): def fix_markup(self, html, log): root = etree.fromstring(html) - self.epubify_markup(root, log) self.filter_css(root, log) - self.extract_css(root) + self.extract_css(root, log) + self.epubify_markup(root, log) html = etree.tostring(root, encoding='utf-8', xml_declaration=True) return html - def extract_css(self, root): + def extract_css(self, root, log): ans = [] for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'): ans.append(s.text) @@ -51,9 +55,21 @@ class Extract(ODF2XHTML): etree.SubElement(head, ns+'link', {'type':'text/css', 'rel':'stylesheet', 'href':'odfpy.css'}) - with open('odfpy.css', 'wb') as f: - f.write((u'\n\n'.join(ans)).encode('utf-8')) + css = u'\n\n'.join(ans) + parser = CSSParser(loglevel=logging.WARNING, + log=_css_logger) + self.css = parser.parseString(css, validate=False) + with open('odfpy.css', 'wb') as f: + f.write(css.encode('utf-8')) + + def get_css_for_class(self, cls): + if not cls: return None + for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE): + for sel in rule.selectorList: + q = sel.selectorText + if q == '.' + cls: + return rule def epubify_markup(self, root, log): from calibre.ebooks.oeb.base import XPath, XHTML @@ -84,16 +100,54 @@ class Extract(ODF2XHTML): div.attrib['style'] = style img.attrib['style'] = 'max-width: 100%; max-height: 100%' - # A div/div/img construct causes text-align:center to not work in ADE - # so set the display of the second div to inline. This should have no - # effect (apart from minor vspace issues) in a compliant HTML renderer - # but it fixes the centering of the image via a text-align:center on - # the first div in ADE + # Handle anchored images. The default markup + CSS produced by + # odf2xhtml works with WebKit but not with ADE. So we convert the + # common cases of left/right/center aligned block images to work on + # both webkit and ADE. We detect the case of setting the side margins + # to auto and map it to an appropriate text-align directive, which + # works in both WebKit and ADE. + # https://bugs.launchpad.net/bugs/1063207 + # https://bugs.launchpad.net/calibre/+bug/859343 imgpath = XPath('descendant::h:div/h:div/h:img') for img in imgpath(root): div2 = img.getparent() div1 = div2.getparent() - if len(div1) == len(div2) == 1: + if (len(div1), len(div2)) != (1, 1): continue + cls = div1.get('class', '') + first_rules = filter(None, [self.get_css_for_class(x) for x in + cls.split()]) + has_align = False + for r in first_rules: + if r.style.getProperty(u'text-align') is not None: + has_align = True + ml = mr = None + if not has_align: + aval = None + cls = div2.get(u'class', u'') + rules = filter(None, [self.get_css_for_class(x) for x in + cls.split()]) + for r in rules: + ml = r.style.getPropertyCSSValue(u'margin-left') or ml + mr = r.style.getPropertyCSSValue(u'margin-right') or mr + ml = getattr(ml, 'value', None) + mr = getattr(mr, 'value', None) + if ml == mr == u'auto': + aval = u'center' + elif ml == u'auto' and mr != u'auto': + aval = 'right' + elif ml != u'auto' and mr == u'auto': + aval = 'left' + if aval is not None: + style = div1.attrib.get('style', '').strip() + if style and not style.endswith(';'): + style = style + ';' + style += 'text-align:%s'%aval + has_align = True + div1.attrib['style'] = style + + if has_align: + # This is needed for ADE, without it the text-align has no + # effect style = div2.attrib['style'] div2.attrib['style'] = 'display:inline;'+style