From b8018f59f29eaf7c5036c704f6c4baa92fda9165 Mon Sep 17 00:00:00 2001 From: GRiker Date: Thu, 27 Oct 2011 13:53:43 -0600 Subject: [PATCH 01/11] WIP - conform epub metadata with calibre --- src/calibre/devices/apple/driver.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/apple/driver.py b/src/calibre/devices/apple/driver.py index 645b2fb606..289d5079f4 100644 --- a/src/calibre/devices/apple/driver.py +++ b/src/calibre/devices/apple/driver.py @@ -1305,6 +1305,8 @@ class ITUNES(DriverBase): if DEBUG: self.log.info(" ITUNES._add_new_copy()") + self._update_epub_metadata(fpath, metadata) + db_added = None lb_added = None @@ -2663,6 +2665,7 @@ class ITUNES(DriverBase): metadata.timestamp = now() if DEBUG: self.log.info(" add timestamp: %s" % metadata.timestamp) + else: metadata.timestamp = now() if DEBUG: @@ -2699,7 +2702,7 @@ class ITUNES(DriverBase): if iswindows and metadata.series: metadata.tags = None - set_metadata(zfo, metadata, update_timestamp=True) + set_metadata(zfo, metadata, apply_null=True, update_timestamp=True) def _update_device(self, msg='', wait=True): ''' From 5c7bf560c2ec37867f7a1fd5e128ca5dafbb3a9a Mon Sep 17 00:00:00 2001 From: GRiker Date: Fri, 18 Nov 2011 05:18:15 -0700 Subject: [PATCH 02/11] Added iPhone 4S device fingerprint --- src/calibre/devices/apple/driver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/apple/driver.py b/src/calibre/devices/apple/driver.py index 289d5079f4..9a242b245d 100644 --- a/src/calibre/devices/apple/driver.py +++ b/src/calibre/devices/apple/driver.py @@ -217,10 +217,11 @@ class ITUNES(DriverBase): # 0x1297 iPhone 4 # 0x129a iPad # 0x129f iPad2 (WiFi) + # 0x12a0 iPhone 4S # 0x12a2 iPad2 (GSM) # 0x12a3 iPad2 (CDMA) VENDOR_ID = [0x05ac] - PRODUCT_ID = [0x1292,0x1293,0x1294,0x1297,0x1299,0x129a,0x129f,0x12a2,0x12a3] + PRODUCT_ID = [0x1292,0x1293,0x1294,0x1297,0x1299,0x129a,0x129f,0x12a0,0x12a2,0x12a3] BCD = [0x01] # Plugboard ID From 055b17c68a29ffaf8b4b9985de31429969d0c836 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Nov 2011 14:52:57 +0530 Subject: [PATCH 03/11] Fix handling of comments in the jacket template --- resources/jacket/template.xhtml | 9 ++++++--- src/calibre/ebooks/oeb/transforms/jacket.py | 8 +++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/resources/jacket/template.xhtml b/resources/jacket/template.xhtml index 17d0493a82..671ca5a04d 100644 --- a/resources/jacket/template.xhtml +++ b/resources/jacket/template.xhtml @@ -38,9 +38,12 @@
{comments}
diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py index 987fe0ce86..ede34ef17c 100644 --- a/src/calibre/ebooks/oeb/transforms/jacket.py +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -16,6 +16,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML from calibre.library.comments import comments_to_html from calibre.utils.date import is_date_undefined +from calibre.ebooks.chardet import strip_encoding_declarations JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]' @@ -180,10 +181,14 @@ def render_jacket(mi, output_profile, except: pass + args['_genre_label'] = args.get('_genre_label', '') + args['_genre'] = args.get('_genre', '') + generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) # Post-process the generated html to strip out empty header items + soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) @@ -206,7 +211,8 @@ def render_jacket(mi, output_profile, if hr_tag is not None: hr_tag.extract() - return soup.renderContents(None) + return strip_encoding_declarations( + soup.renderContents('utf-8').decode('utf-8')) from calibre.ebooks.oeb.base import RECOVER_PARSER From 133338565f0bc61d264007d9e9e2859e914701ab Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Nov 2011 14:53:37 +0530 Subject: [PATCH 04/11] ... --- src/calibre/ebooks/oeb/transforms/jacket.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py index ede34ef17c..429e10acf4 100644 --- a/src/calibre/ebooks/oeb/transforms/jacket.py +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -181,6 +181,7 @@ def render_jacket(mi, output_profile, except: pass + # Used in the comment describing use of custom columns in templates args['_genre_label'] = args.get('_genre_label', '') args['_genre'] = args.get('_genre', '') From abb41dcbb4269adb9481ed9f4a0b965c7e47e936 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Nov 2011 15:16:37 +0530 Subject: [PATCH 05/11] Irex driver: Put books into the top level directory instead of into /ebooks or /Books. Fixes #883616 (IREX DR Driver root directory) --- src/calibre/devices/irexdr/driver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/irexdr/driver.py b/src/calibre/devices/irexdr/driver.py index 32e98f9353..bdc77b3193 100644 --- a/src/calibre/devices/irexdr/driver.py +++ b/src/calibre/devices/irexdr/driver.py @@ -33,7 +33,7 @@ class IREXDR1000(USBMS): MAIN_MEMORY_VOLUME_LABEL = 'IRex Digital Reader 1000 Main Memory' - EBOOK_DIR_MAIN = 'ebooks' + EBOOK_DIR_MAIN = '' DELETE_EXTS = ['.mbp'] SUPPORTS_SUB_DIRS = True @@ -44,7 +44,7 @@ class IREXDR800(IREXDR1000): WINDOWS_MAIN_MEM = 'DR800' FORMATS = ['epub', 'pdb', 'html', 'pdf', 'txt'] - EBOOK_DIR_MAIN = 'Books' + EBOOK_DIR_MAIN = '' DELETE_EXTS = [] SUPPORTS_SUB_DIRS = True From b9765b8f529de6be40dc09d3b2e8a8c38f58ab8e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Nov 2011 15:54:28 +0530 Subject: [PATCH 06/11] ... --- src/calibre/ebooks/oeb/transforms/jacket.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py index 429e10acf4..79524c19eb 100644 --- a/src/calibre/ebooks/oeb/transforms/jacket.py +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -176,14 +176,14 @@ def render_jacket(mi, output_profile, try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') - args[key] = val - args[key+'_label'] = display_name + args[key] = escape(val) + args[key+'_label'] = escape(display_name) except: pass # Used in the comment describing use of custom columns in templates - args['_genre_label'] = args.get('_genre_label', '') - args['_genre'] = args.get('_genre', '') + args['_genre_label'] = args.get('_genre_label', '{_genre_label}') + args['_genre'] = args.get('_genre', '{_genre}') generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) From 5d6706b5fe8eee56e5a7a97adee9fd112e8c378f Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 19 Nov 2011 09:52:21 -0500 Subject: [PATCH 07/11] Fix Cyrillic encoding issues, patch from Roman. --- src/calibre/ebooks/metadata/book/base.py | 3 +- src/calibre/ebooks/metadata/sources/ozon.py | 45 ++++++++++++------- .../gui2/store/stores/ozon_ru_plugin.py | 5 ++- src/calibre/translations/ru.po | 10 ++--- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py index 53d336a23d..286bcee9d0 100644 --- a/src/calibre/ebooks/metadata/book/base.py +++ b/src/calibre/ebooks/metadata/book/base.py @@ -710,7 +710,8 @@ class Metadata(object): fmt('Title sort', self.title_sort) if self.authors: fmt('Author(s)', authors_to_string(self.authors) + \ - ((' [' + self.author_sort + ']') if self.author_sort else '')) + ((' [' + self.author_sort + ']') + if self.author_sort and self.author_sort != _('Unknown') else '')) if self.publisher: fmt('Publisher', self.publisher) if getattr(self, 'book_producer', False): diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index fa9951c40c..ecec13662f 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -11,7 +11,7 @@ import datetime from urllib import quote_plus from Queue import Queue, Empty from lxml import etree, html -from calibre import as_unicode +from calibre import prints, as_unicode from calibre.ebooks.chardet import xml_to_unicode @@ -54,7 +54,8 @@ class Ozon(Source): def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ # div_book -> search only books, ebooks and audio books search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' - + + # for ozon.ru search we have to format ISBN with '-' isbn = _format_isbn(log, identifiers.get('isbn', None)) # TODO: format isbn! qItems = set([isbn, title]) @@ -64,7 +65,7 @@ class Ozon(Source): qItems.discard('') qItems = map(_quoteString, qItems) - q = ' '.join(qItems).strip() + q = u' '.join(qItems).strip() log.info(u'search string: ' + q) if isinstance(q, unicode): @@ -78,13 +79,13 @@ class Ozon(Source): return search_url # }}} - def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ - identifiers={}, timeout=30): + def identify(self, log, result_queue, abort, title=None, authors=None, + identifiers={}, timeout=30): # {{{ if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: - err = 'Insufficient metadata to construct query' + err = u'Insufficient metadata to construct query' log.error(err) return err @@ -109,7 +110,7 @@ class Ozon(Source): # }}} def get_metadata(self, log, entries, title, authors, identifiers): # {{{ - # some book titles have extra charactes like this + # some book titles have extra characters like this # TODO: make a twick reRemoveFromTitle = None #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') @@ -160,7 +161,7 @@ class Ozon(Source): mi.source_relevance = i if ensure_metadata_match(mi): metadata.append(mi) - # log.debug(u'added metadata %s %s. '%(mi.title, mi.authors)) + #log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) else: log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors)) return metadata @@ -285,12 +286,12 @@ class Ozon(Source): url = self.get_book_url(metadata.get_identifiers())[2] raw = self.browser.open_novisit(url, timeout=timeout).read() - doc = html.fromstring(raw) + doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)' xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")' - # series + # series Серия/Серии xpt = xpt_prod_det_at % u'Сери' # % u'Серия:' series = doc.xpath(xpt) @@ -300,7 +301,7 @@ class Ozon(Source): xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' isbn_str = doc.xpath(xpt) if isbn_str: - all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] + all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)] if all_isbns: metadata.all_isbns = all_isbns metadata.isbn = all_isbns[0] @@ -333,10 +334,10 @@ class Ozon(Source): xpt = u'//table[@id="detail_description"]//tr/td' comment_elem = doc.xpath(xpt) if comment_elem: - comments = unicode(etree.tostring(comment_elem[0])) + comments = unicode(etree.tostring(comment_elem[0], encoding=unicode)) if comments: # cleanup root tag, TODO: remove tags like object/embeded - comments = re.sub(r'\A.*?|.*\Z', u'', comments.strip(), re.MULTILINE).strip() + comments = re.sub(ur'\A.*?|.*\Z', u'', comments.strip(), re.MULTILINE).strip() if comments and (not metadata.comments or len(comments) > len(metadata.comments)): metadata.comments = comments else: @@ -345,8 +346,16 @@ class Ozon(Source): log.debug('No book description found in HTML') # }}} -def _quoteString(str): # {{{ - return '"' + str + '"' if str and str.find(' ') != -1 else str +def _quoteString(strToQuote): # {{{ + return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote +# }}} + +def _verifyISBNIntegrity(log, isbn): # {{{ + # Online ISBN-Check http://www.isbn-check.de/ + res = check_isbn(isbn) + if not res: + log.error(u'ISBN integrity check failed for "%s"'%isbn) + return res is not None # }}} # TODO: make customizable @@ -438,7 +447,7 @@ def _normalizeAuthorNameWithInitials(name): # {{{ return res # }}} -def toPubdate(log, yearAsString): +def toPubdate(log, yearAsString): # {{{ res = None if yearAsString: try: @@ -448,7 +457,11 @@ def toPubdate(log, yearAsString): except: log.error('cannot parse to date %s'%yearAsString) return res +# }}} +def _listToUnicodePrintStr(lst): # {{{ + return u'[' + u', '.join(unicode(x) for x in lst) + u']' +# }}} if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py diff --git a/src/calibre/gui2/store/stores/ozon_ru_plugin.py b/src/calibre/gui2/store/stores/ozon_ru_plugin.py index 3934ebbbb3..5d977700c8 100644 --- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py +++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py @@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): result = False with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) + raw = xml_to_unicode(f.read(), verbose=True)[0] + doc = html.fromstring(raw) # example where we are going to find formats #
@@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): #
#

.epub, .fb2.zip, .pdf

#
- xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])' + xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])' formats = doc.xpath(xpt) if formats: result = True diff --git a/src/calibre/translations/ru.po b/src/calibre/translations/ru.po index c515e6213e..89f44b0b6f 100644 --- a/src/calibre/translations/ru.po +++ b/src/calibre/translations/ru.po @@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226 msgid "Configure download metadata" -msgstr "" +msgstr "Настроить загрузку метаданных" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230 msgid "Change how calibre downloads metadata" @@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788 msgid "&Comments" -msgstr "Комментарии" +msgstr "&Комментарии" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854 msgid "Basic metadata" @@ -12603,11 +12603,11 @@ msgstr "Основные метаданные" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133 msgid "Has cover" -msgstr "Есть обложка" +msgstr "Обложка" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133 msgid "Has summary" -msgstr "" +msgstr "Аннотация" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190 msgid "" @@ -12619,7 +12619,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268 msgid "See at" -msgstr "" +msgstr "Посмотреть на" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403 msgid "calibre is downloading metadata from: " From 6ddaa374ce7e2a73037a223ce3aec5d07411a8a6 Mon Sep 17 00:00:00 2001 From: GRiker Date: Sat, 19 Nov 2011 08:59:58 -0700 Subject: [PATCH 08/11] Rewrite metadata header, removing dc:subject tags, added more error handling for Windows/iTunes artwork error, added iPhone 4S device ID (not enabled) --- src/calibre/devices/apple/driver.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/calibre/devices/apple/driver.py b/src/calibre/devices/apple/driver.py index 9a242b245d..2f7c1a9d20 100644 --- a/src/calibre/devices/apple/driver.py +++ b/src/calibre/devices/apple/driver.py @@ -221,7 +221,7 @@ class ITUNES(DriverBase): # 0x12a2 iPad2 (GSM) # 0x12a3 iPad2 (CDMA) VENDOR_ID = [0x05ac] - PRODUCT_ID = [0x1292,0x1293,0x1294,0x1297,0x1299,0x129a,0x129f,0x12a0,0x12a2,0x12a3] + PRODUCT_ID = [0x1292,0x1293,0x1294,0x1297,0x1299,0x129a,0x129f,0x12a2,0x12a3] BCD = [0x01] # Plugboard ID @@ -1412,10 +1412,16 @@ class ITUNES(DriverBase): tmp_cover.write(cover_data) if lb_added: - if lb_added.Artwork.Count: - lb_added.Artwork.Item(1).SetArtworkFromFile(tc) - else: - lb_added.AddArtworkFromFile(tc) + try: + if lb_added.Artwork.Count: + lb_added.Artwork.Item(1).SetArtworkFromFile(tc) + else: + lb_added.AddArtworkFromFile(tc) + except: + if DEBUG: + self.log.warning(" iTunes automation interface reported an error" + " when adding artwork to '%s' in the iTunes Library" % metadata.title) + pass if db_added: if db_added.Artwork.Count: @@ -2775,6 +2781,8 @@ class ITUNES(DriverBase): lb_added.sort_name.set(metadata_x.title_sort) if db_added: + self.log.warning(" waiting for db_added to become writeable ") + time.sleep(1.0) db_added.name.set(metadata_x.title) db_added.album.set(metadata_x.title) db_added.artist.set(authors_to_string(metadata_x.authors)) From 8f3fff04e31b3046dde8a30ec0fa3c9f6b1e49c3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Nov 2011 08:13:11 +0530 Subject: [PATCH 09/11] Have downloaded periodicals recognized when transferred via USB to the Kindle Fire --- src/calibre/ebooks/mobi/writer2/main.py | 46 +++++++++++++------------ 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index 655aa12c9e..760b444cd3 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -302,7 +302,19 @@ class MobiWriter(object): def generate_record0(self): # MOBI header {{{ metadata = self.oeb.metadata - exth = self.build_exth() + bt = 0x002 + if self.primary_index_record_idx is not None: + if False and self.indexer.is_flat_periodical: + # Disabled as setting this to 0x102 causes the Kindle to not + # auto archive the issues + bt = 0x102 + elif self.indexer.is_periodical: + # If you change this, remember to change the cdetype in the EXTH + # header as well + bt = {'newspaper':0x101}.get(self.publication_type, 0x103) + + + exth = self.build_exth(bt) first_image_record = None if self.image_records: first_image_record = len(self.records) @@ -351,17 +363,6 @@ class MobiWriter(object): # 0x10 - 0x13 : UID # 0x14 - 0x17 : Generator version - bt = 0x002 - if self.primary_index_record_idx is not None: - if False and self.indexer.is_flat_periodical: - # Disabled as setting this to 0x102 causes the Kindle to not - # auto archive the issues - bt = 0x102 - elif self.indexer.is_periodical: - # If you change this, remember to change the cdetype in the EXTH - # header as well - bt = {'newspaper':0x101}.get(self.publication_type, 0x103) - record0.write(pack(b'>IIIII', 0xe8, bt, 65001, uid, 6)) @@ -479,7 +480,7 @@ class MobiWriter(object): self.records[0] = align_block(record0) # }}} - def build_exth(self): # EXTH Header {{{ + def build_exth(self, mobi_doctype): # EXTH Header {{{ oeb = self.oeb exth = StringIO() nrecs = 0 @@ -535,16 +536,17 @@ class MobiWriter(object): nrecs += 1 # Write cdetype - if not self.is_periodical and not self.opts.share_not_sync: - exth.write(pack(b'>II', 501, 12)) - exth.write(b'EBOK') - nrecs += 1 + if not self.is_periodical: + if not self.opts.share_not_sync: + exth.write(pack(b'>II', 501, 12)) + exth.write(b'EBOK') + nrecs += 1 else: - # Should be b'NWPR' for doc type of 0x101 and b'MAGZ' for doctype - # of 0x103 but the old writer didn't write them, and I dont know - # what it should be for type 0x102 (b'BLOG'?) so write nothing - # instead - pass + ids = {0x101:b'NWPR', 0x103:b'MAGZ'}.get(mobi_doctype, None) + if ids: + exth.write(pack(b'>II', 501, 12)) + exth.write(ids) + nrecs += 1 # Add a publication date entry if oeb.metadata['date']: From c73b2569a152cffb2410ea40d5ce2db9fd341a3b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Nov 2011 08:30:29 +0530 Subject: [PATCH 10/11] Updated Expansion (spanish) --- recipes/expansion_spanish.recipe | 160 +++++++++++++++----- src/calibre/ebooks/metadata/sources/ozon.py | 22 +-- 2 files changed, 130 insertions(+), 52 deletions(-) diff --git a/recipes/expansion_spanish.recipe b/recipes/expansion_spanish.recipe index f2229e90e6..07a0c99761 100644 --- a/recipes/expansion_spanish.recipe +++ b/recipes/expansion_spanish.recipe @@ -1,35 +1,43 @@ #!/usr/bin/env python -__license__ = 'GPL v3' -__author__ = 'Gerardo Diez' -__copyright__ = 'Gerardo Diez' -description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)' -__docformat__ = 'restructuredtext en' +__license__ = 'GPL v3' +__copyright__ = '5, January 2011 Gerardo Diez & desUBIKado' +__author__ = 'desUBIKado, based on an earlier version by Gerardo Diez' +__version__ = 'v1.01' +__date__ = '13, November 2011' ''' -expansion.es +[url]http://www.expansion.com/[/url] ''' + +import time +import re from calibre.web.feeds.recipes import BasicNewsRecipe -class Publico(BasicNewsRecipe): - title =u'Expansion.com' - __author__ ='Gerardo Diez' - publisher =u'Unidad Editorial Información Económica, S.L.' - category ='finances, catalunya' - oldest_article =1 + +class expansion_spanish(BasicNewsRecipe): + __author__ ='Gerardo Diez & desUBIKado' + description ='Financial news from Spain' + title =u'Expansion' + publisher =u'Unidad Editorial Internet, S.L.' + category ='news, finances, Spain' + oldest_article = 2 + simultaneous_downloads = 10 max_articles_per_feed =100 - simultaneous_downloads =10 - cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png' - timefmt ='[%A, %d %B, %Y]' - encoding ='latin' + timefmt = '[%a, %d %b, %Y]' + encoding ='iso-8859-15' language ='es' - remove_javascript =True - no_stylesheets =True + use_embedded_content = False + remove_javascript = True + no_stylesheets = True + remove_empty_feeds = True + keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']}) + remove_tags =[ - dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}), - dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}), + dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}), + dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}), dict(name='span', attrs={'class':['comentarios']}), dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}), - dict(name='div', attrs={'id':['comentarios_lectores_listado']}) + dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']}) ] feeds =[ (u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'), @@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe): (u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'), (u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'), (u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'), - (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'), (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'), - (u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'), + (u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'), (u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'), - (u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'), + (u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'), (u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'), - (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'), (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'), (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'), - (u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'), - (u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'), + (u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'), + (u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'), (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'), - (u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'), - (u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'), - (u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'), + (u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'), + (u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'), + (u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'), (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'), (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'), (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'), - - (u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'), - (u'Política', u'http://estaticos.expansion.com/rss/economia.xml'), + (u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'), + (u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'), (u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'), - - (u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'), + (u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'), (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'), (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'), - - (u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'), + (u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'), (u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'), - (u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'), + (u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'), (u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'), - (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'), - (u'Cataluña', u'http://estaticos.expansion.com/rss/catalunya.xml'), - (u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml') + (u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'), + (u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml') ] + # Obtener la imagen de portada + + def get_cover_url(self): + cover = None + st = time.localtime() + year = str(st.tm_year) + month = "%.2d" % st.tm_mon + day = "%.2d" % st.tm_mday + #[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url] + cover='http://img5.kiosko.net/'+ year + '/' + month + '/' + day +'/es/expansion.750.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + self.log("\nPortada no disponible") + cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif' + return cover + + + + # Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere + # la página web, mando la variable "t" con la hora "linux" o "epoch" actual + # haciendole creer al sitio web que justo se acaba de ver la publicidad + + def print_version(self, url): + st = time.time() + segundos = str(int(st)) + parametros = '.html?t=' + segundos + return url.replace('.html', parametros) + + + + _processed_links = [] + + def get_article_url(self, article): + + # Para obtener la url original del artículo a partir de la de "feedsportal" + + link = article.get('link', None) + if link is None: + return article + if link.split('/')[-1]=="story01.htm": + link=link.split('/')[-2] + a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A'] + b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0'] + for i in range(0,len(a)): + link=link.replace(a[i],b[i]) + link="http://"+link + + # Eliminar artículos duplicados en otros feeds + + if not (link in self._processed_links): + self._processed_links.append(link) + else: + link = None + + return link + + + + # Un poco de css para mejorar la presentación de las noticias + + extra_css = ''' + .entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;} + .fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;} + ''' + + + + # Para presentar la imagen de los videos incrustados + + preprocess_regexps = [ + (re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '-->'), + (re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '