diff --git a/recipes/heise_online.recipe b/recipes/heise_online.recipe index 338b54782c..4d82570698 100644 --- a/recipes/heise_online.recipe +++ b/recipes/heise_online.recipe @@ -1,11 +1,11 @@ from calibre.web.feeds.news import BasicNewsRecipe -import re - class AdvancedUserRecipe(BasicNewsRecipe): - title = 'heise online' + title = 'Heise-online' description = 'News vom Heise-Verlag' __author__ = 'schuster' + masthead_url = 'http://www.heise.de/icons/ho/heise_online_logo.gif' + publisher = 'Heise Zeitschriften Verlag GmbH & Co. KG' use_embedded_content = False language = 'de' oldest_article = 2 @@ -14,11 +14,10 @@ class AdvancedUserRecipe(BasicNewsRecipe): remove_empty_feeds = True timeout = 5 no_stylesheets = True - encoding = 'utf-8' remove_tags_after = dict(name ='p', attrs={'class':'editor'}) - remove_tags = [{'class':'navi_top_container'}, + remove_tags = [dict(id='navi_top_container'), dict(id='navi_bottom'), dict(id='mitte_rechts'), dict(id='navigation'), @@ -29,27 +28,31 @@ class AdvancedUserRecipe(BasicNewsRecipe): dict(id='seiten_navi'), dict(id='adbottom'), dict(id='sitemap'), - dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')), - ] + dict(name='div', attrs={'id':'sitemap'}), + dict(name='ul', attrs={'class':'erste_zeile'}), + dict(name='ul', attrs={'class':'zweite_zeile'}), + dict(name='div', attrs={'class':'navi_top_container'})] feeds = [ ('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'), - ('iX', 'http://www.heise.de/ix/news/news.rdf'), - ('Technology Review', 'http://www.heise.de/tr/news-atom.xml'), - ('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'), - ('Security', 'http://www.heise.de/security/news/news-atom.xml'), - ('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'), - ('Open Source', 'http://www.heise.de/open/news/news-atom.xml'), - ('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'), + ('Auto', 'http://www.heise.de/autos/rss/news.rdf'), ('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'), - ('Autos', 'http://www.heise.de/autos/rss/news.rdf'), - ('Mac & i', 'http://www.heise.de/mac-and-i/news.rdf'), + ('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'), + ('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'), + ('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'), + ('Open ', 'http://www.heise.de/open/news/news-atom.xml'), + ('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'), + ('Security ', 'http://www.heise.de/security/news/news-atom.xml'), + ('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'), + ('iX', 'http://www.heise.de/ix/news/news.rdf'), + ('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'), ('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'), ('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'), ('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'), - ('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'), - ('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf') - ] + ('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'), + ('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')] def print_version(self, url): return url + '?view=print' + + diff --git a/recipes/the_week_magazine_free.recipe b/recipes/the_week_magazine_free.recipe index 6e033eaf82..89d9b128b2 100644 --- a/recipes/the_week_magazine_free.recipe +++ b/recipes/the_week_magazine_free.recipe @@ -5,7 +5,6 @@ www.theweek.com ''' from calibre.web.feeds.news import BasicNewsRecipe -import re class TheWeek(BasicNewsRecipe): title = 'The Week Magazine' @@ -21,23 +20,7 @@ class TheWeek(BasicNewsRecipe): encoding = 'utf-8' use_embedded_content = False language = 'en' - preprocess_regexps = [(re.compile(r'

', re.DOTALL), lambda match: '')] - remove_tags_before = dict(name='h1') - remove_tags_after = dict(name='div', attrs={'class':'articleSubscribe4free'}) - remove_tags = [ - dict(name='div', attrs={'class':['floatLeft','imageCaption','slideshowImageAttribution','postDate','utilities','cartoonInfo','left','middle','col300','articleSubscribe4free',' articleFlyout','articleFlyout floatRight','fourFreeBar']}) - ,dict(name='div', attrs={'id':['cartoonThumbs','rightColumn','header','partners']}) - ,dict(name='ul', attrs={'class':['slideshowNav','hotTopicsList topicList']}) - ] - remove_attributes = ['width','height', 'style', 'font', 'color'] - extra_css = ''' - h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;} - h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;} - h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;} - p {font-family:Arial,Helvetica,sans-serif;} - ''' - filter_regexps = [r'www\.palmcoastdata\.com'] - + auto_cleanup = True feeds = [ (u'News-Opinion', u'http://theweek.com/section/index/news_opinion.rss'), (u'Business', u'http://theweek.com/section/index/business.rss'), diff --git a/resources/jacket/template.xhtml b/resources/jacket/template.xhtml index f76a126309..671ca5a04d 100644 --- a/resources/jacket/template.xhtml +++ b/resources/jacket/template.xhtml @@ -38,10 +38,12 @@
{comments}
diff --git a/src/calibre/devices/irexdr/driver.py b/src/calibre/devices/irexdr/driver.py index 32e98f9353..bdc77b3193 100644 --- a/src/calibre/devices/irexdr/driver.py +++ b/src/calibre/devices/irexdr/driver.py @@ -33,7 +33,7 @@ class IREXDR1000(USBMS): MAIN_MEMORY_VOLUME_LABEL = 'IRex Digital Reader 1000 Main Memory' - EBOOK_DIR_MAIN = 'ebooks' + EBOOK_DIR_MAIN = '' DELETE_EXTS = ['.mbp'] SUPPORTS_SUB_DIRS = True @@ -44,7 +44,7 @@ class IREXDR800(IREXDR1000): WINDOWS_MAIN_MEM = 'DR800' FORMATS = ['epub', 'pdb', 'html', 'pdf', 'txt'] - EBOOK_DIR_MAIN = 'Books' + EBOOK_DIR_MAIN = '' DELETE_EXTS = [] SUPPORTS_SUB_DIRS = True diff --git a/src/calibre/devices/kindle/driver.py b/src/calibre/devices/kindle/driver.py index 1770a793ec..35a73f86f0 100644 --- a/src/calibre/devices/kindle/driver.py +++ b/src/calibre/devices/kindle/driver.py @@ -388,13 +388,9 @@ class KINDLE_FIRE(KINDLE2): EBOOK_DIR_MAIN = 'Documents' SUPPORTS_SUB_DIRS = False + SCAN_FROM_ROOT = True + SUPPORTS_SUB_DIRS_FOR_SCAN = True VENDOR_NAME = 'AMAZON' WINDOWS_MAIN_MEM = 'KINDLE' - def get_main_ebook_dir(self, for_upload=False): - if for_upload: - return self.EBOOK_DIR_MAIN - return '' - - diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py index 2a4cfc6f56..9a328c7498 100644 --- a/src/calibre/devices/nook/driver.py +++ b/src/calibre/devices/nook/driver.py @@ -81,7 +81,7 @@ class NOOK(USBMS): return [x.replace('#', '_') for x in components] class NOOK_COLOR(NOOK): - description = _('Communicate with the Nook Color and TSR eBook readers.') + description = _('Communicate with the Nook Color, TSR and Tablet eBook readers.') PRODUCT_ID = [0x002, 0x003, 0x004] BCD = [0x216] diff --git a/src/calibre/devices/usbms/deviceconfig.py b/src/calibre/devices/usbms/deviceconfig.py index 3f669f1e24..bc7dc116e2 100644 --- a/src/calibre/devices/usbms/deviceconfig.py +++ b/src/calibre/devices/usbms/deviceconfig.py @@ -28,6 +28,8 @@ class DeviceConfig(object): EXTRA_CUSTOMIZATION_DEFAULT = None SUPPORTS_SUB_DIRS = False + SUPPORTS_SUB_DIRS_FOR_SCAN = False # This setting is used when scanning for + # books when SUPPORTS_SUB_DIRS is False MUST_READ_METADATA = False SUPPORTS_USE_AUTHOR_SORT = False diff --git a/src/calibre/devices/usbms/driver.py b/src/calibre/devices/usbms/driver.py index e09876081b..ff2b6f3891 100644 --- a/src/calibre/devices/usbms/driver.py +++ b/src/calibre/devices/usbms/driver.py @@ -202,7 +202,7 @@ class USBMS(CLI, Device): debug_print('USBMS: scan from root', self.SCAN_FROM_ROOT, ebook_dir) if not os.path.exists(ebook_dir): continue # Get all books in the ebook_dir directory - if self.SUPPORTS_SUB_DIRS: + if self.SUPPORTS_SUB_DIRS or self.SUPPORTS_SUB_DIRS_FOR_SCAN: # build a list of files to check, so we can accurately report progress flist = [] for path, dirs, files in os.walk(ebook_dir): diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py index 53d336a23d..286bcee9d0 100644 --- a/src/calibre/ebooks/metadata/book/base.py +++ b/src/calibre/ebooks/metadata/book/base.py @@ -710,7 +710,8 @@ class Metadata(object): fmt('Title sort', self.title_sort) if self.authors: fmt('Author(s)', authors_to_string(self.authors) + \ - ((' [' + self.author_sort + ']') if self.author_sort else '')) + ((' [' + self.author_sort + ']') + if self.author_sort and self.author_sort != _('Unknown') else '')) if self.publisher: fmt('Publisher', self.publisher) if getattr(self, 'book_producer', False): diff --git a/src/calibre/ebooks/metadata/book/json_codec.py b/src/calibre/ebooks/metadata/book/json_codec.py index 28bf3178ef..a14e18569a 100644 --- a/src/calibre/ebooks/metadata/book/json_codec.py +++ b/src/calibre/ebooks/metadata/book/json_codec.py @@ -6,11 +6,12 @@ Created on 4 Jun 2010 from base64 import b64encode, b64decode import json, traceback +from datetime import datetime, time from calibre.ebooks.metadata.book import SERIALIZABLE_FIELDS from calibre.constants import filesystem_encoding, preferred_encoding from calibre.library.field_metadata import FieldMetadata -from calibre.utils.date import parse_date, isoformat, UNDEFINED_DATE +from calibre.utils.date import parse_date, isoformat, UNDEFINED_DATE, local_tz from calibre.utils.magick import Image from calibre import isbytestring @@ -22,7 +23,13 @@ def string_to_datetime(src): return parse_date(src) def datetime_to_string(dateval): - if dateval is None or dateval == UNDEFINED_DATE: + if dateval is None: + return "None" + if not isinstance(dateval, datetime): + dateval = datetime.combine(dateval, time()) + if hasattr(dateval, 'tzinfo') and dateval.tzinfo is None: + dateval = dateval.replace(tzinfo=local_tz) + if dateval <= UNDEFINED_DATE: return "None" return isoformat(dateval) diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index fa9951c40c..ecec13662f 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -11,7 +11,7 @@ import datetime from urllib import quote_plus from Queue import Queue, Empty from lxml import etree, html -from calibre import as_unicode +from calibre import prints, as_unicode from calibre.ebooks.chardet import xml_to_unicode @@ -54,7 +54,8 @@ class Ozon(Source): def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ # div_book -> search only books, ebooks and audio books search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' - + + # for ozon.ru search we have to format ISBN with '-' isbn = _format_isbn(log, identifiers.get('isbn', None)) # TODO: format isbn! qItems = set([isbn, title]) @@ -64,7 +65,7 @@ class Ozon(Source): qItems.discard('') qItems = map(_quoteString, qItems) - q = ' '.join(qItems).strip() + q = u' '.join(qItems).strip() log.info(u'search string: ' + q) if isinstance(q, unicode): @@ -78,13 +79,13 @@ class Ozon(Source): return search_url # }}} - def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ - identifiers={}, timeout=30): + def identify(self, log, result_queue, abort, title=None, authors=None, + identifiers={}, timeout=30): # {{{ if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: - err = 'Insufficient metadata to construct query' + err = u'Insufficient metadata to construct query' log.error(err) return err @@ -109,7 +110,7 @@ class Ozon(Source): # }}} def get_metadata(self, log, entries, title, authors, identifiers): # {{{ - # some book titles have extra charactes like this + # some book titles have extra characters like this # TODO: make a twick reRemoveFromTitle = None #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') @@ -160,7 +161,7 @@ class Ozon(Source): mi.source_relevance = i if ensure_metadata_match(mi): metadata.append(mi) - # log.debug(u'added metadata %s %s. '%(mi.title, mi.authors)) + #log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) else: log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors)) return metadata @@ -285,12 +286,12 @@ class Ozon(Source): url = self.get_book_url(metadata.get_identifiers())[2] raw = self.browser.open_novisit(url, timeout=timeout).read() - doc = html.fromstring(raw) + doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)' xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")' - # series + # series Серия/Серии xpt = xpt_prod_det_at % u'Сери' # % u'Серия:' series = doc.xpath(xpt) @@ -300,7 +301,7 @@ class Ozon(Source): xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' isbn_str = doc.xpath(xpt) if isbn_str: - all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] + all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)] if all_isbns: metadata.all_isbns = all_isbns metadata.isbn = all_isbns[0] @@ -333,10 +334,10 @@ class Ozon(Source): xpt = u'//table[@id="detail_description"]//tr/td' comment_elem = doc.xpath(xpt) if comment_elem: - comments = unicode(etree.tostring(comment_elem[0])) + comments = unicode(etree.tostring(comment_elem[0], encoding=unicode)) if comments: # cleanup root tag, TODO: remove tags like object/embeded - comments = re.sub(r'\A.*?|.*\Z', u'', comments.strip(), re.MULTILINE).strip() + comments = re.sub(ur'\A.*?|.*\Z', u'', comments.strip(), re.MULTILINE).strip() if comments and (not metadata.comments or len(comments) > len(metadata.comments)): metadata.comments = comments else: @@ -345,8 +346,16 @@ class Ozon(Source): log.debug('No book description found in HTML') # }}} -def _quoteString(str): # {{{ - return '"' + str + '"' if str and str.find(' ') != -1 else str +def _quoteString(strToQuote): # {{{ + return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote +# }}} + +def _verifyISBNIntegrity(log, isbn): # {{{ + # Online ISBN-Check http://www.isbn-check.de/ + res = check_isbn(isbn) + if not res: + log.error(u'ISBN integrity check failed for "%s"'%isbn) + return res is not None # }}} # TODO: make customizable @@ -438,7 +447,7 @@ def _normalizeAuthorNameWithInitials(name): # {{{ return res # }}} -def toPubdate(log, yearAsString): +def toPubdate(log, yearAsString): # {{{ res = None if yearAsString: try: @@ -448,7 +457,11 @@ def toPubdate(log, yearAsString): except: log.error('cannot parse to date %s'%yearAsString) return res +# }}} +def _listToUnicodePrintStr(lst): # {{{ + return u'[' + u', '.join(unicode(x) for x in lst) + u']' +# }}} if __name__ == '__main__': # tests {{{ # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py diff --git a/src/calibre/ebooks/oeb/transforms/jacket.py b/src/calibre/ebooks/oeb/transforms/jacket.py index 987fe0ce86..79524c19eb 100644 --- a/src/calibre/ebooks/oeb/transforms/jacket.py +++ b/src/calibre/ebooks/oeb/transforms/jacket.py @@ -16,6 +16,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML from calibre.library.comments import comments_to_html from calibre.utils.date import is_date_undefined +from calibre.ebooks.chardet import strip_encoding_declarations JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]' @@ -175,15 +176,20 @@ def render_jacket(mi, output_profile, try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') - args[key] = val - args[key+'_label'] = display_name + args[key] = escape(val) + args[key+'_label'] = escape(display_name) except: pass + # Used in the comment describing use of custom columns in templates + args['_genre_label'] = args.get('_genre_label', '{_genre_label}') + args['_genre'] = args.get('_genre', '{_genre}') + generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) # Post-process the generated html to strip out empty header items + soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) @@ -206,7 +212,8 @@ def render_jacket(mi, output_profile, if hr_tag is not None: hr_tag.extract() - return soup.renderContents(None) + return strip_encoding_declarations( + soup.renderContents('utf-8').decode('utf-8')) from calibre.ebooks.oeb.base import RECOVER_PARSER diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py index eb44ec3123..84bf7f6f57 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.py +++ b/src/calibre/gui2/dialogs/metadata_bulk.py @@ -372,13 +372,13 @@ class MetadataBulkDialog(ResizableDialog, Ui_MetadataBulkDialog): self.apply_pubdate.setChecked(True) def clear_pubdate(self, *args): - self.pubdate.setMinimumDateTime(UNDEFINED_QDATETIME) + self.pubdate.setDateTime(UNDEFINED_QDATETIME) def do_apply_adddate(self, *args): self.apply_adddate.setChecked(True) def clear_adddate(self, *args): - self.adddate.setMinimumDateTime(UNDEFINED_QDATETIME) + self.adddate.setDateTime(UNDEFINED_QDATETIME) def button_clicked(self, which): if which == self.button_box.button(QDialogButtonBox.Apply): diff --git a/src/calibre/gui2/store/stores/ozon_ru_plugin.py b/src/calibre/gui2/store/stores/ozon_ru_plugin.py index 3934ebbbb3..5d977700c8 100644 --- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py +++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py @@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): result = False with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) + raw = xml_to_unicode(f.read(), verbose=True)[0] + doc = html.fromstring(raw) # example where we are going to find formats #
@@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): #
#

.epub, .fb2.zip, .pdf

#
- xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])' + xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])' formats = doc.xpath(xpt) if formats: result = True diff --git a/src/calibre/translations/ru.po b/src/calibre/translations/ru.po index c515e6213e..89f44b0b6f 100644 --- a/src/calibre/translations/ru.po +++ b/src/calibre/translations/ru.po @@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226 msgid "Configure download metadata" -msgstr "" +msgstr "Настроить загрузку метаданных" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230 msgid "Change how calibre downloads metadata" @@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788 msgid "&Comments" -msgstr "Комментарии" +msgstr "&Комментарии" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854 msgid "Basic metadata" @@ -12603,11 +12603,11 @@ msgstr "Основные метаданные" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133 msgid "Has cover" -msgstr "Есть обложка" +msgstr "Обложка" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133 msgid "Has summary" -msgstr "" +msgstr "Аннотация" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190 msgid "" @@ -12619,7 +12619,7 @@ msgstr "" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268 msgid "See at" -msgstr "" +msgstr "Посмотреть на" #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403 msgid "calibre is downloading metadata from: " diff --git a/src/calibre/utils/date.py b/src/calibre/utils/date.py index 0de50e4122..faac8795d4 100644 --- a/src/calibre/utils/date.py +++ b/src/calibre/utils/date.py @@ -291,6 +291,11 @@ def clean_date_for_sort(dt, format): if not isinstance(dt, datetime): dt = datetime.combine(dt, time()) + if hasattr(dt, 'tzinfo'): + if dt.tzinfo is None: + dt = dt.replace(tzinfo=_local_tz) + dt = as_local_time(dt) + if format == 'iso': format = 'yyMdhms'