diff --git a/recipes/corren2.recipe b/recipes/corren2.recipe index 494be88f10..f53da20fd1 100644 --- a/recipes/corren2.recipe +++ b/recipes/corren2.recipe @@ -1,39 +1,34 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPLv3' + from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1255797795(BasicNewsRecipe): - title = u'Corren' - language = 'sv' - __author__ = 'Jonas Svensson' - simultaneous_downloads = 1 - no_stylesheets = True - oldest_article = 7 +class AdvancedUserRecipe1311446032(BasicNewsRecipe): + title = 'Corren' + __author__ = 'Jonas Svensson' + description = 'News from Sweden' + publisher = 'Corren' + category = 'news, politics, Sweden' + oldest_article = 2 + delay = 1 max_articles_per_feed = 100 - remove_attributes = ['onload'] - timefmt = '' + no_stylesheets = True + use_embedded_content = False + encoding = 'iso-8859-1' + language = 'sv' - feeds = [ - (u'Toppnyheter (alla kategorier)', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/'), - (u'Bostad', u'http://www.corren.se/inc/RssHandler.ashx?id=4122174&ripurl=http://www.corren.se/bostad/'), - (u'Ekonomi & Jobb', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/'), - (u'Kultur & Nöje', u'http://www.corren.se/inc/RssHandler.ashx?id=4122192&ripurl=http://www.corren.se/kultur/'), - (u'Mat & dryck', u'http://www.corren.se/inc/RssHandler.ashx?id=4122201&ripurl=http://www.corren.se/mat-dryck/'), - (u'Motor', u'http://www.corren.se/inc/RssHandler.ashx?id=4122203&ripurl=http://www.corren.se/motor/'), - (u'Sport', u'http://www.corren.se/inc/RssHandler.ashx?id=4122206&ripurl=http://www.corren.se/sport/'), - (u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223&ripurl=http://www.corren.se/asikter/'), - (u'Mjölby', u'http://www.corren.se/inc/RssHandler.ashx?id=4122235&ripurl=http://www.corren.se/ostergotland/mjolby/'), - (u'Motala', u'http://www.corren.se/inc/RssHandler.ashx?id=4122236&ripurl=http://www.corren.se/ostergotland/motala/') - ] - - def print_version(self, url): - url = url.replace("ekonomi/artikel.aspx", "Print.aspx") - url = url.replace("bostad/artikel.aspx", "Print.aspx") - url = url.replace("kultur/artikel.aspx", "Print.aspx") - url = url.replace("motor/artikel.aspx", "Print.aspx") - url = url.replace("mat-dryck/artikel.aspx", "Print.aspx") - url = url.replace("sport/artikel.aspx", "Print.aspx") - url = url.replace("asikter/artikel.aspx", "Print.aspx") - url = url.replace("mat-dryck/artikel.aspx", "Print.aspx") - url = url.replace("ostergotland/mjolby/artikel.aspx", "Print.aspx") - url = url.replace("ostergotland/motala/artikel.aspx", "Print.aspx") - return url.replace("nyheter/artikel.aspx", "Print.aspx") + feeds = [ + (u'Toppnyheter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/') + ,(u'Ekonomi', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/') + ,(u'Link\xf6ping', u'http://www.corren.se/inc/RssHandler.ashx?id=4122234') + ,(u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223,4122224,4122226,4122227,4122228,4122229,4122230') + ] + keep_only_tags = [dict(name='div', attrs={'id':'article'}),dict(name='div', attrs={'class':'body'})] + remove_tags = [ + dict(name='ul',attrs={'class':'functions'}) + ,dict(name='a',attrs={'href':'javascript*'}) + ,dict(name='div',attrs={'class':'box'}) + ,dict(name='div',attrs={'class':'functionsbottom'}) + ] diff --git a/recipes/dagens_industri.recipe b/recipes/dagens_industri.recipe new file mode 100644 index 0000000000..c9b60c72b1 --- /dev/null +++ b/recipes/dagens_industri.recipe @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPLv3' + +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1311450855(BasicNewsRecipe): + title = u'Dagens Industri' + __author__ = 'Jonas Svensson' + description = 'Economy news from Sweden' + publisher = 'DI' + category = 'news, politics, Sweden' + oldest_article = 2 + delay = 1 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + language = 'sv' + + feeds = [(u'DI', u'http://di.se/rss')] + + keep_only_tags = [dict(name='h1', attrs={'id':'ctl00_ExtraWideContentRegion_WideContentRegion_MainRegion_MainContentRegion_MainBodyRegion_headlineNormal'}),dict(name='div', attrs={'id':'articleBody'})] + + remove_tags = [ + dict(name='div',attrs={'class':'article-actions clear'}) + ,dict(name='div',attrs={'class':'article-action-popup'}) + ,dict(name='div',attrs={'class':'header'}) + ,dict(name='div',attrs={'class':'content clear'}) + ,dict(name='div',attrs={'id':'articleAdvertisementDiv'}) + ,dict(name='ul',attrs={'class':'action-list'}) + ] diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index c5021cb91d..124820d0a1 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -12,7 +12,7 @@ from datetime import date class Guardian(BasicNewsRecipe): - title = u'The Guardian / The Observer' + title = u'The Guardian and The Observer' if date.today().weekday() == 6: base_url = "http://www.guardian.co.uk/theobserver" else: @@ -28,7 +28,7 @@ class Guardian(BasicNewsRecipe): # List of section titles to ignore # For example: ['Sport'] ignore_sections = [] - + timefmt = ' [%a, %d %b %Y]' keep_only_tags = [ dict(name='div', attrs={'id':["content","article_header","main-article-info",]}), @@ -94,7 +94,7 @@ class Guardian(BasicNewsRecipe): prefix = section_title + ': ' for subsection in s.parent.findAll('a', attrs={'class':'book-section'}): yield (prefix + self.tag_to_string(subsection), subsection['href']) - + def find_articles(self, url): soup = self.index_to_soup(url) div = soup.find('div', attrs={'class':'book-index'}) @@ -115,7 +115,7 @@ class Guardian(BasicNewsRecipe): 'title': title, 'url':url, 'description':desc, 'date' : strftime('%a, %d %b'), } - + def parse_index(self): try: feeds = [] diff --git a/recipes/instapaper.recipe b/recipes/instapaper.recipe index c6175a783f..d182e556a2 100644 --- a/recipes/instapaper.recipe +++ b/recipes/instapaper.recipe @@ -43,7 +43,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj - self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) for item in soup.findAll('div', attrs={'class':'cornerControls'}): @@ -63,3 +63,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe): def populate_article_metadata(self, article, soup, first): article.title = soup.find('title').contents[0].strip() + def postprocess_html(self, soup, first_fetch): + for link_tag in soup.findAll(attrs={"id" : "story"}): + link_tag.insert(0,'

'+soup.find('title').contents[0].strip()+'

') + + return soup diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 91e81bd46f..a79078988a 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -1258,6 +1258,16 @@ class StoreEHarlequinStore(StoreBase): formats = ['EPUB', 'PDF'] affiliate = True +class StoreEKnigiStore(StoreBase): + name = u'еКниги' + author = 'Alex Stanev' + description = u'Онлайн книжарница за електронни книги и аудио риалити романи' + actual_plugin = 'calibre.gui2.store.stores.eknigi_plugin:eKnigiStore' + + headquarters = 'BG' + formats = ['EPUB', 'PDF', 'HTML'] + #affiliate = True + class StoreEpubBudStore(StoreBase): name = 'ePub Bud' description = 'Well, it\'s pretty much just "YouTube for Children\'s eBooks. A not-for-profit organization devoted to brining self published childrens books to the world.' @@ -1483,6 +1493,7 @@ plugins += [ StoreEBookShoppeUKStore, # StoreEPubBuyDEStore, StoreEHarlequinStore, + StoreEKnigiStore, StoreEpubBudStore, StoreFeedbooksStore, StoreFoylesUKStore, diff --git a/src/calibre/db/tables.py b/src/calibre/db/tables.py index b75effff4b..fa7b001851 100644 --- a/src/calibre/db/tables.py +++ b/src/calibre/db/tables.py @@ -12,7 +12,7 @@ from datetime import datetime from dateutil.tz import tzoffset from calibre.constants import plugins -from calibre.utils.date import parse_date, local_tz +from calibre.utils.date import parse_date, local_tz, UNDEFINED_DATE from calibre.ebooks.metadata import author_to_author_sort _c_speedup = plugins['speedup'][0] @@ -29,8 +29,11 @@ def _c_convert_timestamp(val): if ret is None: return parse_date(val, as_utc=False) year, month, day, hour, minutes, seconds, tzsecs = ret - return datetime(year, month, day, hour, minutes, seconds, + try: + return datetime(year, month, day, hour, minutes, seconds, tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz) + except OverflowError: + return UNDEFINED_DATE.astimezone(local_tz) class Table(object): diff --git a/src/calibre/ebooks/metadata/sources/isbndb.py b/src/calibre/ebooks/metadata/sources/isbndb.py index b33a625ca7..31c5e69d65 100644 --- a/src/calibre/ebooks/metadata/sources/isbndb.py +++ b/src/calibre/ebooks/metadata/sources/isbndb.py @@ -151,7 +151,7 @@ class ISBNDB(Source): bl = feed.find('BookList') if bl is None: - err = tostring(etree.find('errormessage')) + err = tostring(feed.find('errormessage')) raise ValueError('ISBNDb query failed:' + err) total_results = int(bl.get('total_results')) shown_results = int(bl.get('shown_results')) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 514cf9c246..67f20e691f 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -12,7 +12,7 @@ from collections import OrderedDict, defaultdict from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.utils import (decode_hex_number, decint, - get_trailing_data) + get_trailing_data, decode_tbs) from calibre.utils.magick.draw import identify_data # PalmDB {{{ @@ -399,6 +399,7 @@ class IndexHeader(object): # {{{ def __init__(self, record): self.record = record raw = self.record.raw + #open('/t/index_header.bin', 'wb').write(raw) if raw[:4] != b'INDX': raise ValueError('Invalid Primary Index Record') @@ -948,22 +949,25 @@ class TBSIndexing(object): # {{{ ans.append(('\t\tIndex Entry: %d (Parent index: %d, ' 'Depth: %d, Offset: %d, Size: %d) [%s]')%( x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) - def bin3(num): + def bin4(num): ans = bin(num)[2:] - return '0'*(3-len(ans)) + ans + return bytes('0'*(4-len(ans)) + ans) + + def repr_extra(x): + return str({bin4(k):v for k, v in extra.iteritems()}) tbs_type = 0 if len(byts): - outer, consumed = decint(byts) + outermost_index, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - tbs_type = outer & 0b111 - ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type)) - ans.append('Outer Index entry: %d'%(outer >> 3)) - arg1, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown (vwi: always 0?): %d'%arg1) + for k in extra: + tbs_type |= k + ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) + ans.append('Outermost index: %d'%outermost_index) + ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) if self.doc_type in (257, 259): # Hierarchical periodical - byts, a = self.interpret_periodical(tbs_type, byts) + byts, a = self.interpret_periodical(tbs_type, byts, + dat['geom'][0]) ans += a if byts: sbyts = tuple(hex(b)[2:] for b in byts) @@ -972,159 +976,87 @@ class TBSIndexing(object): # {{{ ans.append('') return tbs_type, ans - def interpret_periodical(self, tbs_type, byts): + def interpret_periodical(self, tbs_type, byts, record_offset): ans = [] - def tbs_type_6(byts, psi=None, msg=None, fmsg='Unknown'): # {{{ + def read_section_transitions(byts, psi=None): # {{{ if psi is None: - # Assume parent section is 1 + # Assume previous section is 1 psi = self.get_index(1) - if msg is None: - msg = ('Article index at start of record or first article' - ' index, relative to parent section') - if byts: - # byts could be empty - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = (arg & 0b1111) - ai = (arg >> 4) - ans.append('%s (fvwi): %d [%d absolute]'%(msg, ai, - ai+psi.index)) - if flags == 1: - arg, consumed = decint(byts) - if arg == 0: - # EOF of record, otherwise ignore and hope someone else - # will deal with these bytes - byts = byts[consumed:] - ans.append('EOF (vwi: should be 0): %d'%arg) - elif flags in (4, 5): - num = byts[0] - byts = byts[1:] - ans.append('Number of article nodes in the record (byte): %d'%num) - if flags == 5: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('%s (vwi)): %d'%(fmsg, arg)) - elif flags == 0: - pass - else: - raise ValueError('Unknown flags: %d'%flags) - return byts - # }}} - - if tbs_type == 3: # {{{ - arg2, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown (vwi: always 0?): %d'%arg2) - - arg3, consumed = decint(byts) - byts = byts[consumed:] - fsi = arg3 >> 4 - flags = arg3 & 0b1111 - ans.append('First section index (fvwi): %d'%fsi) - psi = self.get_index(fsi) - ans.append('Flags (flag: always 0?): %d'%flags) - if flags == 4: - ans.append('Number of articles in this section: %d'%byts[0]) - byts = byts[1:] - elif flags == 0: - pass - else: - raise ValueError('Unknown flags value: %d'%flags) - - - if byts: - byts = tbs_type_6(byts, psi=psi, - msg=('First article of ending section, relative to its' - ' parent\'s index'), - fmsg=('->Offset from start of record to beginning of' - ' last starting section')) while byts: - # We have a transition not just an opening first section - psi = self.get_index(psi.index+1) - arg, consumed = decint(byts) - off = arg >> 4 + ai, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - flags = arg & 0b1111 - ans.append('Last article of ending section w.r.t. starting' - ' section offset (fvwi): %d [%d absolute]'%(off, - psi.index+off)) - ans.append('Flags (always 8?): %d'%flags) - byts = tbs_type_6(byts, psi=psi) - if byts: - # Ended with flag 1,and not EOF, which means there's - # another section transition in this record - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('->Offset from start of record to beginning of ' - 'last starting section: %d'%(arg)) + if extra.get(0b0010, None) is not None: + raise ValueError('Dont know how to interpret flag 0b0010' + ' while reading section transitions') + if extra.get(0b1000, None) is not None: + if len(extra) > 1: + raise ValueError('Dont know how to interpret flags' + ' %r while reading section transitions'%extra) + nsi = self.get_index(psi.index+1) + ans.append('Last article in this record of section %d' + ' (relative to next section index [%d]): ' + '%d [%d absolute index]'%(psi.index, nsi.index, ai, + ai+nsi.index)) + psi = nsi + continue + + ans.append('First article in this record of section %d' + ' (relative to its parent section): ' + '%d [%d absolute index]'%(psi.index, ai, ai+psi.index)) + + num = extra.get(0b0100, None) + if num is None: + msg = ('The section %d has at most one article' + ' in this record')%psi.index else: - break + msg = ('Number of articles in this record of ' + 'section %d: %d')%(psi.index, num) + ans.append(msg) - # }}} + offset = extra.get(0b0001, None) + if offset is not None: + if offset == 0: + ans.append('This record is spanned by the article:' + '%d'%(ai+psi.index)) + else: + ans.append('->Offset to start of next section (%d) from start' + ' of record: %d [%d absolute offset]'%(psi.index+1, + offset, offset+record_offset)) + return byts + # }}} - elif tbs_type == 7: # {{{ - # This occurs for records that have no section nodes and - # whose parent section's index == 1 - ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2])) - byts = byts[2:] - arg, consumed = decint(byts) + def read_starting_section(byts): # {{{ + si, extra, consumed = decode_tbs(byts) byts = byts[consumed:] - ai = arg >> 4 - flags = arg & 0b1111 - ans.append('Article at start of record (fvwi): %d'%ai) - if flags == 4: - num = byts[0] - byts = byts[1:] - ans.append('Number of articles in record (byte): %d'%num) - elif flags == 0: - pass - elif flags == 1: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('EOF (vwi: should be 0): %d'%arg) - else: - raise ValueError('Unknown flags value: %d'%flags) + if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra: + raise ValueError('Dont know how to interpret flags %r' + ' when reading starting section'%extra) + si = self.get_index(si) + ans.append('The section at the start of this record is:' + ' %d'%si.index) + if 0b0100 in extra: + num = extra[0b0100] + ans.append('The number of articles from the section %d' + ' in this record: %d'%(si.index, num)) + elif 0b0001 in extra: + eof = extra[0b0001] + if eof != 0: + raise ValueError('Unknown eof value %s when reading' + ' starting section'%eof) + ans.append('This record is spanned by an article from' + ' the section: %d'%si.index) + return si, byts # }}} - elif tbs_type == 6: # {{{ - # This is used for records spanned by an article whose parent - # section's index == 1 or for the opening record if it contains the - # periodical start, section 1 start and at least one article. The - # two cases are distinguished by the flags on the article index - # vwi. - unk = byts[0] - byts = byts[1:] - ans.append('Unknown (byte: always 2?): %d'%unk) - byts = tbs_type_6(byts) - # }}} + if tbs_type & 0b0100: + # Starting section is the first section + ssi = self.get_index(1) + else: + ssi, byts = read_starting_section(byts) - elif tbs_type == 2: # {{{ - # This occurs for records with no section nodes and whose parent - # section's index != 1 (undefined (records before the first - # section) or > 1) - # This is also used for records that are spanned by an article - # whose parent section index > 1. In this case the flags of the - # vwi referring to the article at the start - # of the record are set to 1 instead of 4. - arg, consumed = decint(byts) - byts = byts[consumed:] - flags = (arg & 0b1111) - psi = (arg >> 4) - ans.append('Parent section index (fvwi): %d'%psi) - psi = self.get_index(psi) - ans.append('Flags: %d'%flags) - if flags == 1: - arg, consumed = decint(byts) - byts = byts[consumed:] - ans.append('Unknown (vwi?: always 0?): %d'%arg) - byts = tbs_type_6(byts, psi=psi) - elif flags == 0: - byts = tbs_type_6(byts, psi=psi) - else: - raise ValueError('Unkown flags: %d'%flags) - # }}} + byts = read_section_transitions(byts, ssi) return byts, ans diff --git a/src/calibre/ebooks/mobi/tbs_periodicals.rst b/src/calibre/ebooks/mobi/tbs_periodicals.rst index d770133625..2fa6ec90f3 100644 --- a/src/calibre/ebooks/mobi/tbs_periodicals.rst +++ b/src/calibre/ebooks/mobi/tbs_periodicals.rst @@ -3,6 +3,20 @@ Reverse engineering the trailing byte sequences for hierarchical periodicals In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. All the following information/inferences are from examining the output of kindlegen on a sample periodical. Given the general level of Amazon's incompetence, there are no guarantees that this information is the *best/most complete* way to do TBS indexing. +Sequence encoding: + +0b1000 : Continuation bit + +First sequences: +0b0010 : 80 +0b0011 : 80 80 +0b0110 : 80 2 +0b0111 : 80 2 80 + +Other sequences: +0b0101 : 4 1a +0b0001 : c b1 + Opening record ---------------- @@ -52,10 +66,60 @@ The text record that contains the opening node for the periodical (depth=0 node If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record. + Starting record with two section transitions:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 7 index entries (0 ends, 4 complete, 3 starts) + TBS bytes: 86 80 2 c0 b8 c4 3 + Complete: + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica] + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz] + Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 1014) [Max and the Magic Marker for iPad: Review] + Index Entry: 7 (Parent index: 2, Depth: 2, Offset: 1961, Size: 1077) [iPad 2 steers itself into home console gaming territory with Real Racing 2 HD] + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 35372) [j_x's Google reader] + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 10368) [Neowin.net] + Index Entry: 8 (Parent index: 2, Depth: 2, Offset: 3038, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + Remaining bytes: b8 c4 3 + + Starting record with three section transitions:: + + Record #1: Starts at: 0 Ends at: 4095 + Contains: 10 index entries (0 ends, 7 complete, 3 starts) + TBS bytes: 86 80 2 c0 b8 c0 b8 c4 4 + Complete: + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica] + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 316) [Neowin.net] + Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz] + Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 308) [Max and the Magic Marker for iPad: Review] + Index Entry: 7 (Parent index: 3, Depth: 2, Offset: 1263, Size: 760) [OSnews Asks on Interrupts: The Results] + Index Entry: 8 (Parent index: 3, Depth: 2, Offset: 2023, Size: 693) [Apple Ditches SAMBA in Favour of Homegrown Replacement] + Index Entry: 9 (Parent index: 3, Depth: 2, Offset: 2716, Size: 747) [ITC: Apple's Mobile Products Do Not Violate Nokia Patents] + Starts: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 25320) [j_x's Google reader] + Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 1255, Size: 6829) [OSNews] + Index Entry: 10 (Parent index: 3, Depth: 2, Offset: 3463, Size: 666) [Transparent Monitor Embedded in Window Glass] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute] + Remaining bytes: b8 c0 b8 c4 4 + + + + Records with no nodes ------------------------ +subtype = 010 + These records are spanned by a single article. They are of two types: 1. If the parent section index is 1, TBS type of 6, like this:: @@ -247,7 +311,7 @@ In such a record there is a transition from one section to the next. As such the Last article of ending section w.r.t. starting section offset (fvwi): 12 [15 absolute] Flags (always 8?): 8 Article index at start of record or first article index, relative to parent section (fvwi): 13 [16 absolute] - Number of article nodes in the record (byte): 4 + Number of article nodes in the record belonging ot the last section (byte): 4 Ending record @@ -274,3 +338,26 @@ Logically, ending records must have at least one article ending, one section end If the record had only a single article end, the last two bytes would be replaced with: f0 +If the last record has multiple section transitions, it is of type 6 and looks like:: + + Record #9: Starts at: 32768 Ends at: 34953 + Contains: 9 index entries (3 ends, 6 complete, 0 starts) + TBS bytes: 86 80 2 1 d0 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0 + Ends: + Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 34739) [j_x's Google reader] + Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica] + Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe] + Complete: + Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 316) [Neowin.net] + Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 34353, Size: 282) [OSNews] + Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 34635, Size: 319) [Slashdot] + Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 308) [Max and the Magic Marker for iPad: Review] + Index Entry: 16 (Parent index: 3, Depth: 2, Offset: 34361, Size: 274) [OSnews Asks on Interrupts: The Results] + Index Entry: 17 (Parent index: 4, Depth: 2, Offset: 34643, Size: 311) [Leonard Nimoy Turns 80] + TBS Type: 110 (6) + Outer Index entry: 0 + Unknown (vwi: always 0?): 0 + Unknown (byte: always 2?): 2 + Article index at start of record or first article index, relative to parent section (fvwi): 13 [14 absolute] + Remaining bytes: 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0 + diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index cf03c613f4..37d2093066 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -11,6 +11,7 @@ import struct from collections import OrderedDict from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail +from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 @@ -39,7 +40,7 @@ def encode_number_as_hex(num): The bytes that follow are simply the hexadecimal representation of the number. ''' - num = bytes(hex(num)[2:]) + num = bytes(hex(num)[2:].upper()) ans = bytearray(num) ans.insert(0, len(num)) return bytes(ans) @@ -197,3 +198,96 @@ def encode_trailing_data(raw): lsize += 1 return raw + encoded +def encode_fvwi(val, flags): + ''' + Encode the value val and the 4 bit flags flags as a fvwi. This encoding is + used in the trailing byte sequences for indexing. Returns encoded + bytestring. + ''' + ans = (val << 4) | (flags & 0b1111) + return encint(ans) + + +def decode_fvwi(byts): + ''' + Decode encoded fvwi. Returns number, flags, consumed + ''' + arg, consumed = decint(bytes(byts)) + return (arg >> 4), (arg & 0b1111), consumed + +def decode_tbs(byts): + ''' + Trailing byte sequences for indexing consists of series of fvwi numbers. + This function reads the fvwi number and its associated flags. It them uses + the flags to read any more numbers that belong to the series. The flags are + the lowest 4 bits of the vwi (see the encode_fvwi function above). + + Returns the fvwi number, a dictionary mapping flags bits to the associated + data and the number of bytes consumed. + ''' + byts = bytes(byts) + val, flags, consumed = decode_fvwi(byts) + extra = {} + byts = byts[consumed:] + if flags & 0b1000: + extra[0b1000] = True + if flags & 0b0010: + x, consumed2 = decint(byts) + byts = byts[consumed2:] + extra[0b0010] = x + consumed += consumed2 + if flags & 0b0100: + extra[0b0100] = ord(byts[0]) + byts = byts[1:] + consumed += 1 + if flags & 0b0001: + x, consumed2 = decint(byts) + byts = byts[consumed2:] + extra[0b0001] = x + consumed += consumed2 + return val, extra, consumed + +def encode_tbs(val, extra): + ''' + Encode the number val and the extra data in the extra dict as an fvwi. See + decode_tbs above. + ''' + flags = 0 + for flag in extra: + flags |= flag + ans = encode_fvwi(val, flags) + + if 0b0010 in extra: + ans += encint(extra[0b0010]) + if 0b0100 in extra: + ans += bytes(bytearray([extra[0b0100]])) + if 0b0001 in extra: + ans += encint(extra[0b0001]) + return ans + +def utf8_text(text): + ''' + Convert a possibly null string to utf-8 bytes, guaranteeing to return a non + empty, normalized bytestring. + ''' + if text and text.strip(): + text = text.strip() + if not isinstance(text, unicode): + text = text.decode('utf-8', 'replace') + text = normalize(text).encode('utf-8') + else: + text = _('Unknown').encode('utf-8') + return text + +def align_block(raw, multiple=4, pad=b'\0'): + ''' + Return raw with enough pad bytes append to ensure its length is a multiple + of 4. + ''' + extra = len(raw) % multiple + if extra == 0: return raw + return raw + pad*(multiple - extra) + + + + diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index 48b1d82c04..0f7a670cff 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -10,34 +10,13 @@ __docformat__ = 'restructuredtext en' from struct import pack from cStringIO import StringIO -from collections import OrderedDict +from collections import OrderedDict, defaultdict -from calibre.ebooks import normalize -from calibre.ebook.mobi.writer2 import RECORD_SIZE -from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex) +from calibre.ebooks.mobi.writer2 import RECORD_SIZE +from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, + encode_trailing_data, encode_tbs, align_block, utf8_text) +from calibre.ebooks.mobi.langcodes import iana2mobi -def utf8_text(text): - ''' - Convert a possibly null string to utf-8 bytes, guaranteeing to return a non - empty, normalized bytestring. - ''' - if text and text.strip(): - text = text.strip() - if not isinstance(text, unicode): - text = text.decode('utf-8', 'replace') - text = normalize(text).encode('utf-8') - else: - text = _('Unknown').encode('utf-8') - return text - -def align_block(raw, multiple=4, pad=b'\0'): - ''' - Return raw with enough pad bytes append to ensure its length is a multiple - of 4. - ''' - extra = len(raw) % multiple - if extra == 0: return raw - return raw + pad*(multiple - extra) class CNCX(object): # {{{ @@ -85,7 +64,7 @@ class CNCX(object): # {{{ return self.strings[string] # }}} -class IndexEntry(object): +class IndexEntry(object): # {{{ TAG_VALUES = { 'offset': 1, @@ -97,7 +76,7 @@ class IndexEntry(object): 'first_child_index': 22, 'last_child_index': 23, } - RTAG_MAP = dict(TAG_VALUES.itervalues(), TAG_VALUES.iterkeys()) + RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()} BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,] @@ -112,6 +91,35 @@ class IndexEntry(object): self.first_child_index = None self.last_child_index = None + @classmethod + def tagx_block(cls, for_periodical=True): + buf = bytearray() + + def add_tag(tag, num_values=1): + buf.append(tag) + buf.append(num_values) + # bitmask + buf.append(1 << (cls.BITMASKS.index(tag))) + # eof + buf.append(0) + + for tag in xrange(1, 5): + add_tag(tag) + + if for_periodical: + for tag in (5, 21, 22, 23): + add_tag(tag) + + # End of TAGX record + for i in xrange(3): buf.append(0) + buf.append(1) + + header = b'TAGX' + header += pack(b'>I', len(buf)) # table length + header += pack(b'>I', 1) # control byte count + + return header + bytes(buf) + @property def next_offset(self): return self.offset + self.length @@ -147,8 +155,135 @@ class IndexEntry(object): ans = buf.get_value() return ans +# }}} -class Indexer(object): +class TBS(object): # {{{ + + ''' + Take the list of index nodes starting/ending on a record and calculate the + trailing byte sequence for the record. + ''' + + def __init__(self, data, is_periodical, first=False, all_sections=[]): + if not data: + self.bytestring = encode_trailing_data(b'') + else: + self.section_map = OrderedDict((i.index, i) for i in + sorted(all_sections, key=lambda x:x.offset)) + + if is_periodical: + # The starting bytes. + # The value is zero which I think indicates the periodical + # index entry. The values for the various flags seem to be + # unused. If the 0b0100 is present, it means that the record + # deals with section 1 (or is the final record with section + # transitions). + self.type_010 = encode_tbs(0, {0b0010: 0}) + self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0}) + self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0}) + self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0}) + + depth_map = defaultdict(list) + for x in ('starts', 'ends', 'completes'): + for idx in data[x]: + depth_map[idx.depth].append(idx) + for l in depth_map.itervalues(): + l.sort(key=lambda x:x.offset) + self.periodical_tbs(data, first, depth_map) + else: + self.book_tbs(data, first) + + def periodical_tbs(self, data, first, depth_map): + buf = StringIO() + + has_section_start = (depth_map[1] and depth_map[1][0] in + data['starts']) + spanner = data['spans'] + first_node = None + for nodes in depth_map.values(): + for node in nodes: + if (first_node is None or (node.offset, node.depth) < + (first_node.offset, first_node.depth)): + first_node = node + + parent_section_index = -1 + if depth_map[0]: + # We have a terminal record + typ = (self.type_110 if has_section_start else self.type_010) + if first_node.depth > 0: + parent_section_index = (first_node.index if first_node.depth + == 1 else first_node.parent_index) + else: + if spanner is not None: + # record is spanned by a single article + parent_section_index = spanner.parent_index + typ = (self.type_110 if parent_section_index == 1 else + self.type_010) + elif not depth_map[1]: + # has only article nodes, i.e. spanned by a section + parent_section_index = self.depth_map[2][0].parent_index + typ = (self.type_111 if parent_section_index == 1 else + self.type_010) + else: + # has section transitions + parent_section_index = self.depth_map[2][0].parent_index + + buf.write(typ) + + if parent_section_index > 1: + # Write starting section information + if spanner is None: + num_articles = len(depth_map[1]) + extra = {} + if num_articles > 1: + extra = {0b0100: num_articles} + else: + extra = {0b0001: 0} + buf.write(encode_tbs(parent_section_index, extra)) + + if spanner is None: + articles = depth_map[2] + sections = [self.section_map[a.parent_index] for a in articles] + sections.sort(key=lambda x:x.offset) + section_map = {s:[a for a in articles is a.parent_index == + s.index] for s in sections} + for i, section in enumerate(sections): + # All the articles in this record that belong to section + articles = section_map[section] + first_article = articles[0] + last_article = articles[-1] + num = len(articles) + + try: + next_sec = sections[i+1] + except: + next_sec == None + + extra = {} + if num > 1: + extra[0b0100] = num + if i == 0 and next_sec is not None: + # Write offset to next section from start of record + # For some reason kindlegen only writes this offset + # for the first section transition. Imitate it. + extra[0b0001] = next_sec.offset - data['offset'] + + buf.write(encode_tbs(first_article.index-section.index, extra)) + + if next_sec is not None: + buf.write(encode_tbs(last_article.index-next_sec.index, + {0b1000: 0})) + else: + buf.write(encode_tbs(spanner.index - parent_section_index, + {0b0001: 0})) + + self.bytestring = encode_trailing_data(buf.getvalue()) + + def book_tbs(self, data, first): + self.bytestring = encode_trailing_data(b'') +# }}} + +class Indexer(object): # {{{ def __init__(self, serializer, number_of_text_records, size_of_last_text_record, opts, oeb): @@ -160,7 +295,9 @@ class Indexer(object): self.log = oeb.log self.opts = opts - self.is_periodical = opts.mobi_periodical + self.is_periodical = self.detect_periodical() + self.log('Generating MOBI index for a %s'%('periodical' if + self.is_periodical else 'book')) self.is_flat_periodical = False if opts.mobi_periodical: periodical_node = iter(oeb.toc).next() @@ -172,15 +309,42 @@ class Indexer(object): self.cncx = CNCX(oeb.toc, opts) if self.is_periodical: - indices = self.create_periodical_index() + self.indices = self.create_periodical_index() else: - raise NotImplementedError() + self.indices = self.create_book_index() - self.records.append(self.create_index_record(indices)) + self.records.append(self.create_index_record()) + self.records.insert(0, self.create_header()) + self.records.extend(self.cncx.records) - def create_index_record(self, indices): + self.calculate_trailing_byte_sequences() + + def detect_periodical(self): # {{{ + for node in self.oeb.toc.iterdescendants(): + if node.depth() == 1 and node.klass != 'article': + self.log.debug( + 'Not a periodical: Deepest node does not have ' + 'class="article"') + return False + if node.depth() == 2 and node.klass != 'section': + self.log.debug( + 'Not a periodical: Second deepest node does not have' + ' class="section"') + return False + if node.depth() == 3 and node.klass != 'periodical': + self.log.debug('Not a periodical: Third deepest node' + ' does not have class="periodical"') + return False + if node.depth() > 3: + self.log.debug('Not a periodical: Has nodes of depth > 3') + return False + return True + # }}} + + def create_index_record(self): # {{{ header_length = 192 buf = StringIO() + indices = self.indices # Write index entries offsets = [] @@ -218,6 +382,135 @@ class Indexer(object): if len(ans) > 0x10000: raise ValueError('Too many entries (%d) in the TOC'%len(offsets)) return ans + # }}} + + def create_header(self): # {{{ + buf = StringIO() + tagx_block = IndexEntry.tagx_block(self.is_periodical) + header_length = 192 + + # Ident 0 - 4 + buf.write(b'INDX') + + # Header length 4 - 8 + buf.write(pack(b'>I', header_length)) + + # Unknown 8-16 + buf.write(b'\0'*8) + + # Index type: 0 - normal, 2 - inflection 16 - 20 + buf.write(pack(b'>I', 2)) + + # IDXT offset 20-24 + buf.write(pack(b'>I', 0)) # Filled in later + + # Number of index records 24-28 + buf.write(pack('b>I', len(self.records))) + + # Index Encoding 28-32 + buf.write(pack(b'>I', 65001)) # utf-8 + + # Index language 32-36 + buf.write(iana2mobi( + str(self.oeb.metadata.language[0]))) + + # Number of index entries 36-40 + buf.write(pack(b'>I', len(self.indices))) + + # ORDT offset 40-44 + buf.write(pack(b'>I', 0)) + + # LIGT offset 44-48 + buf.write(pack(b'>I', 0)) + + # Number of LIGT entries 48-52 + buf.write(pack(b'>I', 0)) + + # Number of CNCX records 52-56 + buf.write(pack(b'>I', len(self.cncx.records))) + + # Unknown 56-180 + buf.write(b'\0'*124) + + # TAGX offset 180-184 + buf.write(pack(b'>I', header_length)) + + # Unknown 184-192 + buf.write(b'\0'*8) + + # TAGX block + buf.write(tagx_block) + + num = len(self.indices) + + # The index of the last entry in the NCX + buf.write(encode_number_as_hex(num-1)) + + # The number of entries in the NCX + buf.write(pack(b'>H', num)) + + # Padding + pad = (4 - (buf.tell()%4))%4 + if pad: + buf.write(b'\0'*pad) + + idxt_offset = buf.tell() + + buf.write(b'IDXT') + buf.write(header_length + len(tagx_block)) + buf.write(b'\0') + buf.seek(20) + buf.write(pack(b'>I', idxt_offset)) + + return align_block(buf.getvalue()) + # }}} + + def create_book_index(self): # {{{ + indices = [] + seen = set() + id_offsets = self.serializer.id_offsets + + for node in self.oeb.toc.iterdescendants(): + try: + offset = id_offsets[node.href] + label = self.cncx[node.title] + except: + self.log.warn('TOC item %s not found in document'%node.href) + continue + if offset in seen: + continue + seen.add(offset) + index = IndexEntry(offset, label) + self.indices.append(index) + + indices.sort(key=lambda x:x.offset) + + # Set lengths + for i, index in indices: + try: + next_offset = indices[i+1].offset + except: + next_offset = self.serializer.body_end_offset + index.length = next_offset - index.offset + + # Remove empty nodes + indices = [i for i in indices if i.length > 0] + + # Set index values + for i, index in indices: + index.index = i + + # Set lengths again to close up any gaps left by filtering + for i, index in indices: + try: + next_offset = indices[i+1].offset + except: + next_offset = self.serializer.body_end_offset + index.length = next_offset - index.offset + + return indices + + # }}} def create_periodical_index(self): # {{{ periodical_node = iter(self.oeb.toc).next() @@ -361,14 +654,48 @@ class Indexer(object): return indices # }}} - def create_header(self): - buf = StringIO() + # TBS {{{ + def calculate_trailing_byte_sequences(self): + self.tbs_map = {} + found_node = False + sections = [i for i in self.indices if i.depth == 1] + for i in xrange(self.number_of_text_records): + offset = i * RECORD_SIZE + next_offset = offset + RECORD_SIZE + data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]), + ('spans', None), ('offset', offset)]) + for index in self.indices: + if index.offset >= next_offset: + # Node starts after current record + break + if index.next_offset <= offset: + # Node ends before current record + continue + if index.offset >= offset: + # Node starts in current record + if index.next_offset <= next_offset: + # Node ends in current record + data['completes'].append(index) + else: + data['starts'].append(index) + else: + # Node starts before current records + if index.next_offset <= next_offset: + # Node ends in current record + data['ends'].append(index) + else: + data['spans'] = index + if (data['ends'] or data['completes'] or data['starts'] or + data['spans'] is not None): + self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not + found_node, all_sections=sections) + found_node = True + else: + self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False) - # Ident - buf.write(b'INDX') + def get_trailing_byte_sequence(self, num): + return self.tbs_map[num].bytestring + # }}} - # Header length - buf.write(pack(b'>I', 192)) +# }}} - # Index type: 0 - normal, 2 - inflection - buf.write(pack(b'>I', 2)) diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index ea67007168..06572f48c4 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -93,6 +93,15 @@ class MobiWriter(object): self.opts, self.oeb) except: self.log.exception('Failed to generate MOBI index:') + else: + self.primary_index_record_idx = len(self.records) + for i in xrange(len(self.records)): + if i == 0: continue + tbs = self.indexer.get_trailing_byte_sequence(i) + self.records[i] += tbs + self.records.extend(self.indexer.records) + + # }}} def write_uncrossable_breaks(self): # {{{ diff --git a/src/calibre/gui2/store/__init__.py b/src/calibre/gui2/store/__init__.py index d58ccbda84..ae42d82032 100644 --- a/src/calibre/gui2/store/__init__.py +++ b/src/calibre/gui2/store/__init__.py @@ -6,6 +6,8 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' +from calibre.utils.filenames import ascii_filename + class StorePlugin(object): # {{{ ''' A plugin representing an online ebook repository (store). The store can @@ -43,7 +45,7 @@ class StorePlugin(object): # {{{ The easiest way to handle affiliate money payouts is to randomly select between the author's affiliate id and calibre's affiliate id so that 70% of the time the author's id is used. - + See declined.txt for a list of stores that do not want to be included. ''' @@ -53,7 +55,7 @@ class StorePlugin(object): # {{{ self.gui = gui self.name = name self.base_plugin = None - self.config = JSONConfig('store/stores/' + self.name) + self.config = JSONConfig('store/stores/' + ascii_filename(self.name)) def open(self, gui, parent=None, detail_item=None, external=False): ''' diff --git a/src/calibre/gui2/store/stores/chitanka_plugin.py b/src/calibre/gui2/store/stores/chitanka_plugin.py index 6f84acba33..3e4364d9fa 100644 --- a/src/calibre/gui2/store/stores/chitanka_plugin.py +++ b/src/calibre/gui2/store/stores/chitanka_plugin.py @@ -54,36 +54,21 @@ class ChitankaStore(BasicStoreConfig, StorePlugin): if counter <= 0: break - id = ''.join(data.xpath('.//a[@class="booklink"]/@href')) + id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue - cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')) - title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')) - author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')) - fb2 = ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')) - epub = ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')) - txt = ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')) - - # remove .zip extensions - if fb2.find('.zip') != -1: - fb2 = fb2[:fb2.find('.zip')] - if epub.find('.zip') != -1: - epub = epub[:epub.find('.zip')] - if txt.find('.zip') != -1: - txt = txt[:txt.find('.zip')] - counter -= 1 s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.detail_item = id.strip() + s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip() + s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip() + s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip() + s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED - s.downloads['FB2'] = base_url + fb2.strip() - s.downloads['EPUB'] = base_url + epub.strip() - s.downloads['TXT'] = base_url + txt.strip() + s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '') + s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '') + s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s @@ -105,35 +90,20 @@ class ChitankaStore(BasicStoreConfig, StorePlugin): if counter <= 0: break - id = ''.join(data.xpath('.//a[@class="booklink"]/@href')) + id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue - cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')) - title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')) - author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')) - fb2 = ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')) - epub = ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')) - txt = ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')) - - # remove .zip extensions - if fb2.find('.zip') != -1: - fb2 = fb2[:fb2.find('.zip')] - if epub.find('.zip') != -1: - epub = epub[:epub.find('.zip')] - if txt.find('.zip') != -1: - txt = txt[:txt.find('.zip')] - counter -= 1 s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.detail_item = id.strip() + s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip() + s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip() + s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip() + s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED - s.downloads['FB2'] = base_url + fb2.strip() - s.downloads['EPUB'] = base_url + epub.strip() - s.downloads['TXT'] = base_url + txt.strip() + s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '') + s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '') + s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s diff --git a/src/calibre/gui2/store/stores/eknigi_plugin.py b/src/calibre/gui2/store/stores/eknigi_plugin.py new file mode 100644 index 0000000000..b2f5f170b6 --- /dev/null +++ b/src/calibre/gui2/store/stores/eknigi_plugin.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +from __future__ import (unicode_literals, division, absolute_import, print_function) + +__license__ = 'GPL 3' +__copyright__ = '2011, Alex Stanev ' +__docformat__ = 'restructuredtext en' + +import random +import urllib2 +from contextlib import closing + +from lxml import html + +from PyQt4.Qt import QUrl + +from calibre import browser, url_slash_cleaner +from calibre.gui2 import open_url +from calibre.gui2.store import StorePlugin +from calibre.gui2.store.basic_config import BasicStoreConfig +from calibre.gui2.store.search_result import SearchResult +from calibre.gui2.store.web_store_dialog import WebStoreDialog + +class eKnigiStore(BasicStoreConfig, StorePlugin): + + def open(self, parent=None, detail_item=None, external=False): + # Use Kovid's affiliate id 30% of the time + if random.randint(1, 10) in (1, 2, 3): + aff_suffix = '&amigosid=23' + else: + aff_suffix = '&amigosid=22' + url = 'http://e-knigi.net/?' + aff_suffix[1:] + + if external or self.config.get('open_external', False): + if detail_item: + url = detail_item + aff_suffix + open_url(QUrl(url_slash_cleaner(url))) + else: + detail_url = None + if detail_item: + url = detail_item + aff_suffix + d = WebStoreDialog(self.gui, url, parent, detail_url) + d.setWindowTitle(self.name) + d.set_tags(self.config.get('tags', '')) + d.exec_() + + def search(self, query, max_results=10, timeout=60): + base_url = 'http://e-knigi.net' + url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&limitstart=0&limit=' + str(max_results) + '&keyword=' + urllib2.quote(query) + + br = browser() + + counter = max_results + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read()) + + # if the store finds only one product, it opens directly detail view + for data in doc.xpath('//div[@class="prod_details"]'): + s = SearchResult() + s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip() + s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip() + s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip() + s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() + s.detail_item = url + s.drm = SearchResult.DRM_UNLOCKED + + yield s + return + + # search in store results + for data in doc.xpath('//div[@class="browseProductContainer"]'): + if counter <= 0: + break + id = ''.join(data.xpath('.//a[1]/@href')).strip() + if not id: + continue + + counter -= 1 + + s = SearchResult() + s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip() + s.title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip() + s.author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '') + s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() + s.detail_item = base_url + id + s.drm = SearchResult.DRM_UNLOCKED + + yield s diff --git a/src/calibre/gui2/update.py b/src/calibre/gui2/update.py index f76d4b8e65..caa1d3f3dc 100644 --- a/src/calibre/gui2/update.py +++ b/src/calibre/gui2/update.py @@ -15,6 +15,7 @@ from calibre.gui2 import config, dynamic, open_url from calibre.gui2.dialogs.plugin_updater import get_plugin_updates_available URL = 'http://status.calibre-ebook.com/latest' +#URL = 'http://localhost:8000/latest' NO_CALIBRE_UPDATE = '-0.0.0' VSEP = '|' diff --git a/src/calibre/library/sqlite.py b/src/calibre/library/sqlite.py index a2a85806f5..b5917f1a55 100644 --- a/src/calibre/library/sqlite.py +++ b/src/calibre/library/sqlite.py @@ -17,7 +17,7 @@ from datetime import datetime from functools import partial from calibre.ebooks.metadata import title_sort, author_to_author_sort -from calibre.utils.date import parse_date, isoformat, local_tz +from calibre.utils.date import parse_date, isoformat, local_tz, UNDEFINED_DATE from calibre import isbytestring, force_unicode from calibre.constants import iswindows, DEBUG, plugins from calibre.utils.icu import strcmp @@ -39,8 +39,11 @@ def _c_convert_timestamp(val): if ret is None: return parse_date(val, as_utc=False) year, month, day, hour, minutes, seconds, tzsecs = ret - return datetime(year, month, day, hour, minutes, seconds, + try: + return datetime(year, month, day, hour, minutes, seconds, tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz) + except OverflowError: + return UNDEFINED_DATE.astimezone(local_tz) def _py_convert_timestamp(val): if val: diff --git a/src/calibre/utils/ipc/job.py b/src/calibre/utils/ipc/job.py index f4b54aee95..e75884d387 100644 --- a/src/calibre/utils/ipc/job.py +++ b/src/calibre/utils/ipc/job.py @@ -141,7 +141,8 @@ class BaseJob(object): def log_file(self): if self.log_path: return open(self.log_path, 'rb') - return cStringIO.StringIO(_('No details available.')) + return cStringIO.StringIO(_('No details available.').encode('utf-8', + 'replace')) @property def details(self):