mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Merge from trunk
This commit is contained in:
commit
9cbda53ab2
@ -1,39 +1,34 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPLv3'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1255797795(BasicNewsRecipe):
|
||||
title = u'Corren'
|
||||
language = 'sv'
|
||||
__author__ = 'Jonas Svensson'
|
||||
simultaneous_downloads = 1
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
class AdvancedUserRecipe1311446032(BasicNewsRecipe):
|
||||
title = 'Corren'
|
||||
__author__ = 'Jonas Svensson'
|
||||
description = 'News from Sweden'
|
||||
publisher = 'Corren'
|
||||
category = 'news, politics, Sweden'
|
||||
oldest_article = 2
|
||||
delay = 1
|
||||
max_articles_per_feed = 100
|
||||
remove_attributes = ['onload']
|
||||
timefmt = ''
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'iso-8859-1'
|
||||
language = 'sv'
|
||||
|
||||
feeds = [
|
||||
(u'Toppnyheter (alla kategorier)', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/'),
|
||||
(u'Bostad', u'http://www.corren.se/inc/RssHandler.ashx?id=4122174&ripurl=http://www.corren.se/bostad/'),
|
||||
(u'Ekonomi & Jobb', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/'),
|
||||
(u'Kultur & Nöje', u'http://www.corren.se/inc/RssHandler.ashx?id=4122192&ripurl=http://www.corren.se/kultur/'),
|
||||
(u'Mat & dryck', u'http://www.corren.se/inc/RssHandler.ashx?id=4122201&ripurl=http://www.corren.se/mat-dryck/'),
|
||||
(u'Motor', u'http://www.corren.se/inc/RssHandler.ashx?id=4122203&ripurl=http://www.corren.se/motor/'),
|
||||
(u'Sport', u'http://www.corren.se/inc/RssHandler.ashx?id=4122206&ripurl=http://www.corren.se/sport/'),
|
||||
(u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223&ripurl=http://www.corren.se/asikter/'),
|
||||
(u'Mjölby', u'http://www.corren.se/inc/RssHandler.ashx?id=4122235&ripurl=http://www.corren.se/ostergotland/mjolby/'),
|
||||
(u'Motala', u'http://www.corren.se/inc/RssHandler.ashx?id=4122236&ripurl=http://www.corren.se/ostergotland/motala/')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
url = url.replace("ekonomi/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("bostad/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("kultur/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("motor/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("mat-dryck/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("sport/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("asikter/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("mat-dryck/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("ostergotland/mjolby/artikel.aspx", "Print.aspx")
|
||||
url = url.replace("ostergotland/motala/artikel.aspx", "Print.aspx")
|
||||
return url.replace("nyheter/artikel.aspx", "Print.aspx")
|
||||
feeds = [
|
||||
(u'Toppnyheter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122151&ripurl=http://www.corren.se/nyheter/')
|
||||
,(u'Ekonomi', u'http://www.corren.se/inc/RssHandler.ashx?id=4122176&ripurl=http://www.corren.se/ekonomi/')
|
||||
,(u'Link\xf6ping', u'http://www.corren.se/inc/RssHandler.ashx?id=4122234')
|
||||
,(u'Åsikter', u'http://www.corren.se/inc/RssHandler.ashx?id=4122223,4122224,4122226,4122227,4122228,4122229,4122230')
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'article'}),dict(name='div', attrs={'class':'body'})]
|
||||
remove_tags = [
|
||||
dict(name='ul',attrs={'class':'functions'})
|
||||
,dict(name='a',attrs={'href':'javascript*'})
|
||||
,dict(name='div',attrs={'class':'box'})
|
||||
,dict(name='div',attrs={'class':'functionsbottom'})
|
||||
]
|
||||
|
32
recipes/dagens_industri.recipe
Normal file
32
recipes/dagens_industri.recipe
Normal file
@ -0,0 +1,32 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPLv3'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1311450855(BasicNewsRecipe):
|
||||
title = u'Dagens Industri'
|
||||
__author__ = 'Jonas Svensson'
|
||||
description = 'Economy news from Sweden'
|
||||
publisher = 'DI'
|
||||
category = 'news, politics, Sweden'
|
||||
oldest_article = 2
|
||||
delay = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf-8'
|
||||
language = 'sv'
|
||||
|
||||
feeds = [(u'DI', u'http://di.se/rss')]
|
||||
|
||||
keep_only_tags = [dict(name='h1', attrs={'id':'ctl00_ExtraWideContentRegion_WideContentRegion_MainRegion_MainContentRegion_MainBodyRegion_headlineNormal'}),dict(name='div', attrs={'id':'articleBody'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'class':'article-actions clear'})
|
||||
,dict(name='div',attrs={'class':'article-action-popup'})
|
||||
,dict(name='div',attrs={'class':'header'})
|
||||
,dict(name='div',attrs={'class':'content clear'})
|
||||
,dict(name='div',attrs={'id':'articleAdvertisementDiv'})
|
||||
,dict(name='ul',attrs={'class':'action-list'})
|
||||
]
|
@ -12,7 +12,7 @@ from datetime import date
|
||||
|
||||
class Guardian(BasicNewsRecipe):
|
||||
|
||||
title = u'The Guardian / The Observer'
|
||||
title = u'The Guardian and The Observer'
|
||||
if date.today().weekday() == 6:
|
||||
base_url = "http://www.guardian.co.uk/theobserver"
|
||||
else:
|
||||
@ -28,7 +28,7 @@ class Guardian(BasicNewsRecipe):
|
||||
# List of section titles to ignore
|
||||
# For example: ['Sport']
|
||||
ignore_sections = []
|
||||
|
||||
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':["content","article_header","main-article-info",]}),
|
||||
@ -94,7 +94,7 @@ class Guardian(BasicNewsRecipe):
|
||||
prefix = section_title + ': '
|
||||
for subsection in s.parent.findAll('a', attrs={'class':'book-section'}):
|
||||
yield (prefix + self.tag_to_string(subsection), subsection['href'])
|
||||
|
||||
|
||||
def find_articles(self, url):
|
||||
soup = self.index_to_soup(url)
|
||||
div = soup.find('div', attrs={'class':'book-index'})
|
||||
@ -115,7 +115,7 @@ class Guardian(BasicNewsRecipe):
|
||||
'title': title, 'url':url, 'description':desc,
|
||||
'date' : strftime('%a, %d %b'),
|
||||
}
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
try:
|
||||
feeds = []
|
||||
|
@ -43,7 +43,7 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
||||
lfeeds = self.get_feeds()
|
||||
for feedobj in lfeeds:
|
||||
feedtitle, feedurl = feedobj
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||
articles = []
|
||||
soup = self.index_to_soup(feedurl)
|
||||
for item in soup.findAll('div', attrs={'class':'cornerControls'}):
|
||||
@ -63,3 +63,8 @@ class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
article.title = soup.find('title').contents[0].strip()
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for link_tag in soup.findAll(attrs={"id" : "story"}):
|
||||
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
|
||||
|
||||
return soup
|
||||
|
@ -1258,6 +1258,16 @@ class StoreEHarlequinStore(StoreBase):
|
||||
formats = ['EPUB', 'PDF']
|
||||
affiliate = True
|
||||
|
||||
class StoreEKnigiStore(StoreBase):
|
||||
name = u'еКниги'
|
||||
author = 'Alex Stanev'
|
||||
description = u'Онлайн книжарница за електронни книги и аудио риалити романи'
|
||||
actual_plugin = 'calibre.gui2.store.stores.eknigi_plugin:eKnigiStore'
|
||||
|
||||
headquarters = 'BG'
|
||||
formats = ['EPUB', 'PDF', 'HTML']
|
||||
#affiliate = True
|
||||
|
||||
class StoreEpubBudStore(StoreBase):
|
||||
name = 'ePub Bud'
|
||||
description = 'Well, it\'s pretty much just "YouTube for Children\'s eBooks. A not-for-profit organization devoted to brining self published childrens books to the world.'
|
||||
@ -1483,6 +1493,7 @@ plugins += [
|
||||
StoreEBookShoppeUKStore,
|
||||
# StoreEPubBuyDEStore,
|
||||
StoreEHarlequinStore,
|
||||
StoreEKnigiStore,
|
||||
StoreEpubBudStore,
|
||||
StoreFeedbooksStore,
|
||||
StoreFoylesUKStore,
|
||||
|
@ -12,7 +12,7 @@ from datetime import datetime
|
||||
from dateutil.tz import tzoffset
|
||||
|
||||
from calibre.constants import plugins
|
||||
from calibre.utils.date import parse_date, local_tz
|
||||
from calibre.utils.date import parse_date, local_tz, UNDEFINED_DATE
|
||||
from calibre.ebooks.metadata import author_to_author_sort
|
||||
|
||||
_c_speedup = plugins['speedup'][0]
|
||||
@ -29,8 +29,11 @@ def _c_convert_timestamp(val):
|
||||
if ret is None:
|
||||
return parse_date(val, as_utc=False)
|
||||
year, month, day, hour, minutes, seconds, tzsecs = ret
|
||||
return datetime(year, month, day, hour, minutes, seconds,
|
||||
try:
|
||||
return datetime(year, month, day, hour, minutes, seconds,
|
||||
tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
|
||||
except OverflowError:
|
||||
return UNDEFINED_DATE.astimezone(local_tz)
|
||||
|
||||
class Table(object):
|
||||
|
||||
|
@ -151,7 +151,7 @@ class ISBNDB(Source):
|
||||
|
||||
bl = feed.find('BookList')
|
||||
if bl is None:
|
||||
err = tostring(etree.find('errormessage'))
|
||||
err = tostring(feed.find('errormessage'))
|
||||
raise ValueError('ISBNDb query failed:' + err)
|
||||
total_results = int(bl.get('total_results'))
|
||||
shown_results = int(bl.get('shown_results'))
|
||||
|
@ -12,7 +12,7 @@ from collections import OrderedDict, defaultdict
|
||||
from calibre.utils.date import utc_tz
|
||||
from calibre.ebooks.mobi.langcodes import main_language, sub_language
|
||||
from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
|
||||
get_trailing_data)
|
||||
get_trailing_data, decode_tbs)
|
||||
from calibre.utils.magick.draw import identify_data
|
||||
|
||||
# PalmDB {{{
|
||||
@ -399,6 +399,7 @@ class IndexHeader(object): # {{{
|
||||
def __init__(self, record):
|
||||
self.record = record
|
||||
raw = self.record.raw
|
||||
#open('/t/index_header.bin', 'wb').write(raw)
|
||||
if raw[:4] != b'INDX':
|
||||
raise ValueError('Invalid Primary Index Record')
|
||||
|
||||
@ -948,22 +949,25 @@ class TBSIndexing(object): # {{{
|
||||
ans.append(('\t\tIndex Entry: %d (Parent index: %d, '
|
||||
'Depth: %d, Offset: %d, Size: %d) [%s]')%(
|
||||
x.index, x.parent_index, x.depth, x.offset, x.size, x.label))
|
||||
def bin3(num):
|
||||
def bin4(num):
|
||||
ans = bin(num)[2:]
|
||||
return '0'*(3-len(ans)) + ans
|
||||
return bytes('0'*(4-len(ans)) + ans)
|
||||
|
||||
def repr_extra(x):
|
||||
return str({bin4(k):v for k, v in extra.iteritems()})
|
||||
|
||||
tbs_type = 0
|
||||
if len(byts):
|
||||
outer, consumed = decint(byts)
|
||||
outermost_index, extra, consumed = decode_tbs(byts)
|
||||
byts = byts[consumed:]
|
||||
tbs_type = outer & 0b111
|
||||
ans.append('TBS Type: %s (%d)'%(bin3(tbs_type), tbs_type))
|
||||
ans.append('Outer Index entry: %d'%(outer >> 3))
|
||||
arg1, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('Unknown (vwi: always 0?): %d'%arg1)
|
||||
for k in extra:
|
||||
tbs_type |= k
|
||||
ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type)))
|
||||
ans.append('Outermost index: %d'%outermost_index)
|
||||
ans.append('Unknown extra start bytes: %s'%repr_extra(extra))
|
||||
if self.doc_type in (257, 259): # Hierarchical periodical
|
||||
byts, a = self.interpret_periodical(tbs_type, byts)
|
||||
byts, a = self.interpret_periodical(tbs_type, byts,
|
||||
dat['geom'][0])
|
||||
ans += a
|
||||
if byts:
|
||||
sbyts = tuple(hex(b)[2:] for b in byts)
|
||||
@ -972,159 +976,87 @@ class TBSIndexing(object): # {{{
|
||||
ans.append('')
|
||||
return tbs_type, ans
|
||||
|
||||
def interpret_periodical(self, tbs_type, byts):
|
||||
def interpret_periodical(self, tbs_type, byts, record_offset):
|
||||
ans = []
|
||||
|
||||
def tbs_type_6(byts, psi=None, msg=None, fmsg='Unknown'): # {{{
|
||||
def read_section_transitions(byts, psi=None): # {{{
|
||||
if psi is None:
|
||||
# Assume parent section is 1
|
||||
# Assume previous section is 1
|
||||
psi = self.get_index(1)
|
||||
if msg is None:
|
||||
msg = ('Article index at start of record or first article'
|
||||
' index, relative to parent section')
|
||||
if byts:
|
||||
# byts could be empty
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
flags = (arg & 0b1111)
|
||||
ai = (arg >> 4)
|
||||
ans.append('%s (fvwi): %d [%d absolute]'%(msg, ai,
|
||||
ai+psi.index))
|
||||
if flags == 1:
|
||||
arg, consumed = decint(byts)
|
||||
if arg == 0:
|
||||
# EOF of record, otherwise ignore and hope someone else
|
||||
# will deal with these bytes
|
||||
byts = byts[consumed:]
|
||||
ans.append('EOF (vwi: should be 0): %d'%arg)
|
||||
elif flags in (4, 5):
|
||||
num = byts[0]
|
||||
byts = byts[1:]
|
||||
ans.append('Number of article nodes in the record (byte): %d'%num)
|
||||
if flags == 5:
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('%s (vwi)): %d'%(fmsg, arg))
|
||||
elif flags == 0:
|
||||
pass
|
||||
else:
|
||||
raise ValueError('Unknown flags: %d'%flags)
|
||||
return byts
|
||||
|
||||
# }}}
|
||||
|
||||
if tbs_type == 3: # {{{
|
||||
arg2, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('Unknown (vwi: always 0?): %d'%arg2)
|
||||
|
||||
arg3, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
fsi = arg3 >> 4
|
||||
flags = arg3 & 0b1111
|
||||
ans.append('First section index (fvwi): %d'%fsi)
|
||||
psi = self.get_index(fsi)
|
||||
ans.append('Flags (flag: always 0?): %d'%flags)
|
||||
if flags == 4:
|
||||
ans.append('Number of articles in this section: %d'%byts[0])
|
||||
byts = byts[1:]
|
||||
elif flags == 0:
|
||||
pass
|
||||
else:
|
||||
raise ValueError('Unknown flags value: %d'%flags)
|
||||
|
||||
|
||||
if byts:
|
||||
byts = tbs_type_6(byts, psi=psi,
|
||||
msg=('First article of ending section, relative to its'
|
||||
' parent\'s index'),
|
||||
fmsg=('->Offset from start of record to beginning of'
|
||||
' last starting section'))
|
||||
while byts:
|
||||
# We have a transition not just an opening first section
|
||||
psi = self.get_index(psi.index+1)
|
||||
arg, consumed = decint(byts)
|
||||
off = arg >> 4
|
||||
ai, extra, consumed = decode_tbs(byts)
|
||||
byts = byts[consumed:]
|
||||
flags = arg & 0b1111
|
||||
ans.append('Last article of ending section w.r.t. starting'
|
||||
' section offset (fvwi): %d [%d absolute]'%(off,
|
||||
psi.index+off))
|
||||
ans.append('Flags (always 8?): %d'%flags)
|
||||
byts = tbs_type_6(byts, psi=psi)
|
||||
if byts:
|
||||
# Ended with flag 1,and not EOF, which means there's
|
||||
# another section transition in this record
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('->Offset from start of record to beginning of '
|
||||
'last starting section: %d'%(arg))
|
||||
if extra.get(0b0010, None) is not None:
|
||||
raise ValueError('Dont know how to interpret flag 0b0010'
|
||||
' while reading section transitions')
|
||||
if extra.get(0b1000, None) is not None:
|
||||
if len(extra) > 1:
|
||||
raise ValueError('Dont know how to interpret flags'
|
||||
' %r while reading section transitions'%extra)
|
||||
nsi = self.get_index(psi.index+1)
|
||||
ans.append('Last article in this record of section %d'
|
||||
' (relative to next section index [%d]): '
|
||||
'%d [%d absolute index]'%(psi.index, nsi.index, ai,
|
||||
ai+nsi.index))
|
||||
psi = nsi
|
||||
continue
|
||||
|
||||
ans.append('First article in this record of section %d'
|
||||
' (relative to its parent section): '
|
||||
'%d [%d absolute index]'%(psi.index, ai, ai+psi.index))
|
||||
|
||||
num = extra.get(0b0100, None)
|
||||
if num is None:
|
||||
msg = ('The section %d has at most one article'
|
||||
' in this record')%psi.index
|
||||
else:
|
||||
break
|
||||
msg = ('Number of articles in this record of '
|
||||
'section %d: %d')%(psi.index, num)
|
||||
ans.append(msg)
|
||||
|
||||
# }}}
|
||||
offset = extra.get(0b0001, None)
|
||||
if offset is not None:
|
||||
if offset == 0:
|
||||
ans.append('This record is spanned by the article:'
|
||||
'%d'%(ai+psi.index))
|
||||
else:
|
||||
ans.append('->Offset to start of next section (%d) from start'
|
||||
' of record: %d [%d absolute offset]'%(psi.index+1,
|
||||
offset, offset+record_offset))
|
||||
return byts
|
||||
# }}}
|
||||
|
||||
elif tbs_type == 7: # {{{
|
||||
# This occurs for records that have no section nodes and
|
||||
# whose parent section's index == 1
|
||||
ans.append('Unknown (maybe vwi?): %r'%bytes(byts[:2]))
|
||||
byts = byts[2:]
|
||||
arg, consumed = decint(byts)
|
||||
def read_starting_section(byts): # {{{
|
||||
si, extra, consumed = decode_tbs(byts)
|
||||
byts = byts[consumed:]
|
||||
ai = arg >> 4
|
||||
flags = arg & 0b1111
|
||||
ans.append('Article at start of record (fvwi): %d'%ai)
|
||||
if flags == 4:
|
||||
num = byts[0]
|
||||
byts = byts[1:]
|
||||
ans.append('Number of articles in record (byte): %d'%num)
|
||||
elif flags == 0:
|
||||
pass
|
||||
elif flags == 1:
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('EOF (vwi: should be 0): %d'%arg)
|
||||
else:
|
||||
raise ValueError('Unknown flags value: %d'%flags)
|
||||
if len(extra) > 1 or 0b0010 in extra or 0b1000 in extra:
|
||||
raise ValueError('Dont know how to interpret flags %r'
|
||||
' when reading starting section'%extra)
|
||||
si = self.get_index(si)
|
||||
ans.append('The section at the start of this record is:'
|
||||
' %d'%si.index)
|
||||
if 0b0100 in extra:
|
||||
num = extra[0b0100]
|
||||
ans.append('The number of articles from the section %d'
|
||||
' in this record: %d'%(si.index, num))
|
||||
elif 0b0001 in extra:
|
||||
eof = extra[0b0001]
|
||||
if eof != 0:
|
||||
raise ValueError('Unknown eof value %s when reading'
|
||||
' starting section'%eof)
|
||||
ans.append('This record is spanned by an article from'
|
||||
' the section: %d'%si.index)
|
||||
return si, byts
|
||||
# }}}
|
||||
|
||||
elif tbs_type == 6: # {{{
|
||||
# This is used for records spanned by an article whose parent
|
||||
# section's index == 1 or for the opening record if it contains the
|
||||
# periodical start, section 1 start and at least one article. The
|
||||
# two cases are distinguished by the flags on the article index
|
||||
# vwi.
|
||||
unk = byts[0]
|
||||
byts = byts[1:]
|
||||
ans.append('Unknown (byte: always 2?): %d'%unk)
|
||||
byts = tbs_type_6(byts)
|
||||
# }}}
|
||||
if tbs_type & 0b0100:
|
||||
# Starting section is the first section
|
||||
ssi = self.get_index(1)
|
||||
else:
|
||||
ssi, byts = read_starting_section(byts)
|
||||
|
||||
elif tbs_type == 2: # {{{
|
||||
# This occurs for records with no section nodes and whose parent
|
||||
# section's index != 1 (undefined (records before the first
|
||||
# section) or > 1)
|
||||
# This is also used for records that are spanned by an article
|
||||
# whose parent section index > 1. In this case the flags of the
|
||||
# vwi referring to the article at the start
|
||||
# of the record are set to 1 instead of 4.
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
flags = (arg & 0b1111)
|
||||
psi = (arg >> 4)
|
||||
ans.append('Parent section index (fvwi): %d'%psi)
|
||||
psi = self.get_index(psi)
|
||||
ans.append('Flags: %d'%flags)
|
||||
if flags == 1:
|
||||
arg, consumed = decint(byts)
|
||||
byts = byts[consumed:]
|
||||
ans.append('Unknown (vwi?: always 0?): %d'%arg)
|
||||
byts = tbs_type_6(byts, psi=psi)
|
||||
elif flags == 0:
|
||||
byts = tbs_type_6(byts, psi=psi)
|
||||
else:
|
||||
raise ValueError('Unkown flags: %d'%flags)
|
||||
# }}}
|
||||
byts = read_section_transitions(byts, ssi)
|
||||
|
||||
return byts, ans
|
||||
|
||||
|
@ -3,6 +3,20 @@ Reverse engineering the trailing byte sequences for hierarchical periodicals
|
||||
|
||||
In the following, *vwi* means variable width integer and *fvwi* means a vwi whose lowest four bits are used as a flag. All the following information/inferences are from examining the output of kindlegen on a sample periodical. Given the general level of Amazon's incompetence, there are no guarantees that this information is the *best/most complete* way to do TBS indexing.
|
||||
|
||||
Sequence encoding:
|
||||
|
||||
0b1000 : Continuation bit
|
||||
|
||||
First sequences:
|
||||
0b0010 : 80
|
||||
0b0011 : 80 80
|
||||
0b0110 : 80 2
|
||||
0b0111 : 80 2 80
|
||||
|
||||
Other sequences:
|
||||
0b0101 : 4 1a
|
||||
0b0001 : c b1
|
||||
|
||||
Opening record
|
||||
----------------
|
||||
|
||||
@ -52,10 +66,60 @@ The text record that contains the opening node for the periodical (depth=0 node
|
||||
|
||||
If there was only a single article, instead of 2, then the last two bytes would be: c0, i.e. there would be no byte giving the number of articles in the record.
|
||||
|
||||
Starting record with two section transitions::
|
||||
|
||||
Record #1: Starts at: 0 Ends at: 4095
|
||||
Contains: 7 index entries (0 ends, 4 complete, 3 starts)
|
||||
TBS bytes: 86 80 2 c0 b8 c4 3
|
||||
Complete:
|
||||
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica]
|
||||
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz]
|
||||
Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 1014) [Max and the Magic Marker for iPad: Review]
|
||||
Index Entry: 7 (Parent index: 2, Depth: 2, Offset: 1961, Size: 1077) [iPad 2 steers itself into home console gaming territory with Real Racing 2 HD]
|
||||
Starts:
|
||||
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 35372) [j_x's Google reader]
|
||||
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 10368) [Neowin.net]
|
||||
Index Entry: 8 (Parent index: 2, Depth: 2, Offset: 3038, Size: 1082) [Microsoft's Joe Belfiore still working on upcoming Zune hardware]
|
||||
TBS Type: 110 (6)
|
||||
Outer Index entry: 0
|
||||
Unknown (vwi: always 0?): 0
|
||||
Unknown (byte: always 2?): 2
|
||||
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
|
||||
Remaining bytes: b8 c4 3
|
||||
|
||||
Starting record with three section transitions::
|
||||
|
||||
Record #1: Starts at: 0 Ends at: 4095
|
||||
Contains: 10 index entries (0 ends, 7 complete, 3 starts)
|
||||
TBS bytes: 86 80 2 c0 b8 c0 b8 c4 4
|
||||
Complete:
|
||||
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 564, Size: 375) [Ars Technica]
|
||||
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 939, Size: 316) [Neowin.net]
|
||||
Index Entry: 5 (Parent index: 1, Depth: 2, Offset: 572, Size: 367) [Week in gaming: 3DS review, Crysis 2, George Hotz]
|
||||
Index Entry: 6 (Parent index: 2, Depth: 2, Offset: 947, Size: 308) [Max and the Magic Marker for iPad: Review]
|
||||
Index Entry: 7 (Parent index: 3, Depth: 2, Offset: 1263, Size: 760) [OSnews Asks on Interrupts: The Results]
|
||||
Index Entry: 8 (Parent index: 3, Depth: 2, Offset: 2023, Size: 693) [Apple Ditches SAMBA in Favour of Homegrown Replacement]
|
||||
Index Entry: 9 (Parent index: 3, Depth: 2, Offset: 2716, Size: 747) [ITC: Apple's Mobile Products Do Not Violate Nokia Patents]
|
||||
Starts:
|
||||
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 25320) [j_x's Google reader]
|
||||
Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 1255, Size: 6829) [OSNews]
|
||||
Index Entry: 10 (Parent index: 3, Depth: 2, Offset: 3463, Size: 666) [Transparent Monitor Embedded in Window Glass]
|
||||
TBS Type: 110 (6)
|
||||
Outer Index entry: 0
|
||||
Unknown (vwi: always 0?): 0
|
||||
Unknown (byte: always 2?): 2
|
||||
Article index at start of record or first article index, relative to parent section (fvwi): 4 [5 absolute]
|
||||
Remaining bytes: b8 c0 b8 c4 4
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Records with no nodes
|
||||
------------------------
|
||||
|
||||
subtype = 010
|
||||
|
||||
These records are spanned by a single article. They are of two types:
|
||||
|
||||
1. If the parent section index is 1, TBS type of 6, like this::
|
||||
@ -247,7 +311,7 @@ In such a record there is a transition from one section to the next. As such the
|
||||
Last article of ending section w.r.t. starting section offset (fvwi): 12 [15 absolute]
|
||||
Flags (always 8?): 8
|
||||
Article index at start of record or first article index, relative to parent section (fvwi): 13 [16 absolute]
|
||||
Number of article nodes in the record (byte): 4
|
||||
Number of article nodes in the record belonging ot the last section (byte): 4
|
||||
|
||||
|
||||
Ending record
|
||||
@ -274,3 +338,26 @@ Logically, ending records must have at least one article ending, one section end
|
||||
|
||||
If the record had only a single article end, the last two bytes would be replaced with: f0
|
||||
|
||||
If the last record has multiple section transitions, it is of type 6 and looks like::
|
||||
|
||||
Record #9: Starts at: 32768 Ends at: 34953
|
||||
Contains: 9 index entries (3 ends, 6 complete, 0 starts)
|
||||
TBS bytes: 86 80 2 1 d0 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0
|
||||
Ends:
|
||||
Index Entry: 0 (Parent index: -1, Depth: 0, Offset: 215, Size: 34739) [j_x's Google reader]
|
||||
Index Entry: 1 (Parent index: 0, Depth: 1, Offset: 7758, Size: 26279) [Ars Technica]
|
||||
Index Entry: 14 (Parent index: 1, Depth: 2, Offset: 31929, Size: 2108) [Trademarked keyword sales may soon be restricted in Europe]
|
||||
Complete:
|
||||
Index Entry: 2 (Parent index: 0, Depth: 1, Offset: 34037, Size: 316) [Neowin.net]
|
||||
Index Entry: 3 (Parent index: 0, Depth: 1, Offset: 34353, Size: 282) [OSNews]
|
||||
Index Entry: 4 (Parent index: 0, Depth: 1, Offset: 34635, Size: 319) [Slashdot]
|
||||
Index Entry: 15 (Parent index: 2, Depth: 2, Offset: 34045, Size: 308) [Max and the Magic Marker for iPad: Review]
|
||||
Index Entry: 16 (Parent index: 3, Depth: 2, Offset: 34361, Size: 274) [OSnews Asks on Interrupts: The Results]
|
||||
Index Entry: 17 (Parent index: 4, Depth: 2, Offset: 34643, Size: 311) [Leonard Nimoy Turns 80]
|
||||
TBS Type: 110 (6)
|
||||
Outer Index entry: 0
|
||||
Unknown (vwi: always 0?): 0
|
||||
Unknown (byte: always 2?): 2
|
||||
Article index at start of record or first article index, relative to parent section (fvwi): 13 [14 absolute]
|
||||
Remaining bytes: 1 c8 1 d0 1 c8 1 d0 1 c8 1 d0
|
||||
|
||||
|
@ -11,6 +11,7 @@ import struct
|
||||
from collections import OrderedDict
|
||||
|
||||
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
||||
from calibre.ebooks import normalize
|
||||
|
||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||
|
||||
@ -39,7 +40,7 @@ def encode_number_as_hex(num):
|
||||
The bytes that follow are simply the hexadecimal representation of the
|
||||
number.
|
||||
'''
|
||||
num = bytes(hex(num)[2:])
|
||||
num = bytes(hex(num)[2:].upper())
|
||||
ans = bytearray(num)
|
||||
ans.insert(0, len(num))
|
||||
return bytes(ans)
|
||||
@ -197,3 +198,96 @@ def encode_trailing_data(raw):
|
||||
lsize += 1
|
||||
return raw + encoded
|
||||
|
||||
def encode_fvwi(val, flags):
|
||||
'''
|
||||
Encode the value val and the 4 bit flags flags as a fvwi. This encoding is
|
||||
used in the trailing byte sequences for indexing. Returns encoded
|
||||
bytestring.
|
||||
'''
|
||||
ans = (val << 4) | (flags & 0b1111)
|
||||
return encint(ans)
|
||||
|
||||
|
||||
def decode_fvwi(byts):
|
||||
'''
|
||||
Decode encoded fvwi. Returns number, flags, consumed
|
||||
'''
|
||||
arg, consumed = decint(bytes(byts))
|
||||
return (arg >> 4), (arg & 0b1111), consumed
|
||||
|
||||
def decode_tbs(byts):
|
||||
'''
|
||||
Trailing byte sequences for indexing consists of series of fvwi numbers.
|
||||
This function reads the fvwi number and its associated flags. It them uses
|
||||
the flags to read any more numbers that belong to the series. The flags are
|
||||
the lowest 4 bits of the vwi (see the encode_fvwi function above).
|
||||
|
||||
Returns the fvwi number, a dictionary mapping flags bits to the associated
|
||||
data and the number of bytes consumed.
|
||||
'''
|
||||
byts = bytes(byts)
|
||||
val, flags, consumed = decode_fvwi(byts)
|
||||
extra = {}
|
||||
byts = byts[consumed:]
|
||||
if flags & 0b1000:
|
||||
extra[0b1000] = True
|
||||
if flags & 0b0010:
|
||||
x, consumed2 = decint(byts)
|
||||
byts = byts[consumed2:]
|
||||
extra[0b0010] = x
|
||||
consumed += consumed2
|
||||
if flags & 0b0100:
|
||||
extra[0b0100] = ord(byts[0])
|
||||
byts = byts[1:]
|
||||
consumed += 1
|
||||
if flags & 0b0001:
|
||||
x, consumed2 = decint(byts)
|
||||
byts = byts[consumed2:]
|
||||
extra[0b0001] = x
|
||||
consumed += consumed2
|
||||
return val, extra, consumed
|
||||
|
||||
def encode_tbs(val, extra):
|
||||
'''
|
||||
Encode the number val and the extra data in the extra dict as an fvwi. See
|
||||
decode_tbs above.
|
||||
'''
|
||||
flags = 0
|
||||
for flag in extra:
|
||||
flags |= flag
|
||||
ans = encode_fvwi(val, flags)
|
||||
|
||||
if 0b0010 in extra:
|
||||
ans += encint(extra[0b0010])
|
||||
if 0b0100 in extra:
|
||||
ans += bytes(bytearray([extra[0b0100]]))
|
||||
if 0b0001 in extra:
|
||||
ans += encint(extra[0b0001])
|
||||
return ans
|
||||
|
||||
def utf8_text(text):
|
||||
'''
|
||||
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
|
||||
empty, normalized bytestring.
|
||||
'''
|
||||
if text and text.strip():
|
||||
text = text.strip()
|
||||
if not isinstance(text, unicode):
|
||||
text = text.decode('utf-8', 'replace')
|
||||
text = normalize(text).encode('utf-8')
|
||||
else:
|
||||
text = _('Unknown').encode('utf-8')
|
||||
return text
|
||||
|
||||
def align_block(raw, multiple=4, pad=b'\0'):
|
||||
'''
|
||||
Return raw with enough pad bytes append to ensure its length is a multiple
|
||||
of 4.
|
||||
'''
|
||||
extra = len(raw) % multiple
|
||||
if extra == 0: return raw
|
||||
return raw + pad*(multiple - extra)
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -10,34 +10,13 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
from struct import pack
|
||||
from cStringIO import StringIO
|
||||
from collections import OrderedDict
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
from calibre.ebooks import normalize
|
||||
from calibre.ebook.mobi.writer2 import RECORD_SIZE
|
||||
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex)
|
||||
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
|
||||
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
|
||||
encode_trailing_data, encode_tbs, align_block, utf8_text)
|
||||
from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||
|
||||
def utf8_text(text):
|
||||
'''
|
||||
Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
|
||||
empty, normalized bytestring.
|
||||
'''
|
||||
if text and text.strip():
|
||||
text = text.strip()
|
||||
if not isinstance(text, unicode):
|
||||
text = text.decode('utf-8', 'replace')
|
||||
text = normalize(text).encode('utf-8')
|
||||
else:
|
||||
text = _('Unknown').encode('utf-8')
|
||||
return text
|
||||
|
||||
def align_block(raw, multiple=4, pad=b'\0'):
|
||||
'''
|
||||
Return raw with enough pad bytes append to ensure its length is a multiple
|
||||
of 4.
|
||||
'''
|
||||
extra = len(raw) % multiple
|
||||
if extra == 0: return raw
|
||||
return raw + pad*(multiple - extra)
|
||||
|
||||
class CNCX(object): # {{{
|
||||
|
||||
@ -85,7 +64,7 @@ class CNCX(object): # {{{
|
||||
return self.strings[string]
|
||||
# }}}
|
||||
|
||||
class IndexEntry(object):
|
||||
class IndexEntry(object): # {{{
|
||||
|
||||
TAG_VALUES = {
|
||||
'offset': 1,
|
||||
@ -97,7 +76,7 @@ class IndexEntry(object):
|
||||
'first_child_index': 22,
|
||||
'last_child_index': 23,
|
||||
}
|
||||
RTAG_MAP = dict(TAG_VALUES.itervalues(), TAG_VALUES.iterkeys())
|
||||
RTAG_MAP = {v:k for k, v in TAG_VALUES.iteritems()}
|
||||
|
||||
BITMASKS = [1, 2, 3, 4, 5, 21, 22, 23,]
|
||||
|
||||
@ -112,6 +91,35 @@ class IndexEntry(object):
|
||||
self.first_child_index = None
|
||||
self.last_child_index = None
|
||||
|
||||
@classmethod
|
||||
def tagx_block(cls, for_periodical=True):
|
||||
buf = bytearray()
|
||||
|
||||
def add_tag(tag, num_values=1):
|
||||
buf.append(tag)
|
||||
buf.append(num_values)
|
||||
# bitmask
|
||||
buf.append(1 << (cls.BITMASKS.index(tag)))
|
||||
# eof
|
||||
buf.append(0)
|
||||
|
||||
for tag in xrange(1, 5):
|
||||
add_tag(tag)
|
||||
|
||||
if for_periodical:
|
||||
for tag in (5, 21, 22, 23):
|
||||
add_tag(tag)
|
||||
|
||||
# End of TAGX record
|
||||
for i in xrange(3): buf.append(0)
|
||||
buf.append(1)
|
||||
|
||||
header = b'TAGX'
|
||||
header += pack(b'>I', len(buf)) # table length
|
||||
header += pack(b'>I', 1) # control byte count
|
||||
|
||||
return header + bytes(buf)
|
||||
|
||||
@property
|
||||
def next_offset(self):
|
||||
return self.offset + self.length
|
||||
@ -147,8 +155,135 @@ class IndexEntry(object):
|
||||
ans = buf.get_value()
|
||||
return ans
|
||||
|
||||
# }}}
|
||||
|
||||
class Indexer(object):
|
||||
class TBS(object): # {{{
|
||||
|
||||
'''
|
||||
Take the list of index nodes starting/ending on a record and calculate the
|
||||
trailing byte sequence for the record.
|
||||
'''
|
||||
|
||||
def __init__(self, data, is_periodical, first=False, all_sections=[]):
|
||||
if not data:
|
||||
self.bytestring = encode_trailing_data(b'')
|
||||
else:
|
||||
self.section_map = OrderedDict((i.index, i) for i in
|
||||
sorted(all_sections, key=lambda x:x.offset))
|
||||
|
||||
if is_periodical:
|
||||
# The starting bytes.
|
||||
# The value is zero which I think indicates the periodical
|
||||
# index entry. The values for the various flags seem to be
|
||||
# unused. If the 0b0100 is present, it means that the record
|
||||
# deals with section 1 (or is the final record with section
|
||||
# transitions).
|
||||
self.type_010 = encode_tbs(0, {0b0010: 0})
|
||||
self.type_011 = encode_tbs(0, {0b0010: 0, 0b0001: 0})
|
||||
self.type_110 = encode_tbs(0, {0b0100: 2, 0b0010: 0})
|
||||
self.type_111 = encode_tbs(0, {0b0100: 2, 0b0010: 0, 0b0001: 0})
|
||||
|
||||
depth_map = defaultdict(list)
|
||||
for x in ('starts', 'ends', 'completes'):
|
||||
for idx in data[x]:
|
||||
depth_map[idx.depth].append(idx)
|
||||
for l in depth_map.itervalues():
|
||||
l.sort(key=lambda x:x.offset)
|
||||
self.periodical_tbs(data, first, depth_map)
|
||||
else:
|
||||
self.book_tbs(data, first)
|
||||
|
||||
def periodical_tbs(self, data, first, depth_map):
|
||||
buf = StringIO()
|
||||
|
||||
has_section_start = (depth_map[1] and depth_map[1][0] in
|
||||
data['starts'])
|
||||
spanner = data['spans']
|
||||
first_node = None
|
||||
for nodes in depth_map.values():
|
||||
for node in nodes:
|
||||
if (first_node is None or (node.offset, node.depth) <
|
||||
(first_node.offset, first_node.depth)):
|
||||
first_node = node
|
||||
|
||||
parent_section_index = -1
|
||||
if depth_map[0]:
|
||||
# We have a terminal record
|
||||
typ = (self.type_110 if has_section_start else self.type_010)
|
||||
if first_node.depth > 0:
|
||||
parent_section_index = (first_node.index if first_node.depth
|
||||
== 1 else first_node.parent_index)
|
||||
else:
|
||||
if spanner is not None:
|
||||
# record is spanned by a single article
|
||||
parent_section_index = spanner.parent_index
|
||||
typ = (self.type_110 if parent_section_index == 1 else
|
||||
self.type_010)
|
||||
elif not depth_map[1]:
|
||||
# has only article nodes, i.e. spanned by a section
|
||||
parent_section_index = self.depth_map[2][0].parent_index
|
||||
typ = (self.type_111 if parent_section_index == 1 else
|
||||
self.type_010)
|
||||
else:
|
||||
# has section transitions
|
||||
parent_section_index = self.depth_map[2][0].parent_index
|
||||
|
||||
buf.write(typ)
|
||||
|
||||
if parent_section_index > 1:
|
||||
# Write starting section information
|
||||
if spanner is None:
|
||||
num_articles = len(depth_map[1])
|
||||
extra = {}
|
||||
if num_articles > 1:
|
||||
extra = {0b0100: num_articles}
|
||||
else:
|
||||
extra = {0b0001: 0}
|
||||
buf.write(encode_tbs(parent_section_index, extra))
|
||||
|
||||
if spanner is None:
|
||||
articles = depth_map[2]
|
||||
sections = [self.section_map[a.parent_index] for a in articles]
|
||||
sections.sort(key=lambda x:x.offset)
|
||||
section_map = {s:[a for a in articles is a.parent_index ==
|
||||
s.index] for s in sections}
|
||||
for i, section in enumerate(sections):
|
||||
# All the articles in this record that belong to section
|
||||
articles = section_map[section]
|
||||
first_article = articles[0]
|
||||
last_article = articles[-1]
|
||||
num = len(articles)
|
||||
|
||||
try:
|
||||
next_sec = sections[i+1]
|
||||
except:
|
||||
next_sec == None
|
||||
|
||||
extra = {}
|
||||
if num > 1:
|
||||
extra[0b0100] = num
|
||||
if i == 0 and next_sec is not None:
|
||||
# Write offset to next section from start of record
|
||||
# For some reason kindlegen only writes this offset
|
||||
# for the first section transition. Imitate it.
|
||||
extra[0b0001] = next_sec.offset - data['offset']
|
||||
|
||||
buf.write(encode_tbs(first_article.index-section.index, extra))
|
||||
|
||||
if next_sec is not None:
|
||||
buf.write(encode_tbs(last_article.index-next_sec.index,
|
||||
{0b1000: 0}))
|
||||
else:
|
||||
buf.write(encode_tbs(spanner.index - parent_section_index,
|
||||
{0b0001: 0}))
|
||||
|
||||
self.bytestring = encode_trailing_data(buf.getvalue())
|
||||
|
||||
def book_tbs(self, data, first):
|
||||
self.bytestring = encode_trailing_data(b'')
|
||||
# }}}
|
||||
|
||||
class Indexer(object): # {{{
|
||||
|
||||
def __init__(self, serializer, number_of_text_records,
|
||||
size_of_last_text_record, opts, oeb):
|
||||
@ -160,7 +295,9 @@ class Indexer(object):
|
||||
self.log = oeb.log
|
||||
self.opts = opts
|
||||
|
||||
self.is_periodical = opts.mobi_periodical
|
||||
self.is_periodical = self.detect_periodical()
|
||||
self.log('Generating MOBI index for a %s'%('periodical' if
|
||||
self.is_periodical else 'book'))
|
||||
self.is_flat_periodical = False
|
||||
if opts.mobi_periodical:
|
||||
periodical_node = iter(oeb.toc).next()
|
||||
@ -172,15 +309,42 @@ class Indexer(object):
|
||||
self.cncx = CNCX(oeb.toc, opts)
|
||||
|
||||
if self.is_periodical:
|
||||
indices = self.create_periodical_index()
|
||||
self.indices = self.create_periodical_index()
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
self.indices = self.create_book_index()
|
||||
|
||||
self.records.append(self.create_index_record(indices))
|
||||
self.records.append(self.create_index_record())
|
||||
self.records.insert(0, self.create_header())
|
||||
self.records.extend(self.cncx.records)
|
||||
|
||||
def create_index_record(self, indices):
|
||||
self.calculate_trailing_byte_sequences()
|
||||
|
||||
def detect_periodical(self): # {{{
|
||||
for node in self.oeb.toc.iterdescendants():
|
||||
if node.depth() == 1 and node.klass != 'article':
|
||||
self.log.debug(
|
||||
'Not a periodical: Deepest node does not have '
|
||||
'class="article"')
|
||||
return False
|
||||
if node.depth() == 2 and node.klass != 'section':
|
||||
self.log.debug(
|
||||
'Not a periodical: Second deepest node does not have'
|
||||
' class="section"')
|
||||
return False
|
||||
if node.depth() == 3 and node.klass != 'periodical':
|
||||
self.log.debug('Not a periodical: Third deepest node'
|
||||
' does not have class="periodical"')
|
||||
return False
|
||||
if node.depth() > 3:
|
||||
self.log.debug('Not a periodical: Has nodes of depth > 3')
|
||||
return False
|
||||
return True
|
||||
# }}}
|
||||
|
||||
def create_index_record(self): # {{{
|
||||
header_length = 192
|
||||
buf = StringIO()
|
||||
indices = self.indices
|
||||
|
||||
# Write index entries
|
||||
offsets = []
|
||||
@ -218,6 +382,135 @@ class Indexer(object):
|
||||
if len(ans) > 0x10000:
|
||||
raise ValueError('Too many entries (%d) in the TOC'%len(offsets))
|
||||
return ans
|
||||
# }}}
|
||||
|
||||
def create_header(self): # {{{
|
||||
buf = StringIO()
|
||||
tagx_block = IndexEntry.tagx_block(self.is_periodical)
|
||||
header_length = 192
|
||||
|
||||
# Ident 0 - 4
|
||||
buf.write(b'INDX')
|
||||
|
||||
# Header length 4 - 8
|
||||
buf.write(pack(b'>I', header_length))
|
||||
|
||||
# Unknown 8-16
|
||||
buf.write(b'\0'*8)
|
||||
|
||||
# Index type: 0 - normal, 2 - inflection 16 - 20
|
||||
buf.write(pack(b'>I', 2))
|
||||
|
||||
# IDXT offset 20-24
|
||||
buf.write(pack(b'>I', 0)) # Filled in later
|
||||
|
||||
# Number of index records 24-28
|
||||
buf.write(pack('b>I', len(self.records)))
|
||||
|
||||
# Index Encoding 28-32
|
||||
buf.write(pack(b'>I', 65001)) # utf-8
|
||||
|
||||
# Index language 32-36
|
||||
buf.write(iana2mobi(
|
||||
str(self.oeb.metadata.language[0])))
|
||||
|
||||
# Number of index entries 36-40
|
||||
buf.write(pack(b'>I', len(self.indices)))
|
||||
|
||||
# ORDT offset 40-44
|
||||
buf.write(pack(b'>I', 0))
|
||||
|
||||
# LIGT offset 44-48
|
||||
buf.write(pack(b'>I', 0))
|
||||
|
||||
# Number of LIGT entries 48-52
|
||||
buf.write(pack(b'>I', 0))
|
||||
|
||||
# Number of CNCX records 52-56
|
||||
buf.write(pack(b'>I', len(self.cncx.records)))
|
||||
|
||||
# Unknown 56-180
|
||||
buf.write(b'\0'*124)
|
||||
|
||||
# TAGX offset 180-184
|
||||
buf.write(pack(b'>I', header_length))
|
||||
|
||||
# Unknown 184-192
|
||||
buf.write(b'\0'*8)
|
||||
|
||||
# TAGX block
|
||||
buf.write(tagx_block)
|
||||
|
||||
num = len(self.indices)
|
||||
|
||||
# The index of the last entry in the NCX
|
||||
buf.write(encode_number_as_hex(num-1))
|
||||
|
||||
# The number of entries in the NCX
|
||||
buf.write(pack(b'>H', num))
|
||||
|
||||
# Padding
|
||||
pad = (4 - (buf.tell()%4))%4
|
||||
if pad:
|
||||
buf.write(b'\0'*pad)
|
||||
|
||||
idxt_offset = buf.tell()
|
||||
|
||||
buf.write(b'IDXT')
|
||||
buf.write(header_length + len(tagx_block))
|
||||
buf.write(b'\0')
|
||||
buf.seek(20)
|
||||
buf.write(pack(b'>I', idxt_offset))
|
||||
|
||||
return align_block(buf.getvalue())
|
||||
# }}}
|
||||
|
||||
def create_book_index(self): # {{{
|
||||
indices = []
|
||||
seen = set()
|
||||
id_offsets = self.serializer.id_offsets
|
||||
|
||||
for node in self.oeb.toc.iterdescendants():
|
||||
try:
|
||||
offset = id_offsets[node.href]
|
||||
label = self.cncx[node.title]
|
||||
except:
|
||||
self.log.warn('TOC item %s not found in document'%node.href)
|
||||
continue
|
||||
if offset in seen:
|
||||
continue
|
||||
seen.add(offset)
|
||||
index = IndexEntry(offset, label)
|
||||
self.indices.append(index)
|
||||
|
||||
indices.sort(key=lambda x:x.offset)
|
||||
|
||||
# Set lengths
|
||||
for i, index in indices:
|
||||
try:
|
||||
next_offset = indices[i+1].offset
|
||||
except:
|
||||
next_offset = self.serializer.body_end_offset
|
||||
index.length = next_offset - index.offset
|
||||
|
||||
# Remove empty nodes
|
||||
indices = [i for i in indices if i.length > 0]
|
||||
|
||||
# Set index values
|
||||
for i, index in indices:
|
||||
index.index = i
|
||||
|
||||
# Set lengths again to close up any gaps left by filtering
|
||||
for i, index in indices:
|
||||
try:
|
||||
next_offset = indices[i+1].offset
|
||||
except:
|
||||
next_offset = self.serializer.body_end_offset
|
||||
index.length = next_offset - index.offset
|
||||
|
||||
return indices
|
||||
|
||||
# }}}
|
||||
|
||||
def create_periodical_index(self): # {{{
|
||||
periodical_node = iter(self.oeb.toc).next()
|
||||
@ -361,14 +654,48 @@ class Indexer(object):
|
||||
return indices
|
||||
# }}}
|
||||
|
||||
def create_header(self):
|
||||
buf = StringIO()
|
||||
# TBS {{{
|
||||
def calculate_trailing_byte_sequences(self):
|
||||
self.tbs_map = {}
|
||||
found_node = False
|
||||
sections = [i for i in self.indices if i.depth == 1]
|
||||
for i in xrange(self.number_of_text_records):
|
||||
offset = i * RECORD_SIZE
|
||||
next_offset = offset + RECORD_SIZE
|
||||
data = OrderedDict([('ends',[]), ('completes',[]), ('starts',[]),
|
||||
('spans', None), ('offset', offset)])
|
||||
for index in self.indices:
|
||||
if index.offset >= next_offset:
|
||||
# Node starts after current record
|
||||
break
|
||||
if index.next_offset <= offset:
|
||||
# Node ends before current record
|
||||
continue
|
||||
if index.offset >= offset:
|
||||
# Node starts in current record
|
||||
if index.next_offset <= next_offset:
|
||||
# Node ends in current record
|
||||
data['completes'].append(index)
|
||||
else:
|
||||
data['starts'].append(index)
|
||||
else:
|
||||
# Node starts before current records
|
||||
if index.next_offset <= next_offset:
|
||||
# Node ends in current record
|
||||
data['ends'].append(index)
|
||||
else:
|
||||
data['spans'] = index
|
||||
if (data['ends'] or data['completes'] or data['starts'] or
|
||||
data['spans'] is not None):
|
||||
self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not
|
||||
found_node, all_sections=sections)
|
||||
found_node = True
|
||||
else:
|
||||
self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False)
|
||||
|
||||
# Ident
|
||||
buf.write(b'INDX')
|
||||
def get_trailing_byte_sequence(self, num):
|
||||
return self.tbs_map[num].bytestring
|
||||
# }}}
|
||||
|
||||
# Header length
|
||||
buf.write(pack(b'>I', 192))
|
||||
# }}}
|
||||
|
||||
# Index type: 0 - normal, 2 - inflection
|
||||
buf.write(pack(b'>I', 2))
|
||||
|
@ -93,6 +93,15 @@ class MobiWriter(object):
|
||||
self.opts, self.oeb)
|
||||
except:
|
||||
self.log.exception('Failed to generate MOBI index:')
|
||||
else:
|
||||
self.primary_index_record_idx = len(self.records)
|
||||
for i in xrange(len(self.records)):
|
||||
if i == 0: continue
|
||||
tbs = self.indexer.get_trailing_byte_sequence(i)
|
||||
self.records[i] += tbs
|
||||
self.records.extend(self.indexer.records)
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
def write_uncrossable_breaks(self): # {{{
|
||||
|
@ -6,6 +6,8 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
|
||||
class StorePlugin(object): # {{{
|
||||
'''
|
||||
A plugin representing an online ebook repository (store). The store can
|
||||
@ -43,7 +45,7 @@ class StorePlugin(object): # {{{
|
||||
The easiest way to handle affiliate money payouts is to randomly select
|
||||
between the author's affiliate id and calibre's affiliate id so that
|
||||
70% of the time the author's id is used.
|
||||
|
||||
|
||||
See declined.txt for a list of stores that do not want to be included.
|
||||
'''
|
||||
|
||||
@ -53,7 +55,7 @@ class StorePlugin(object): # {{{
|
||||
self.gui = gui
|
||||
self.name = name
|
||||
self.base_plugin = None
|
||||
self.config = JSONConfig('store/stores/' + self.name)
|
||||
self.config = JSONConfig('store/stores/' + ascii_filename(self.name))
|
||||
|
||||
def open(self, gui, parent=None, detail_item=None, external=False):
|
||||
'''
|
||||
|
@ -54,36 +54,21 @@ class ChitankaStore(BasicStoreConfig, StorePlugin):
|
||||
if counter <= 0:
|
||||
break
|
||||
|
||||
id = ''.join(data.xpath('.//a[@class="booklink"]/@href'))
|
||||
id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip()
|
||||
if not id:
|
||||
continue
|
||||
|
||||
cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src'))
|
||||
title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()'))
|
||||
author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()'))
|
||||
fb2 = ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href'))
|
||||
epub = ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href'))
|
||||
txt = ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href'))
|
||||
|
||||
# remove .zip extensions
|
||||
if fb2.find('.zip') != -1:
|
||||
fb2 = fb2[:fb2.find('.zip')]
|
||||
if epub.find('.zip') != -1:
|
||||
epub = epub[:epub.find('.zip')]
|
||||
if txt.find('.zip') != -1:
|
||||
txt = txt[:txt.find('.zip')]
|
||||
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
s.cover_url = cover_url
|
||||
s.title = title.strip()
|
||||
s.author = author.strip()
|
||||
s.detail_item = id.strip()
|
||||
s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip()
|
||||
s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip()
|
||||
s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip()
|
||||
s.detail_item = id
|
||||
s.drm = SearchResult.DRM_UNLOCKED
|
||||
s.downloads['FB2'] = base_url + fb2.strip()
|
||||
s.downloads['EPUB'] = base_url + epub.strip()
|
||||
s.downloads['TXT'] = base_url + txt.strip()
|
||||
s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '')
|
||||
s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '')
|
||||
s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '')
|
||||
s.formats = 'FB2, EPUB, TXT, SFB'
|
||||
yield s
|
||||
|
||||
@ -105,35 +90,20 @@ class ChitankaStore(BasicStoreConfig, StorePlugin):
|
||||
if counter <= 0:
|
||||
break
|
||||
|
||||
id = ''.join(data.xpath('.//a[@class="booklink"]/@href'))
|
||||
id = ''.join(data.xpath('.//a[@class="booklink"]/@href')).strip()
|
||||
if not id:
|
||||
continue
|
||||
|
||||
cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src'))
|
||||
title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()'))
|
||||
author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()'))
|
||||
fb2 = ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href'))
|
||||
epub = ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href'))
|
||||
txt = ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href'))
|
||||
|
||||
# remove .zip extensions
|
||||
if fb2.find('.zip') != -1:
|
||||
fb2 = fb2[:fb2.find('.zip')]
|
||||
if epub.find('.zip') != -1:
|
||||
epub = epub[:epub.find('.zip')]
|
||||
if txt.find('.zip') != -1:
|
||||
txt = txt[:txt.find('.zip')]
|
||||
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
s.cover_url = cover_url
|
||||
s.title = title.strip()
|
||||
s.author = author.strip()
|
||||
s.detail_item = id.strip()
|
||||
s.cover_url = ''.join(data.xpath('.//a[@class="booklink"]/img/@src')).strip()
|
||||
s.title = ''.join(data.xpath('.//a[@class="booklink"]/i/text()')).strip()
|
||||
s.author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()')).strip()
|
||||
s.detail_item = id
|
||||
s.drm = SearchResult.DRM_UNLOCKED
|
||||
s.downloads['FB2'] = base_url + fb2.strip()
|
||||
s.downloads['EPUB'] = base_url + epub.strip()
|
||||
s.downloads['TXT'] = base_url + txt.strip()
|
||||
s.downloads['FB2'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip().replace('.zip', '')
|
||||
s.downloads['EPUB'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-epub"]/@href')).strip().replace('.zip', '')
|
||||
s.downloads['TXT'] = base_url + ''.join(data.xpath('.//a[@class="dl dl-txt"]/@href')).strip().replace('.zip', '')
|
||||
s.formats = 'FB2, EPUB, TXT, SFB'
|
||||
yield s
|
||||
|
88
src/calibre/gui2/store/stores/eknigi_plugin.py
Normal file
88
src/calibre/gui2/store/stores/eknigi_plugin.py
Normal file
@ -0,0 +1,88 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Alex Stanev <alex@stanev.org>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import random
|
||||
import urllib2
|
||||
from contextlib import closing
|
||||
|
||||
from lxml import html
|
||||
|
||||
from PyQt4.Qt import QUrl
|
||||
|
||||
from calibre import browser, url_slash_cleaner
|
||||
from calibre.gui2 import open_url
|
||||
from calibre.gui2.store import StorePlugin
|
||||
from calibre.gui2.store.basic_config import BasicStoreConfig
|
||||
from calibre.gui2.store.search_result import SearchResult
|
||||
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||
|
||||
class eKnigiStore(BasicStoreConfig, StorePlugin):
|
||||
|
||||
def open(self, parent=None, detail_item=None, external=False):
|
||||
# Use Kovid's affiliate id 30% of the time
|
||||
if random.randint(1, 10) in (1, 2, 3):
|
||||
aff_suffix = '&amigosid=23'
|
||||
else:
|
||||
aff_suffix = '&amigosid=22'
|
||||
url = 'http://e-knigi.net/?' + aff_suffix[1:]
|
||||
|
||||
if external or self.config.get('open_external', False):
|
||||
if detail_item:
|
||||
url = detail_item + aff_suffix
|
||||
open_url(QUrl(url_slash_cleaner(url)))
|
||||
else:
|
||||
detail_url = None
|
||||
if detail_item:
|
||||
url = detail_item + aff_suffix
|
||||
d = WebStoreDialog(self.gui, url, parent, detail_url)
|
||||
d.setWindowTitle(self.name)
|
||||
d.set_tags(self.config.get('tags', ''))
|
||||
d.exec_()
|
||||
|
||||
def search(self, query, max_results=10, timeout=60):
|
||||
base_url = 'http://e-knigi.net'
|
||||
url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&limitstart=0&limit=' + str(max_results) + '&keyword=' + urllib2.quote(query)
|
||||
|
||||
br = browser()
|
||||
|
||||
counter = max_results
|
||||
with closing(br.open(url, timeout=timeout)) as f:
|
||||
doc = html.fromstring(f.read())
|
||||
|
||||
# if the store finds only one product, it opens directly detail view
|
||||
for data in doc.xpath('//div[@class="prod_details"]'):
|
||||
s = SearchResult()
|
||||
s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip()
|
||||
s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip()
|
||||
s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip()
|
||||
s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip()
|
||||
s.detail_item = url
|
||||
s.drm = SearchResult.DRM_UNLOCKED
|
||||
|
||||
yield s
|
||||
return
|
||||
|
||||
# search in store results
|
||||
for data in doc.xpath('//div[@class="browseProductContainer"]'):
|
||||
if counter <= 0:
|
||||
break
|
||||
id = ''.join(data.xpath('.//a[1]/@href')).strip()
|
||||
if not id:
|
||||
continue
|
||||
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip()
|
||||
s.title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip()
|
||||
s.author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '')
|
||||
s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip()
|
||||
s.detail_item = base_url + id
|
||||
s.drm = SearchResult.DRM_UNLOCKED
|
||||
|
||||
yield s
|
@ -15,6 +15,7 @@ from calibre.gui2 import config, dynamic, open_url
|
||||
from calibre.gui2.dialogs.plugin_updater import get_plugin_updates_available
|
||||
|
||||
URL = 'http://status.calibre-ebook.com/latest'
|
||||
#URL = 'http://localhost:8000/latest'
|
||||
NO_CALIBRE_UPDATE = '-0.0.0'
|
||||
VSEP = '|'
|
||||
|
||||
|
@ -17,7 +17,7 @@ from datetime import datetime
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.metadata import title_sort, author_to_author_sort
|
||||
from calibre.utils.date import parse_date, isoformat, local_tz
|
||||
from calibre.utils.date import parse_date, isoformat, local_tz, UNDEFINED_DATE
|
||||
from calibre import isbytestring, force_unicode
|
||||
from calibre.constants import iswindows, DEBUG, plugins
|
||||
from calibre.utils.icu import strcmp
|
||||
@ -39,8 +39,11 @@ def _c_convert_timestamp(val):
|
||||
if ret is None:
|
||||
return parse_date(val, as_utc=False)
|
||||
year, month, day, hour, minutes, seconds, tzsecs = ret
|
||||
return datetime(year, month, day, hour, minutes, seconds,
|
||||
try:
|
||||
return datetime(year, month, day, hour, minutes, seconds,
|
||||
tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
|
||||
except OverflowError:
|
||||
return UNDEFINED_DATE.astimezone(local_tz)
|
||||
|
||||
def _py_convert_timestamp(val):
|
||||
if val:
|
||||
|
@ -141,7 +141,8 @@ class BaseJob(object):
|
||||
def log_file(self):
|
||||
if self.log_path:
|
||||
return open(self.log_path, 'rb')
|
||||
return cStringIO.StringIO(_('No details available.'))
|
||||
return cStringIO.StringIO(_('No details available.').encode('utf-8',
|
||||
'replace'))
|
||||
|
||||
@property
|
||||
def details(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user