Reviewed pyflake changes to metadata.mobi, commented out a diagnostic

This commit is contained in:
GRiker 2010-01-26 11:08:15 -07:00
commit c0f25a4428
9 changed files with 521 additions and 48 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 835 B

View File

@ -10,26 +10,44 @@ doscovermagazine.com
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class DiscoverMagazine(BasicNewsRecipe): class DiscoverMagazine(BasicNewsRecipe):
title = u'Discover Magazine' title = u'Discover Magazine'
description = u'Science, Technology and the Future' description = u'Science, Technology and the Future'
__author__ = 'Mike Diaz' __author__ = 'Mike Diaz'
oldest_article = 33
language = 'en' language = 'en'
max_articles_per_feed = 20 oldest_article = 33
max_articles_per_feed = 20
no_stylesheets = True
remove_javascript = True
use_embedded_content = False
encoding = 'utf-8'
extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
remove_tags_before = dict(id='articlePage')
keep_only_tags = [dict(name='div', attrs={'id':'articlePage'})]
remove_tags = [dict(attrs={'id':['buttons', 'tool-box', 'teaser', 'already-subscriber', 'teaser-suite', 'related-articles', 'relatedItem', 'box-popular', 'box-blogs', 'box-news', 'footer']}),
dict(attrs={'class':'popularNewsBox'}),
dict(name=['img', 'style', 'head'])]
remove_tags_after = dict(id='articlePage')
feeds = [ feeds = [
(u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'), (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
(u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'), (u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
(u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'), (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'),
(u'Space', u'http://discovermagazine.com/topics/space/rss.xml'), (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'),
(u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'), (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'),
(u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'), (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'),
(u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'), (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'),
(u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'), (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'),
(u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'), (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'),
(u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'), (u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'),
(u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'), (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'),
(u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'), (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'),
(u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'), (u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'),
(u'Science Not Fiction', u'http://blogs.discovermagazine.com/sciencenotfiction/wp-rss.php') (u'Science Not Fiction', u'http://blogs.discovermagazine.com/sciencenotfiction/wp-rss.php')
] ]

View File

@ -0,0 +1,239 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
macleans.ca
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
from datetime import timedelta, date
class Macleans(BasicNewsRecipe):
title = u'Macleans Magazine'
__author__ = 'Nick Redding'
language = 'en_CA'
description = ('Macleans Magazine')
no_stylesheets = True
timefmt = ' [%b %d]'
# customization notes: delete sections you are not interested in
# set oldest_article to the maximum number of days back from today to include articles
sectionlist = [
['http://www2.macleans.ca/','Front Page'],
['http://www2.macleans.ca/category/canada/','Canada'],
['http://www2.macleans.ca/category/world-from-the-magazine/','World'],
['http://www2.macleans.ca/category/business','Business'],
['http://www2.macleans.ca/category/arts-culture/','Culture'],
['http://www2.macleans.ca/category/opinion','Opinion'],
['http://www2.macleans.ca/category/health-from-the-magazine/','Health'],
['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'],
['http://www2.macleans.ca/category/education/','On Campus'],
['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel']
]
oldest_article = 7
# formatting for print version of articles
extra_css = '''h2{font-family:Times,serif; font-size:large;}
small {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
'''
# tag handling for print version of articles
keep_only_tags = [dict(id='tw-print')]
remove_tags = [dict({'class':'postmetadata'})]
def preprocess_html(self,soup):
for img_tag in soup.findAll('img'):
parent_tag = img_tag.parent
if parent_tag.name == 'a':
new_tag = Tag(soup,'p')
new_tag.insert(0,img_tag)
parent_tag.replaceWith(new_tag)
elif parent_tag.name == 'p':
if not self.tag_to_string(parent_tag) == '':
new_div = Tag(soup,'div')
new_tag = Tag(soup,'p')
new_tag.insert(0,img_tag)
parent_tag.replaceWith(new_div)
new_div.insert(0,new_tag)
new_div.insert(1,parent_tag)
return soup
def parse_index(self):
articles = {}
key = None
ans = []
def parse_index_page(page_url,page_title):
def decode_date(datestr):
dmysplit = datestr.strip().lower().split(',')
mdsplit = dmysplit[1].split()
m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(mdsplit[0])+1
d = int(mdsplit[1])
y = int(dmysplit[2].split()[0])
return date(y,m,d)
def article_title(tag):
atag = tag.find('a',href=True)
if not atag:
return ''
return self.tag_to_string(atag)
def article_url(tag):
atag = tag.find('a',href=True)
if not atag:
return ''
return atag['href']+'print/'
def article_description(tag):
for p_tag in tag.findAll('p'):
d = self.tag_to_string(p_tag,False)
if not d == '':
return d
return ''
def compound_h4_h3_title(tag):
if tag.h4:
if tag.h3:
return self.tag_to_string(tag.h4,False)+u'\u2014'+self.tag_to_string(tag.h3,False)
else:
return self.tag_to_string(tag.h4,False)
elif tag.h3:
return self.tag_to_string(tag.h3,False)
else:
return ''
def compound_h2_h4_title(tag):
if tag.h2:
if tag.h4:
return self.tag_to_string(tag.h2,False)+u'\u2014'+self.tag_to_string(tag.h4,False)
else:
return self.tag_to_string(tag.h2,False)
elif tag.h4:
return self.tag_to_string(tag.h4,False)
else:
return ''
def handle_article(header_tag, outer_tag):
if header_tag:
url = article_url(header_tag)
title = article_title(header_tag)
author_date_tag = outer_tag.h4
if author_date_tag:
author_date = self.tag_to_string(author_date_tag,False).split(' - ')
author = author_date[0].strip()
article_date = decode_date(author_date[1])
earliest_date = date.today() - timedelta(days=self.oldest_article)
if article_date < earliest_date:
self.log("Skipping article dated %s" % author_date[1])
else:
excerpt_div = outer_tag.find('div','excerpt')
if excerpt_div:
description = article_description(excerpt_div)
else:
description = ''
if not articles.has_key(page_title):
articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content=''))
def handle_category_article(cat, header_tag, outer_tag):
url = article_url(header_tag)
title = article_title(header_tag)
if not title == '':
title = cat+u'\u2014'+title
a_tag = outer_tag.find('span','authorLink')
if a_tag:
author = self.tag_to_string(a_tag,False)
a_tag.parent.extract()
else:
author = ''
description = article_description(outer_tag)
if not articles.has_key(page_title):
articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date='',description=description,author=author,content=''))
soup = self.index_to_soup(page_url)
if page_title == 'Front Page':
# special processing for the front page
top_stories = soup.find('div',{ "id" : "macleansFeatured" })
if top_stories:
for div_slide in top_stories.findAll('div','slide'):
url = article_url(div_slide)
div_title = div_slide.find('div','header')
if div_title:
title = self.tag_to_string(div_title,False)
else:
title = ''
description = article_description(div_slide)
if not articles.has_key(page_title):
articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
from_macleans = soup.find('div',{ "id" : "fromMacleans" })
if from_macleans:
for li_tag in from_macleans.findAll('li','fromMacleansArticle'):
title = compound_h4_h3_title(li_tag)
url = article_url(li_tag)
description = article_description(li_tag)
if not articles.has_key(page_title):
articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
blog_central = soup.find('div',{ "id" : "bloglist" })
if blog_central:
for li_tag in blog_central.findAll('li'):
title = compound_h2_h4_title(li_tag)
if li_tag.h4:
url = article_url(li_tag.h4)
if not articles.has_key(page_title):
articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date='',description='',author='',content=''))
# need_to_know = soup.find('div',{ "id" : "needToKnow" })
# if need_to_know:
# for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}):
# title = compound_h4_h3_title(div_tag)
# url = article_url(div_tag)
# description = article_description(div_tag)
# if not articles.has_key(page_title):
# articles[page_title] = []
# articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content=''))
for news_category in soup.findAll('div','newsCategory'):
news_cat = self.tag_to_string(news_category.h4,False)
handle_category_article(news_cat, news_category.find('h2'), news_category.find('div'))
for news_item in news_category.findAll('li'):
handle_category_article(news_cat,news_item.h3,news_item)
return
# find the div containing the highlight article
div_post = soup.find('div','post')
if div_post:
h1_tag = div_post.h1
handle_article(h1_tag,div_post)
# find the divs containing the rest of the articles
div_other = div_post.find('div', { "id" : "categoryOtherPosts" })
if div_other:
for div_entry in div_other.findAll('div','entry'):
h2_tag = div_entry.h2
handle_article(h2_tag,div_entry)
for page_name,page_title in self.sectionlist:
parse_index_page(page_name,page_title)
ans.append(page_title)
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,24 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Metro_Montreal(BasicNewsRecipe):
title = u'M\xe9tro Montr\xe9al'
__author__ = 'Jerry Clapperton'
description = 'Le quotidien le plus branché sur le monde'
language = 'fr'
oldest_article = 7
max_articles_per_feed = 20
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
encoding = 'utf-8'
extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
remove_tags = [dict(attrs={'id':'buttons'}), dict(name=['img', 'style'])]
feeds = [(u"L'info", u'http://journalmetro.com/linfo/rss'), (u'Monde', u'http://journalmetro.com/monde/rss'), (u'Culture', u'http://journalmetro.com/culture/rss'), (u'Sports', u'http://journalmetro.com/sports/rss'), (u'Paroles', u'http://journalmetro.com/paroles/rss')]
def print_version(self, url):
return url.replace('article', 'ArticlePrint') + '?language=fr'

View File

@ -1,31 +1,40 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class NewsandObserver(BasicNewsRecipe): class NewsandObserver(BasicNewsRecipe):
title = u'News and Observer' title = u'Raleigh News & Observer'
description = 'News from Raleigh, North Carolina' description = 'News from Raleigh, North Carolina'
language = 'en' language = 'en'
__author__ = 'Krittika Goyal' __author__ = 'Krittika Goyal updated by Walt Anthony'
oldest_article = 5 #days oldest_article = 3 #days
max_articles_per_feed = 25 max_articles_per_feed = 25
summary_length = 150
no_stylesheets = True
remove_javascript = True
remove_stylesheets = True
remove_tags_before = dict(name='h1', attrs={'id':'story_headline'}) remove_tags_before = dict(name='h1', attrs={'id':'story_headline'})
remove_tags_after = dict(name='div', attrs={'id':'story_text_remaining'}) remove_tags_after = dict(name='div', attrs={'id':'story_text_remaining'})
remove_tags = [ remove_tags = [
dict(name='iframe'), dict(name='iframe'),
dict(name='div', attrs={'id':['right-rail', 'story_tools']}), dict(name='div', attrs={'id':['right-rail', 'story_tools', 'toolbox', 'toolbar', 'tool', 'shirttail', 'comment_widget', 'story_keywords', 'txtResizeTool']}),
dict(name='div', attrs={'class':['Buy-It-Now', 'story_link_share']}),
dict(name='ul', attrs={'class':'bold_tabs_nav'}), dict(name='ul', attrs={'class':'bold_tabs_nav'}),
] ]
feeds = [ feeds = [
('Cover', 'http://www.newsobserver.com/100/index.rss'), ('Cover', 'http://www.newsobserver.com/100/index.rss'),
('News', 'http://www.newsobserver.com/102/index.rss'), ('News', 'http://www.newsobserver.com/102/index.rss'),
('Politics', 'http://www.newsobserver.com/105/index.rss'), ('Politics', 'http://www.newsobserver.com/105/index.rss'),
('Business', 'http://www.newsobserver.com/104/index.rss'), ('Business', 'http://www.newsobserver.com/104/index.rss'),
('Sports', 'http://www.newsobserver.com/103/index.rss'), ('Sports', 'http://www.newsobserver.com/103/index.rss'),
('College Sports', 'http://www.newsobserver.com/119/index.rss'), ('College Sports', 'http://www.newsobserver.com/119/index.rss'),
('Lifestyles', 'http://www.newsobserver.com/106/index.rss'), ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'),
('Editorials', 'http://www.newsobserver.com/158/index.rss')] ('Editorials', 'http://www.newsobserver.com/158/index.rss')
]

View File

@ -1,10 +1,12 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
pagina12.com.ar pagina12.com.ar
''' '''
import time
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Pagina12(BasicNewsRecipe): class Pagina12(BasicNewsRecipe):
@ -19,6 +21,8 @@ class Pagina12(BasicNewsRecipe):
encoding = 'cp1252' encoding = 'cp1252'
use_embedded_content = False use_embedded_content = False
language = 'es' language = 'es'
remove_empty_feeds = True
extra_css = ' body{font-family: sans-serif} '
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
@ -47,3 +51,8 @@ class Pagina12(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/') return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
def get_cover_url(self):
imgnames = ['tapan.jpg','tapagn.jpg','tapan_gr.jpg','tapagn.jpg','tapagn.jpg','tapan.jpg','tapagn.jpg']
weekday = time.localtime().tm_wday
return strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/') + imgnames[weekday]

View File

@ -0,0 +1,22 @@
from calibre.web.feeds.news import BasicNewsRecipe
class The_Gazette(BasicNewsRecipe):
cover_url = 'file:///D:/Documents/Pictures/Covers/The_Gazette.jpg'
title = u'The Gazette'
__author__ = 'Jerry Clapperton'
description = 'Montreal news in English'
language = 'en_CA'
oldest_article = 7
max_articles_per_feed = 20
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
encoding = 'utf-8'
keep_only_tags = [dict(name='div', attrs={'id':['storyheader','page1']})]
extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
feeds = [(u'News', u'http://feeds.canada.com/canwest/F297'), (u'Opinion', u'http://feeds.canada.com/canwest/F7383'), (u'Arts', u'http://feeds.canada.com/canwest/F7366'), (u'Life', u'http://rss.canada.com/get/?F6934'), (u'Business', u'http://feeds.canada.com/canwest/F6939'), (u'Travel', u'http://rss.canada.com/get/?F6938'), (u'Health', u'http://feeds.canada.com/canwest/F7397'), (u'Technology', u'http://feeds.canada.com/canwest/F7411')]

View File

@ -808,6 +808,8 @@ class Device(DeviceConfig, DevicePlugin):
ext = os.path.splitext(fname)[1] ext = os.path.splitext(fname)[1]
from calibre.library.save_to_disk import get_components from calibre.library.save_to_disk import get_components
if not isinstance(template, unicode):
template = template.decode('utf-8')
extra_components = get_components(template, mdata, fname) extra_components = get_components(template, mdata, fname)
if not extra_components: if not extra_components:
extra_components.append(sanitize(self.filename_callback(fname, extra_components.append(sanitize(self.filename_callback(fname,

View File

@ -17,6 +17,8 @@ from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN
from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.langcodes import iana2mobi
import struct
class StreamSlicer(object): class StreamSlicer(object):
def __init__(self, stream, start=0, stop=None): def __init__(self, stream, start=0, stop=None):
@ -72,25 +74,50 @@ class StreamSlicer(object):
return stream.write(value) return stream.write(value)
raise TypeError("stream indices must be integers") raise TypeError("stream indices must be integers")
def update(self, data_blocks):
# Rewrite the stream
stream = self._stream
base = self.start
stream.seek(base)
self._stream.truncate(base)
for block in data_blocks:
stream.write(block)
def truncate(self, value):
self._stream.truncate(value)
class MetadataUpdater(object): class MetadataUpdater(object):
def __init__(self, stream): def __init__(self, stream):
self.stream = stream self.stream = stream
data = self.data = StreamSlicer(stream) data = self.data = StreamSlicer(stream)
type = self.type = data[60:68] self.type = data[60:68]
self.nrecs, = unpack('>H', data[76:78]) self.nrecs, = unpack('>H', data[76:78])
record0 = self.record0 = self.record(0) record0 = self.record0 = self.record(0)
self.encryption_type, = unpack('>H', record0[12:14]) self.encryption_type, = unpack('>H', record0[12:14])
codepage, = unpack('>I', record0[28:32]) codepage, = unpack('>I', record0[28:32])
self.codec = 'utf-8' if codepage == 65001 else 'cp1252' self.codec = 'utf-8' if codepage == 65001 else 'cp1252'
image_base, = unpack('>I', record0[108:112]) image_base, = unpack('>I', record0[108:112])
flags, = unpack('>I', record0[128:132]) flags, = self.flags, = unpack('>I', record0[128:132])
have_exth = self.have_exth = (flags & 0x40) != 0 have_exth = self.have_exth = (flags & 0x40) != 0
self.cover_record = self.thumbnail_record = None self.cover_record = self.thumbnail_record = None
self.timestamp = None self.timestamp = None
self.pdbrecords = self.get_pdbrecords()
if not have_exth: if not have_exth:
return self.create_exth()
# Fetch timestamp, cover_record, thumbnail_record
self.fetchEXTHFields()
def fetchEXTHFields(self):
stream = self.stream
record0 = self.record0
# 20:24 = mobiHeaderLength, 16=PDBHeader size
exth_off = unpack('>I', record0[20:24])[0] + 16 + record0.start exth_off = unpack('>I', record0[20:24])[0] + 16 + record0.start
image_base, = unpack('>I', record0[108:112])
# Fetch EXTH block
exth = self.exth = StreamSlicer(stream, exth_off, record0.stop) exth = self.exth = StreamSlicer(stream, exth_off, record0.stop)
nitems, = unpack('>I', exth[8:12]) nitems, = unpack('>I', exth[8:12])
pos = 12 pos = 12
@ -109,6 +136,130 @@ class MetadataUpdater(object):
rindex, = self.thumbnail_rindex, = unpack('>I', content) rindex, = self.thumbnail_rindex, = unpack('>I', content)
self.thumbnail_record = self.record(rindex + image_base) self.thumbnail_record = self.record(rindex + image_base)
def patch(self, off, new_record0):
# Save the current size of each record
record_sizes = [len(new_record0)]
for i in range(1,self.nrecs-1):
record_sizes.append(self.pdbrecords[i+1][0]-self.pdbrecords[i][0])
# And the last one
record_sizes.append(self.data.stop - self.pdbrecords[self.nrecs-1][0])
# pdbrecord[0] is the offset of record0. It will not change
# record1 offset will be offset of record0 + len(new_record0)
updated_pdbrecords = [self.pdbrecords[0][0]]
record0_offset = self.pdbrecords[0][0]
updated_offset = record0_offset + len(new_record0)
for i in range(1,self.nrecs-1):
updated_pdbrecords.append(updated_offset)
updated_offset += record_sizes[i]
# Update the last pdbrecord
updated_pdbrecords.append(updated_offset)
# Read in current records 1 to last
data_blocks = [new_record0]
for i in range(1,self.nrecs):
data_blocks.append(self.data[self.pdbrecords[i][0]:self.pdbrecords[i][0] + record_sizes[i]])
# Rewrite the stream
self.record0.update(data_blocks)
# Rewrite the pdbrecords
self.update_pdbrecords(updated_pdbrecords)
# Truncate if necessary
if (updated_pdbrecords[-1] + record_sizes[-1]) < self.data.stop:
self.data.truncate(updated_pdbrecords[-1] + record_sizes[-1])
else:
self.data.stop = updated_pdbrecords[-1] + record_sizes[-1]
def patchSection(self, section, new):
off = self.pdbrecords[section][0]
self.patch(off, new)
def create_exth(self, exth=None):
# Add an EXTH block to record 0, rewrite the stream
# self.hexdump(self.record0)
# Fetch the title
title_offset, = struct.unpack('>L', self.record0[0x54:0x58])
title_length, = struct.unpack('>L', self.record0[0x58:0x5c])
title_in_file, = struct.unpack('%ds' % (title_length), self.record0[title_offset:title_offset + title_length])
# Adjust length to accommodate PrimaryINDX if necessary
mobi_header_length, = unpack('>L', self.record0[0x14:0x18])
if mobi_header_length == 0xe4:
# Patch mobi_header_length to 0xE8
self.record0[0x17] = "\xe8"
self.record0[0xf4:0xf8] = pack('>L', 0xFFFFFFFF)
mobi_header_length = 0xe8
# Set EXTH flag (0x40)
self.record0[0x80:0x84] = pack('>L', self.flags|0x40)
if not exth:
# Construct an empty EXTH block
pad = '\0' * 4
exth = ['EXTH', pack('>II', 12, 0), pad]
exth = ''.join(exth)
# Update title_offset
self.record0[0x54:0x58] = pack('>L', 0x10 + mobi_header_length + len(exth))
# Create an updated Record0
new_record0 = StringIO()
new_record0.write(self.record0[:0x10 + mobi_header_length])
new_record0.write(exth)
new_record0.write(title_in_file)
# Pad to a 4-byte boundary
trail = len(new_record0.getvalue()) % 4
pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte
new_record0.write(pad)
#self.hexdump(new_record0.getvalue())
# Rebuild the stream, update the pdbrecords pointers
self.patchSection(0,new_record0.getvalue())
# Update record0
self.record0 = self.record(0)
def hexdump(self, src, length=16):
# Diagnostic
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
N=0; result=''
while src:
s,src = src[:length],src[length:]
hexa = ' '.join(["%02X"%ord(x) for x in s])
s = s.translate(FILTER)
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
N+=length
print result
def get_pdbrecords(self):
pdbrecords = []
for i in xrange(self.nrecs):
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.data[78+i*8:78+i*8+8])
flags, val = a1, a2<<16|a3<<8|a4
pdbrecords.append( [offset, flags, val] )
return pdbrecords
def update_pdbrecords(self, updated_pdbrecords):
for (i, pdbrecord) in enumerate(updated_pdbrecords):
self.data[78+i*8:78+i*8 + 4] = pack('>L',pdbrecord)
# Refresh local copy
self.pdbrecords = self.get_pdbrecords()
def dump_pdbrecords(self):
# Diagnostic
print "MetadataUpdater.dump_pdbrecords()"
print "%10s %10s %10s" % ("offset","flags","val")
for i in xrange(len(self.pdbrecords)):
pdbrecord = self.pdbrecords[i]
print "%10X %10X %10X" % (pdbrecord[0], pdbrecord[1], pdbrecord[2])
def record(self, n): def record(self, n):
if n >= self.nrecs: if n >= self.nrecs:
raise ValueError('non-existent record %r' % n) raise ValueError('non-existent record %r' % n)
@ -142,7 +293,6 @@ class MetadataUpdater(object):
if mi.tags: if mi.tags:
subjects = '; '.join(mi.tags) subjects = '; '.join(mi.tags)
recs.append((105, subjects.encode(self.codec, 'replace'))) recs.append((105, subjects.encode(self.codec, 'replace')))
if mi.pubdate: if mi.pubdate:
recs.append((106, str(mi.pubdate).encode(self.codec, 'replace'))) recs.append((106, str(mi.pubdate).encode(self.codec, 'replace')))
elif mi.timestamp: elif mi.timestamp:
@ -151,15 +301,16 @@ class MetadataUpdater(object):
recs.append((106, self.timestamp)) recs.append((106, self.timestamp))
else: else:
recs.append((106, str(datetime.now()).encode(self.codec, 'replace'))) recs.append((106, str(datetime.now()).encode(self.codec, 'replace')))
if self.cover_record is not None: if self.cover_record is not None:
recs.append((201, pack('>I', self.cover_rindex))) recs.append((201, pack('>I', self.cover_rindex)))
recs.append((203, pack('>I', 0))) recs.append((203, pack('>I', 0)))
if self.thumbnail_record is not None: if self.thumbnail_record is not None:
recs.append((202, pack('>I', self.thumbnail_rindex))) recs.append((202, pack('>I', self.thumbnail_rindex)))
exth = StringIO()
if getattr(self, 'encryption_type', -1) != 0: if getattr(self, 'encryption_type', -1) != 0:
raise MobiError('Setting metadata in DRMed MOBI files is not supported.') raise MobiError('Setting metadata in DRMed MOBI files is not supported.')
exth = StringIO()
for code, data in recs: for code, data in recs:
exth.write(pack('>II', code, len(data) + 8)) exth.write(pack('>II', code, len(data) + 8))
exth.write(data) exth.write(data)
@ -168,17 +319,16 @@ class MetadataUpdater(object):
pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte
exth = ['EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad] exth = ['EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad]
exth = ''.join(exth) exth = ''.join(exth)
title = (mi.title or _('Unknown')).encode(self.codec, 'replace')
if getattr(self, 'exth', None) is None: if getattr(self, 'exth', None) is None:
raise MobiError('No existing EXTH record. Cannot update metadata.') raise MobiError('No existing EXTH record. Cannot update metadata.')
title_off = (self.exth.start - self.record0.start) + len(exth)
title_len = len(title)
trail = len(self.exth) - len(exth) - len(title)
if trail < 0:
raise MobiError("Insufficient space to update metadata")
self.exth[:] = ''.join([exth, title, '\0' * trail])
self.record0[84:92] = pack('>II', title_off, title_len)
self.record0[92:96] = iana2mobi(mi.language) self.record0[92:96] = iana2mobi(mi.language)
self.create_exth(exth)
# Fetch updated timestamp, cover_record, thumbnail_record
self.fetchEXTHFields()
if mi.cover_data[1] or mi.cover: if mi.cover_data[1] or mi.cover:
try: try:
data = mi.cover_data[1] if mi.cover_data[1] else open(mi.cover, 'rb').read() data = mi.cover_data[1] if mi.cover_data[1] else open(mi.cover, 'rb').read()