From d5c756a84d06b0f744b5cd0fc0d149ec99453b6d Mon Sep 17 00:00:00 2001 From: GRiker Date: Mon, 25 Jan 2010 10:32:42 -0700 Subject: [PATCH 1/6] GwR revisions to extend EXTH header --- src/calibre/ebooks/metadata/mobi.py | 180 ++++++++++++++++++++++++++-- 1 file changed, 168 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index 2f8c90f4c4..f4809e3cff 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -17,6 +17,8 @@ from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN from calibre.ebooks.mobi.langcodes import iana2mobi +import struct, traceback + class StreamSlicer(object): def __init__(self, stream, start=0, stop=None): @@ -72,6 +74,17 @@ class StreamSlicer(object): return stream.write(value) raise TypeError("stream indices must be integers") + def update(self, data_blocks): + # Rewrite the stream + stream = self._stream + base = self.start + stream.seek(base) + self._stream.truncate(base) + for block in data_blocks: + stream.write(block) + + def truncate(self, value): + self._stream.truncate(value) class MetadataUpdater(object): def __init__(self, stream): @@ -84,13 +97,27 @@ class MetadataUpdater(object): codepage, = unpack('>I', record0[28:32]) self.codec = 'utf-8' if codepage == 65001 else 'cp1252' image_base, = unpack('>I', record0[108:112]) - flags, = unpack('>I', record0[128:132]) + flags, = self.flags, = unpack('>I', record0[128:132]) have_exth = self.have_exth = (flags & 0x40) != 0 self.cover_record = self.thumbnail_record = None self.timestamp = None + + self.pdbrecords = self.get_pdbrecords() if not have_exth: - return + self.create_exth() + + # Fetch timestamp, cover_record, thumbnail_record + self.fetchEXTHFields() + + def fetchEXTHFields(self): + stream = self.stream + record0 = self.record0 + + # 20:24 = mobiHeaderLength, 16=PDBHeader size exth_off = unpack('>I', record0[20:24])[0] + 16 + record0.start + image_base, = unpack('>I', record0[108:112]) + + # Fetch EXTH block exth = self.exth = StreamSlicer(stream, exth_off, record0.stop) nitems, = unpack('>I', exth[8:12]) pos = 12 @@ -108,7 +135,136 @@ class MetadataUpdater(object): elif id == 202: rindex, = self.thumbnail_rindex, = unpack('>I', content) self.thumbnail_record = self.record(rindex + image_base) + + def patch(self, off, new_record0): + # Save the current size of each record + record_sizes = [len(new_record0)] + for i in range(1,self.nrecs-1): + record_sizes.append(self.pdbrecords[i+1][0]-self.pdbrecords[i][0]) + # And the last one + record_sizes.append(self.data.stop - self.pdbrecords[self.nrecs-1][0]) + + # pdbrecord[0] is the offset of record0. It will not change + # record1 offset will be offset of record0 + len(new_record0) + updated_pdbrecords = [self.pdbrecords[0][0]] + record0_offset = self.pdbrecords[0][0] + current_offset = self.pdbrecords[1][0] + updated_offset = record0_offset + len(new_record0) + + for i in range(1,self.nrecs-1): + updated_pdbrecords.append(updated_offset) + updated_offset += record_sizes[i] + # Update the last pdbrecord + updated_pdbrecords.append(updated_offset) + # Read in current records 1 to last + data_blocks = [new_record0] + for i in range(1,self.nrecs): + data_blocks.append(self.data[self.pdbrecords[i][0]:self.pdbrecords[i][0] + record_sizes[i]]) + + # Rewrite the stream + self.record0.update(data_blocks) + + # Rewrite the pdbrecords + self.update_pdbrecords(updated_pdbrecords) + + # Truncate if necessary + if (updated_pdbrecords[-1] + record_sizes[-1]) < self.data.stop: + self.data.truncate(updated_pdbrecords[-1] + record_sizes[-1]) + else: + self.data.stop = updated_pdbrecords[-1] + record_sizes[-1] + + def patchSection(self, section, new): + if (section + 1 == self.nrecs): + endoff = len(self.stream) + else: + endoff = self.pdbrecords[section + 1][0] + off = self.pdbrecords[section][0] + self.patch(off, new) + + def create_exth(self, exth=None): + # Add an EXTH block to record 0, rewrite the stream + # self.hexdump(self.record0) + + # Fetch the title + title_offset, = struct.unpack('>L', self.record0[0x54:0x58]) + title_length, = struct.unpack('>L', self.record0[0x58:0x5c]) + title_in_file, = struct.unpack('%ds' % (title_length), self.record0[title_offset:title_offset + title_length]) + + # Adjust length to accommodate PrimaryINDX if necessary + mobi_header_length, = unpack('>L', self.record0[0x14:0x18]) + if mobi_header_length == 0xe4: + # Patch mobi_header_length to 0xE8 + self.record0[0x17] = "\xe8" + self.record0[0xf4:0xf8] = pack('>L', 0xFFFFFFFF) + mobi_header_length = 0xe8 + + # Set EXTH flag (0x40) + self.record0[0x80:0x84] = pack('>L', self.flags|0x40) + + if not exth: + # Construct an empty EXTH block + pad = '\0' * 4 + exth = ['EXTH', pack('>II', 12, 0), pad] + exth = ''.join(exth) + + # Update title_offset + self.record0[0x54:0x58] = pack('>L', 0x10 + mobi_header_length + len(exth)) + + # Create an updated Record0 + new_record0 = StringIO() + new_record0.write(self.record0[:0x10 + mobi_header_length]) + new_record0.write(exth) + new_record0.write(title_in_file) + + # Pad to a 4-byte boundary + trail = len(new_record0.getvalue()) % 4 + pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte + new_record0.write(pad) + + self.hexdump(new_record0.getvalue()) + + # Rebuild the stream, update the pdbrecords pointers + self.patchSection(0,new_record0.getvalue()) + + # Update record0 + self.record0 = self.record(0) + + def hexdump(self, src, length=16): + # Diagnostic + FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) + N=0; result='' + while src: + s,src = src[:length],src[length:] + hexa = ' '.join(["%02X"%ord(x) for x in s]) + s = s.translate(FILTER) + result += "%04X %-*s %s\n" % (N, length*3, hexa, s) + N+=length + print result + + def get_pdbrecords(self): + pdbrecords = [] + for i in xrange(self.nrecs): + offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.data[78+i*8:78+i*8+8]) + flags, val = a1, a2<<16|a3<<8|a4 + pdbrecords.append( [offset, flags, val] ) + return pdbrecords + + def update_pdbrecords(self, updated_pdbrecords): + for (i, pdbrecord) in enumerate(updated_pdbrecords): + self.data[78+i*8:78+i*8 + 4] = pack('>L',pdbrecord) + + # Refresh local copy + self.pdbrecords = self.get_pdbrecords() + + def dump_pdbrecords(self): + # Diagnostic + print "MetadataUpdater.dump_pdbrecords()" + print "%10s %10s %10s" % ("offset","flags","val") + for i in xrange(len(self.pdbrecords)): + pdbrecord = self.pdbrecords[i] + print "%10X %10X %10X" % (pdbrecord[0], pdbrecord[1], pdbrecord[2]) + def record(self, n): if n >= self.nrecs: raise ValueError('non-existent record %r' % n) @@ -142,7 +298,6 @@ class MetadataUpdater(object): if mi.tags: subjects = '; '.join(mi.tags) recs.append((105, subjects.encode(self.codec, 'replace'))) - if mi.pubdate: recs.append((106, str(mi.pubdate).encode(self.codec, 'replace'))) elif mi.timestamp: @@ -151,15 +306,16 @@ class MetadataUpdater(object): recs.append((106, self.timestamp)) else: recs.append((106, str(datetime.now()).encode(self.codec, 'replace'))) - if self.cover_record is not None: recs.append((201, pack('>I', self.cover_rindex))) recs.append((203, pack('>I', 0))) if self.thumbnail_record is not None: recs.append((202, pack('>I', self.thumbnail_rindex))) - exth = StringIO() + if getattr(self, 'encryption_type', -1) != 0: raise MobiError('Setting metadata in DRMed MOBI files is not supported.') + + exth = StringIO() for code, data in recs: exth.write(pack('>II', code, len(data) + 8)) exth.write(data) @@ -168,17 +324,17 @@ class MetadataUpdater(object): pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte exth = ['EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad] exth = ''.join(exth) + title = (mi.title or _('Unknown')).encode(self.codec, 'replace') if getattr(self, 'exth', None) is None: raise MobiError('No existing EXTH record. Cannot update metadata.') - title_off = (self.exth.start - self.record0.start) + len(exth) - title_len = len(title) - trail = len(self.exth) - len(exth) - len(title) - if trail < 0: - raise MobiError("Insufficient space to update metadata") - self.exth[:] = ''.join([exth, title, '\0' * trail]) - self.record0[84:92] = pack('>II', title_off, title_len) + self.record0[92:96] = iana2mobi(mi.language) + self.create_exth(exth) + + # Fetch updated timestamp, cover_record, thumbnail_record + self.fetchEXTHFields() + if mi.cover_data[1] or mi.cover: try: data = mi.cover_data[1] if mi.cover_data[1] else open(mi.cover, 'rb').read() From ff081d1515b95f8299ead074dc22d83bb66477ed Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jan 2010 01:59:10 -0700 Subject: [PATCH 2/6] New recipe for Macleans Magazine by Nick Redding and improved recipe for Raleigh News and Observer --- resources/images/news/observer.png | Bin 0 -> 835 bytes resources/recipes/macleans.recipe | 239 +++++++++++++++++++++++++++++ resources/recipes/observer.recipe | 37 +++-- 3 files changed, 262 insertions(+), 14 deletions(-) create mode 100644 resources/images/news/observer.png create mode 100644 resources/recipes/macleans.recipe diff --git a/resources/images/news/observer.png b/resources/images/news/observer.png new file mode 100644 index 0000000000000000000000000000000000000000..5fbb7a6ccc5d89e65aab9f7263ea81dab39db99b GIT binary patch literal 835 zcmV-J1HAl+P)BcB5d@K5{K079Ttgw|cV*s*6X&8~~c{W)*0 zoH|a^%oJWMhO{h<)(9yZ9b*~`Fb2FNv4IRbhmK$!*w%JVPmI#)#*vnVkdjE;C7szz zYySYsaqw4H5K?ZIWg)djZEP^VwuV~1Jy^=#~`PuOcH+3O|f9 z`Sx3c>+6k^^!i^cO;4ekBMvH&MAgsS`SK7==`>4U4m0=Z$2@uZEUC;OPFtG7wXeWv zgrzWAQ$?Xv#6~&}wo=rpRcy}#1o`n%vZLSdloaSNq>|6!mrH;EK-xA2kTULK#XZW4 z3uGt8SiCw67;d?^;ZZ=C8BL@sOC7c7ca0j zH(NKTK|w=3!fKWI&pu)IiPQAGdX)Bk&ttSkI!@#BUQaKoDMnaXZ#dIB*xhk1UdW?_ z;QsXqjPK+1^bz0DNmII=^2|-V?-ii3=)CqI^jdT zqAe|i)hc17!hc~ftkewQOGBswuN~#{Y?k7U?~$b<;$d)>mkA9hApk?ofUGkVYX>o^xk8ywICP5IQwf-OT!$7)QxAMp{&(YD<&EFK*D%r8rnxX&z N002ovPDHLkV1hsDiQoVL literal 0 HcmV?d00001 diff --git a/resources/recipes/macleans.recipe b/resources/recipes/macleans.recipe new file mode 100644 index 0000000000..296a56f5f3 --- /dev/null +++ b/resources/recipes/macleans.recipe @@ -0,0 +1,239 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +macleans.ca +''' +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag +from datetime import timedelta, date + +class Macleans(BasicNewsRecipe): + title = u'Macleans Magazine' + __author__ = 'Nick Redding' + language = 'en_CA' + description = ('Macleans Magazine') + + no_stylesheets = True + timefmt = ' [%b %d]' + + # customization notes: delete sections you are not interested in + # set oldest_article to the maximum number of days back from today to include articles + sectionlist = [ + ['http://www2.macleans.ca/','Front Page'], + ['http://www2.macleans.ca/category/canada/','Canada'], + ['http://www2.macleans.ca/category/world-from-the-magazine/','World'], + ['http://www2.macleans.ca/category/business','Business'], + ['http://www2.macleans.ca/category/arts-culture/','Culture'], + ['http://www2.macleans.ca/category/opinion','Opinion'], + ['http://www2.macleans.ca/category/health-from-the-magazine/','Health'], + ['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'], + ['http://www2.macleans.ca/category/education/','On Campus'], + ['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel'] + ] + oldest_article = 7 + + # formatting for print version of articles + extra_css = '''h2{font-family:Times,serif; font-size:large;} + small {font-family:Times,serif; font-size:xx-small; list-style-type: none;} + ''' + + # tag handling for print version of articles + keep_only_tags = [dict(id='tw-print')] + remove_tags = [dict({'class':'postmetadata'})] + + + def preprocess_html(self,soup): + for img_tag in soup.findAll('img'): + parent_tag = img_tag.parent + if parent_tag.name == 'a': + new_tag = Tag(soup,'p') + new_tag.insert(0,img_tag) + parent_tag.replaceWith(new_tag) + elif parent_tag.name == 'p': + if not self.tag_to_string(parent_tag) == '': + new_div = Tag(soup,'div') + new_tag = Tag(soup,'p') + new_tag.insert(0,img_tag) + parent_tag.replaceWith(new_div) + new_div.insert(0,new_tag) + new_div.insert(1,parent_tag) + return soup + + def parse_index(self): + + + + articles = {} + key = None + ans = [] + + def parse_index_page(page_url,page_title): + + def decode_date(datestr): + dmysplit = datestr.strip().lower().split(',') + mdsplit = dmysplit[1].split() + m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(mdsplit[0])+1 + d = int(mdsplit[1]) + y = int(dmysplit[2].split()[0]) + return date(y,m,d) + + def article_title(tag): + atag = tag.find('a',href=True) + if not atag: + return '' + return self.tag_to_string(atag) + + def article_url(tag): + atag = tag.find('a',href=True) + if not atag: + return '' + return atag['href']+'print/' + + def article_description(tag): + for p_tag in tag.findAll('p'): + d = self.tag_to_string(p_tag,False) + if not d == '': + return d + return '' + + def compound_h4_h3_title(tag): + if tag.h4: + if tag.h3: + return self.tag_to_string(tag.h4,False)+u'\u2014'+self.tag_to_string(tag.h3,False) + else: + return self.tag_to_string(tag.h4,False) + elif tag.h3: + return self.tag_to_string(tag.h3,False) + else: + return '' + + def compound_h2_h4_title(tag): + if tag.h2: + if tag.h4: + return self.tag_to_string(tag.h2,False)+u'\u2014'+self.tag_to_string(tag.h4,False) + else: + return self.tag_to_string(tag.h2,False) + elif tag.h4: + return self.tag_to_string(tag.h4,False) + else: + return '' + + + def handle_article(header_tag, outer_tag): + if header_tag: + url = article_url(header_tag) + title = article_title(header_tag) + author_date_tag = outer_tag.h4 + if author_date_tag: + author_date = self.tag_to_string(author_date_tag,False).split(' - ') + author = author_date[0].strip() + article_date = decode_date(author_date[1]) + earliest_date = date.today() - timedelta(days=self.oldest_article) + if article_date < earliest_date: + self.log("Skipping article dated %s" % author_date[1]) + else: + excerpt_div = outer_tag.find('div','excerpt') + if excerpt_div: + description = article_description(excerpt_div) + else: + description = '' + if not articles.has_key(page_title): + articles[page_title] = [] + articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content='')) + + def handle_category_article(cat, header_tag, outer_tag): + url = article_url(header_tag) + title = article_title(header_tag) + if not title == '': + title = cat+u'\u2014'+title + a_tag = outer_tag.find('span','authorLink') + if a_tag: + author = self.tag_to_string(a_tag,False) + a_tag.parent.extract() + else: + author = '' + description = article_description(outer_tag) + if not articles.has_key(page_title): + articles[page_title] = [] + articles[page_title].append(dict(title=title,url=url,date='',description=description,author=author,content='')) + + + soup = self.index_to_soup(page_url) + + if page_title == 'Front Page': + # special processing for the front page + top_stories = soup.find('div',{ "id" : "macleansFeatured" }) + if top_stories: + for div_slide in top_stories.findAll('div','slide'): + url = article_url(div_slide) + div_title = div_slide.find('div','header') + if div_title: + title = self.tag_to_string(div_title,False) + else: + title = '' + description = article_description(div_slide) + if not articles.has_key(page_title): + articles[page_title] = [] + articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + from_macleans = soup.find('div',{ "id" : "fromMacleans" }) + if from_macleans: + for li_tag in from_macleans.findAll('li','fromMacleansArticle'): + title = compound_h4_h3_title(li_tag) + url = article_url(li_tag) + description = article_description(li_tag) + if not articles.has_key(page_title): + articles[page_title] = [] + articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + blog_central = soup.find('div',{ "id" : "bloglist" }) + if blog_central: + for li_tag in blog_central.findAll('li'): + title = compound_h2_h4_title(li_tag) + if li_tag.h4: + url = article_url(li_tag.h4) + if not articles.has_key(page_title): + articles[page_title] = [] + articles[page_title].append(dict(title=title,url=url,date='',description='',author='',content='')) + +# need_to_know = soup.find('div',{ "id" : "needToKnow" }) +# if need_to_know: +# for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}): +# title = compound_h4_h3_title(div_tag) +# url = article_url(div_tag) +# description = article_description(div_tag) +# if not articles.has_key(page_title): +# articles[page_title] = [] +# articles[page_title].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + for news_category in soup.findAll('div','newsCategory'): + news_cat = self.tag_to_string(news_category.h4,False) + handle_category_article(news_cat, news_category.find('h2'), news_category.find('div')) + for news_item in news_category.findAll('li'): + handle_category_article(news_cat,news_item.h3,news_item) + + return + + # find the div containing the highlight article + div_post = soup.find('div','post') + if div_post: + h1_tag = div_post.h1 + handle_article(h1_tag,div_post) + + # find the divs containing the rest of the articles + div_other = div_post.find('div', { "id" : "categoryOtherPosts" }) + if div_other: + for div_entry in div_other.findAll('div','entry'): + h2_tag = div_entry.h2 + handle_article(h2_tag,div_entry) + + + + for page_name,page_title in self.sectionlist: + parse_index_page(page_name,page_title) + ans.append(page_title) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/observer.recipe b/resources/recipes/observer.recipe index 139d1ff7d4..dec9da8f37 100644 --- a/resources/recipes/observer.recipe +++ b/resources/recipes/observer.recipe @@ -1,31 +1,40 @@ from calibre.web.feeds.news import BasicNewsRecipe class NewsandObserver(BasicNewsRecipe): - title = u'News and Observer' + title = u'Raleigh News & Observer' description = 'News from Raleigh, North Carolina' language = 'en' - __author__ = 'Krittika Goyal' - oldest_article = 5 #days + __author__ = 'Krittika Goyal updated by Walt Anthony' + oldest_article = 3 #days max_articles_per_feed = 25 + summary_length = 150 + + no_stylesheets = True + remove_javascript = True - remove_stylesheets = True remove_tags_before = dict(name='h1', attrs={'id':'story_headline'}) - remove_tags_after = dict(name='div', attrs={'id':'story_text_remaining'}) + remove_tags_after = dict(name='div', attrs={'id':'story_text_remaining'}) + + remove_tags = [ dict(name='iframe'), - dict(name='div', attrs={'id':['right-rail', 'story_tools']}), + dict(name='div', attrs={'id':['right-rail', 'story_tools', 'toolbox', 'toolbar', 'tool', 'shirttail', 'comment_widget', 'story_keywords', 'txtResizeTool']}), + dict(name='div', attrs={'class':['Buy-It-Now', 'story_link_share']}), dict(name='ul', attrs={'class':'bold_tabs_nav'}), + ] + feeds = [ - ('Cover', 'http://www.newsobserver.com/100/index.rss'), - ('News', 'http://www.newsobserver.com/102/index.rss'), - ('Politics', 'http://www.newsobserver.com/105/index.rss'), - ('Business', 'http://www.newsobserver.com/104/index.rss'), - ('Sports', 'http://www.newsobserver.com/103/index.rss'), - ('College Sports', 'http://www.newsobserver.com/119/index.rss'), - ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'), - ('Editorials', 'http://www.newsobserver.com/158/index.rss')] + ('Cover', 'http://www.newsobserver.com/100/index.rss'), + ('News', 'http://www.newsobserver.com/102/index.rss'), + ('Politics', 'http://www.newsobserver.com/105/index.rss'), + ('Business', 'http://www.newsobserver.com/104/index.rss'), + ('Sports', 'http://www.newsobserver.com/103/index.rss'), + ('College Sports', 'http://www.newsobserver.com/119/index.rss'), + ('Lifestyles', 'http://www.newsobserver.com/106/index.rss'), + ('Editorials', 'http://www.newsobserver.com/158/index.rss') + ] From 78888577c5e5f3522518c3e6b84100bc0db1bd9c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jan 2010 08:51:32 -0700 Subject: [PATCH 3/6] Implement #4689 (Updated recipe for Pagina 12) --- resources/recipes/pagina12.recipe | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/resources/recipes/pagina12.recipe b/resources/recipes/pagina12.recipe index 3a271e055d..bcf73c7b1c 100644 --- a/resources/recipes/pagina12.recipe +++ b/resources/recipes/pagina12.recipe @@ -1,10 +1,12 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' pagina12.com.ar ''' +import time +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Pagina12(BasicNewsRecipe): @@ -19,6 +21,8 @@ class Pagina12(BasicNewsRecipe): encoding = 'cp1252' use_embedded_content = False language = 'es' + remove_empty_feeds = True + extra_css = ' body{font-family: sans-serif} ' conversion_options = { 'comment' : description @@ -28,7 +32,7 @@ class Pagina12(BasicNewsRecipe): } remove_tags = [dict(name='div', attrs={'id':['volver','logo','logo_suple','fin','permalink']})] - + feeds = [ (u'Edicion impresa', u'http://www.pagina12.com.ar/diario/rss/principal.xml' ) @@ -47,3 +51,8 @@ class Pagina12(BasicNewsRecipe): def print_version(self, url): return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/') + def get_cover_url(self): + imgnames = ['tapan.jpg','tapagn.jpg','tapan_gr.jpg','tapagn.jpg','tapagn.jpg','tapan.jpg','tapagn.jpg'] + weekday = time.localtime().tm_wday + return strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/') + imgnames[weekday] + \ No newline at end of file From c3cba391b73c2d322d9cc58409651778d713ba22 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jan 2010 09:29:53 -0700 Subject: [PATCH 4/6] Fix #4690 (Error while sending book with non-ascii character in title/author to device) --- src/calibre/devices/usbms/device.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index 6ddfc81cf3..5cfb60dede 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -808,6 +808,8 @@ class Device(DeviceConfig, DevicePlugin): ext = os.path.splitext(fname)[1] from calibre.library.save_to_disk import get_components + if not isinstance(template, unicode): + template = template.decode('utf-8') extra_components = get_components(template, mdata, fname) if not extra_components: extra_components.append(sanitize(self.filename_callback(fname, From 0a73a7cf514fbad0ce5264cae43a7b796612dcda Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jan 2010 10:06:56 -0700 Subject: [PATCH 5/6] Remove ununsed vars from mobi.py --- src/calibre/ebooks/metadata/mobi.py | 48 +++++++++++++---------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index f4809e3cff..def23c47e9 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -17,7 +17,7 @@ from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.writer import rescale_image, MAX_THUMB_DIMEN from calibre.ebooks.mobi.langcodes import iana2mobi -import struct, traceback +import struct class StreamSlicer(object): @@ -74,15 +74,15 @@ class StreamSlicer(object): return stream.write(value) raise TypeError("stream indices must be integers") - def update(self, data_blocks): + def update(self, data_blocks): # Rewrite the stream stream = self._stream base = self.start stream.seek(base) self._stream.truncate(base) - for block in data_blocks: + for block in data_blocks: stream.write(block) - + def truncate(self, value): self._stream.truncate(value) @@ -90,7 +90,7 @@ class MetadataUpdater(object): def __init__(self, stream): self.stream = stream data = self.data = StreamSlicer(stream) - type = self.type = data[60:68] + self.type = data[60:68] self.nrecs, = unpack('>H', data[76:78]) record0 = self.record0 = self.record(0) self.encryption_type, = unpack('>H', record0[12:14]) @@ -135,25 +135,24 @@ class MetadataUpdater(object): elif id == 202: rindex, = self.thumbnail_rindex, = unpack('>I', content) self.thumbnail_record = self.record(rindex + image_base) - - def patch(self, off, new_record0): + + def patch(self, off, new_record0): # Save the current size of each record record_sizes = [len(new_record0)] for i in range(1,self.nrecs-1): record_sizes.append(self.pdbrecords[i+1][0]-self.pdbrecords[i][0]) # And the last one record_sizes.append(self.data.stop - self.pdbrecords[self.nrecs-1][0]) - + # pdbrecord[0] is the offset of record0. It will not change # record1 offset will be offset of record0 + len(new_record0) updated_pdbrecords = [self.pdbrecords[0][0]] record0_offset = self.pdbrecords[0][0] - current_offset = self.pdbrecords[1][0] updated_offset = record0_offset + len(new_record0) - + for i in range(1,self.nrecs-1): updated_pdbrecords.append(updated_offset) - updated_offset += record_sizes[i] + updated_offset += record_sizes[i] # Update the last pdbrecord updated_pdbrecords.append(updated_offset) @@ -175,17 +174,13 @@ class MetadataUpdater(object): self.data.stop = updated_pdbrecords[-1] + record_sizes[-1] def patchSection(self, section, new): - if (section + 1 == self.nrecs): - endoff = len(self.stream) - else: - endoff = self.pdbrecords[section + 1][0] off = self.pdbrecords[section][0] self.patch(off, new) def create_exth(self, exth=None): # Add an EXTH block to record 0, rewrite the stream # self.hexdump(self.record0) - + # Fetch the title title_offset, = struct.unpack('>L', self.record0[0x54:0x58]) title_length, = struct.unpack('>L', self.record0[0x58:0x5c]) @@ -198,10 +193,10 @@ class MetadataUpdater(object): self.record0[0x17] = "\xe8" self.record0[0xf4:0xf8] = pack('>L', 0xFFFFFFFF) mobi_header_length = 0xe8 - + # Set EXTH flag (0x40) self.record0[0x80:0x84] = pack('>L', self.flags|0x40) - + if not exth: # Construct an empty EXTH block pad = '\0' * 4 @@ -216,23 +211,23 @@ class MetadataUpdater(object): new_record0.write(self.record0[:0x10 + mobi_header_length]) new_record0.write(exth) new_record0.write(title_in_file) - + # Pad to a 4-byte boundary trail = len(new_record0.getvalue()) % 4 pad = '\0' * (4 - trail) # Always pad w/ at least 1 byte new_record0.write(pad) self.hexdump(new_record0.getvalue()) - + # Rebuild the stream, update the pdbrecords pointers self.patchSection(0,new_record0.getvalue()) - + # Update record0 self.record0 = self.record(0) - + def hexdump(self, src, length=16): # Diagnostic - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) + FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) N=0; result='' while src: s,src = src[:length],src[length:] @@ -249,14 +244,14 @@ class MetadataUpdater(object): flags, val = a1, a2<<16|a3<<8|a4 pdbrecords.append( [offset, flags, val] ) return pdbrecords - + def update_pdbrecords(self, updated_pdbrecords): for (i, pdbrecord) in enumerate(updated_pdbrecords): self.data[78+i*8:78+i*8 + 4] = pack('>L',pdbrecord) # Refresh local copy self.pdbrecords = self.get_pdbrecords() - + def dump_pdbrecords(self): # Diagnostic print "MetadataUpdater.dump_pdbrecords()" @@ -264,7 +259,7 @@ class MetadataUpdater(object): for i in xrange(len(self.pdbrecords)): pdbrecord = self.pdbrecords[i] print "%10X %10X %10X" % (pdbrecord[0], pdbrecord[1], pdbrecord[2]) - + def record(self, n): if n >= self.nrecs: raise ValueError('non-existent record %r' % n) @@ -325,7 +320,6 @@ class MetadataUpdater(object): exth = ['EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad] exth = ''.join(exth) - title = (mi.title or _('Unknown')).encode(self.codec, 'replace') if getattr(self, 'exth', None) is None: raise MobiError('No existing EXTH record. Cannot update metadata.') From b476baeaca132dbad59a374901d0cfbbb0b5bbda Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 26 Jan 2010 10:08:32 -0700 Subject: [PATCH 6/6] Updated recipe for DIscover Magazine and new Recipes for The Metro Montreal and The Gazette by Jerry Clapperton --- resources/recipes/discover_magazine.recipe | 56 ++++++++++++++-------- resources/recipes/metro_montreal.recipe | 24 ++++++++++ resources/recipes/pagina12.recipe | 4 +- resources/recipes/the_gazette.recipe | 22 +++++++++ 4 files changed, 85 insertions(+), 21 deletions(-) create mode 100644 resources/recipes/metro_montreal.recipe create mode 100644 resources/recipes/the_gazette.recipe diff --git a/resources/recipes/discover_magazine.recipe b/resources/recipes/discover_magazine.recipe index a3562bbbd1..0d2ee3ee74 100644 --- a/resources/recipes/discover_magazine.recipe +++ b/resources/recipes/discover_magazine.recipe @@ -10,26 +10,44 @@ doscovermagazine.com from calibre.web.feeds.news import BasicNewsRecipe class DiscoverMagazine(BasicNewsRecipe): + title = u'Discover Magazine' - description = u'Science, Technology and the Future' - __author__ = 'Mike Diaz' - oldest_article = 33 + description = u'Science, Technology and the Future' + __author__ = 'Mike Diaz' language = 'en' - - max_articles_per_feed = 20 + + oldest_article = 33 + max_articles_per_feed = 20 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + encoding = 'utf-8' + + extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' + + remove_tags_before = dict(id='articlePage') + + keep_only_tags = [dict(name='div', attrs={'id':'articlePage'})] + + remove_tags = [dict(attrs={'id':['buttons', 'tool-box', 'teaser', 'already-subscriber', 'teaser-suite', 'related-articles', 'relatedItem', 'box-popular', 'box-blogs', 'box-news', 'footer']}), + dict(attrs={'class':'popularNewsBox'}), + dict(name=['img', 'style', 'head'])] + + remove_tags_after = dict(id='articlePage') + feeds = [ - (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'), - (u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'), - (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'), - (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'), - (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'), - (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'), - (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'), - (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'), - (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'), - (u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'), - (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'), - (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'), - (u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'), + (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'), + (u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'), + (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'), + (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'), + (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'), + (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'), + (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'), + (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'), + (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'), + (u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'), + (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'), + (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'), + (u'Stupid Science Word of the Month', u'http://discovermagazine.com/columns/stupid-science-word-of-the-month/rss.xml'), (u'Science Not Fiction', u'http://blogs.discovermagazine.com/sciencenotfiction/wp-rss.php') - ] \ No newline at end of file + ] diff --git a/resources/recipes/metro_montreal.recipe b/resources/recipes/metro_montreal.recipe new file mode 100644 index 0000000000..9c308a91d8 --- /dev/null +++ b/resources/recipes/metro_montreal.recipe @@ -0,0 +1,24 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class Metro_Montreal(BasicNewsRecipe): + + title = u'M\xe9tro Montr\xe9al' + __author__ = 'Jerry Clapperton' + description = 'Le quotidien le plus branché sur le monde' + language = 'fr' + + oldest_article = 7 + max_articles_per_feed = 20 + use_embedded_content = False + remove_javascript = True + no_stylesheets = True + encoding = 'utf-8' + + extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' + + remove_tags = [dict(attrs={'id':'buttons'}), dict(name=['img', 'style'])] + + feeds = [(u"L'info", u'http://journalmetro.com/linfo/rss'), (u'Monde', u'http://journalmetro.com/monde/rss'), (u'Culture', u'http://journalmetro.com/culture/rss'), (u'Sports', u'http://journalmetro.com/sports/rss'), (u'Paroles', u'http://journalmetro.com/paroles/rss')] + + def print_version(self, url): + return url.replace('article', 'ArticlePrint') + '?language=fr' diff --git a/resources/recipes/pagina12.recipe b/resources/recipes/pagina12.recipe index bcf73c7b1c..2fb433dc82 100644 --- a/resources/recipes/pagina12.recipe +++ b/resources/recipes/pagina12.recipe @@ -32,7 +32,7 @@ class Pagina12(BasicNewsRecipe): } remove_tags = [dict(name='div', attrs={'id':['volver','logo','logo_suple','fin','permalink']})] - + feeds = [ (u'Edicion impresa', u'http://www.pagina12.com.ar/diario/rss/principal.xml' ) @@ -55,4 +55,4 @@ class Pagina12(BasicNewsRecipe): imgnames = ['tapan.jpg','tapagn.jpg','tapan_gr.jpg','tapagn.jpg','tapagn.jpg','tapan.jpg','tapagn.jpg'] weekday = time.localtime().tm_wday return strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/') + imgnames[weekday] - \ No newline at end of file + diff --git a/resources/recipes/the_gazette.recipe b/resources/recipes/the_gazette.recipe new file mode 100644 index 0000000000..19afff986e --- /dev/null +++ b/resources/recipes/the_gazette.recipe @@ -0,0 +1,22 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class The_Gazette(BasicNewsRecipe): + + cover_url = 'file:///D:/Documents/Pictures/Covers/The_Gazette.jpg' + title = u'The Gazette' + __author__ = 'Jerry Clapperton' + description = 'Montreal news in English' + language = 'en_CA' + + oldest_article = 7 + max_articles_per_feed = 20 + use_embedded_content = False + remove_javascript = True + no_stylesheets = True + encoding = 'utf-8' + + keep_only_tags = [dict(name='div', attrs={'id':['storyheader','page1']})] + + extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' + + feeds = [(u'News', u'http://feeds.canada.com/canwest/F297'), (u'Opinion', u'http://feeds.canada.com/canwest/F7383'), (u'Arts', u'http://feeds.canada.com/canwest/F7366'), (u'Life', u'http://rss.canada.com/get/?F6934'), (u'Business', u'http://feeds.canada.com/canwest/F6939'), (u'Travel', u'http://rss.canada.com/get/?F6938'), (u'Health', u'http://feeds.canada.com/canwest/F7397'), (u'Technology', u'http://feeds.canada.com/canwest/F7411')]